diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index a5fb3901428d9d..765372aa9e402f 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -1412,13 +1412,14 @@ class MCPlusBuilder {
     return false;
   }
 
-  /// Modify a direct call instruction \p Inst with an indirect call taking
-  /// a destination from a memory location pointed by \p TargetLocation symbol.
-  virtual bool convertCallToIndirectCall(MCInst &Inst,
-                                         const MCSymbol *TargetLocation,
-                                         MCContext *Ctx) {
+  /// Creates an indirect call to the function within the \p DirectCall PLT
+  /// stub. The function's memory location is pointed by the \p TargetLocation
+  /// symbol.
+  virtual InstructionListType
+  createIndirectPltCall(const MCInst &DirectCall,
+                        const MCSymbol *TargetLocation, MCContext *Ctx) {
     llvm_unreachable("not implemented");
-    return false;
+    return {};
   }
 
   /// Morph an indirect call into a load where \p Reg holds the call target.
diff --git a/bolt/include/bolt/Rewrite/DWARFRewriter.h b/bolt/include/bolt/Rewrite/DWARFRewriter.h
index 3cc9d823c815b2..4559ff5ff51592 100644
--- a/bolt/include/bolt/Rewrite/DWARFRewriter.h
+++ b/bolt/include/bolt/Rewrite/DWARFRewriter.h
@@ -150,9 +150,6 @@ class DWARFRewriter {
   /// blocks) to be updated.
   void updateDebugAddressRanges();
 
-  /// Rewrite .gdb_index section if present.
-  void updateGdbIndexSection(CUOffsetMap &CUMap, uint32_t NumCUs);
-
   /// DWARFDie contains a pointer to a DIE and hence gets invalidated once the
   /// embedded DIE is destroyed. This wrapper class stores a DIE internally and
   /// could be cast to a DWARFDie that is valid even after the initial DIE is
@@ -194,14 +191,6 @@ class DWARFRewriter {
     DwoRangesBase[DWOId] = RangesBase;
   }
 
-  /// Adds an GDBIndexTUEntry if .gdb_index seciton exists.
-  void addGDBTypeUnitEntry(const GDBIndexTUEntry &&Entry);
-
-  /// Returns all entries needed for Types CU list
-  const GDBIndexTUEntryType &getGDBIndexTUEntryVector() const {
-    return GDBIndexTUEntryVector;
-  }
-
   using OverriddenSectionsMap = std::unordered_map<DWARFSectionKind, StringRef>;
   /// Output .dwo files.
   void writeDWOFiles(DWARFUnit &, const OverriddenSectionsMap &,
diff --git a/bolt/lib/Passes/PLTCall.cpp b/bolt/lib/Passes/PLTCall.cpp
index d0276f22e14ef8..2ed996fadbb99e 100644
--- a/bolt/lib/Passes/PLTCall.cpp
+++ b/bolt/lib/Passes/PLTCall.cpp
@@ -48,8 +48,8 @@ Error PLTCall::runOnFunctions(BinaryContext &BC) {
     return Error::success();
 
   uint64_t NumCallsOptimized = 0;
-  for (auto &It : BC.getBinaryFunctions()) {
-    BinaryFunction &Function = It.second;
+  for (auto &BFI : BC.getBinaryFunctions()) {
+    BinaryFunction &Function = BFI.second;
     if (!shouldOptimize(Function))
       continue;
 
@@ -61,18 +61,21 @@ Error PLTCall::runOnFunctions(BinaryContext &BC) {
       if (opts::PLT == OT_HOT && !BB.getKnownExecutionCount())
         continue;
 
-      for (MCInst &Instr : BB) {
-        if (!BC.MIB->isCall(Instr))
+      for (auto II = BB.begin(); II != BB.end(); II++) {
+        if (!BC.MIB->isCall(*II))
           continue;
-        const MCSymbol *CallSymbol = BC.MIB->getTargetSymbol(Instr);
+        const MCSymbol *CallSymbol = BC.MIB->getTargetSymbol(*II);
         if (!CallSymbol)
           continue;
         const BinaryFunction *CalleeBF = BC.getFunctionForSymbol(CallSymbol);
         if (!CalleeBF || !CalleeBF->isPLTFunction())
           continue;
-        BC.MIB->convertCallToIndirectCall(Instr, CalleeBF->getPLTSymbol(),
-                                          BC.Ctx.get());
-        BC.MIB->addAnnotation(Instr, "PLTCall", true);
+        const InstructionListType NewCode = BC.MIB->createIndirectPltCall(
+            *II, CalleeBF->getPLTSymbol(), BC.Ctx.get());
+        II = BB.replaceInstruction(II, NewCode);
+        assert(!NewCode.empty() && "PLT Call replacement must be non-empty");
+        std::advance(II, NewCode.size() - 1);
+        BC.MIB->addAnnotation(*II, "PLTCall", true);
         ++NumCallsOptimized;
       }
     }
diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp
index e1b3762a316606..0e475031eae4f3 100644
--- a/bolt/lib/Rewrite/DWARFRewriter.cpp
+++ b/bolt/lib/Rewrite/DWARFRewriter.cpp
@@ -2060,177 +2060,6 @@ void DWARFRewriter::writeDWOFiles(
   TempOut->keep();
 }
 
-void DWARFRewriter::addGDBTypeUnitEntry(const GDBIndexTUEntry &&Entry) {
-  std::lock_guard<std::mutex> Lock(DWARFRewriterMutex);
-  if (!BC.getGdbIndexSection())
-    return;
-  GDBIndexTUEntryVector.emplace_back(Entry);
-}
-
-void DWARFRewriter::updateGdbIndexSection(CUOffsetMap &CUMap, uint32_t NumCUs) {
-  if (!BC.getGdbIndexSection())
-    return;
-
-  // See https://sourceware.org/gdb/onlinedocs/gdb/Index-Section-Format.html
-  // for .gdb_index section format.
-
-  StringRef GdbIndexContents = BC.getGdbIndexSection()->getContents();
-
-  const char *Data = GdbIndexContents.data();
-
-  // Parse the header.
-  const uint32_t Version = read32le(Data);
-  if (Version != 7 && Version != 8) {
-    errs() << "BOLT-ERROR: can only process .gdb_index versions 7 and 8\n";
-    exit(1);
-  }
-
-  // Some .gdb_index generators use file offsets while others use section
-  // offsets. Hence we can only rely on offsets relative to each other,
-  // and ignore their absolute values.
-  const uint32_t CUListOffset = read32le(Data + 4);
-  const uint32_t CUTypesOffset = read32le(Data + 8);
-  const uint32_t AddressTableOffset = read32le(Data + 12);
-  const uint32_t SymbolTableOffset = read32le(Data + 16);
-  const uint32_t ConstantPoolOffset = read32le(Data + 20);
-  Data += 24;
-
-  // Map CUs offsets to indices and verify existing index table.
-  std::map<uint32_t, uint32_t> OffsetToIndexMap;
-  const uint32_t CUListSize = CUTypesOffset - CUListOffset;
-  const uint32_t TUListSize = AddressTableOffset - CUTypesOffset;
-  const unsigned NUmCUsEncoded = CUListSize / 16;
-  unsigned MaxDWARFVersion = BC.DwCtx->getMaxVersion();
-  unsigned NumDWARF5TUs =
-      getGDBIndexTUEntryVector().size() - BC.DwCtx->getNumTypeUnits();
-  bool SkipTypeUnits = false;
-  // For DWARF5 Types are in .debug_info.
-  // LLD doesn't generate Types CU List, and in CU list offset
-  // only includes CUs.
-  // GDB 11+ includes only CUs in CU list and generates Types
-  // list.
-  // GDB 9 includes CUs and TUs in CU list and generates TYpes
-  // list. The NumCUs is CUs + TUs, so need to modify the check.
-  // For split-dwarf
-  // GDB-11, DWARF5: TU units from dwo are not included.
-  // GDB-11, DWARF4: TU units from dwo are included.
-  if (MaxDWARFVersion >= 5)
-    SkipTypeUnits = !TUListSize ? true
-                                : ((NUmCUsEncoded + NumDWARF5TUs) ==
-                                   BC.DwCtx->getNumCompileUnits());
-
-  if (!((CUListSize == NumCUs * 16) ||
-        (CUListSize == (NumCUs + NumDWARF5TUs) * 16))) {
-    errs() << "BOLT-ERROR: .gdb_index: CU count mismatch\n";
-    exit(1);
-  }
-  DenseSet<uint64_t> OriginalOffsets;
-  for (unsigned Index = 0, Units = BC.DwCtx->getNumCompileUnits();
-       Index < Units; ++Index) {
-    const DWARFUnit *CU = BC.DwCtx->getUnitAtIndex(Index);
-    if (SkipTypeUnits && CU->isTypeUnit())
-      continue;
-    const uint64_t Offset = read64le(Data);
-    Data += 16;
-    if (CU->getOffset() != Offset) {
-      errs() << "BOLT-ERROR: .gdb_index CU offset mismatch\n";
-      exit(1);
-    }
-
-    OriginalOffsets.insert(Offset);
-    OffsetToIndexMap[Offset] = Index;
-  }
-
-  // Ignore old address table.
-  const uint32_t OldAddressTableSize = SymbolTableOffset - AddressTableOffset;
-  // Move Data to the beginning of symbol table.
-  Data += SymbolTableOffset - CUTypesOffset;
-
-  // Calculate the size of the new address table.
-  uint32_t NewAddressTableSize = 0;
-  for (const auto &CURangesPair : ARangesSectionWriter->getCUAddressRanges()) {
-    const SmallVector<DebugAddressRange, 2> &Ranges = CURangesPair.second;
-    NewAddressTableSize += Ranges.size() * 20;
-  }
-
-  // Difference between old and new table (and section) sizes.
-  // Could be negative.
-  int32_t Delta = NewAddressTableSize - OldAddressTableSize;
-
-  size_t NewGdbIndexSize = GdbIndexContents.size() + Delta;
-
-  // Free'd by ExecutableFileMemoryManager.
-  auto *NewGdbIndexContents = new uint8_t[NewGdbIndexSize];
-  uint8_t *Buffer = NewGdbIndexContents;
-
-  write32le(Buffer, Version);
-  write32le(Buffer + 4, CUListOffset);
-  write32le(Buffer + 8, CUTypesOffset);
-  write32le(Buffer + 12, AddressTableOffset);
-  write32le(Buffer + 16, SymbolTableOffset + Delta);
-  write32le(Buffer + 20, ConstantPoolOffset + Delta);
-  Buffer += 24;
-
-  using MapEntry = std::pair<uint32_t, CUInfo>;
-  std::vector<MapEntry> CUVector(CUMap.begin(), CUMap.end());
-  // Need to sort since we write out all of TUs in .debug_info before CUs.
-  std::sort(CUVector.begin(), CUVector.end(),
-            [](const MapEntry &E1, const MapEntry &E2) -> bool {
-              return E1.second.Offset < E2.second.Offset;
-            });
-  // Writing out CU List <Offset, Size>
-  for (auto &CUInfo : CUVector) {
-    // Skipping TU for DWARF5 when they are not included in CU list.
-    if (!OriginalOffsets.count(CUInfo.first))
-      continue;
-    write64le(Buffer, CUInfo.second.Offset);
-    // Length encoded in CU doesn't contain first 4 bytes that encode length.
-    write64le(Buffer + 8, CUInfo.second.Length + 4);
-    Buffer += 16;
-  }
-
-  // Rewrite TU CU List, since abbrevs can be different.
-  // Entry example:
-  // 0: offset = 0x00000000, type_offset = 0x0000001e, type_signature =
-  // 0x418503b8111e9a7b Spec says " triplet, the first value is the CU offset,
-  // the second value is the type offset in the CU, and the third value is the
-  // type signature" Looking at what is being generated by gdb-add-index. The
-  // first entry is TU offset, second entry is offset from it, and third entry
-  // is the type signature.
-  if (TUListSize)
-    for (const GDBIndexTUEntry &Entry : getGDBIndexTUEntryVector()) {
-      write64le(Buffer, Entry.UnitOffset);
-      write64le(Buffer + 8, Entry.TypeDIERelativeOffset);
-      write64le(Buffer + 16, Entry.TypeHash);
-      Buffer += sizeof(GDBIndexTUEntry);
-    }
-
-  // Generate new address table.
-  for (const std::pair<const uint64_t, DebugAddressRangesVector> &CURangesPair :
-       ARangesSectionWriter->getCUAddressRanges()) {
-    const uint32_t CUIndex = OffsetToIndexMap[CURangesPair.first];
-    const DebugAddressRangesVector &Ranges = CURangesPair.second;
-    for (const DebugAddressRange &Range : Ranges) {
-      write64le(Buffer, Range.LowPC);
-      write64le(Buffer + 8, Range.HighPC);
-      write32le(Buffer + 16, CUIndex);
-      Buffer += 20;
-    }
-  }
-
-  const size_t TrailingSize =
-      GdbIndexContents.data() + GdbIndexContents.size() - Data;
-  assert(Buffer + TrailingSize == NewGdbIndexContents + NewGdbIndexSize &&
-         "size calculation error");
-
-  // Copy over the rest of the original data.
-  memcpy(Buffer, Data, TrailingSize);
-
-  // Register the new section.
-  BC.registerOrUpdateNoteSection(".gdb_index", NewGdbIndexContents,
-                                 NewGdbIndexSize);
-}
-
 std::unique_ptr<DebugBufferVector>
 DWARFRewriter::makeFinalLocListsSection(DWARFVersion Version) {
   auto LocBuffer = std::make_unique<DebugBufferVector>();
diff --git a/bolt/lib/Rewrite/LinuxKernelRewriter.cpp b/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
index b2c8b2446f7e1e..6b3f5bce9f0f58 100644
--- a/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
+++ b/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
@@ -273,6 +273,8 @@ class LinuxKernelRewriter final : public MetadataRewriter {
 
   /// Handle alternative instruction info from .altinstructions.
   Error readAltInstructions();
+  Error tryReadAltInstructions(uint32_t AltInstFeatureSize,
+                               bool AltInstHasPadLen, bool ParseOnly);
   Error rewriteAltInstructions();
 
   /// Read .pci_fixup
@@ -1319,12 +1321,69 @@ Error LinuxKernelRewriter::rewriteBugTable() {
 ///	    u8  padlen;         // present in older kernels
 ///   } __packed;
 ///
-/// Note the structures is packed.
+/// Note that the structure is packed.
+///
+/// Since the size of the "feature" field could be either u16 or u32, and
+/// "padlen" presence is unknown, we attempt to parse .altinstructions section
+/// using all possible combinations (four at this time). Since we validate the
+/// contents of the section and its size, the detection works quite well.
+/// Still, we leave the user the opportunity to specify these features on the
+/// command line and skip the guesswork.
 Error LinuxKernelRewriter::readAltInstructions() {
   AltInstrSection = BC.getUniqueSectionByName(".altinstructions");
   if (!AltInstrSection)
     return Error::success();
 
+  // Presence of "padlen" field.
+  std::vector<bool> PadLenVariants;
+  if (opts::AltInstHasPadLen.getNumOccurrences())
+    PadLenVariants.push_back(opts::AltInstHasPadLen);
+  else
+    PadLenVariants = {false, true};
+
+  // Size (in bytes) variants of "feature" field.
+  std::vector<uint32_t> FeatureSizeVariants;
+  if (opts::AltInstFeatureSize.getNumOccurrences())
+    FeatureSizeVariants.push_back(opts::AltInstFeatureSize);
+  else
+    FeatureSizeVariants = {2, 4};
+
+  for (bool AltInstHasPadLen : PadLenVariants) {
+    for (uint32_t AltInstFeatureSize : FeatureSizeVariants) {
+      LLVM_DEBUG({
+        dbgs() << "BOLT-DEBUG: trying AltInstHasPadLen = " << AltInstHasPadLen
+               << "; AltInstFeatureSize = " << AltInstFeatureSize << ";\n";
+      });
+      if (Error E = tryReadAltInstructions(AltInstFeatureSize, AltInstHasPadLen,
+                                           /*ParseOnly*/ true)) {
+        consumeError(std::move(E));
+        continue;
+      }
+
+      LLVM_DEBUG(dbgs() << "Matched .altinstructions format\n");
+
+      if (!opts::AltInstHasPadLen.getNumOccurrences())
+        BC.outs() << "BOLT-INFO: setting --" << opts::AltInstHasPadLen.ArgStr
+                  << '=' << AltInstHasPadLen << '\n';
+
+      if (!opts::AltInstFeatureSize.getNumOccurrences())
+        BC.outs() << "BOLT-INFO: setting --" << opts::AltInstFeatureSize.ArgStr
+                  << '=' << AltInstFeatureSize << '\n';
+
+      return tryReadAltInstructions(AltInstFeatureSize, AltInstHasPadLen,
+                                    /*ParseOnly*/ false);
+    }
+  }
+
+  // We couldn't match the format. Read again to properly propagate the error
+  // to the user.
+  return tryReadAltInstructions(opts::AltInstFeatureSize,
+                                opts::AltInstHasPadLen, /*ParseOnly*/ false);
+}
+
+Error LinuxKernelRewriter::tryReadAltInstructions(uint32_t AltInstFeatureSize,
+                                                  bool AltInstHasPadLen,
+                                                  bool ParseOnly) {
   const uint64_t Address = AltInstrSection->getAddress();
   DataExtractor DE = DataExtractor(AltInstrSection->getContents(),
                                    BC.AsmInfo->isLittleEndian(),
@@ -1336,12 +1395,12 @@ Error LinuxKernelRewriter::readAltInstructions() {
         Address + Cursor.tell() + (int32_t)DE.getU32(Cursor);
     const uint64_t AltInstAddress =
         Address + Cursor.tell() + (int32_t)DE.getU32(Cursor);
-    const uint64_t Feature = DE.getUnsigned(Cursor, opts::AltInstFeatureSize);
+    const uint64_t Feature = DE.getUnsigned(Cursor, AltInstFeatureSize);
     const uint8_t OrgSize = DE.getU8(Cursor);
     const uint8_t AltSize = DE.getU8(Cursor);
 
     // Older kernels may have the padlen field.
-    const uint8_t PadLen = opts::AltInstHasPadLen ? DE.getU8(Cursor) : 0;
+    const uint8_t PadLen = AltInstHasPadLen ? DE.getU8(Cursor) : 0;
 
     if (!Cursor)
       return createStringError(
@@ -1358,7 +1417,7 @@ Error LinuxKernelRewriter::readAltInstructions() {
                 << "\n\tFeature: 0x" << Twine::utohexstr(Feature)
                 << "\n\tOrgSize: " << (int)OrgSize
                 << "\n\tAltSize: " << (int)AltSize << '\n';
-      if (opts::AltInstHasPadLen)
+      if (AltInstHasPadLen)
         BC.outs() << "\tPadLen:  " << (int)PadLen << '\n';
     }
 
@@ -1375,7 +1434,7 @@ Error LinuxKernelRewriter::readAltInstructions() {
 
     BinaryFunction *AltBF =
         BC.getBinaryFunctionContainingAddress(AltInstAddress);
-    if (AltBF && BC.shouldEmit(*AltBF)) {
+    if (!ParseOnly && AltBF && BC.shouldEmit(*AltBF)) {
       BC.errs()
           << "BOLT-WARNING: alternative instruction sequence found in function "
           << *AltBF << '\n';
@@ -1397,6 +1456,9 @@ Error LinuxKernelRewriter::readAltInstructions() {
                                " referenced by .altinstructions entry %d",
                                OrgInstAddress, EntryID);
 
+    if (ParseOnly)
+      continue;
+
     // There could be more than one alternative instruction sequences for the
     // same original instruction. Annotate each alternative separately.
     std::string AnnotationName = "AltInst";
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index a74eda8e4a566e..5220d305b838d5 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -1054,6 +1054,47 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     return true;
   }
 
+  InstructionListType createIndirectPltCall(const MCInst &DirectCall,
+                                            const MCSymbol *TargetLocation,
+                                            MCContext *Ctx) override {
+    const bool IsTailCall = isTailCall(DirectCall);
+    assert((DirectCall.getOpcode() == AArch64::BL ||
+            (DirectCall.getOpcode() == AArch64::B && IsTailCall)) &&
+           "64-bit direct (tail) call instruction expected");
+
+    InstructionListType Code;
+    // Code sequence for indirect plt call:
+    // adrp	x16 <symbol>
+    // ldr	x17, [x16, #<offset>]
+    // blr	x17  ; or 'br' for tail calls
+
+    MCInst InstAdrp;
+    InstAdrp.setOpcode(AArch64::ADRP);
+    InstAdrp.addOperand(MCOperand::createReg(AArch64::X16));
+    InstAdrp.addOperand(MCOperand::createImm(0));
+    setOperandToSymbolRef(InstAdrp, /* OpNum */ 1, TargetLocation,
+                          /* Addend */ 0, Ctx, ELF::R_AARCH64_ADR_GOT_PAGE);
+    Code.emplace_back(InstAdrp);
+
+    MCInst InstLoad;
+    InstLoad.setOpcode(AArch64::LDRXui);
+    InstLoad.addOperand(MCOperand::createReg(AArch64::X17));
+    InstLoad.addOperand(MCOperand::createReg(AArch64::X16));
+    InstLoad.addOperand(MCOperand::createImm(0));
+    setOperandToSymbolRef(InstLoad, /* OpNum */ 2, TargetLocation,
+                          /* Addend */ 0, Ctx, ELF::R_AARCH64_LD64_GOT_LO12_NC);
+    Code.emplace_back(InstLoad);
+
+    MCInst InstCall;
+    InstCall.setOpcode(IsTailCall ? AArch64::BR : AArch64::BLR);
+    InstCall.addOperand(MCOperand::createReg(AArch64::X17));
+    if (IsTailCall)
+      setTailCall(InstCall);
+    Code.emplace_back(InstCall);
+
+    return Code;
+  }
+
   bool lowerTailCall(MCInst &Inst) override {
     removeAnnotation(Inst, MCPlus::MCAnnotation::kTailCall);
     if (getConditionalTailCall(Inst))
diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
index e350e701c7b7ba..515c9a94c58cd4 100644
--- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
+++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
@@ -1639,11 +1639,16 @@ class X86MCPlusBuilder : public MCPlusBuilder {
     return true;
   }
 
-  bool convertCallToIndirectCall(MCInst &Inst, const MCSymbol *TargetLocation,
-                                 MCContext *Ctx) override {
-    assert((Inst.getOpcode() == X86::CALL64pcrel32 ||
-            (Inst.getOpcode() == X86::JMP_4 && isTailCall(Inst))) &&
+  InstructionListType createIndirectPltCall(const MCInst &DirectCall,
+                                            const MCSymbol *TargetLocation,
+                                            MCContext *Ctx) override {
+    assert((DirectCall.getOpcode() == X86::CALL64pcrel32 ||
+            (DirectCall.getOpcode() == X86::JMP_4 && isTailCall(DirectCall))) &&
            "64-bit direct (tail) call instruction expected");
+
+    InstructionListType Code;
+    // Create a new indirect call by converting the previous direct call.
+    MCInst Inst = DirectCall;
     const auto NewOpcode =
         (Inst.getOpcode() == X86::CALL64pcrel32) ? X86::CALL64m : X86::JMP32m;
     Inst.setOpcode(NewOpcode);
@@ -1664,7 +1669,8 @@ class X86MCPlusBuilder : public MCPlusBuilder {
     Inst.insert(Inst.begin(),
                 MCOperand::createReg(X86::RIP));        // BaseReg
 
-    return true;
+    Code.emplace_back(Inst);
+    return Code;
   }
 
   void convertIndirectCallToLoad(MCInst &Inst, MCPhysReg Reg) override {
diff --git a/bolt/test/AArch64/plt-call.test b/bolt/test/AArch64/plt-call.test
new file mode 100644
index 00000000000000..da307d4a6c01e6
--- /dev/null
+++ b/bolt/test/AArch64/plt-call.test
@@ -0,0 +1,15 @@
+// Verify that PLTCall optimization works.
+
+RUN: %clang %cflags %p/../Inputs/plt-tailcall.c \
+RUN:    -o %t -Wl,-q
+RUN: llvm-bolt %t -o %t.bolt --plt=all --print-plt  --print-only=foo | FileCheck %s
+
+// Call to printf
+CHECK: adrp	x16, printf@GOT
+CHECK: ldr	x17, [x16, :lo12:printf@GOT]
+CHECK: blr	x17 # PLTCall: 1
+
+// Call to puts, that was tail-call optimized
+CHECK: adrp	x16, puts@GOT
+CHECK: ldr	x17, [x16, :lo12:puts@GOT]
+CHECK: br	x17 # TAILCALL  # PLTCall: 1
diff --git a/bolt/test/Inputs/plt-tailcall.c b/bolt/test/Inputs/plt-tailcall.c
new file mode 100644
index 00000000000000..13f6e29c607747
--- /dev/null
+++ b/bolt/test/Inputs/plt-tailcall.c
@@ -0,0 +1,8 @@
+#include "stub.h"
+
+int foo(char *c) {
+  printf("");
+  __attribute__((musttail)) return puts(c);
+}
+
+int main() { return foo("a"); }
diff --git a/bolt/test/X86/linux-alt-instruction.s b/bolt/test/X86/linux-alt-instruction.s
index 2cdf31519682a8..66cd33a711b89b 100644
--- a/bolt/test/X86/linux-alt-instruction.s
+++ b/bolt/test/X86/linux-alt-instruction.s
@@ -12,24 +12,30 @@
 ## Older kernels used to have padlen field in alt_instr. Check compatibility.
 
 # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown --defsym PADLEN=1 \
-# RUN:   %s -o %t.o
-# RUN: %clang %cflags -nostdlib %t.o -o %t.exe \
+# RUN:   %s -o %t.padlen.o
+# RUN: %clang %cflags -nostdlib %t.padlen.o -o %t.padlen.exe \
 # RUN:   -Wl,--image-base=0xffffffff80000000,--no-dynamic-linker,--no-eh-frame-hdr,--no-pie
-# RUN: llvm-bolt %t.exe --print-normalized --alt-inst-has-padlen -o %t.out \
+# RUN: llvm-bolt %t.padlen.exe --print-normalized --alt-inst-has-padlen -o %t.padlen.out \
 # RUN:   | FileCheck %s
 
 ## Check with a larger size of "feature" field in alt_instr.
 
 # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
-# RUN:   --defsym FEATURE_SIZE_4=1 %s -o %t.o
-# RUN: %clang %cflags -nostdlib %t.o -o %t.exe \
+# RUN:   --defsym FEATURE_SIZE_4=1 %s -o %t.fs4.o
+# RUN: %clang %cflags -nostdlib %t.fs4.o -o %t.fs4.exe \
 # RUN:   -Wl,--image-base=0xffffffff80000000,--no-dynamic-linker,--no-eh-frame-hdr,--no-pie
-# RUN: llvm-bolt %t.exe --print-normalized --alt-inst-feature-size=4 -o %t.out \
+# RUN: llvm-bolt %t.fs4.exe --print-normalized --alt-inst-feature-size=4 -o %t.fs4.out \
 # RUN:   | FileCheck %s
 
 ## Check that out-of-bounds read is handled properly.
 
-# RUN: not llvm-bolt %t.exe --print-normalized --alt-inst-feature-size=2 -o %t.out
+# RUN: not llvm-bolt %t.fs4.exe --alt-inst-feature-size=2 -o %t.fs4.out
+
+## Check that BOLT automatically detects structure fields in .altinstructions.
+
+# RUN: llvm-bolt %t.exe --print-normalized -o %t.out | FileCheck %s
+# RUN: llvm-bolt %t.exe --print-normalized -o %t.padlen.out | FileCheck %s
+# RUN: llvm-bolt %t.exe --print-normalized -o %t.fs4.out | FileCheck %s
 
 # CHECK:      BOLT-INFO: Linux kernel binary detected
 # CHECK:      BOLT-INFO: parsed 2 alternative instruction entries
diff --git a/bolt/test/X86/plt-call.test b/bolt/test/X86/plt-call.test
new file mode 100644
index 00000000000000..e6ae86c179d279
--- /dev/null
+++ b/bolt/test/X86/plt-call.test
@@ -0,0 +1,11 @@
+// Verify that PLTCall optimization works.
+
+RUN: %clang %cflags %p/../Inputs/plt-tailcall.c \
+RUN:    -o %t -Wl,-q
+RUN: llvm-bolt %t -o %t.bolt --plt=all --print-plt  --print-only=foo | FileCheck %s
+
+// Call to printf
+CHECK: callq *printf@GOT(%rip) # PLTCall: 1
+
+// Call to puts, that was tail-call optimized
+CHECK: jmpl *puts@GOT(%rip) # TAILCALL # PLTCall: 1
diff --git a/clang-tools-extra/clangd/TidyProvider.cpp b/clang-tools-extra/clangd/TidyProvider.cpp
index b658a80559937c..a4121df30d3dfa 100644
--- a/clang-tools-extra/clangd/TidyProvider.cpp
+++ b/clang-tools-extra/clangd/TidyProvider.cpp
@@ -221,13 +221,7 @@ TidyProvider disableUnusableChecks(llvm::ArrayRef<std::string> ExtraBadChecks) {
       "-hicpp-invalid-access-moved",
       // Check uses dataflow analysis, which might hang/crash unexpectedly on
       // incomplete code.
-      "-bugprone-unchecked-optional-access",
-
-      // ----- Performance problems -----
-
-      // This check runs expensive analysis for each variable.
-      // It has been observed to increase reparse time by 10x.
-      "-misc-const-correctness");
+      "-bugprone-unchecked-optional-access");
 
   size_t Size = BadChecks.size();
   for (const std::string &Str : ExtraBadChecks) {
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index 0385ca4b3a0639..820e4cc44a7bcd 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -1494,6 +1494,9 @@ bool ByteCodeExprGen<Emitter>::VisitMemberExpr(const MemberExpr *E) {
     return false;
   }
 
+  if (!isa<FieldDecl>(Member))
+    return this->discard(Base) && this->visitDeclRef(Member, E);
+
   if (Initializing) {
     if (!this->delegate(Base))
       return false;
@@ -1503,19 +1506,16 @@ bool ByteCodeExprGen<Emitter>::VisitMemberExpr(const MemberExpr *E) {
   }
 
   // Base above gives us a pointer on the stack.
-  if (const auto *FD = dyn_cast<FieldDecl>(Member)) {
-    const RecordDecl *RD = FD->getParent();
-    const Record *R = getRecord(RD);
-    if (!R)
-      return false;
-    const Record::Field *F = R->getField(FD);
-    // Leave a pointer to the field on the stack.
-    if (F->Decl->getType()->isReferenceType())
-      return this->emitGetFieldPop(PT_Ptr, F->Offset, E) && maybeLoadValue();
-    return this->emitGetPtrFieldPop(F->Offset, E) && maybeLoadValue();
-  }
-
-  return false;
+  const auto *FD = cast<FieldDecl>(Member);
+  const RecordDecl *RD = FD->getParent();
+  const Record *R = getRecord(RD);
+  if (!R)
+    return false;
+  const Record::Field *F = R->getField(FD);
+  // Leave a pointer to the field on the stack.
+  if (F->Decl->getType()->isReferenceType())
+    return this->emitGetFieldPop(PT_Ptr, F->Offset, E) && maybeLoadValue();
+  return this->emitGetPtrFieldPop(F->Offset, E) && maybeLoadValue();
 }
 
 template <class Emitter>
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index 11e2d549d8a450..99e12da0081afc 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -5746,78 +5746,6 @@ void CGDebugInfo::EmitExternalVariable(llvm::GlobalVariable *Var,
   Var->addDebugInfo(GVE);
 }
 
-void CGDebugInfo::EmitPseudoVariable(CGBuilderTy &Builder,
-                                     llvm::Instruction *Value, QualType Ty) {
-  // Only when -g2 or above is specified, debug info for variables will be
-  // generated.
-  if (CGM.getCodeGenOpts().getDebugInfo() <=
-      llvm::codegenoptions::DebugLineTablesOnly)
-    return;
-
-  llvm::DebugLoc SaveDebugLoc = Builder.getCurrentDebugLocation();
-  if (!SaveDebugLoc.get())
-    return;
-
-  llvm::DIFile *Unit = SaveDebugLoc->getFile();
-  llvm::DIType *Type = getOrCreateType(Ty, Unit);
-
-  // Check if Value is already a declared variable and has debug info, in this
-  // case we have nothing to do. Clang emits declared variable as alloca, and
-  // it is loaded upon use, so we identify such pattern here.
-  if (llvm::LoadInst *Load = dyn_cast<llvm::LoadInst>(Value)) {
-    llvm::Value *Var = Load->getPointerOperand();
-    // There can be implicit type cast applied on a variable if it is an opaque
-    // ptr, in this case its debug info may not match the actual type of object
-    // being used as in the next instruction, so we will need to emit a pseudo
-    // variable for type-casted value.
-    auto DeclareTypeMatches = [&](auto *DbgDeclare) {
-      return DbgDeclare->getVariable()->getType() == Type;
-    };
-    if (any_of(llvm::findDbgDeclares(Var), DeclareTypeMatches) ||
-        any_of(llvm::findDVRDeclares(Var), DeclareTypeMatches))
-      return;
-  }
-
-  // Find the correct location to insert a sequence of instructions to
-  // materialize Value on the stack.
-  auto SaveInsertionPoint = Builder.saveIP();
-  if (llvm::InvokeInst *Invoke = dyn_cast<llvm::InvokeInst>(Value))
-    Builder.SetInsertPoint(Invoke->getNormalDest()->begin());
-  else if (llvm::Instruction *Next = Value->getIterator()->getNextNode())
-    Builder.SetInsertPoint(Next);
-  else
-    Builder.SetInsertPoint(Value->getParent());
-  llvm::DebugLoc DL = Value->getDebugLoc();
-  if (DL.get())
-    Builder.SetCurrentDebugLocation(DL);
-  else if (!Builder.getCurrentDebugLocation().get())
-    Builder.SetCurrentDebugLocation(SaveDebugLoc);
-
-  llvm::AllocaInst *PseudoVar = Builder.CreateAlloca(Value->getType());
-  Address PseudoVarAddr(PseudoVar, Value->getType(),
-                        CharUnits::fromQuantity(PseudoVar->getAlign()));
-  llvm::LoadInst *Load = Builder.CreateLoad(PseudoVarAddr);
-  Value->replaceAllUsesWith(Load);
-  Builder.SetInsertPoint(Load);
-  Builder.CreateStore(Value, PseudoVarAddr);
-
-  // Emit debug info for materialized Value.
-  unsigned Line = Builder.getCurrentDebugLocation().getLine();
-  unsigned Column = Builder.getCurrentDebugLocation().getCol();
-  llvm::DILocalVariable *D = DBuilder.createAutoVariable(
-      LexicalBlockStack.back(), "", nullptr, 0, Type, false,
-      llvm::DINode::FlagArtificial);
-  llvm::DILocation *DIL =
-      llvm::DILocation::get(CGM.getLLVMContext(), Line, Column,
-                            LexicalBlockStack.back(), CurInlinedAt);
-  SmallVector<uint64_t> Expr;
-  DBuilder.insertDeclare(PseudoVar, D, DBuilder.createExpression(Expr), DIL,
-                         Load);
-
-  Builder.restoreIP(SaveInsertionPoint);
-  Builder.SetCurrentDebugLocation(SaveDebugLoc);
-}
-
 void CGDebugInfo::EmitGlobalAlias(const llvm::GlobalValue *GV,
                                   const GlobalDecl GD) {
 
diff --git a/clang/lib/CodeGen/CGDebugInfo.h b/clang/lib/CodeGen/CGDebugInfo.h
index da466837aa3c34..8fe738be215687 100644
--- a/clang/lib/CodeGen/CGDebugInfo.h
+++ b/clang/lib/CodeGen/CGDebugInfo.h
@@ -530,12 +530,6 @@ class CGDebugInfo {
   /// Emit information about an external variable.
   void EmitExternalVariable(llvm::GlobalVariable *GV, const VarDecl *Decl);
 
-  /// Emit a pseudo variable and debug info for an intermediate value if it does
-  /// not correspond to a variable in the source code, so that a profiler can
-  /// track more accurate usage of certain instructions of interest.
-  void EmitPseudoVariable(CGBuilderTy &Builder, llvm::Instruction *Value,
-                          QualType Ty);
-
   /// Emit information about global variable alias.
   void EmitGlobalAlias(const llvm::GlobalValue *GV, const GlobalDecl Decl);
 
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index 58f0a3113b4f81..1b144c178ce960 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -1937,26 +1937,7 @@ Value *ScalarExprEmitter::VisitMemberExpr(MemberExpr *E) {
     }
   }
 
-  llvm::Value *Result = EmitLoadOfLValue(E);
-
-  // If -fdebug-info-for-profiling is specified, emit a pseudo variable and its
-  // debug info for the pointer, even if there is no variable associated with
-  // the pointer's expression.
-  if (CGF.CGM.getCodeGenOpts().DebugInfoForProfiling && CGF.getDebugInfo()) {
-    if (llvm::LoadInst *Load = dyn_cast<llvm::LoadInst>(Result)) {
-      if (llvm::GetElementPtrInst *GEP =
-              dyn_cast<llvm::GetElementPtrInst>(Load->getPointerOperand())) {
-        if (llvm::Instruction *Pointer =
-                dyn_cast<llvm::Instruction>(GEP->getPointerOperand())) {
-          QualType Ty = E->getBase()->getType();
-          if (!E->isArrow())
-            Ty = CGF.getContext().getPointerType(Ty);
-          CGF.getDebugInfo()->EmitPseudoVariable(Builder, Pointer, Ty);
-        }
-      }
-    }
-  }
-  return Result;
+  return EmitLoadOfLValue(E);
 }
 
 Value *ScalarExprEmitter::VisitArraySubscriptExpr(ArraySubscriptExpr *E) {
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 4b9b735f1cfb43..95a6fe66babae9 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -1493,7 +1493,7 @@ void Sema::ActOnExitFunctionContext() {
 ///
 /// This routine determines whether overloading is possible, not
 /// whether a new declaration actually overloads a previous one.
-/// It will return true in C++ (where overloads are alway permitted)
+/// It will return true in C++ (where overloads are always permitted)
 /// or, as a C extension, when either the new declaration or a
 /// previous one is declared with the 'overloadable' attribute.
 static bool AllowOverloadingOfFunction(const LookupResult &Previous,
@@ -4147,7 +4147,7 @@ bool Sema::MergeFunctionDecl(FunctionDecl *New, NamedDecl *&OldD, Scope *S,
 
     // If we are merging two functions where only one of them has a prototype,
     // we may have enough information to decide to issue a diagnostic that the
-    // function without a protoype will change behavior in C23. This handles
+    // function without a prototype will change behavior in C23. This handles
     // cases like:
     //   void i(); void i(int j);
     //   void i(int j); void i();
@@ -10553,7 +10553,7 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
     if (getLangOpts().CUDA && !isFunctionTemplateSpecialization)
       CUDA().maybeAddHostDeviceAttrs(NewFD, Previous);
 
-    // Handle explict specializations of function templates
+    // Handle explicit specializations of function templates
     // and friend function declarations with an explicit
     // template argument list.
     if (isFunctionTemplateSpecialization) {
@@ -12601,7 +12601,7 @@ void Sema::CheckMSVCRTEntryPoint(FunctionDecl *FD) {
     if (FD->getName() != "DllMain")
       FD->setHasImplicitReturnZero(true);
 
-  // Explicity specified calling conventions are applied to MSVC entry points
+  // Explicitly specified calling conventions are applied to MSVC entry points
   if (!hasExplicitCallingConv(T)) {
     if (isDefaultStdCall(FD, *this)) {
       if (FT->getCallConv() != CC_X86StdCall) {
@@ -13674,12 +13674,12 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) {
           CreateRecoveryExpr(Init->getBeginLoc(), Init->getEndLoc(), Args);
       if (RecoveryExpr.get())
         VDecl->setInit(RecoveryExpr.get());
-      // In general, for error recovery purposes, the initalizer doesn't play
+      // In general, for error recovery purposes, the initializer doesn't play
       // part in the valid bit of the declaration. There are a few exceptions:
       //  1) if the var decl has a deduced auto type, and the type cannot be
       //     deduced by an invalid initializer;
-      //  2) if the var decl is decompsition decl with a non-deduced type, and
-      //     the initialization fails (e.g. `int [a] = {1, 2};`);
+      //  2) if the var decl is a decomposition decl with a non-deduced type,
+      //      and the initialization fails (e.g. `int [a] = {1, 2};`);
       // Case 1) was already handled elsewhere.
       if (isa<DecompositionDecl>(VDecl)) // Case 2)
         VDecl->setInvalidDecl();
@@ -13897,9 +13897,9 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) {
     }
   } else if (VDecl->isFileVarDecl()) {
     // In C, extern is typically used to avoid tentative definitions when
-    // declaring variables in headers, but adding an intializer makes it a
+    // declaring variables in headers, but adding an initializer makes it a
     // definition. This is somewhat confusing, so GCC and Clang both warn on it.
-    // In C++, extern is often used to give implictly static const variables
+    // In C++, extern is often used to give implicitly static const variables
     // external linkage, so don't warn in that case. If selectany is present,
     // this might be header code intended for C and C++ inclusion, so apply the
     // C++ rules.
@@ -14093,7 +14093,7 @@ void Sema::ActOnUninitializedDecl(Decl *RealDecl) {
           return;
         }
       }
-      // The declaration is unitialized, no need for further checks.
+      // The declaration is uninitialized, no need for further checks.
       return;
     }
 
@@ -16324,7 +16324,7 @@ Decl *Sema::ActOnFinishFunctionBody(Decl *dcl, Stmt *Body,
         FSI->ObjCWarnForNoDesignatedInitChain = false;
       }
       if (FSI->ObjCWarnForNoInitDelegation) {
-        // Don't issue this warning for unavaialable inits.
+        // Don't issue this warning for unavailable inits.
         if (!MD->isUnavailable())
           Diag(MD->getLocation(),
                diag::warn_objc_secondary_init_missing_init_call);
@@ -17876,7 +17876,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
                   SkipBody->Previous = Def;
                   makeMergedDefinitionVisible(Hidden);
                   // Carry on and handle it like a normal definition. We'll
-                  // skip starting the definitiion later.
+                  // skip starting the definition later.
                 }
               } else if (!IsExplicitSpecializationAfterInstantiation) {
                 // A redeclaration in function prototype scope in C isn't
@@ -20475,7 +20475,7 @@ Sema::FunctionEmissionStatus Sema::getEmissionStatus(const FunctionDecl *FD,
   } else if (LangOpts.OpenMP > 45) {
     // In OpenMP host compilation prior to 5.0 everything was an emitted host
     // function. In 5.0, no_host was introduced which might cause a function to
-    // be ommitted.
+    // be omitted.
     std::optional<OMPDeclareTargetDeclAttr::DevTypeTy> DevTy =
         OMPDeclareTargetDeclAttr::getDeviceType(FD->getCanonicalDecl());
     if (DevTy)
diff --git a/clang/test/CodeGenCXX/debug-info-ptr-to-ptr.cpp b/clang/test/CodeGenCXX/debug-info-ptr-to-ptr.cpp
deleted file mode 100644
index baf791487771c0..00000000000000
--- a/clang/test/CodeGenCXX/debug-info-ptr-to-ptr.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-// Test debug info for intermediate value of a chained pointer deferencing
-// expression when the flag -fdebug-info-for-pointer-type is enabled.
-// RUN: %clang_cc1 -emit-llvm -triple x86_64-linux-gnu %s -fdebug-info-for-profiling -debug-info-kind=constructor -o - | FileCheck %s
-
-class A {
-public:
-  int i;
-  char c;
-  void *p;
-  int arr[3];
-};
-
-class B {
-public:
-  A* a;
-};
-
-class C {
-public:
-  B* b;
-  A* a;
-  A arr[10];
-};
-
-// CHECK-LABEL: define dso_local noundef i32 @{{.*}}func1{{.*}}(
-// CHECK:         [[A_ADDR:%.*]] = getelementptr inbounds %class.B, ptr {{%.*}}, i32 0, i32 0, !dbg [[DBG1:![0-9]+]]
-// CHECK-NEXT:    [[A:%.*]] = load ptr, ptr [[A_ADDR]], align {{.*}}, !dbg [[DBG1]]
-// CHECK-NEXT:    [[PSEUDO1:%.*]] = alloca ptr, align {{.*}}, !dbg [[DBG1]]
-// CHECK-NEXT:    store ptr [[A]], ptr [[PSEUDO1]], align {{.*}}, !dbg [[DBG1]]
-// CHECK-NEXT:    call void @llvm.dbg.declare(metadata ptr [[PSEUDO1]], metadata [[META1:![0-9]+]], metadata !DIExpression()), !dbg [[DBG1]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[PSEUDO1]], align {{.*}}, !dbg [[DBG1]]
-// CHECK-NEXT:    {{%.*}} = getelementptr inbounds %class.A, ptr [[TMP1]], i32 0, i32 0,
-int func1(B *b) {
-  return b->a->i;
-}
-
-// Should generate a pseudo variable when pointer is type-casted.
-// CHECK-LABEL: define dso_local noundef ptr @{{.*}}func2{{.*}}(
-// CHECK:         call void @llvm.dbg.declare(metadata ptr [[B_ADDR:%.*]], metadata [[META2:![0-9]+]], metadata !DIExpression())
-// CHECK-NEXT:    [[B:%.*]] = load ptr, ptr [[B_ADDR]],
-// CHECK-NEXT:    [[PSEUDO1:%.*]] = alloca ptr,
-// CHECK-NEXT:    store ptr [[B]], ptr [[PSEUDO1]],
-// CHECK-NEXT:    call void @llvm.dbg.declare(metadata ptr [[PSEUDO1]], metadata [[META3:![0-9]+]], metadata !DIExpression())
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[PSEUDO1]],
-// CHECK-NEXT:    {{%.*}} = getelementptr inbounds %class.B, ptr [[TMP1]], i32 0,
-A* func2(void *b) {
-  return ((B*)b)->a;
-}
-
-// Should not generate pseudo variable in this case.
-// CHECK-LABEL: define dso_local noundef i32 @{{.*}}func3{{.*}}(
-// CHECK:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR:%.*]], metadata [[META4:![0-9]+]], metadata !DIExpression())
-// CHECK:    call void @llvm.dbg.declare(metadata ptr [[LOCAL1:%.*]], metadata [[META5:![0-9]+]], metadata !DIExpression())
-// CHECK-NOT: call void @llvm.dbg.declare(metadata ptr
-int func3(B *b) {
-  A *local1 = b->a;
-  return local1->i;
-}
-
-// CHECK-LABEL: define dso_local noundef signext i8 @{{.*}}func4{{.*}}(
-// CHECK:         [[A_ADDR:%.*]] = getelementptr inbounds %class.C, ptr {{%.*}}, i32 0, i32 1
-// CHECK-NEXT:    [[A:%.*]] = load ptr, ptr [[A_ADDR]],
-// CHECK-NEXT:    [[PSEUDO1:%.*]] = alloca ptr,
-// CHECK-NEXT:    store ptr [[A]], ptr [[PSEUDO1]],
-// CHECK-NEXT:    call void @llvm.dbg.declare(metadata ptr [[PSEUDO1]], metadata [[META6:![0-9]+]], metadata !DIExpression())
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[PSEUDO1]],
-// CHECK-NEXT:    {{%.*}} = getelementptr inbounds %class.A, ptr [[TMP1]], i32 0, i32 0,
-// CHECK:         [[CALL:%.*]] = call noundef ptr @{{.*}}foo{{.*}}(
-// CHECK-NEXT:    [[PSEUDO2:%.*]] = alloca ptr,
-// CHECK-NEXT:    store ptr [[CALL]], ptr [[PSEUDO2]]
-// CHECK-NEXT:    call void @llvm.dbg.declare(metadata ptr [[PSEUDO2]], metadata [[META6]], metadata !DIExpression())
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[PSEUDO2]]
-// CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds %class.A, ptr [[TMP2]], i32 0, i32 1
-char func4(C *c) {
-  extern A* foo(int x);
-  return foo(c->a->i)->c;
-}
-
-// CHECK-LABEL: define dso_local noundef signext i8 @{{.*}}func5{{.*}}(
-// CHECK:         call void @llvm.dbg.declare(metadata ptr {{%.*}}, metadata [[META7:![0-9]+]], metadata !DIExpression())
-// CHECK:         call void @llvm.dbg.declare(metadata ptr {{%.*}}, metadata [[META8:![0-9]+]], metadata !DIExpression())
-// CHECK:         [[A_ADDR:%.*]] = getelementptr inbounds %class.A, ptr {{%.*}}, i64 {{%.*}},
-// CHECK-NEXT:    [[PSEUDO1:%.*]] = alloca ptr,
-// CHECK-NEXT:    store ptr [[A_ADDR]], ptr [[PSEUDO1]],
-// CHECK-NEXT:    call void @llvm.dbg.declare(metadata ptr [[PSEUDO1]], metadata [[META9:![0-9]+]], metadata !DIExpression())
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[PSEUDO1]],
-// CHECK-NEXT:    {{%.*}} = getelementptr inbounds %class.A, ptr [[TMP1]], i32 0, i32 1,
-char func5(void *arr, int n) {
-  return ((A*)arr)[n].c;
-}
-
-// CHECK-LABEL: define dso_local noundef i32 @{{.*}}func6{{.*}}(
-// CHECK:         call void @llvm.dbg.declare(metadata ptr {{%.*}}, metadata [[META10:![0-9]+]], metadata !DIExpression())
-// CHECK:         call void @llvm.dbg.declare(metadata ptr {{%.*}}, metadata [[META11:![0-9]+]], metadata !DIExpression())
-int func6(B &b) {
-  return reinterpret_cast<A&>(b).i;
-}
-
-// CHECK-DAG: [[META_A:![0-9]+]] = distinct !DICompositeType(tag: DW_TAG_class_type, name: "A",
-// CHECK-DAG: [[META_AP:![0-9]+]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: [[META_A]],
-// CHECK-DAG: [[META_B:![0-9]+]] = distinct !DICompositeType(tag: DW_TAG_class_type, name: "B",
-// CHECK-DAG: [[META_BP:![0-9]+]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: [[META_B]],
-// CHECK-DAG: [[META_C:![0-9]+]] = distinct !DICompositeType(tag: DW_TAG_class_type, name: "C",
-// CHECK-DAG: [[META_CP:![0-9]+]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: [[META_C]],
-// CHECK-DAG: [[META_VP:![0-9]+]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null,
-// CHECK-DAG: [[META_I32:![0-9]+]] = !DIBasicType(name: "int", size: 32,
-// CHECK-DAG: [[META_BR:![0-9]+]] = !DIDerivedType(tag: DW_TAG_reference_type, baseType: [[META_B]],
-
-// CHECK-DAG: [[DBG1]] = !DILocation(line: 34, column: 13,
-// CHECK-DAG: [[META1]] = !DILocalVariable(scope: {{.*}}, type: [[META_AP]], flags: DIFlagArtificial)
-// CHECK-DAG: [[META2]] = !DILocalVariable(name: "b", arg: 1, scope: {{.*}}, file: {{.*}}, line: 46, type: [[META_VP]])
-// CHECK-DAG: [[META3]] = !DILocalVariable(scope: {{.*}}, type: [[META_BP]], flags: DIFlagArtificial)
-// CHECK-DAG: [[META4]] = !DILocalVariable(name: "b", arg: 1, scope: {{.*}}, file: {{.*}}, line: 55, type: [[META_BP]])
-// CHECK-DAG: [[META5]] = !DILocalVariable(name: "local1", scope: {{.*}}, file: {{.*}}, line: 56, type: [[META_AP]])
-// CHECK-DAG: [[META6]] = !DILocalVariable(scope: {{.*}}, type: [[META_AP]], flags: DIFlagArtificial)
-// CHECK-DAG: [[META7]] = !DILocalVariable(name: "arr", arg: 1, scope: {{.*}}, file: {{.*}}, line: 88, type: [[META_VP]])
-// CHECK-DAG: [[META8]] = !DILocalVariable(name: "n", arg: 2, scope: {{.*}}, file: {{.*}}, line: 88, type: [[META_I32]])
-// CHECK-DAG: [[META9]] = !DILocalVariable(scope: {{.*}}, type: [[META_AP]], flags: DIFlagArtificial)
-// CHECK-DAG: [[META10]] = !DILocalVariable(name: "b", arg: 1, scope: {{.*}}, file: {{.*}}, line: 95, type: [[META_BR]])
-// CHECK-DAG: [[META11]] = !DILocalVariable(scope: {{.*}}, type: [[META_AP]], flags: DIFlagArtificial)
diff --git a/clang/test/CodeGenHLSL/convergence/for.hlsl b/clang/test/CodeGenHLSL/convergence/for.hlsl
index 180fae74ba7514..95f9a196bdb676 100644
--- a/clang/test/CodeGenHLSL/convergence/for.hlsl
+++ b/clang/test/CodeGenHLSL/convergence/for.hlsl
@@ -92,7 +92,7 @@ void test6() {
 // CHECK:   [[C1:%[a-zA-Z0-9]+]] = call spir_func noundef i1 @_Z4condv() [[A3]] [ "convergencectrl"(token [[T1]]) ]
 // CHECK:   br i1 [[C1]], label %if.then, label %if.end
 // CHECK: if.then:
-// CHECK    call spir_func void @_Z3foov() [[A3:#[0-9]+]] [ "convergencectrl"(token [[T1]]) ]
+// CHECK:   call spir_func void @_Z3foov() [[A3:#[0-9]+]] [ "convergencectrl"(token [[T1]]) ]
 // CHECK:   br label %for.end
 // CHECK: if.end:
 // CHECK:   br label %for.inc
diff --git a/clang/test/SemaCXX/ms-const-member-expr.cpp b/clang/test/SemaCXX/ms-const-member-expr.cpp
index 72cfe76fbe43a2..8312f84b550f00 100644
--- a/clang/test/SemaCXX/ms-const-member-expr.cpp
+++ b/clang/test/SemaCXX/ms-const-member-expr.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 %s -std=c++11 -fms-compatibility -fsyntax-only -verify
+// RUN: %clang_cc1 %s -std=c++11 -fms-compatibility -fsyntax-only -verify -fexperimental-new-constant-interpreter
 
 struct S {
   enum { E = 1 };
diff --git a/clang/test/SemaHLSL/standard_conversion_sequences.hlsl b/clang/test/SemaHLSL/standard_conversion_sequences.hlsl
index a0d398105f15d6..256981d2c1e2e0 100644
--- a/clang/test/SemaHLSL/standard_conversion_sequences.hlsl
+++ b/clang/test/SemaHLSL/standard_conversion_sequences.hlsl
@@ -4,9 +4,8 @@
 void test() {
   
   // CHECK: VarDecl {{.*}} used f3 'vector<float, 3>':'float __attribute__((ext_vector_type(3)))' cinit
-  // CHECK-NEXt: ImplicitCastExpr {{.*}} 'vector<float, 3>':'float __attribute__((ext_vector_type(3)))' <VectorSplat>
-  // CHECK-NEXt: ImplicitCastExpr {{.*}} 'float' <FloatingCast>
-  // CHECK-NEXt: FloatingLiteral {{.*}} 'double' 1.000000e+00
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector<float, 3>':'float __attribute__((ext_vector_type(3)))' <VectorSplat>
+  // CHECK-NEXT: FloatingLiteral {{.*}} 'float' 1.000000e+00
   vector<float,3> f3 = 1.0; // No warning for splatting to a vector from a literal.
 
 
diff --git a/compiler-rt/lib/scudo/standalone/primary64.h b/compiler-rt/lib/scudo/standalone/primary64.h
index bed2ccb8b992a2..8a583bacb4a934 100644
--- a/compiler-rt/lib/scudo/standalone/primary64.h
+++ b/compiler-rt/lib/scudo/standalone/primary64.h
@@ -1392,7 +1392,7 @@ template <typename Config> class SizeClassAllocator64 {
         continue;
       }
 
-      const uptr PushedBytesDelta = BG->BytesInBGAtLastCheckpoint - BytesInBG;
+      const uptr PushedBytesDelta = BytesInBG - BG->BytesInBGAtLastCheckpoint;
 
       // Given the randomness property, we try to release the pages only if the
       // bytes used by free blocks exceed certain proportion of group size. Note
diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
index 52f2034b8707a3..ec1fb411ff0e25 100644
--- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
@@ -204,6 +204,8 @@ struct IntrinsicLibrary {
                                            llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genCAssociatedCPtr(mlir::Type,
                                         llvm::ArrayRef<fir::ExtendedValue>);
+  mlir::Value genErfcScaled(mlir::Type resultType,
+                            llvm::ArrayRef<mlir::Value> args);
   void genCFPointer(llvm::ArrayRef<fir::ExtendedValue>);
   void genCFProcPointer(llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genCFunLoc(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Numeric.h b/flang/include/flang/Optimizer/Builder/Runtime/Numeric.h
index 558358257b5134..6857650ce52b74 100644
--- a/flang/include/flang/Optimizer/Builder/Runtime/Numeric.h
+++ b/flang/include/flang/Optimizer/Builder/Runtime/Numeric.h
@@ -18,6 +18,10 @@ class FirOpBuilder;
 
 namespace fir::runtime {
 
+/// Generate call to ErfcScaled intrinsic runtime routine.
+mlir::Value genErfcScaled(fir::FirOpBuilder &builder, mlir::Location loc,
+                          mlir::Value x);
+
 /// Generate call to Exponent intrinsic runtime routine.
 mlir::Value genExponent(fir::FirOpBuilder &builder, mlir::Location loc,
                         mlir::Type resultType, mlir::Value x);
diff --git a/flang/include/flang/Runtime/numeric.h b/flang/include/flang/Runtime/numeric.h
index 7d3f91360c8cfb..e051e864316630 100644
--- a/flang/include/flang/Runtime/numeric.h
+++ b/flang/include/flang/Runtime/numeric.h
@@ -73,6 +73,20 @@ CppTypeFor<TypeCategory::Integer, 16> RTDECL(Ceiling16_16)(
 #endif
 #endif
 
+// ERFC_SCALED
+CppTypeFor<TypeCategory::Real, 4> RTDECL(ErfcScaled4)(
+    CppTypeFor<TypeCategory::Real, 4>);
+CppTypeFor<TypeCategory::Real, 8> RTDECL(ErfcScaled8)(
+    CppTypeFor<TypeCategory::Real, 8>);
+#if LDBL_MANT_DIG == 64
+CppTypeFor<TypeCategory::Real, 10> RTDECL(ErfcScaled10)(
+    CppTypeFor<TypeCategory::Real, 10>);
+#endif
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDECL(ErfcScaled16)(
+    CppTypeFor<TypeCategory::Real, 16>);
+#endif
+
 // EXPONENT is defined to return default INTEGER; support INTEGER(4 & 8)
 CppTypeFor<TypeCategory::Integer, 4> RTDECL(Exponent4_4)(
     CppTypeFor<TypeCategory::Real, 4>);
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 4cdf1f2d98caa4..c438ae1250e450 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -224,6 +224,7 @@ static constexpr IntrinsicHandler handlers[]{
        {"boundary", asBox, handleDynamicOptional},
        {"dim", asValue}}},
      /*isElemental=*/false},
+    {"erfc_scaled", &I::genErfcScaled},
     {"etime",
      &I::genEtime,
      {{{"values", asBox}, {"time", asBox}}},
@@ -5777,15 +5778,19 @@ IntrinsicLibrary::genReduce(mlir::Type resultType,
       return builder.create<fir::LoadOp>(loc, result);
     }
     if (fir::isa_char(eleTy)) {
-      // Create mutable fir.box to be passed to the runtime for the result.
-      fir::MutableBoxValue resultMutableBox =
-          fir::factory::createTempMutableBox(builder, loc, eleTy);
-      mlir::Value resultIrBox =
-          fir::factory::getMutableIRBox(builder, loc, resultMutableBox);
+      auto charTy = mlir::dyn_cast_or_null<fir::CharacterType>(resultType);
+      assert(charTy && "expect CharacterType");
+      fir::factory::CharacterExprHelper charHelper(builder, loc);
+      mlir::Value len;
+      if (charTy.hasDynamicLen())
+        len = charHelper.readLengthFromBox(fir::getBase(arrayTmp), charTy);
+      else
+        len = builder.createIntegerConstant(loc, builder.getI32Type(),
+                                            charTy.getLen());
+      fir::CharBoxValue temp = charHelper.createCharacterTemp(eleTy, len);
       fir::runtime::genReduce(builder, loc, array, operation, mask, identity,
-                              ordered, resultIrBox);
-      // Handle cleanup of allocatable result descriptor and return
-      return readAndAddCleanUp(resultMutableBox, resultType, "REDUCE");
+                              ordered, temp.getBuffer());
+      return temp;
     }
     return fir::runtime::genReduce(builder, loc, array, operation, mask,
                                    identity, ordered);
@@ -5878,6 +5883,16 @@ mlir::Value IntrinsicLibrary::genRRSpacing(mlir::Type resultType,
       fir::runtime::genRRSpacing(builder, loc, fir::getBase(args[0])));
 }
 
+// ERFC_SCALED
+mlir::Value IntrinsicLibrary::genErfcScaled(mlir::Type resultType,
+                                            llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 1);
+
+  return builder.createConvert(
+      loc, resultType,
+      fir::runtime::genErfcScaled(builder, loc, fir::getBase(args[0])));
+}
+
 // SAME_TYPE_AS
 fir::ExtendedValue
 IntrinsicLibrary::genSameTypeAs(mlir::Type resultType,
diff --git a/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp b/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp
index 8ac9d64f576b6a..1d13248db59841 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp
@@ -22,6 +22,28 @@ using namespace Fortran::runtime;
 // may not have them in their runtime library. This can occur in the
 // case of cross compilation, for example.
 
+/// Placeholder for real*10 version of ErfcScaled Intrinsic
+struct ForcedErfcScaled10 {
+  static constexpr const char *name = ExpandAndQuoteKey(RTNAME(ErfcScaled10));
+  static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
+    return [](mlir::MLIRContext *ctx) {
+      auto ty = mlir::FloatType::getF80(ctx);
+      return mlir::FunctionType::get(ctx, {ty}, {ty});
+    };
+  }
+};
+
+/// Placeholder for real*16 version of ErfcScaled Intrinsic
+struct ForcedErfcScaled16 {
+  static constexpr const char *name = ExpandAndQuoteKey(RTNAME(ErfcScaled16));
+  static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
+    return [](mlir::MLIRContext *ctx) {
+      auto ty = mlir::FloatType::getF128(ctx);
+      return mlir::FunctionType::get(ctx, {ty}, {ty});
+    };
+  }
+};
+
 /// Placeholder for real*10 version of Exponent Intrinsic
 struct ForcedExponent10_4 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(Exponent10_4));
@@ -444,6 +466,30 @@ mlir::Value fir::runtime::genRRSpacing(fir::FirOpBuilder &builder,
   return builder.create<fir::CallOp>(loc, func, args).getResult(0);
 }
 
+/// Generate call to ErfcScaled intrinsic runtime routine.
+mlir::Value fir::runtime::genErfcScaled(fir::FirOpBuilder &builder,
+                                        mlir::Location loc, mlir::Value x) {
+  mlir::func::FuncOp func;
+  mlir::Type fltTy = x.getType();
+
+  if (fltTy.isF32())
+    func = fir::runtime::getRuntimeFunc<mkRTKey(ErfcScaled4)>(loc, builder);
+  else if (fltTy.isF64())
+    func = fir::runtime::getRuntimeFunc<mkRTKey(ErfcScaled8)>(loc, builder);
+  else if (fltTy.isF80())
+    func = fir::runtime::getRuntimeFunc<ForcedErfcScaled10>(loc, builder);
+  else if (fltTy.isF128())
+    func = fir::runtime::getRuntimeFunc<ForcedErfcScaled16>(loc, builder);
+  else
+    fir::intrinsicTypeTODO(builder, fltTy, loc, "ERFC_SCALED");
+
+  auto funcTy = func.getFunctionType();
+  llvm::SmallVector<mlir::Value> args = {
+      builder.createConvert(loc, funcTy.getInput(0), x)};
+
+  return builder.create<fir::CallOp>(loc, func, args).getResult(0);
+}
+
 /// Generate call to Scale intrinsic runtime routine.
 mlir::Value fir::runtime::genScale(fir::FirOpBuilder &builder,
                                    mlir::Location loc, mlir::Value x,
diff --git a/flang/runtime/numeric-templates.h b/flang/runtime/numeric-templates.h
index 4936e7738a663e..1b5395df945193 100644
--- a/flang/runtime/numeric-templates.h
+++ b/flang/runtime/numeric-templates.h
@@ -354,6 +354,110 @@ template <int PREC, typename T> inline RT_API_ATTRS T Spacing(T x) {
   }
 }
 
+// ERFC_SCALED (16.9.71)
+template <typename T> inline RT_API_ATTRS T ErfcScaled(T arg) {
+  // Coefficients for approximation to erfc in the first interval.
+  static const T a[5] = {3.16112374387056560e00, 1.13864154151050156e02,
+      3.77485237685302021e02, 3.20937758913846947e03, 1.85777706184603153e-1};
+  static const T b[4] = {2.36012909523441209e01, 2.44024637934444173e02,
+      1.28261652607737228e03, 2.84423683343917062e03};
+
+  // Coefficients for approximation to erfc in the second interval.
+  static const T c[9] = {5.64188496988670089e-1, 8.88314979438837594e00,
+      6.61191906371416295e01, 2.98635138197400131e02, 8.81952221241769090e02,
+      1.71204761263407058e03, 2.05107837782607147e03, 1.23033935479799725e03,
+      2.15311535474403846e-8};
+  static const T d[8] = {1.57449261107098347e01, 1.17693950891312499e02,
+      5.37181101862009858e02, 1.62138957456669019e03, 3.29079923573345963e03,
+      4.36261909014324716e03, 3.43936767414372164e03, 1.23033935480374942e03};
+
+  // Coefficients for approximation to erfc in the third interval.
+  static const T p[6] = {3.05326634961232344e-1, 3.60344899949804439e-1,
+      1.25781726111229246e-1, 1.60837851487422766e-2, 6.58749161529837803e-4,
+      1.63153871373020978e-2};
+  static const T q[5] = {2.56852019228982242e00, 1.87295284992346047e00,
+      5.27905102951428412e-1, 6.05183413124413191e-2, 2.33520497626869185e-3};
+
+  constexpr T sqrtpi{1.7724538509078120380404576221783883301349L};
+  constexpr T rsqrtpi{0.5641895835477562869480794515607725858440L};
+  constexpr T epsilonby2{std::numeric_limits<T>::epsilon() * 0.5};
+  constexpr T xneg{-26.628e0};
+  constexpr T xhuge{6.71e7};
+  constexpr T thresh{0.46875e0};
+  constexpr T zero{0.0};
+  constexpr T one{1.0};
+  constexpr T four{4.0};
+  constexpr T sixteen{16.0};
+  constexpr T xmax{1.0 / (sqrtpi * std::numeric_limits<T>::min())};
+  static_assert(xmax > xhuge, "xmax must be greater than xhuge");
+
+  T ysq;
+  T xnum;
+  T xden;
+  T del;
+  T result;
+
+  auto x{arg};
+  auto y{std::fabs(x)};
+
+  if (y <= thresh) {
+    // evaluate erf for  |x| <= 0.46875
+    ysq = zero;
+    if (y > epsilonby2) {
+      ysq = y * y;
+    }
+    xnum = a[4] * ysq;
+    xden = ysq;
+    for (int i{0}; i < 3; i++) {
+      xnum = (xnum + a[i]) * ysq;
+      xden = (xden + b[i]) * ysq;
+    }
+    result = x * (xnum + a[3]) / (xden + b[3]);
+    result = one - result;
+    result = std::exp(ysq) * result;
+    return result;
+  } else if (y <= four) {
+    //  evaluate erfc for 0.46875 < |x| <= 4.0
+    xnum = c[8] * y;
+    xden = y;
+    for (int i{0}; i < 7; ++i) {
+      xnum = (xnum + c[i]) * y;
+      xden = (xden + d[i]) * y;
+    }
+    result = (xnum + c[7]) / (xden + d[7]);
+  } else {
+    //  evaluate erfc for |x| > 4.0
+    result = zero;
+    if (y >= xhuge) {
+      if (y < xmax) {
+        result = rsqrtpi / y;
+      }
+    } else {
+      ysq = one / (y * y);
+      xnum = p[5] * ysq;
+      xden = ysq;
+      for (int i{0}; i < 4; ++i) {
+        xnum = (xnum + p[i]) * ysq;
+        xden = (xden + q[i]) * ysq;
+      }
+      result = ysq * (xnum + p[4]) / (xden + q[4]);
+      result = (rsqrtpi - result) / y;
+    }
+  }
+  //  fix up for negative argument, erf, etc.
+  if (x < zero) {
+    if (x < xneg) {
+      result = std::numeric_limits<T>::max();
+    } else {
+      ysq = trunc(x * sixteen) / sixteen;
+      del = (x - ysq) * (x + ysq);
+      y = std::exp((ysq * ysq)) * std::exp((del));
+      result = (y + y) - result;
+    }
+  }
+  return result;
+}
+
 } // namespace Fortran::runtime
 
 #endif // FORTRAN_RUNTIME_NUMERIC_TEMPLATES_H_
diff --git a/flang/runtime/numeric.cpp b/flang/runtime/numeric.cpp
index 2225473c4690e2..7c40beb31083ff 100644
--- a/flang/runtime/numeric.cpp
+++ b/flang/runtime/numeric.cpp
@@ -316,6 +316,27 @@ CppTypeFor<TypeCategory::Integer, 16> RTDEF(Ceiling16_16)(
 #endif
 #endif
 
+CppTypeFor<TypeCategory::Real, 4> RTDEF(ErfcScaled4)(
+    CppTypeFor<TypeCategory::Real, 4> x) {
+  return ErfcScaled(x);
+}
+CppTypeFor<TypeCategory::Real, 8> RTDEF(ErfcScaled8)(
+    CppTypeFor<TypeCategory::Real, 8> x) {
+  return ErfcScaled(x);
+}
+#if LDBL_MANT_DIG == 64
+CppTypeFor<TypeCategory::Real, 10> RTDEF(ErfcScaled10)(
+    CppTypeFor<TypeCategory::Real, 10> x) {
+  return ErfcScaled(x);
+}
+#endif
+#if LDBL_MANT_DIG == 113
+CppTypeFor<TypeCategory::Real, 16> RTDEF(ErfcScaled16)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return ErfcScaled(x);
+}
+#endif
+
 CppTypeFor<TypeCategory::Integer, 4> RTDEF(Exponent4_4)(
     CppTypeFor<TypeCategory::Real, 4> x) {
   return Exponent<CppTypeFor<TypeCategory::Integer, 4>>(x);
diff --git a/flang/test/Lower/Intrinsics/erfc_scaled.f90 b/flang/test/Lower/Intrinsics/erfc_scaled.f90
new file mode 100644
index 00000000000000..ab5e90cb2409ea
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/erfc_scaled.f90
@@ -0,0 +1,23 @@
+! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir %s -o - | FileCheck %s
+
+! CHECK-LABEL: func @_QPerfc_scaled4(
+! CHECK-SAME: %[[x:[^:]+]]: !fir.ref<f32>{{.*}}) -> f32
+function erfc_scaled4(x)
+  real(kind=4) :: erfc_scaled4
+  real(kind=4) :: x
+  erfc_scaled4 = erfc_scaled(x);
+! CHECK: %[[a1:.*]] = fir.load %[[x]] : !fir.ref<f32>
+! CHECK: %{{.*}} = fir.call @_FortranAErfcScaled4(%[[a1]]) {{.*}}: (f32) -> f32
+end function erfc_scaled4
+
+
+! CHECK-LABEL: func @_QPerfc_scaled8(
+! CHECK-SAME: %[[x:[^:]+]]: !fir.ref<f64>{{.*}}) -> f64
+function erfc_scaled8(x)
+  real(kind=8) :: erfc_scaled8
+  real(kind=8) :: x
+  erfc_scaled8 = erfc_scaled(x);
+! CHECK: %[[a1:.*]] = fir.load %[[x]] : !fir.ref<f64>
+! CHECK: %{{.*}} = fir.call @_FortranAErfcScaled8(%[[a1]]) {{.*}}: (f64) -> f64
+end function erfc_scaled8
diff --git a/flang/test/Lower/Intrinsics/reduce.f90 b/flang/test/Lower/Intrinsics/reduce.f90
index 842e626d7cc397..8d7b7798a94c56 100644
--- a/flang/test/Lower/Intrinsics/reduce.f90
+++ b/flang/test/Lower/Intrinsics/reduce.f90
@@ -348,21 +348,25 @@ subroutine char1(a)
   res = reduce(a, red_char1)
 end subroutine
 
-! CHECK: fir.call @_FortranAReduceChar1
+! CHECK: %[[CHRTMP:.*]] = fir.alloca !fir.char<1> {bindc_name = ".chrtmp"}
+! CHECK: %[[RESULT:.*]] = fir.convert %[[CHRTMP]] : (!fir.ref<!fir.char<1>>) -> !fir.ref<i8>
+! CHECK: fir.call @_FortranAReduceChar1(%[[RESULT]], {{.*}})
 
 pure function red_char2(a,b)
-  character(kind=2), intent(in) :: a, b
-  character(kind=2) :: red_char2
+  character(kind=2, len=10), intent(in) :: a, b
+  character(kind=2, len=10) :: red_char2
   red_char2 = a // b
 end function
 
 subroutine char2(a)
-  character(kind=2), intent(in) :: a(:)
-  character(kind=2) :: res
+  character(kind=2, len=10), intent(in) :: a(:)
+  character(kind=2, len=10) :: res
   res = reduce(a, red_char2)
 end subroutine
 
-! CHECK: fir.call @_FortranAReduceChar2
+! CHECK: %[[CHRTMP:.*]] = fir.alloca !fir.char<2,10> {bindc_name = ".chrtmp"}
+! CHECK: %[[RESULT:.*]] = fir.convert %[[CHRTMP]] : (!fir.ref<!fir.char<2,10>>) -> !fir.ref<i16>
+! CHECK: fir.call @_FortranAReduceChar2(%[[RESULT]], {{.*}})
 
 pure function red_char4(a,b)
   character(kind=4), intent(in) :: a, b
@@ -598,8 +602,8 @@ subroutine char1dim(a)
 ! CHECK: fir.call @_FortranAReduceCharacter1Dim
 
 subroutine char2dim(a)
-  character(kind=2), intent(in) :: a(:, :)
-  character(kind=2), allocatable :: res(:)
+  character(kind=2, len=10), intent(in) :: a(:, :)
+  character(kind=2, len=10), allocatable :: res(:)
   res = reduce(a, red_char2, 2)
 end subroutine
 
@@ -613,4 +617,22 @@ subroutine char4dim(a)
 
 ! CHECK: fir.call @_FortranAReduceCharacter4Dim
 
+pure function red_char_dyn(a, b)
+  character(*), intent(In) :: a, b
+  character(max(len(a),len(b))) :: red_char_dyn
+  red_char_dyn = max(a, b)
+end function
+
+subroutine charDyn()
+  character(5) :: res
+  character(:), allocatable :: a(:)
+  allocate(character(10)::a(10))
+  res = reduce(a, red_char_dyn)
+end subroutine
+
+! CHECK: %[[BOX_ELESIZE:.*]] = fir.box_elesize %{{.*}} : (!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>) -> index
+! CHECK: %[[CHRTMP:.*]] = fir.alloca !fir.char<1,?>(%[[BOX_ELESIZE]] : index) {bindc_name = ".chrtmp"}
+! CHECK: %[[RESULT:.*]] = fir.convert %[[CHRTMP]] : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<i8>
+! CHECK: fir.call @_FortranAReduceChar1(%[[RESULT]], {{.*}})
+
 end module
diff --git a/flang/unittests/Runtime/Numeric.cpp b/flang/unittests/Runtime/Numeric.cpp
index b69ff21ea79fb0..9f77e165707834 100644
--- a/flang/unittests/Runtime/Numeric.cpp
+++ b/flang/unittests/Runtime/Numeric.cpp
@@ -31,6 +31,14 @@ TEST(Numeric, Floor) {
   EXPECT_EQ(RTNAME(Floor4_1)(Real<4>{0}), 0);
 }
 
+TEST(Numeric, Erfc_scaled) {
+  EXPECT_NEAR(RTNAME(ErfcScaled4)(Real<4>{20.0}), 0.02817434874, 1.0e-8);
+  EXPECT_NEAR(RTNAME(ErfcScaled8)(Real<8>{20.0}), 0.02817434874, 1.0e-11);
+#if LDBL_MANT_DIG == 64
+  EXPECT_NEAR(RTNAME(ErfcScaled10)(Real<10>{20.0}), 0.02817434874, 1.0e-8);
+#endif
+}
+
 TEST(Numeric, Exponent) {
   EXPECT_EQ(RTNAME(Exponent4_4)(Real<4>{0}), 0);
   EXPECT_EQ(RTNAME(Exponent4_8)(Real<4>{1.0}), 1);
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index 381061ce3fcbf0..db96a80051a8dd 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -541,6 +541,8 @@ if(LIBC_TYPES_HAS_FLOAT16)
     libc.src.math.rintf16
     libc.src.math.roundf16
     libc.src.math.roundevenf16
+    libc.src.math.totalorderf16
+    libc.src.math.totalordermagf16
     libc.src.math.truncf16
     libc.src.math.ufromfpf16
     libc.src.math.ufromfpxf16
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index e99960b12441da..355eaf33ace6d1 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -571,6 +571,8 @@ if(LIBC_TYPES_HAS_FLOAT16)
     libc.src.math.rintf16
     libc.src.math.roundf16
     libc.src.math.roundevenf16
+    libc.src.math.totalorderf16
+    libc.src.math.totalordermagf16
     libc.src.math.truncf16
     libc.src.math.ufromfpf16
     libc.src.math.ufromfpxf16
diff --git a/libc/docs/c23.rst b/libc/docs/c23.rst
index fec9b24bbd5815..4134befd1ed358 100644
--- a/libc/docs/c23.rst
+++ b/libc/docs/c23.rst
@@ -42,8 +42,8 @@ Additions:
   * rsqrt*
   * __STDC_IEC_60559_DFP__ functions (_Decimal32, _Decimal64, _Decimal128)
   * compoundn*
-  * totalorder*
-  * totalordermag*
+  * totalorder* |check|
+  * totalordermag* |check|
   * getpayload*
   * setpayload*
   * iscannonical
diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
index f83a646c34b57c..d556885eda6223 100644
--- a/libc/docs/math/index.rst
+++ b/libc/docs/math/index.rst
@@ -210,6 +210,10 @@ Basic Operations
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | scalbn           | |check|          | |check|         | |check|                |                      | |check|                | 7.12.6.19              | F.10.3.19                  |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
+| totalorder       |                  |                 |                        | |check|              |                        | F.10.12.1              | N/A                        |
++------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
+| totalordermag    |                  |                 |                        | |check|              |                        | F.10.12.2              | N/A                        |
++------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | trunc            | |check|          | |check|         | |check|                | |check|              | |check|                | 7.12.9.9               | F.10.6.9                   |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | ufromfp          | |check|          | |check|         | |check|                | |check|              | |check|                | 7.12.9.10              | F.10.6.10                  |
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index 34169948fc6d27..b134ec00a7d7a2 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -710,6 +710,10 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"canonicalizel", RetValSpec<IntType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
           GuardedFunctionSpec<"canonicalizef16", RetValSpec<IntType>, [ArgSpec<Float16Type>, ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
           GuardedFunctionSpec<"canonicalizef128", RetValSpec<IntType>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
+
+          GuardedFunctionSpec<"totalorderf16", RetValSpec<IntType>, [ArgSpec<Float16Ptr>, ArgSpec<Float16Ptr>], "LIBC_TYPES_HAS_FLOAT16">,
+
+          GuardedFunctionSpec<"totalordermagf16", RetValSpec<IntType>, [ArgSpec<Float16Ptr>, ArgSpec<Float16Ptr>], "LIBC_TYPES_HAS_FLOAT16">,
       ]
   >;
 
diff --git a/libc/src/__support/FPUtil/BasicOperations.h b/libc/src/__support/FPUtil/BasicOperations.h
index e5ac101fedc0e3..beb8e48db8f51b 100644
--- a/libc/src/__support/FPUtil/BasicOperations.h
+++ b/libc/src/__support/FPUtil/BasicOperations.h
@@ -240,6 +240,31 @@ LIBC_INLINE int canonicalize(T &cx, const T &x) {
   return 0;
 }
 
+template <typename T>
+LIBC_INLINE cpp::enable_if_t<cpp::is_floating_point_v<T>, bool>
+totalorder(T x, T y) {
+  using FPBits = FPBits<T>;
+  FPBits x_bits(x);
+  FPBits y_bits(y);
+
+  using StorageType = typename FPBits::StorageType;
+  StorageType x_u = x_bits.uintval();
+  StorageType y_u = y_bits.uintval();
+
+  using signed_t = cpp::make_signed_t<StorageType>;
+  signed_t x_signed = static_cast<signed_t>(x_u);
+  signed_t y_signed = static_cast<signed_t>(y_u);
+
+  bool both_neg = (x_u & y_u & FPBits::SIGN_MASK) != 0;
+  return x_signed == y_signed || ((x_signed <= y_signed) != both_neg);
+}
+
+template <typename T>
+LIBC_INLINE cpp::enable_if_t<cpp::is_floating_point_v<T>, bool>
+totalordermag(T x, T y) {
+  return FPBits<T>(x).abs().uintval() <= FPBits<T>(y).abs().uintval();
+}
+
 } // namespace fputil
 } // namespace LIBC_NAMESPACE
 
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index 82dfdaf479ff00..2446c293b8ef5a 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -369,6 +369,10 @@ add_math_entrypoint_object(tanhf)
 add_math_entrypoint_object(tgamma)
 add_math_entrypoint_object(tgammaf)
 
+add_math_entrypoint_object(totalorderf16)
+
+add_math_entrypoint_object(totalordermagf16)
+
 add_math_entrypoint_object(trunc)
 add_math_entrypoint_object(truncf)
 add_math_entrypoint_object(truncl)
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index f4f683e61bd658..673bef516b13d1 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -3577,3 +3577,27 @@ add_entrypoint_object(
   COMPILE_OPTIONS
     -O3
 )
+
+add_entrypoint_object(
+  totalorderf16
+  SRCS
+    totalorderf16.cpp
+  HDRS
+    ../totalorderf16.h
+  DEPENDS
+    libc.src.__support.FPUtil.basic_operations
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  totalordermagf16
+  SRCS
+    totalordermagf16.cpp
+  HDRS
+    ../totalordermagf16.h
+  DEPENDS
+    libc.src.__support.FPUtil.basic_operations
+  COMPILE_OPTIONS
+    -O3
+)
diff --git a/libc/src/math/generic/totalorderf16.cpp b/libc/src/math/generic/totalorderf16.cpp
new file mode 100644
index 00000000000000..e43beb33d2fd3d
--- /dev/null
+++ b/libc/src/math/generic/totalorderf16.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of totalorderf16 function --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/totalorderf16.h"
+#include "src/__support/FPUtil/BasicOperations.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(int, totalorderf16, (const float16 *x, const float16 *y)) {
+  return static_cast<int>(fputil::totalorder(*x, *y));
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/totalordermagf16.cpp b/libc/src/math/generic/totalordermagf16.cpp
new file mode 100644
index 00000000000000..09d04fbeb2d2c6
--- /dev/null
+++ b/libc/src/math/generic/totalordermagf16.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of totalordermagf16 function -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/totalordermagf16.h"
+#include "src/__support/FPUtil/BasicOperations.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(int, totalordermagf16,
+                   (const float16 *x, const float16 *y)) {
+  return static_cast<int>(fputil::totalordermag(*x, *y));
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/totalorderf16.h b/libc/src/math/totalorderf16.h
new file mode 100644
index 00000000000000..f5390140c4dc2e
--- /dev/null
+++ b/libc/src/math/totalorderf16.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for totalorderf16 -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_TOTALORDERF16_H
+#define LLVM_LIBC_SRC_MATH_TOTALORDERF16_H
+
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE {
+
+int totalorderf16(const float16 *x, const float16 *y);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_TOTALORDERF16_H
diff --git a/libc/src/math/totalordermagf16.h b/libc/src/math/totalordermagf16.h
new file mode 100644
index 00000000000000..8c6621b9783dfb
--- /dev/null
+++ b/libc/src/math/totalordermagf16.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for totalordermagf16 --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_TOTALORDERMAGF16_H
+#define LLVM_LIBC_SRC_MATH_TOTALORDERMAGF16_H
+
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE {
+
+int totalordermagf16(const float16 *x, const float16 *y);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_TOTALORDERMAGF16_H
diff --git a/libc/test/UnitTest/FPMatcher.h b/libc/test/UnitTest/FPMatcher.h
index 26af5cec02b587..86b823260e1979 100644
--- a/libc/test/UnitTest/FPMatcher.h
+++ b/libc/test/UnitTest/FPMatcher.h
@@ -97,8 +97,10 @@ template <typename T> struct FPTest : public Test {
       LIBC_NAMESPACE::cpp::numeric_limits<StorageType>::max();                 \
   const T zero = FPBits::zero(Sign::POS).get_val();                            \
   const T neg_zero = FPBits::zero(Sign::NEG).get_val();                        \
-  const T aNaN = FPBits::quiet_nan().get_val();                                \
-  const T sNaN = FPBits::signaling_nan().get_val();                            \
+  const T aNaN = FPBits::quiet_nan(Sign::POS).get_val();                       \
+  const T neg_aNaN = FPBits::quiet_nan(Sign::NEG).get_val();                   \
+  const T sNaN = FPBits::signaling_nan(Sign::POS).get_val();                   \
+  const T neg_sNaN = FPBits::signaling_nan(Sign::NEG).get_val();               \
   const T inf = FPBits::inf(Sign::POS).get_val();                              \
   const T neg_inf = FPBits::inf(Sign::NEG).get_val();                          \
   const T min_normal = FPBits::min_normal().get_val();                         \
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 75e2bdd7be100a..68cd412b14e9d3 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -3519,3 +3519,27 @@ add_fp_unittest(
     libc.src.math.powf
     libc.src.__support.FPUtil.fp_bits
 )
+
+add_fp_unittest(
+  totalorderf16_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    totalorderf16_test.cpp
+  HDRS
+    TotalOrderTest.h
+  DEPENDS
+    libc.src.math.totalorderf16
+)
+
+add_fp_unittest(
+  totalordermagf16_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    totalordermagf16_test.cpp
+  HDRS
+    TotalOrderMagTest.h
+  DEPENDS
+    libc.src.math.totalordermagf16
+)
diff --git a/libc/test/src/math/smoke/TotalOrderMagTest.h b/libc/test/src/math/smoke/TotalOrderMagTest.h
new file mode 100644
index 00000000000000..5fe2983a0e678b
--- /dev/null
+++ b/libc/test/src/math/smoke/TotalOrderMagTest.h
@@ -0,0 +1,142 @@
+//===-- Utility class to test flavors of totalordermag ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LIBC_TEST_SRC_MATH_SMOKE_TOTALORDERMAGTEST_H
+#define LIBC_TEST_SRC_MATH_SMOKE_TOTALORDERMAGTEST_H
+
+#include "test/UnitTest/FEnvSafeTest.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+
+template <typename T>
+class TotalOrderMagTestTemplate : public LIBC_NAMESPACE::testing::FEnvSafeTest {
+
+  DECLARE_SPECIAL_CONSTANTS(T)
+
+public:
+  typedef int (*TotalOrderMagFunc)(const T *, const T *);
+
+  bool funcWrapper(TotalOrderMagFunc func, T x, T y) {
+    return func(&x, &y) != 0;
+  }
+
+  void testXLesserThanY(TotalOrderMagFunc func) {
+    EXPECT_TRUE(funcWrapper(func, neg_inf, inf));
+
+    EXPECT_TRUE(funcWrapper(func, T(0.0), T(0.1)));
+    EXPECT_TRUE(funcWrapper(func, T(0.0), T(123.38)));
+
+    EXPECT_FALSE(funcWrapper(func, T(-0.1), T(0.0)));
+    EXPECT_FALSE(funcWrapper(func, T(-123.38), T(0.0)));
+
+    EXPECT_TRUE(funcWrapper(func, T(-0.1), T(0.1)));
+    EXPECT_TRUE(funcWrapper(func, T(-123.38), T(123.38)));
+  }
+
+  void testXGreaterThanY(TotalOrderMagFunc func) {
+    EXPECT_TRUE(funcWrapper(func, inf, neg_inf));
+
+    EXPECT_TRUE(funcWrapper(func, T(0.0), T(-0.1)));
+    EXPECT_TRUE(funcWrapper(func, T(0.0), T(-123.38)));
+
+    EXPECT_FALSE(funcWrapper(func, T(0.1), T(0.0)));
+    EXPECT_FALSE(funcWrapper(func, T(123.38), T(0.0)));
+
+    EXPECT_TRUE(funcWrapper(func, T(0.1), T(-0.1)));
+    EXPECT_TRUE(funcWrapper(func, T(123.38), T(-123.38)));
+  }
+
+  void testXEqualToY(TotalOrderMagFunc func) {
+    EXPECT_TRUE(funcWrapper(func, inf, inf));
+    EXPECT_TRUE(funcWrapper(func, neg_inf, neg_inf));
+
+    EXPECT_TRUE(funcWrapper(func, T(-0.0), T(0.0)));
+    EXPECT_TRUE(funcWrapper(func, T(0.0), T(-0.0)));
+
+    EXPECT_TRUE(funcWrapper(func, T(0.0), T(0.0)));
+    EXPECT_TRUE(funcWrapper(func, T(-0.0), T(-0.0)));
+    EXPECT_TRUE(funcWrapper(func, T(0.1), T(0.1)));
+    EXPECT_TRUE(funcWrapper(func, T(-0.1), T(-0.1)));
+    EXPECT_TRUE(funcWrapper(func, T(123.38), T(123.38)));
+    EXPECT_TRUE(funcWrapper(func, T(-123.38), T(-123.38)));
+  }
+
+  void testSingleNaN(TotalOrderMagFunc func) {
+    EXPECT_FALSE(funcWrapper(func, neg_aNaN, T(0.0)));
+    EXPECT_FALSE(funcWrapper(func, neg_aNaN, T(0.1)));
+    EXPECT_FALSE(funcWrapper(func, neg_aNaN, T(123.38)));
+
+    EXPECT_TRUE(funcWrapper(func, T(0.0), neg_aNaN));
+    EXPECT_TRUE(funcWrapper(func, T(0.1), neg_aNaN));
+    EXPECT_TRUE(funcWrapper(func, T(123.38), neg_aNaN));
+
+    EXPECT_TRUE(funcWrapper(func, T(0.0), aNaN));
+    EXPECT_TRUE(funcWrapper(func, T(0.1), aNaN));
+    EXPECT_TRUE(funcWrapper(func, T(123.38), aNaN));
+
+    EXPECT_FALSE(funcWrapper(func, aNaN, T(0.0)));
+    EXPECT_FALSE(funcWrapper(func, aNaN, T(0.1)));
+    EXPECT_FALSE(funcWrapper(func, aNaN, T(123.38)));
+  }
+
+  void testNaNSigns(TotalOrderMagFunc func) {
+    EXPECT_TRUE(funcWrapper(func, neg_aNaN, aNaN));
+    EXPECT_FALSE(funcWrapper(func, neg_aNaN, sNaN));
+    EXPECT_TRUE(funcWrapper(func, neg_sNaN, aNaN));
+    EXPECT_TRUE(funcWrapper(func, neg_sNaN, sNaN));
+
+    EXPECT_TRUE(funcWrapper(func, aNaN, neg_aNaN));
+    EXPECT_FALSE(funcWrapper(func, aNaN, neg_sNaN));
+    EXPECT_TRUE(funcWrapper(func, sNaN, neg_aNaN));
+    EXPECT_TRUE(funcWrapper(func, sNaN, neg_sNaN));
+  }
+
+  void testQuietVsSignalingNaN(TotalOrderMagFunc func) {
+    EXPECT_FALSE(funcWrapper(func, neg_aNaN, neg_sNaN));
+    EXPECT_TRUE(funcWrapper(func, neg_sNaN, neg_aNaN));
+    EXPECT_TRUE(funcWrapper(func, sNaN, aNaN));
+    EXPECT_FALSE(funcWrapper(func, aNaN, sNaN));
+  }
+
+  void testNaNPayloads(TotalOrderMagFunc func) {
+    T qnan_123 = FPBits::quiet_nan(Sign::POS, 0x123).get_val();
+    T neg_qnan_123 = FPBits::quiet_nan(Sign::NEG, 0x123).get_val();
+    T snan_123 = FPBits::signaling_nan(Sign::POS, 0x123).get_val();
+    T neg_snan_123 = FPBits::signaling_nan(Sign::NEG, 0x123).get_val();
+
+    EXPECT_TRUE(funcWrapper(func, aNaN, aNaN));
+    EXPECT_TRUE(funcWrapper(func, sNaN, sNaN));
+    EXPECT_TRUE(funcWrapper(func, aNaN, qnan_123));
+    EXPECT_TRUE(funcWrapper(func, sNaN, snan_123));
+    EXPECT_FALSE(funcWrapper(func, qnan_123, aNaN));
+    EXPECT_FALSE(funcWrapper(func, snan_123, sNaN));
+
+    EXPECT_TRUE(funcWrapper(func, neg_aNaN, neg_aNaN));
+    EXPECT_TRUE(funcWrapper(func, neg_sNaN, neg_sNaN));
+    EXPECT_TRUE(funcWrapper(func, neg_aNaN, neg_qnan_123));
+    EXPECT_TRUE(funcWrapper(func, neg_sNaN, neg_snan_123));
+    EXPECT_FALSE(funcWrapper(func, neg_qnan_123, neg_aNaN));
+    EXPECT_FALSE(funcWrapper(func, neg_snan_123, neg_sNaN));
+  }
+};
+
+#define LIST_TOTALORDERMAG_TESTS(T, func)                                      \
+  using LlvmLibcTotalOrderMagTest = TotalOrderMagTestTemplate<T>;              \
+  TEST_F(LlvmLibcTotalOrderMagTest, XLesserThanY) { testXLesserThanY(&func); } \
+  TEST_F(LlvmLibcTotalOrderMagTest, XGreaterThanY) {                           \
+    testXGreaterThanY(&func);                                                  \
+  }                                                                            \
+  TEST_F(LlvmLibcTotalOrderMagTest, XEqualToY) { testXEqualToY(&func); }       \
+  TEST_F(LlvmLibcTotalOrderMagTest, SingleNaN) { testSingleNaN(&func); }       \
+  TEST_F(LlvmLibcTotalOrderMagTest, NaNSigns) { testNaNSigns(&func); }         \
+  TEST_F(LlvmLibcTotalOrderMagTest, QuietVsSignalingNaN) {                     \
+    testQuietVsSignalingNaN(&func);                                            \
+  }                                                                            \
+  TEST_F(LlvmLibcTotalOrderMagTest, NaNPayloads) { testNaNPayloads(&func); }
+
+#endif // LIBC_TEST_SRC_MATH_SMOKE_TOTALORDERMAGTEST_H
diff --git a/libc/test/src/math/smoke/TotalOrderTest.h b/libc/test/src/math/smoke/TotalOrderTest.h
new file mode 100644
index 00000000000000..281b2a59f930db
--- /dev/null
+++ b/libc/test/src/math/smoke/TotalOrderTest.h
@@ -0,0 +1,138 @@
+//===-- Utility class to test different flavors of totalorder ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LIBC_TEST_SRC_MATH_SMOKE_TOTALORDERTEST_H
+#define LIBC_TEST_SRC_MATH_SMOKE_TOTALORDERTEST_H
+
+#include "test/UnitTest/FEnvSafeTest.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+
+template <typename T>
+class TotalOrderTestTemplate : public LIBC_NAMESPACE::testing::FEnvSafeTest {
+
+  DECLARE_SPECIAL_CONSTANTS(T)
+
+public:
+  typedef int (*TotalOrderFunc)(const T *, const T *);
+
+  bool funcWrapper(TotalOrderFunc func, T x, T y) { return func(&x, &y) != 0; }
+
+  void testXLesserThanY(TotalOrderFunc func) {
+    EXPECT_TRUE(funcWrapper(func, neg_inf, inf));
+
+    EXPECT_TRUE(funcWrapper(func, T(0.0), T(0.1)));
+    EXPECT_TRUE(funcWrapper(func, T(0.0), T(123.38)));
+
+    EXPECT_TRUE(funcWrapper(func, T(-0.1), T(0.0)));
+    EXPECT_TRUE(funcWrapper(func, T(-123.38), T(0.0)));
+
+    EXPECT_TRUE(funcWrapper(func, T(-0.1), T(0.1)));
+    EXPECT_TRUE(funcWrapper(func, T(-123.38), T(123.38)));
+  }
+
+  void testXGreaterThanY(TotalOrderFunc func) {
+    EXPECT_FALSE(funcWrapper(func, inf, neg_inf));
+
+    EXPECT_FALSE(funcWrapper(func, T(0.0), T(-0.1)));
+    EXPECT_FALSE(funcWrapper(func, T(0.0), T(-123.38)));
+
+    EXPECT_FALSE(funcWrapper(func, T(0.1), T(0.0)));
+    EXPECT_FALSE(funcWrapper(func, T(123.38), T(0.0)));
+
+    EXPECT_FALSE(funcWrapper(func, T(0.1), T(-0.1)));
+    EXPECT_FALSE(funcWrapper(func, T(123.38), T(-123.38)));
+  }
+
+  void testXEqualToY(TotalOrderFunc func) {
+    EXPECT_TRUE(funcWrapper(func, inf, inf));
+    EXPECT_TRUE(funcWrapper(func, neg_inf, neg_inf));
+
+    EXPECT_TRUE(funcWrapper(func, T(-0.0), T(0.0)));
+    EXPECT_FALSE(funcWrapper(func, T(0.0), T(-0.0)));
+
+    EXPECT_TRUE(funcWrapper(func, T(0.0), T(0.0)));
+    EXPECT_TRUE(funcWrapper(func, T(-0.0), T(-0.0)));
+    EXPECT_TRUE(funcWrapper(func, T(0.1), T(0.1)));
+    EXPECT_TRUE(funcWrapper(func, T(-0.1), T(-0.1)));
+    EXPECT_TRUE(funcWrapper(func, T(123.38), T(123.38)));
+    EXPECT_TRUE(funcWrapper(func, T(-123.38), T(-123.38)));
+  }
+
+  void testSingleNaN(TotalOrderFunc func) {
+    EXPECT_TRUE(funcWrapper(func, neg_aNaN, T(0.0)));
+    EXPECT_TRUE(funcWrapper(func, neg_aNaN, T(0.1)));
+    EXPECT_TRUE(funcWrapper(func, neg_aNaN, T(123.38)));
+
+    EXPECT_FALSE(funcWrapper(func, T(0.0), neg_aNaN));
+    EXPECT_FALSE(funcWrapper(func, T(0.1), neg_aNaN));
+    EXPECT_FALSE(funcWrapper(func, T(123.38), neg_aNaN));
+
+    EXPECT_TRUE(funcWrapper(func, T(0.0), aNaN));
+    EXPECT_TRUE(funcWrapper(func, T(0.1), aNaN));
+    EXPECT_TRUE(funcWrapper(func, T(123.38), aNaN));
+
+    EXPECT_FALSE(funcWrapper(func, aNaN, T(0.0)));
+    EXPECT_FALSE(funcWrapper(func, aNaN, T(0.1)));
+    EXPECT_FALSE(funcWrapper(func, aNaN, T(123.38)));
+  }
+
+  void testNaNSigns(TotalOrderFunc func) {
+    EXPECT_TRUE(funcWrapper(func, neg_aNaN, aNaN));
+    EXPECT_TRUE(funcWrapper(func, neg_aNaN, sNaN));
+    EXPECT_TRUE(funcWrapper(func, neg_sNaN, aNaN));
+    EXPECT_TRUE(funcWrapper(func, neg_sNaN, sNaN));
+
+    EXPECT_FALSE(funcWrapper(func, aNaN, neg_aNaN));
+    EXPECT_FALSE(funcWrapper(func, aNaN, neg_sNaN));
+    EXPECT_FALSE(funcWrapper(func, sNaN, neg_aNaN));
+    EXPECT_FALSE(funcWrapper(func, sNaN, neg_sNaN));
+  }
+
+  void testQuietVsSignalingNaN(TotalOrderFunc func) {
+    EXPECT_TRUE(funcWrapper(func, neg_aNaN, neg_sNaN));
+    EXPECT_FALSE(funcWrapper(func, neg_sNaN, neg_aNaN));
+    EXPECT_TRUE(funcWrapper(func, sNaN, aNaN));
+    EXPECT_FALSE(funcWrapper(func, aNaN, sNaN));
+  }
+
+  void testNaNPayloads(TotalOrderFunc func) {
+    T qnan_123 = FPBits::quiet_nan(Sign::POS, 0x123).get_val();
+    T neg_qnan_123 = FPBits::quiet_nan(Sign::NEG, 0x123).get_val();
+    T snan_123 = FPBits::signaling_nan(Sign::POS, 0x123).get_val();
+    T neg_snan_123 = FPBits::signaling_nan(Sign::NEG, 0x123).get_val();
+
+    EXPECT_TRUE(funcWrapper(func, aNaN, aNaN));
+    EXPECT_TRUE(funcWrapper(func, sNaN, sNaN));
+    EXPECT_TRUE(funcWrapper(func, aNaN, qnan_123));
+    EXPECT_TRUE(funcWrapper(func, sNaN, snan_123));
+    EXPECT_FALSE(funcWrapper(func, qnan_123, aNaN));
+    EXPECT_FALSE(funcWrapper(func, snan_123, sNaN));
+
+    EXPECT_TRUE(funcWrapper(func, neg_aNaN, neg_aNaN));
+    EXPECT_TRUE(funcWrapper(func, neg_sNaN, neg_sNaN));
+    EXPECT_FALSE(funcWrapper(func, neg_aNaN, neg_qnan_123));
+    EXPECT_FALSE(funcWrapper(func, neg_sNaN, neg_snan_123));
+    EXPECT_TRUE(funcWrapper(func, neg_qnan_123, neg_aNaN));
+    EXPECT_TRUE(funcWrapper(func, neg_snan_123, neg_sNaN));
+  }
+};
+
+#define LIST_TOTALORDER_TESTS(T, func)                                         \
+  using LlvmLibcTotalOrderTest = TotalOrderTestTemplate<T>;                    \
+  TEST_F(LlvmLibcTotalOrderTest, XLesserThanY) { testXLesserThanY(&func); }    \
+  TEST_F(LlvmLibcTotalOrderTest, XGreaterThanY) { testXGreaterThanY(&func); }  \
+  TEST_F(LlvmLibcTotalOrderTest, XEqualToY) { testXEqualToY(&func); }          \
+  TEST_F(LlvmLibcTotalOrderTest, SingleNaN) { testSingleNaN(&func); }          \
+  TEST_F(LlvmLibcTotalOrderTest, NaNSigns) { testNaNSigns(&func); }            \
+  TEST_F(LlvmLibcTotalOrderTest, QuietVsSignalingNaN) {                        \
+    testQuietVsSignalingNaN(&func);                                            \
+  }                                                                            \
+  TEST_F(LlvmLibcTotalOrderTest, NaNPayloads) { testNaNPayloads(&func); }
+
+#endif // LIBC_TEST_SRC_MATH_SMOKE_TOTALORDERTEST_H
diff --git a/libc/test/src/math/smoke/totalorderf16_test.cpp b/libc/test/src/math/smoke/totalorderf16_test.cpp
new file mode 100644
index 00000000000000..410c70c47c51d8
--- /dev/null
+++ b/libc/test/src/math/smoke/totalorderf16_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for totalorderf16 ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "TotalOrderTest.h"
+
+#include "src/math/totalorderf16.h"
+
+LIST_TOTALORDER_TESTS(float16, LIBC_NAMESPACE::totalorderf16)
diff --git a/libc/test/src/math/smoke/totalordermagf16_test.cpp b/libc/test/src/math/smoke/totalordermagf16_test.cpp
new file mode 100644
index 00000000000000..b09eb11cd9c3bb
--- /dev/null
+++ b/libc/test/src/math/smoke/totalordermagf16_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for totalordermagf16 ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "TotalOrderMagTest.h"
+
+#include "src/math/totalordermagf16.h"
+
+LIST_TOTALORDERMAG_TESTS(float16, LIBC_NAMESPACE::totalordermagf16)
diff --git a/libcxx/docs/Status/Cxx2cIssues.csv b/libcxx/docs/Status/Cxx2cIssues.csv
index 8d24457186310c..28359b7bb49ac4 100644
--- a/libcxx/docs/Status/Cxx2cIssues.csv
+++ b/libcxx/docs/Status/Cxx2cIssues.csv
@@ -65,4 +65,5 @@
 "`3343 <https://wg21.link/LWG3343>`__","Ordering of calls to ``unlock()`` and ``notify_all()`` in Effects element of ``notify_all_at_thread_exit()`` should be reversed","Not Yet Adopted","|Complete|","16.0",""
 "XXXX","","The sys_info range should be affected by save","Not Yet Adopted","|Complete|","19.0"
 "`4071 <https://wg21.link/LWG4071>`__","","``reference_wrapper`` comparisons are not SFINAE-friendly","Not Yet Adopted","|Complete|","19.0"
+"`4110 <https://wg21.link/LWG4110>`__","","``shared_ptr(nullptr_t, Deleter)`` is overconstrained, breaking some sensible deleters","Not Yet Adopted","|Complete|","19.0"
 "","","","","",""
diff --git a/libcxx/include/__memory/shared_ptr.h b/libcxx/include/__memory/shared_ptr.h
index 00db96185be7c6..7b5002cb95d32b 100644
--- a/libcxx/include/__memory/shared_ptr.h
+++ b/libcxx/include/__memory/shared_ptr.h
@@ -404,7 +404,7 @@ struct __shared_ptr_deleter_ctor_reqs {
 };
 
 template <class _Dp, class _Tp>
-using __shared_ptr_nullptr_deleter_ctor_reqs = _And<is_move_constructible<_Dp>, __well_formed_deleter<_Dp, nullptr_t> >;
+using __shared_ptr_nullptr_deleter_ctor_reqs = _And<is_move_constructible<_Dp>, __well_formed_deleter<_Dp, _Tp*> >;
 
 #if defined(_LIBCPP_ABI_ENABLE_SHARED_PTR_TRIVIAL_ABI)
 #  define _LIBCPP_SHARED_PTR_TRIVIAL_ABI __attribute__((__trivial_abi__))
diff --git a/libcxx/include/string b/libcxx/include/string
index 1db803e822d727..751af8f1476d0d 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -868,23 +868,9 @@ private:
 
   static_assert(sizeof(__short) == (sizeof(value_type) * (__min_cap + 1)), "__short has an unexpected size.");
 
-  union __ulx {
-    __long __lx;
-    __short __lxx;
-  };
-
-  enum { __n_words = sizeof(__ulx) / sizeof(size_type) };
-
-  struct __raw {
-    size_type __words[__n_words];
-  };
-
-  struct __rep {
-    union {
-      __short __s;
-      __long __l;
-      __raw __r;
-    };
+  union __rep {
+    __short __s;
+    __long __l;
   };
 
   __compressed_pair<__rep, allocator_type> __r_;
@@ -3746,17 +3732,10 @@ template <class _Allocator>
 inline _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI bool
 operator==(const basic_string<char, char_traits<char>, _Allocator>& __lhs,
            const basic_string<char, char_traits<char>, _Allocator>& __rhs) _NOEXCEPT {
-  size_t __lhs_sz = __lhs.size();
-  if (__lhs_sz != __rhs.size())
+  size_t __sz = __lhs.size();
+  if (__sz != __rhs.size())
     return false;
-  const char* __lp = __lhs.data();
-  const char* __rp = __rhs.data();
-  if (__lhs.__is_long())
-    return char_traits<char>::compare(__lp, __rp, __lhs_sz) == 0;
-  for (; __lhs_sz != 0; --__lhs_sz, ++__lp, ++__rp)
-    if (*__lp != *__rp)
-      return false;
-  return true;
+  return char_traits<char>::compare(__lhs.data(), __rhs.data(), __sz) == 0;
 }
 
 #if _LIBCPP_STD_VER <= 17
diff --git a/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/get_info.local_time.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/get_info.local_time.pass.cpp
index a8c468a6c6fd4d..ec3e490c0ed790 100644
--- a/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/get_info.local_time.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/get_info.local_time.pass.cpp
@@ -12,7 +12,7 @@
 // XFAIL: libcpp-has-no-experimental-tzdb
 // XFAIL: availability-tzdb-missing
 // Times out under HWASan
-// XFAIL: hwasan
+// UNSUPPORTED: hwasan
 
 // <chrono>
 
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/nullptr_t_deleter.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/nullptr_t_deleter.pass.cpp
index 13340ed5294c05..4ea752b36bd018 100644
--- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/nullptr_t_deleter.pass.cpp
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/nullptr_t_deleter.pass.cpp
@@ -32,17 +32,16 @@ int A::count = 0;
 // LWG 3233. Broken requirements for shared_ptr converting constructors
 // https://cplusplus.github.io/LWG/issue3233
 static_assert( std::is_constructible<std::shared_ptr<int>,  std::nullptr_t, test_deleter<int> >::value, "");
-static_assert(!std::is_constructible<std::shared_ptr<int>,  std::nullptr_t, bad_deleter>::value, "");
-static_assert(!std::is_constructible<std::shared_ptr<int>,  std::nullptr_t, no_nullptr_deleter>::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int>, std::nullptr_t, bad_deleter>::value, "");
 static_assert(!std::is_constructible<std::shared_ptr<int>,  std::nullptr_t, no_move_deleter>::value, "");
 
 #if TEST_STD_VER >= 17
-static_assert( std::is_constructible<std::shared_ptr<int[]>,  std::nullptr_t, test_deleter<int> >::value, "");
+static_assert(std::is_constructible<std::shared_ptr<int[]>, std::nullptr_t, test_deleter<int[]> >::value, "");
 static_assert(!std::is_constructible<std::shared_ptr<int[]>,  std::nullptr_t, bad_deleter>::value, "");
 static_assert(!std::is_constructible<std::shared_ptr<int[]>,  std::nullptr_t, no_nullptr_deleter>::value, "");
 static_assert(!std::is_constructible<std::shared_ptr<int[]>,  std::nullptr_t, no_move_deleter>::value, "");
 
-static_assert( std::is_constructible<std::shared_ptr<int[5]>,  std::nullptr_t, test_deleter<int> >::value, "");
+static_assert(std::is_constructible<std::shared_ptr<int[5]>, std::nullptr_t, test_deleter<int[5]> >::value, "");
 static_assert(!std::is_constructible<std::shared_ptr<int[5]>,  std::nullptr_t, bad_deleter>::value, "");
 static_assert(!std::is_constructible<std::shared_ptr<int[5]>,  std::nullptr_t, no_nullptr_deleter>::value, "");
 static_assert(!std::is_constructible<std::shared_ptr<int[5]>,  std::nullptr_t, no_move_deleter>::value, "");
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/nullptr_t_deleter_allocator.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/nullptr_t_deleter_allocator.pass.cpp
index 53ca6fb5b234d4..a479b24c4595ab 100644
--- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/nullptr_t_deleter_allocator.pass.cpp
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/nullptr_t_deleter_allocator.pass.cpp
@@ -33,17 +33,21 @@ int A::count = 0;
 // LWG 3233. Broken requirements for shared_ptr converting constructors
 // https://cplusplus.github.io/LWG/issue3233
 static_assert( std::is_constructible<std::shared_ptr<int>,  std::nullptr_t, test_deleter<int>, test_allocator<int> >::value, "");
-static_assert(!std::is_constructible<std::shared_ptr<int>,  std::nullptr_t, bad_deleter, test_allocator<int> >::value, "");
-static_assert(!std::is_constructible<std::shared_ptr<int>,  std::nullptr_t, no_nullptr_deleter, test_allocator<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int>, std::nullptr_t, bad_deleter, test_allocator<int> >::value,
+              "");
 static_assert(!std::is_constructible<std::shared_ptr<int>,  std::nullptr_t, no_move_deleter, test_allocator<int> >::value, "");
 
 #if TEST_STD_VER >= 17
-static_assert( std::is_constructible<std::shared_ptr<int[]>,  std::nullptr_t, test_deleter<int>, test_allocator<int> >::value, "");
+static_assert(
+    std::is_constructible<std::shared_ptr<int[]>, std::nullptr_t, test_deleter<int[]>, test_allocator<int> >::value,
+    "");
 static_assert(!std::is_constructible<std::shared_ptr<int[]>,  std::nullptr_t, bad_deleter, test_allocator<int> >::value, "");
 static_assert(!std::is_constructible<std::shared_ptr<int[]>,  std::nullptr_t, no_nullptr_deleter, test_allocator<int> >::value, "");
 static_assert(!std::is_constructible<std::shared_ptr<int[]>,  std::nullptr_t, no_move_deleter, test_allocator<int> >::value, "");
 
-static_assert( std::is_constructible<std::shared_ptr<int[5]>,  std::nullptr_t, test_deleter<int>, test_allocator<int> >::value, "");
+static_assert(
+    std::is_constructible<std::shared_ptr<int[5]>, std::nullptr_t, test_deleter<int[5]>, test_allocator<int> >::value,
+    "");
 static_assert(!std::is_constructible<std::shared_ptr<int[5]>,  std::nullptr_t, bad_deleter, test_allocator<int> >::value, "");
 static_assert(!std::is_constructible<std::shared_ptr<int[5]>,  std::nullptr_t, no_nullptr_deleter, test_allocator<int> >::value, "");
 static_assert(!std::is_constructible<std::shared_ptr<int[5]>,  std::nullptr_t, no_move_deleter, test_allocator<int> >::value, "");
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter.pass.cpp
index 562acf56d96fe1..95dcb92b51993c 100644
--- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter.pass.cpp
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter.pass.cpp
@@ -115,6 +115,14 @@ int main(int, char**)
     }
 #endif // TEST_STD_VER >= 11
 
+#if TEST_STD_VER >= 14
+    {
+      // LWG 4110
+      auto deleter = [](auto pointer) { delete pointer; };
+      std::shared_ptr<int> p(new int, deleter);
+    }
+#endif
+
   test_function_type();
   return 0;
 }
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter_allocator.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter_allocator.pass.cpp
index 9dffbcdd59a735..89e7d0b02d421b 100644
--- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter_allocator.pass.cpp
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter_allocator.pass.cpp
@@ -165,5 +165,13 @@ int main(int, char**)
                                              test_allocator<Derived[4]> >::value, "");
     }
 
+#if TEST_STD_VER >= 14
+    {
+      // LWG 4110
+      auto deleter = [](auto pointer) { delete pointer; };
+      std::shared_ptr<int> p(new int, deleter, std::allocator<int>());
+    }
+#endif
+
   return 0;
 }
diff --git a/lld/MachO/UnwindInfoSection.cpp b/lld/MachO/UnwindInfoSection.cpp
index 0ac2f39a6180c7..7033481d6014b5 100644
--- a/lld/MachO/UnwindInfoSection.cpp
+++ b/lld/MachO/UnwindInfoSection.cpp
@@ -298,19 +298,31 @@ void UnwindInfoSectionImpl::prepareRelocations(ConcatInputSection *isec) {
       assert(!isCoalescedWeak(referentIsec));
       // Personality functions can be referenced via section relocations
       // if they live in the same object file. Create placeholder synthetic
-      // symbols for them in the GOT.
+      // symbols for them in the GOT. If the corresponding symbol is already
+      // in the GOT, use that to avoid creating a duplicate entry. All GOT
+      // entries needed by non-unwind sections will have already been added
+      // by this point.
       Symbol *&s = personalityTable[{referentIsec, r.addend}];
       if (s == nullptr) {
-        // This runs after dead stripping, so the noDeadStrip argument does not
-        // matter.
-        s = make<Defined>("<internal>", /*file=*/nullptr, referentIsec,
-                          r.addend, /*size=*/0, /*isWeakDef=*/false,
-                          /*isExternal=*/false, /*isPrivateExtern=*/false,
-                          /*includeInSymtab=*/true,
-                          /*isReferencedDynamically=*/false,
-                          /*noDeadStrip=*/false);
-        s->used = true;
-        in.got->addEntry(s);
+        Defined *const *gotEntry =
+            llvm::find_if(referentIsec->symbols, [&](Defined const *d) {
+              return d->value == static_cast<uint64_t>(r.addend) &&
+                     d->isInGot();
+            });
+        if (gotEntry != referentIsec->symbols.end()) {
+          s = *gotEntry;
+        } else {
+          // This runs after dead stripping, so the noDeadStrip argument does
+          // not matter.
+          s = make<Defined>("<internal>", /*file=*/nullptr, referentIsec,
+                            r.addend, /*size=*/0, /*isWeakDef=*/false,
+                            /*isExternal=*/false, /*isPrivateExtern=*/false,
+                            /*includeInSymtab=*/true,
+                            /*isReferencedDynamically=*/false,
+                            /*noDeadStrip=*/false);
+          s->used = true;
+          in.got->addEntry(s);
+        }
       }
       r.referent = s;
       r.addend = 0;
diff --git a/lld/test/MachO/compact-unwind-both-local-and-dylib-personality.s b/lld/test/MachO/compact-unwind-both-local-and-dylib-personality.s
index 676577d6b17e9f..35f39ba5fb1e21 100644
--- a/lld/test/MachO/compact-unwind-both-local-and-dylib-personality.s
+++ b/lld/test/MachO/compact-unwind-both-local-and-dylib-personality.s
@@ -42,19 +42,20 @@
 
 # RUN: llvm-objdump --macho --indirect-symbols --unwind-info --bind %t/d.out | FileCheck %s --check-prefixes=D -D#%x,OFF=0x100000000
 
-# A:      Indirect symbols for (__DATA_CONST,__got)
+# A:      Indirect symbols for (__DATA_CONST,__got) 4 entries
 # A-NEXT: address                    index name
 # A:      0x[[#%x,GXX_PERSONALITY_LO:]] [[#]] ___gxx_personality_v0
+# A:      0x[[#%x,PERSONALITY_1:]]      [[#]] _personality_1
+# A:      0x[[#%x,PERSONALITY_2:]]      [[#]] _personality_2
 # A:      0x[[#%x,GXX_PERSONALITY_HI:]] [[#]] ___gxx_personality_v0
-# A:      0x[[#%x,PERSONALITY_1:]]  LOCAL
-# A:      0x[[#%x,PERSONALITY_2:]]  LOCAL
 
 # BC:      Indirect symbols for (__DATA_CONST,__got)
 # BC-NEXT: address                    index name
-# C:       0x[[#%x,GXX_PERSONALITY_HI:]] LOCAL
 # BC:      0x[[#%x,GXX_PERSONALITY_LO:]] LOCAL
-# BC:      0x[[#%x,PERSONALITY_1:]]      LOCAL
-# BC:      0x[[#%x,PERSONALITY_2:]]      LOCAL
+# C:       0x[[#%x,GXX_PERSONALITY_HI:]] [[#]] ___gxx_personality_v0
+# BC:      0x[[#%x,PERSONALITY_1:]]      [[#]] _personality_1
+# BC:      0x[[#%x,PERSONALITY_2:]]      [[#]] _personality_2
+# BC-EMPTY:
 
 # CHECK:        Personality functions: (count = 3)
 # CHECK-DAG:     personality[{{[0-9]+}}]: 0x{{0*}}[[#GXX_PERSONALITY_LO-OFF]]
@@ -66,7 +67,7 @@
 # A-NEXT: __DATA_CONST __got        0x[[#GXX_PERSONALITY_LO-0]] pointer         0 libc++abi        ___gxx_personality_v0
 
 
-# D:      Indirect symbols for (__DATA_CONST,__got)
+# D:      Indirect symbols for (__DATA_CONST,__got) 6 entries
 # D-NEXT: address                    index name
 # D:      0x[[#%x,GXX_PERSONALITY_HI:]] [[#]] ___gxx_personality_v0
 # D:      0x[[#%x,PERSONALITY_1:]] [[#]] _personality_1
diff --git a/lld/test/MachO/compact-unwind.s b/lld/test/MachO/compact-unwind.s
index fa73ccb10a32a2..27e4b44dc0b09f 100644
--- a/lld/test/MachO/compact-unwind.s
+++ b/lld/test/MachO/compact-unwind.s
@@ -29,7 +29,7 @@
 # FIRST:      Indirect symbols for (__DATA_CONST,__got)
 # FIRST-NEXT: address                    index name
 # FIRST-DAG:  0x[[#%x,GXX_PERSONALITY:]] [[#]] ___gxx_personality_v0
-# FIRST-DAG:  0x[[#%x,MY_PERSONALITY:]]  LOCAL
+# FIRST-DAG:  0x[[#%x,MY_PERSONALITY:]]
 
 # SECOND:      Indirect symbols for (__DATA_CONST,__got)
 # SECOND-NEXT: address                    index name
diff --git a/lldb/source/Commands/CommandObjectThread.cpp b/lldb/source/Commands/CommandObjectThread.cpp
index db96ee2cec383e..bb2be560ebfff3 100644
--- a/lldb/source/Commands/CommandObjectThread.cpp
+++ b/lldb/source/Commands/CommandObjectThread.cpp
@@ -383,7 +383,7 @@ class CommandObjectThreadStepWithTypeAndScope : public CommandObjectParsed {
                                 eCommandProcessMustBePaused),
         m_step_type(step_type), m_step_scope(step_scope),
         m_class_options("scripted step") {
-    AddSimpleArgumentList(eArgTypeThreadID, eArgRepeatOptional);
+    AddSimpleArgumentList(eArgTypeThreadIndex, eArgRepeatOptional);
 
     if (step_type == eStepTypeScripted) {
       m_all_options.Append(&m_class_options, LLDB_OPT_SET_1 | LLDB_OPT_SET_2,
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp
index 7cf92adc6ef578..992d814793f9d2 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp
@@ -367,20 +367,6 @@ lldb_private::Type *DWARFDIE::ResolveTypeUID(const DWARFDIE &die) const {
   return nullptr;
 }
 
-std::vector<DWARFDIE> DWARFDIE::GetDeclContextDIEs() const {
-  if (!IsValid())
-    return {};
-
-  std::vector<DWARFDIE> result;
-  DWARFDIE parent = GetParentDeclContextDIE();
-  while (parent.IsValid() && parent.GetDIE() != GetDIE()) {
-    result.push_back(std::move(parent));
-    parent = parent.GetParentDeclContextDIE();
-  }
-
-  return result;
-}
-
 static void GetDeclContextImpl(DWARFDIE die,
                                llvm::SmallSet<lldb::user_id_t, 4> &seen,
                                std::vector<CompilerContext> &context) {
@@ -491,6 +477,18 @@ static void GetTypeLookupContextImpl(DWARFDIE die,
     case DW_TAG_base_type:
       push_ctx(CompilerContextKind::Builtin, name);
       break;
+    // If any of the tags below appear in the parent chain, stop the decl
+    // context and return. Prior to these being in here, if a type existed in a
+    // namespace "a" like "a::my_struct", but we also have a function in that
+    // same namespace "a" which contained a type named "my_struct", both would
+    // return "a::my_struct" as the declaration context since the
+    // DW_TAG_subprogram would be skipped and its parent would be found.
+    case DW_TAG_compile_unit:
+    case DW_TAG_type_unit:
+    case DW_TAG_subprogram:
+    case DW_TAG_lexical_block:
+    case DW_TAG_inlined_subroutine:
+      return;
     default:
       break;
     }
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.h
index 511ca62d0197a8..c74a82061fccf2 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.h
@@ -69,9 +69,6 @@ class DWARFDIE : public DWARFBaseDIE {
   DWARFDIE
   GetParentDeclContextDIE() const;
 
-  // DeclContext related functions
-  std::vector<DWARFDIE> GetDeclContextDIEs() const;
-
   /// Return this DIE's decl context as it is needed to look up types
   /// in Clang modules. This context will include any modules or functions that
   /// the type is declared in so an exact module match can be efficiently made.
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
index a52a7d67673742..d9e81f9c105b2d 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
@@ -3039,95 +3039,6 @@ TypeSP SymbolFileDWARF::FindCompleteObjCDefinitionTypeForDIE(
   return type_sp;
 }
 
-// This function helps to ensure that the declaration contexts match for two
-// different DIEs. Often times debug information will refer to a forward
-// declaration of a type (the equivalent of "struct my_struct;". There will
-// often be a declaration of that type elsewhere that has the full definition.
-// When we go looking for the full type "my_struct", we will find one or more
-// matches in the accelerator tables and we will then need to make sure the
-// type was in the same declaration context as the original DIE. This function
-// can efficiently compare two DIEs and will return true when the declaration
-// context matches, and false when they don't.
-bool SymbolFileDWARF::DIEDeclContextsMatch(const DWARFDIE &die1,
-                                           const DWARFDIE &die2) {
-  if (die1 == die2)
-    return true;
-
-  std::vector<DWARFDIE> decl_ctx_1;
-  std::vector<DWARFDIE> decl_ctx_2;
-  // The declaration DIE stack is a stack of the declaration context DIEs all
-  // the way back to the compile unit. If a type "T" is declared inside a class
-  // "B", and class "B" is declared inside a class "A" and class "A" is in a
-  // namespace "lldb", and the namespace is in a compile unit, there will be a
-  // stack of DIEs:
-  //
-  //   [0] DW_TAG_class_type for "B"
-  //   [1] DW_TAG_class_type for "A"
-  //   [2] DW_TAG_namespace  for "lldb"
-  //   [3] DW_TAG_compile_unit or DW_TAG_partial_unit for the source file.
-  //
-  // We grab both contexts and make sure that everything matches all the way
-  // back to the compiler unit.
-
-  // First lets grab the decl contexts for both DIEs
-  decl_ctx_1 = die1.GetDeclContextDIEs();
-  decl_ctx_2 = die2.GetDeclContextDIEs();
-  // Make sure the context arrays have the same size, otherwise we are done
-  const size_t count1 = decl_ctx_1.size();
-  const size_t count2 = decl_ctx_2.size();
-  if (count1 != count2)
-    return false;
-
-  // Make sure the DW_TAG values match all the way back up the compile unit. If
-  // they don't, then we are done.
-  DWARFDIE decl_ctx_die1;
-  DWARFDIE decl_ctx_die2;
-  size_t i;
-  for (i = 0; i < count1; i++) {
-    decl_ctx_die1 = decl_ctx_1[i];
-    decl_ctx_die2 = decl_ctx_2[i];
-    if (decl_ctx_die1.Tag() != decl_ctx_die2.Tag())
-      return false;
-  }
-#ifndef NDEBUG
-
-  // Make sure the top item in the decl context die array is always
-  // DW_TAG_compile_unit or DW_TAG_partial_unit. If it isn't then
-  // something went wrong in the DWARFDIE::GetDeclContextDIEs()
-  // function.
-  dw_tag_t cu_tag = decl_ctx_1[count1 - 1].Tag();
-  UNUSED_IF_ASSERT_DISABLED(cu_tag);
-  assert(cu_tag == DW_TAG_compile_unit || cu_tag == DW_TAG_partial_unit);
-
-#endif
-  // Always skip the compile unit when comparing by only iterating up to "count
-  // - 1". Here we compare the names as we go.
-  for (i = 0; i < count1 - 1; i++) {
-    decl_ctx_die1 = decl_ctx_1[i];
-    decl_ctx_die2 = decl_ctx_2[i];
-    const char *name1 = decl_ctx_die1.GetName();
-    const char *name2 = decl_ctx_die2.GetName();
-    // If the string was from a DW_FORM_strp, then the pointer will often be
-    // the same!
-    if (name1 == name2)
-      continue;
-
-    // Name pointers are not equal, so only compare the strings if both are not
-    // NULL.
-    if (name1 && name2) {
-      // If the strings don't compare, we are done...
-      if (strcmp(name1, name2) != 0)
-        return false;
-    } else {
-      // One name was NULL while the other wasn't
-      return false;
-    }
-  }
-  // We made it through all of the checks and the declaration contexts are
-  // equal.
-  return true;
-}
-
 TypeSP
 SymbolFileDWARF::FindDefinitionTypeForDWARFDeclContext(const DWARFDIE &die) {
   TypeSP type_sp;
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
index 7282c08c6857c9..5d3654efcce544 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
@@ -461,8 +461,6 @@ class SymbolFileDWARF : public SymbolFileCommon {
   FindBlockContainingSpecification(const DWARFDIE &die,
                                    dw_offset_t spec_block_die_offset);
 
-  bool DIEDeclContextsMatch(const DWARFDIE &die1, const DWARFDIE &die2);
-
   bool ClassContainsSelector(const DWARFDIE &class_die, ConstString selector);
 
   /// Parse call site entries (DW_TAG_call_site), including any nested call site
diff --git a/lldb/test/API/commands/process/attach/attach_denied/TestAttachDenied.py b/lldb/test/API/commands/process/attach/attach_denied/TestAttachDenied.py
index 22dca62045022e..d72a710e8127bf 100644
--- a/lldb/test/API/commands/process/attach/attach_denied/TestAttachDenied.py
+++ b/lldb/test/API/commands/process/attach/attach_denied/TestAttachDenied.py
@@ -18,6 +18,7 @@ class AttachDeniedTestCase(TestBase):
     @skipIfWindows
     @skipIfiOSSimulator
     @skipIfDarwinEmbedded  # ptrace(ATTACH_REQUEST...) won't work on ios/tvos/etc
+    @skipIfAsan # Times out inconsistently under asan
     def test_attach_to_process_by_id_denied(self):
         """Test attach by process id denied"""
         self.build()
diff --git a/lldb/test/API/functionalities/type_types/Makefile b/lldb/test/API/functionalities/type_types/Makefile
new file mode 100644
index 00000000000000..3d0b98f13f3d7b
--- /dev/null
+++ b/lldb/test/API/functionalities/type_types/Makefile
@@ -0,0 +1,2 @@
+CXX_SOURCES := main.cpp
+include Makefile.rules
diff --git a/lldb/test/API/functionalities/type_types/TestFindTypes.py b/lldb/test/API/functionalities/type_types/TestFindTypes.py
new file mode 100644
index 00000000000000..42b5c4cfaaf77c
--- /dev/null
+++ b/lldb/test/API/functionalities/type_types/TestFindTypes.py
@@ -0,0 +1,66 @@
+"""
+Test the SBModule and SBTarget type lookup APIs to find multiple types.
+"""
+
+import lldb
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+
+
+class TypeFindFirstTestCase(TestBase):
+    def test_find_first_type(self):
+        """
+        Test SBTarget::FindTypes() and SBModule::FindTypes() APIs.
+
+        We had issues where our declaration context when finding types was
+        incorrectly calculated where a type in a namepace, and a type in a
+        function that was also in the same namespace would match a lookup. For
+        example:
+
+            namespace a {
+              struct Foo {
+                int foo;
+              };
+
+              unsigned foo() {
+                typedef unsigned Foo;
+                Foo foo = 12;
+                return foo;
+              }
+            } // namespace a
+
+
+        Previously LLDB would calculate the declaration context of "a::Foo"
+        correctly, but incorrectly calculate the declaration context of "Foo"
+        from within the foo() function as "a::Foo". Adding tests to ensure this
+        works correctly.
+        """
+        self.build()
+        target = self.createTestTarget()
+        exe_module = target.GetModuleAtIndex(0)
+        self.assertTrue(exe_module.IsValid())
+        # Test the SBTarget and SBModule APIs for FindFirstType
+        for api in [target, exe_module]:
+            # We should find the "a::Foo" but not the "Foo" type in the function
+            types = api.FindTypes("a::Foo")
+            self.assertEqual(types.GetSize(), 1)
+            type_str0 = str(types.GetTypeAtIndex(0))
+            self.assertIn('struct Foo {', type_str0)
+
+            # When we search by type basename, we should find any type whose
+            # basename matches "Foo", so "a::Foo" and the "Foo" type in the
+            # function.
+            types = api.FindTypes("Foo")
+            self.assertEqual(types.GetSize(), 2)
+            type_str0 = str(types.GetTypeAtIndex(0))
+            type_str1 = str(types.GetTypeAtIndex(1))
+            # We don't know which order the types will come back as, so
+            self.assertEqual(set([str(t).split('\n')[0] for t in types]), set(["typedef Foo", "struct Foo {"]))
+
+            # When we search by type basename with "::" prepended, we should
+            # only types in the root namespace which means only "Foo" type in
+            # the function.
+            types = api.FindTypes("::Foo")
+            self.assertEqual(types.GetSize(), 1)
+            type_str0 = str(types.GetTypeAtIndex(0))
+            self.assertIn('typedef Foo', type_str0)
diff --git a/lldb/test/API/functionalities/type_types/main.cpp b/lldb/test/API/functionalities/type_types/main.cpp
new file mode 100644
index 00000000000000..095328932cdc46
--- /dev/null
+++ b/lldb/test/API/functionalities/type_types/main.cpp
@@ -0,0 +1,15 @@
+namespace a {
+struct Foo {};
+
+unsigned foo() {
+  typedef unsigned Foo;
+  Foo foo = 12;
+  return foo;
+}
+} // namespace a
+
+int main() {
+  a::Foo f = {};
+  a::foo();
+  return 0;
+}
diff --git a/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp b/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp
index bea07dfa27cc6a..65da7de1ba2d8a 100644
--- a/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp
+++ b/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp
@@ -258,3 +258,110 @@ TEST(DWARFDIETest, GetContext) {
       struct_die.GetTypeLookupContext(),
       testing::ElementsAre(make_namespace("NAMESPACE"), make_struct("STRUCT")));
 }
+
+TEST(DWARFDIETest, GetContextInFunction) {
+  // Make sure we get the right context fo each "struct_t" type. The first
+  // should be "a::struct_t" and the one defined in the "foo" function should be
+  // "struct_t". Previous DWARFDIE::GetTypeLookupContext() function calls would
+  // have the "struct_t" in "foo" be "a::struct_t" because it would traverse the
+  // entire die parent tree and ignore DW_TAG_subprogram and keep traversing the
+  // parents.
+  //
+  // 0x0000000b: DW_TAG_compile_unit
+  // 0x0000000c:   DW_TAG_namespace
+  //                 DW_AT_name("a")
+  // 0x0000000f:     DW_TAG_structure_type
+  //                   DW_AT_name("struct_t")
+  // 0x00000019:     DW_TAG_subprogram
+  //                   DW_AT_name("foo")
+  // 0x0000001e:       DW_TAG_structure_type
+  //                     DW_AT_name("struct_t")
+  // 0x00000028:       NULL
+  // 0x00000029:     NULL
+  // 0x0000002a:   NULL
+  const char *yamldata = R"(
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_386
+DWARF:
+  debug_str:
+    - ''
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+        - Code:            0x2
+          Tag:             DW_TAG_namespace
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_string
+        - Code:            0x3
+          Tag:             DW_TAG_structure_type
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_string
+        - Code:            0x4
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_string
+  debug_info:
+    - Length:          0x27
+      Version:         4
+      AbbrevTableID:   0
+      AbbrOffset:      0x0
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x1
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0xDEADBEEFDEADBEEF
+              CStr:            a
+        - AbbrCode:        0x3
+          Values:
+            - Value:           0xDEADBEEFDEADBEEF
+              CStr:            struct_t
+        - AbbrCode:        0x4
+          Values:
+            - Value:           0xDEADBEEFDEADBEEF
+              CStr:            foo
+        - AbbrCode:        0x3
+          Values:
+            - Value:           0xDEADBEEFDEADBEEF
+              CStr:            struct_t
+        - AbbrCode:        0x0
+        - AbbrCode:        0x0
+        - AbbrCode:        0x0)";
+
+  YAMLModuleTester t(yamldata);
+  auto *symbol_file =
+      llvm::cast<SymbolFileDWARF>(t.GetModule()->GetSymbolFile());
+  DWARFUnit *unit = symbol_file->DebugInfo().GetUnitAtIndex(0);
+  ASSERT_TRUE(unit);
+
+  auto make_namespace = [](llvm::StringRef name) {
+    return CompilerContext(CompilerContextKind::Namespace, ConstString(name));
+  };
+  auto make_struct = [](llvm::StringRef name) {
+    return CompilerContext(CompilerContextKind::Struct, ConstString(name));
+  };
+  // Grab the "a::struct_t" type from the "a" namespace
+  DWARFDIE a_struct_die = unit->DIE().GetFirstChild().GetFirstChild();
+  ASSERT_TRUE(a_struct_die);
+  EXPECT_THAT(
+      a_struct_die.GetDeclContext(),
+      testing::ElementsAre(make_namespace("a"), make_struct("struct_t")));
+  // Grab the "struct_t" defined in the "foo" function.
+  DWARFDIE foo_struct_die =
+      unit->DIE().GetFirstChild().GetFirstChild().GetSibling().GetFirstChild();
+  EXPECT_THAT(foo_struct_die.GetTypeLookupContext(),
+              testing::ElementsAre(make_struct("struct_t")));
+}
diff --git a/llvm/include/llvm/Analysis/CGSCCPassManager.h b/llvm/include/llvm/Analysis/CGSCCPassManager.h
index 5654ad46d6eab0..b19d53621ac867 100644
--- a/llvm/include/llvm/Analysis/CGSCCPassManager.h
+++ b/llvm/include/llvm/Analysis/CGSCCPassManager.h
@@ -306,6 +306,10 @@ struct CGSCCUpdateResult {
   SmallDenseSet<std::pair<LazyCallGraph::Node *, LazyCallGraph::SCC *>, 4>
       &InlinedInternalEdges;
 
+  /// Functions that a pass has considered to be dead to be removed at the end
+  /// of the call graph walk in batch.
+  SmallVector<Function *, 4> &DeadFunctions;
+
   /// Weak VHs to keep track of indirect calls for the purposes of detecting
   /// devirtualization.
   ///
diff --git a/llvm/include/llvm/Analysis/LazyCallGraph.h b/llvm/include/llvm/Analysis/LazyCallGraph.h
index ac8ca207d312b7..a8bbf2c578af9b 100644
--- a/llvm/include/llvm/Analysis/LazyCallGraph.h
+++ b/llvm/include/llvm/Analysis/LazyCallGraph.h
@@ -832,7 +832,7 @@ class LazyCallGraph {
     /// self-edges and edge removals which result in a spanning tree with no
     /// more cycles.
     [[nodiscard]] SmallVector<RefSCC *, 1>
-    removeInternalRefEdge(Node &SourceN, ArrayRef<Node *> TargetNs);
+    removeInternalRefEdges(ArrayRef<std::pair<Node *, Node *>> Edges);
 
     /// A convenience wrapper around the above to handle trivial cases of
     /// inserting a new call edge.
@@ -1056,18 +1056,18 @@ class LazyCallGraph {
   /// once SCCs have started to be formed. These routines have strict contracts
   /// but may be called at any point.
 
-  /// Remove a dead function from the call graph (typically to delete it).
+  /// Remove dead functions from the call graph.
   ///
-  /// Note that the function must have an empty use list, and the call graph
-  /// must be up-to-date prior to calling this. That means it is by itself in
-  /// a maximal SCC which is by itself in a maximal RefSCC, etc. No structural
-  /// changes result from calling this routine other than potentially removing
-  /// entry points into the call graph.
+  /// These functions should have already been passed to markDeadFunction().
+  /// This is done as a batch to prevent compile time blowup as a result of
+  /// handling a single function at a time.
+  void removeDeadFunctions(ArrayRef<Function *> DeadFs);
+
+  /// Mark a function as dead to be removed later by removeDeadFunctions().
   ///
-  /// If SCC formation has begun, this function must not be part of the current
-  /// DFS in order to call this safely. Typically, the function will have been
-  /// fully visited by the DFS prior to calling this routine.
-  void removeDeadFunction(Function &F);
+  /// The function body should have no incoming or outgoing call or ref edges.
+  /// For example, a function with a single "unreachable" instruction.
+  void markDeadFunction(Function &F);
 
   /// Add a new function split/outlined from an existing function.
   ///
diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index b9f385f4c4b8fa..7a54fe55014be1 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -704,9 +704,10 @@ class LoopAccessInfo {
   const PredicatedScalarEvolution &getPSE() const { return *PSE; }
 
 private:
-  /// Analyze the loop.
-  void analyzeLoop(AAResults *AA, LoopInfo *LI,
-                   const TargetLibraryInfo *TLI, DominatorTree *DT);
+  /// Analyze the loop. Returns true if all memory access in the loop can be
+  /// vectorized.
+  bool analyzeLoop(AAResults *AA, LoopInfo *LI, const TargetLibraryInfo *TLI,
+                   DominatorTree *DT);
 
   /// Check if the structure of the loop allows it to be analyzed by this
   /// pass.
diff --git a/llvm/include/llvm/MC/MCFragment.h b/llvm/include/llvm/MC/MCFragment.h
index 67e10c35894954..45599c940659e4 100644
--- a/llvm/include/llvm/MC/MCFragment.h
+++ b/llvm/include/llvm/MC/MCFragment.h
@@ -23,12 +23,15 @@
 
 namespace llvm {
 
+class MCAssembler;
 class MCSection;
 class MCSubtargetInfo;
 class MCSymbol;
 
-class MCFragment : public ilist_node_with_parent<MCFragment, MCSection> {
+class MCFragment {
   friend class MCAsmLayout;
+  friend class MCAssembler;
+  friend class MCSection;
 
 public:
   enum FragmentType : uint8_t {
@@ -51,6 +54,9 @@ class MCFragment : public ilist_node_with_parent<MCFragment, MCSection> {
   };
 
 private:
+  // The next fragment within the section.
+  MCFragment *Next = nullptr;
+
   /// The data for the section this fragment is in.
   MCSection *Parent;
 
@@ -64,10 +70,6 @@ class MCFragment : public ilist_node_with_parent<MCFragment, MCSection> {
   /// The layout order of this fragment.
   unsigned LayoutOrder;
 
-  /// The subsection this fragment belongs to. This is 0 if the fragment is not
-  // in any subsection.
-  unsigned SubsectionNumber = 0;
-
   FragmentType Kind;
 
 protected:
@@ -88,6 +90,8 @@ class MCFragment : public ilist_node_with_parent<MCFragment, MCSection> {
   /// This method will dispatch to the appropriate subclass.
   void destroy();
 
+  MCFragment *getNext() const { return Next; }
+
   FragmentType getKind() const { return Kind; }
 
   MCSection *getParent() const { return Parent; }
@@ -104,9 +108,6 @@ class MCFragment : public ilist_node_with_parent<MCFragment, MCSection> {
   bool hasInstructions() const { return HasInstructions; }
 
   void dump() const;
-
-  void setSubsectionNumber(unsigned Value) { SubsectionNumber = Value; }
-  unsigned getSubsectionNumber() const { return SubsectionNumber; }
 };
 
 class MCDummyFragment : public MCFragment {
diff --git a/llvm/include/llvm/MC/MCObjectStreamer.h b/llvm/include/llvm/MC/MCObjectStreamer.h
index e212d546139808..c0a337f5ea45e5 100644
--- a/llvm/include/llvm/MC/MCObjectStreamer.h
+++ b/llvm/include/llvm/MC/MCObjectStreamer.h
@@ -41,7 +41,6 @@ class raw_pwrite_stream;
 /// implementation.
 class MCObjectStreamer : public MCStreamer {
   std::unique_ptr<MCAssembler> Assembler;
-  MCSection::iterator CurInsertionPoint;
   bool EmitEHFrame;
   bool EmitDebugFrame;
   SmallVector<MCSymbol *, 2> PendingLabels;
@@ -94,7 +93,7 @@ class MCObjectStreamer : public MCStreamer {
   void insert(MCFragment *F) {
     flushPendingLabels(F);
     MCSection *CurSection = getCurrentSectionOnly();
-    CurSection->getFragmentList().insert(CurInsertionPoint, F);
+    CurSection->addFragment(*F);
     F->setParent(CurSection);
   }
 
diff --git a/llvm/include/llvm/MC/MCSection.h b/llvm/include/llvm/MC/MCSection.h
index 217b9b4b5bc52b..e5455292d5c625 100644
--- a/llvm/include/llvm/MC/MCSection.h
+++ b/llvm/include/llvm/MC/MCSection.h
@@ -14,7 +14,6 @@
 #define LLVM_MC_MCSECTION_H
 
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/ilist.h"
 #include "llvm/MC/MCFragment.h"
 #include "llvm/MC/SectionKind.h"
 #include "llvm/Support/Alignment.h"
@@ -24,20 +23,18 @@
 namespace llvm {
 
 class MCAsmInfo;
+class MCAssembler;
 class MCContext;
 class MCExpr;
 class MCSymbol;
 class raw_ostream;
 class Triple;
 
-template <> struct ilist_alloc_traits<MCFragment> {
-  static void deleteNode(MCFragment *V);
-};
-
 /// Instances of this class represent a uniqued identifier for a section in the
 /// current translation unit.  The MCContext class uniques and creates these.
 class MCSection {
 public:
+  friend MCAssembler;
   static constexpr unsigned NonUniqueID = ~0U;
 
   enum SectionVariant {
@@ -58,12 +55,29 @@ class MCSection {
     BundleLockedAlignToEnd
   };
 
-  using FragmentListType = iplist<MCFragment>;
+  struct iterator {
+    MCFragment *F = nullptr;
+    iterator() = default;
+    explicit iterator(MCFragment *F) : F(F) {}
+    MCFragment &operator*() const { return *F; }
+    bool operator==(const iterator &O) const { return F == O.F; }
+    bool operator!=(const iterator &O) const { return F != O.F; }
+    iterator &operator++() {
+      F = F->Next;
+      return *this;
+    }
+    iterator operator++(int) { return iterator(F->Next); }
+  };
 
-  using const_iterator = FragmentListType::const_iterator;
-  using iterator = FragmentListType::iterator;
+  struct FragList {
+    MCFragment *Head = nullptr;
+    MCFragment *Tail = nullptr;
+  };
 
 private:
+  // At parse time, this holds the fragment list of the current subsection. At
+  // layout time, this holds the concatenated fragment lists of all subsections.
+  FragList *CurFragList;
   MCSymbol *Begin;
   MCSymbol *End = nullptr;
   /// The alignment requirement of this section.
@@ -92,11 +106,10 @@ class MCSection {
 
   MCDummyFragment DummyFragment;
 
-  FragmentListType Fragments;
-
-  /// Mapping from subsection number to insertion point for subsection numbers
-  /// below that number.
-  SmallVector<std::pair<unsigned, MCFragment *>, 1> SubsectionFragmentMap;
+  // Mapping from subsection number to fragment list. At layout time, the
+  // subsection 0 list is replaced with concatenated fragments from all
+  // subsections.
+  SmallVector<std::pair<unsigned, FragList>, 1> Subsections;
 
   /// State for tracking labels that don't yet have Fragments
   struct PendingLabel {
@@ -171,29 +184,27 @@ class MCSection {
   bool isRegistered() const { return IsRegistered; }
   void setIsRegistered(bool Value) { IsRegistered = Value; }
 
-  MCSection::FragmentListType &getFragmentList() { return Fragments; }
-  const MCSection::FragmentListType &getFragmentList() const {
-    return const_cast<MCSection *>(this)->getFragmentList();
-  }
-
-  /// Support for MCFragment::getNextNode().
-  static FragmentListType MCSection::*getSublistAccess(MCFragment *) {
-    return &MCSection::Fragments;
-  }
-
   const MCDummyFragment &getDummyFragment() const { return DummyFragment; }
   MCDummyFragment &getDummyFragment() { return DummyFragment; }
 
-  iterator begin() { return Fragments.begin(); }
-  const_iterator begin() const { return Fragments.begin(); }
-
-  iterator end() { return Fragments.end(); }
-  const_iterator end() const { return Fragments.end(); }
-  bool empty() const { return Fragments.empty(); }
-
-  void addFragment(MCFragment &F) { Fragments.push_back(&F); }
+  FragList *curFragList() const { return CurFragList; }
+  iterator begin() const { return iterator(CurFragList->Head); }
+  iterator end() const { return {}; }
+  bool empty() const { return !CurFragList->Head; }
+
+  void addFragment(MCFragment &F) {
+    // The formal layout order will be finalized in MCAssembler::layout.
+    if (CurFragList->Tail) {
+      CurFragList->Tail->Next = &F;
+      F.setLayoutOrder(CurFragList->Tail->getLayoutOrder() + 1);
+    } else {
+      CurFragList->Head = &F;
+      assert(F.getLayoutOrder() == 0);
+    }
+    CurFragList->Tail = &F;
+  }
 
-  MCSection::iterator getSubsectionInsertionPoint(unsigned Subsection);
+  void switchSubsection(unsigned Subsection);
 
   void dump() const;
 
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index d6831eeaa794b7..dae2caf0181e46 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -797,7 +797,7 @@ struct InstrProfValueSiteRecord {
   /// Value profiling data pairs at a given value site.
   std::list<InstrProfValueData> ValueData;
 
-  InstrProfValueSiteRecord() { ValueData.clear(); }
+  InstrProfValueSiteRecord() = default;
   template <class InputIterator>
   InstrProfValueSiteRecord(InputIterator F, InputIterator L)
       : ValueData(F, L) {}
diff --git a/llvm/lib/Analysis/CGSCCPassManager.cpp b/llvm/lib/Analysis/CGSCCPassManager.cpp
index 8ae5c3dee6103e..2ed1d98f800688 100644
--- a/llvm/lib/Analysis/CGSCCPassManager.cpp
+++ b/llvm/lib/Analysis/CGSCCPassManager.cpp
@@ -158,10 +158,12 @@ ModuleToPostOrderCGSCCPassAdaptor::run(Module &M, ModuleAnalysisManager &AM) {
   SmallDenseSet<std::pair<LazyCallGraph::Node *, LazyCallGraph::SCC *>, 4>
       InlinedInternalEdges;
 
+  SmallVector<Function *, 4> DeadFunctions;
+
   CGSCCUpdateResult UR = {
-      RCWorklist,           CWorklist, InvalidRefSCCSet,
-      InvalidSCCSet,        nullptr,   PreservedAnalyses::all(),
-      InlinedInternalEdges, {}};
+      RCWorklist,           CWorklist,     InvalidRefSCCSet,
+      InvalidSCCSet,        nullptr,       PreservedAnalyses::all(),
+      InlinedInternalEdges, DeadFunctions, {}};
 
   // Request PassInstrumentation from analysis manager, will use it to run
   // instrumenting callbacks for the passes later.
@@ -340,6 +342,10 @@ ModuleToPostOrderCGSCCPassAdaptor::run(Module &M, ModuleAnalysisManager &AM) {
     } while (!RCWorklist.empty());
   }
 
+  CG.removeDeadFunctions(DeadFunctions);
+  for (Function *DeadF : DeadFunctions)
+    DeadF->eraseFromParent();
+
 #if defined(EXPENSIVE_CHECKS)
   // Verify that the call graph is still valid.
   CG.verify();
@@ -1030,36 +1036,6 @@ static LazyCallGraph::SCC &updateCGAndAnalysisManagerForPass(
     return true;
   });
 
-  // Now do a batch removal of the internal ref edges left.
-  auto NewRefSCCs = RC->removeInternalRefEdge(N, DeadTargets);
-  if (!NewRefSCCs.empty()) {
-    // The old RefSCC is dead, mark it as such.
-    UR.InvalidatedRefSCCs.insert(RC);
-
-    // Note that we don't bother to invalidate analyses as ref-edge
-    // connectivity is not really observable in any way and is intended
-    // exclusively to be used for ordering of transforms rather than for
-    // analysis conclusions.
-
-    // Update RC to the "bottom".
-    assert(G.lookupSCC(N) == C && "Changed the SCC when splitting RefSCCs!");
-    RC = &C->getOuterRefSCC();
-    assert(G.lookupRefSCC(N) == RC && "Failed to update current RefSCC!");
-
-    // The RC worklist is in reverse postorder, so we enqueue the new ones in
-    // RPO except for the one which contains the source node as that is the
-    // "bottom" we will continue processing in the bottom-up walk.
-    assert(NewRefSCCs.front() == RC &&
-           "New current RefSCC not first in the returned list!");
-    for (RefSCC *NewRC : llvm::reverse(llvm::drop_begin(NewRefSCCs))) {
-      assert(NewRC != RC && "Should not encounter the current RefSCC further "
-                            "in the postorder list of new RefSCCs.");
-      UR.RCWorklist.insert(NewRC);
-      LLVM_DEBUG(dbgs() << "Enqueuing a new RefSCC in the update worklist: "
-                        << *NewRC << "\n");
-    }
-  }
-
   // Next demote all the call edges that are now ref edges. This helps make
   // the SCCs small which should minimize the work below as we don't want to
   // form cycles that this would break.
diff --git a/llvm/lib/Analysis/LazyCallGraph.cpp b/llvm/lib/Analysis/LazyCallGraph.cpp
index 48a7ca0061600b..e6bf8c9cbb289f 100644
--- a/llvm/lib/Analysis/LazyCallGraph.cpp
+++ b/llvm/lib/Analysis/LazyCallGraph.cpp
@@ -1160,8 +1160,8 @@ void LazyCallGraph::RefSCC::removeOutgoingEdge(Node &SourceN, Node &TargetN) {
 }
 
 SmallVector<LazyCallGraph::RefSCC *, 1>
-LazyCallGraph::RefSCC::removeInternalRefEdge(Node &SourceN,
-                                             ArrayRef<Node *> TargetNs) {
+LazyCallGraph::RefSCC::removeInternalRefEdges(
+    ArrayRef<std::pair<Node *, Node *>> Edges) {
   // We return a list of the resulting *new* RefSCCs in post-order.
   SmallVector<RefSCC *, 1> Result;
 
@@ -1179,25 +1179,21 @@ LazyCallGraph::RefSCC::removeInternalRefEdge(Node &SourceN,
 #endif
 
   // First remove the actual edges.
-  for (Node *TargetN : TargetNs) {
-    assert(!(*SourceN)[*TargetN].isCall() &&
+  for (auto [SourceN, TargetN] : Edges) {
+    assert(!(**SourceN)[*TargetN].isCall() &&
            "Cannot remove a call edge, it must first be made a ref edge");
 
-    bool Removed = SourceN->removeEdgeInternal(*TargetN);
+    bool Removed = (*SourceN)->removeEdgeInternal(*TargetN);
     (void)Removed;
     assert(Removed && "Target not in the edge set for this caller?");
   }
 
   // Direct self references don't impact the ref graph at all.
-  if (llvm::all_of(TargetNs,
-                   [&](Node *TargetN) { return &SourceN == TargetN; }))
-    return Result;
-
   // If all targets are in the same SCC as the source, because no call edges
   // were removed there is no RefSCC structure change.
-  SCC &SourceC = *G->lookupSCC(SourceN);
-  if (llvm::all_of(TargetNs, [&](Node *TargetN) {
-        return G->lookupSCC(*TargetN) == &SourceC;
+  if (llvm::all_of(Edges, [&](std::pair<Node *, Node *> E) {
+        return E.first == E.second ||
+               G->lookupSCC(*E.first) == G->lookupSCC(*E.second);
       }))
     return Result;
 
@@ -1499,7 +1495,7 @@ void LazyCallGraph::removeEdge(Node &SourceN, Node &TargetN) {
   assert(Removed && "Target not in the edge set for this caller?");
 }
 
-void LazyCallGraph::removeDeadFunction(Function &F) {
+void LazyCallGraph::markDeadFunction(Function &F) {
   // FIXME: This is unnecessarily restrictive. We should be able to remove
   // functions which recursively call themselves.
   assert(F.hasZeroLiveUses() &&
@@ -1515,57 +1511,66 @@ void LazyCallGraph::removeDeadFunction(Function &F) {
 
   Node &N = *NI->second;
 
-  // Cannot remove a function which has yet to be visited in the DFS walk, so
-  // if we have a node at all then we must have an SCC and RefSCC.
-  auto CI = SCCMap.find(&N);
-  assert(CI != SCCMap.end() &&
-         "Tried to remove a node without an SCC after DFS walk started!");
-  SCC &C = *CI->second;
-  RefSCC *RC = &C.getOuterRefSCC();
-
-  // In extremely rare cases, we can delete a dead function which is still in a
-  // non-trivial RefSCC. This can happen due to spurious ref edges sticking
-  // around after an IR function reference is removed.
-  if (RC->size() != 1) {
-    SmallVector<Node *, 0> NodesInRC;
-    for (SCC &OtherC : *RC) {
-      for (Node &OtherN : OtherC)
-        NodesInRC.push_back(&OtherN);
+  // Remove all call edges out of dead function.
+  for (Edge E : *N) {
+    if (E.isCall())
+      N->setEdgeKind(E.getNode(), Edge::Ref);
+  }
+}
+
+void LazyCallGraph::removeDeadFunctions(ArrayRef<Function *> DeadFs) {
+  if (DeadFs.empty())
+    return;
+
+  // Group dead functions by the RefSCC they're in.
+  DenseMap<RefSCC *, SmallVector<Node *, 1>> RCs;
+  for (Function *DeadF : DeadFs) {
+    Node *N = lookup(*DeadF);
+#ifndef NDEBUG
+    for (Edge &E : **N) {
+      assert(!E.isCall() &&
+             "dead function shouldn't have any outgoing call edges");
     }
-    for (Node *OtherN : NodesInRC) {
-      if ((*OtherN)->lookup(N)) {
-        auto NewRefSCCs =
-            RC->removeInternalRefEdge(*OtherN, ArrayRef<Node *>(&N));
-        // If we've split into multiple RefSCCs, RC is now invalid and the
-        // RefSCC containing C will be different.
-        if (!NewRefSCCs.empty())
-          RC = &C.getOuterRefSCC();
+#endif
+    RefSCC *RC = lookupRefSCC(*N);
+    RCs[RC].push_back(N);
+  }
+  // Remove outgoing edges from all dead functions. Dead functions should
+  // already have had their call edges removed in markDeadFunction(), so we only
+  // need to worry about spurious ref edges.
+  for (auto [RC, DeadNs] : RCs) {
+    SmallVector<std::pair<Node *, Node *>> InternalEdgesToRemove;
+    for (Node *DeadN : DeadNs) {
+      for (Edge &E : **DeadN) {
+        if (lookupRefSCC(E.getNode()) == RC)
+          InternalEdgesToRemove.push_back({DeadN, &E.getNode()});
+        else
+          RC->removeOutgoingEdge(*DeadN, E.getNode());
       }
     }
+    // We ignore the returned RefSCCs since at this point we're done with CGSCC
+    // iteration and don't need to add it to any worklists.
+    (void)RC->removeInternalRefEdges(InternalEdgesToRemove);
+    for (Node *DeadN : DeadNs) {
+      RefSCC *DeadRC = lookupRefSCC(*DeadN);
+      assert(DeadRC->size() == 1);
+      assert(DeadRC->begin()->size() == 1);
+      DeadRC->clear();
+      DeadRC->G = nullptr;
+    }
   }
+  // Clean up data structures.
+  for (Function *DeadF : DeadFs) {
+    Node &N = *lookup(*DeadF);
+
+    EntryEdges.removeEdgeInternal(N);
+    SCCMap.erase(SCCMap.find(&N));
+    NodeMap.erase(NodeMap.find(DeadF));
 
-  NodeMap.erase(NI);
-  EntryEdges.removeEdgeInternal(N);
-  SCCMap.erase(CI);
-
-  // This node must be the only member of its SCC as it has no callers, and
-  // that SCC must be the only member of a RefSCC as it has no references.
-  // Validate these properties first.
-  assert(C.size() == 1 && "Dead functions must be in a singular SCC");
-  assert(RC->size() == 1 && "Dead functions must be in a singular RefSCC");
-
-  // Finally clear out all the data structures from the node down through the
-  // components. postorder_ref_scc_iterator will skip empty RefSCCs, so no need
-  // to adjust LazyCallGraph data structures.
-  N.clear();
-  N.G = nullptr;
-  N.F = nullptr;
-  C.clear();
-  RC->clear();
-  RC->G = nullptr;
-
-  // Nothing to delete as all the objects are allocated in stable bump pointer
-  // allocators.
+    N.clear();
+    N.G = nullptr;
+    N.F = nullptr;
+  }
 }
 
 // Gets the Edge::Kind from one function to another by looking at the function's
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 93b8d28ef749f3..fd8919fff6ff96 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -2389,7 +2389,7 @@ bool LoopAccessInfo::canAnalyzeLoop() {
   return true;
 }
 
-void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
+bool LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
                                  const TargetLibraryInfo *TLI,
                                  DominatorTree *DT) {
   // Holds the Load and Store instructions.
@@ -2430,10 +2430,8 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
 
       // With both a non-vectorizable memory instruction and a convergent
       // operation, found in this loop, no reason to continue the search.
-      if (HasComplexMemInst && HasConvergentOp) {
-        CanVecMem = false;
-        return;
-      }
+      if (HasComplexMemInst && HasConvergentOp)
+        return false;
 
       // Avoid hitting recordAnalysis multiple times.
       if (HasComplexMemInst)
@@ -2508,10 +2506,8 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
     } // Next instr.
   } // Next block.
 
-  if (HasComplexMemInst) {
-    CanVecMem = false;
-    return;
-  }
+  if (HasComplexMemInst)
+    return false;
 
   // Now we have two lists that hold the loads and the stores.
   // Next, we find the pointers that they use.
@@ -2520,8 +2516,7 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
   // care if the pointers are *restrict*.
   if (!Stores.size()) {
     LLVM_DEBUG(dbgs() << "LAA: Found a read-only loop!\n");
-    CanVecMem = true;
-    return;
+    return true;
   }
 
   MemoryDepChecker::DepCandidates DependentAccesses;
@@ -2574,8 +2569,7 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
     LLVM_DEBUG(
         dbgs() << "LAA: A loop annotated parallel, ignore memory dependency "
                << "checks.\n");
-    CanVecMem = true;
-    return;
+    return true;
   }
 
   for (LoadInst *LD : Loads) {
@@ -2622,8 +2616,7 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
   // other reads in this loop then is it safe to vectorize.
   if (NumReadWrites == 1 && NumReads == 0) {
     LLVM_DEBUG(dbgs() << "LAA: Found a write-only loop!\n");
-    CanVecMem = true;
-    return;
+    return true;
   }
 
   // Build dependence sets and check whether we need a runtime pointer bounds
@@ -2642,21 +2635,20 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
         << "cannot identify array bounds";
     LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because we can't find "
                       << "the array bounds.\n");
-    CanVecMem = false;
-    return;
+    return false;
   }
 
   LLVM_DEBUG(
     dbgs() << "LAA: May be able to perform a memory runtime check if needed.\n");
 
-  CanVecMem = true;
+  bool DepsAreSafe = true;
   if (Accesses.isDependencyCheckNeeded()) {
     LLVM_DEBUG(dbgs() << "LAA: Checking memory dependencies\n");
-    CanVecMem = DepChecker->areDepsSafe(DependentAccesses,
-                                        Accesses.getDependenciesToCheck(),
-                                        Accesses.getUnderlyingObjects());
+    DepsAreSafe = DepChecker->areDepsSafe(DependentAccesses,
+                                          Accesses.getDependenciesToCheck(),
+                                          Accesses.getUnderlyingObjects());
 
-    if (!CanVecMem && DepChecker->shouldRetryWithRuntimeCheck()) {
+    if (!DepsAreSafe && DepChecker->shouldRetryWithRuntimeCheck()) {
       LLVM_DEBUG(dbgs() << "LAA: Retrying with memory checks\n");
 
       // Clear the dependency checks. We assume they are not needed.
@@ -2676,30 +2668,30 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
         recordAnalysis("CantCheckMemDepsAtRunTime", I)
             << "cannot check memory dependencies at runtime";
         LLVM_DEBUG(dbgs() << "LAA: Can't vectorize with memory checks\n");
-        CanVecMem = false;
-        return;
+        return false;
       }
-
-      CanVecMem = true;
+      DepsAreSafe = true;
     }
   }
 
   if (HasConvergentOp) {
     recordAnalysis("CantInsertRuntimeCheckWithConvergent")
-      << "cannot add control dependency to convergent operation";
+        << "cannot add control dependency to convergent operation";
     LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because a runtime check "
                          "would be needed with a convergent operation\n");
-    CanVecMem = false;
-    return;
+    return false;
   }
 
-  if (CanVecMem)
+  if (DepsAreSafe) {
     LLVM_DEBUG(
         dbgs() << "LAA: No unsafe dependent memory operations in loop.  We"
                << (PtrRtChecking->Need ? "" : " don't")
                << " need runtime memory checks.\n");
-  else
-    emitUnsafeDependenceRemark();
+    return true;
+  }
+
+  emitUnsafeDependenceRemark();
+  return false;
 }
 
 void LoopAccessInfo::emitUnsafeDependenceRemark() {
@@ -3048,7 +3040,7 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
                                                   MaxTargetVectorWidthInBits);
   PtrRtChecking = std::make_unique<RuntimePointerChecking>(*DepChecker, SE);
   if (canAnalyzeLoop())
-    analyzeLoop(AA, LI, TLI, DT);
+    CanVecMem = analyzeLoop(AA, LI, TLI, DT);
 }
 
 void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp
index 8490853eda87c2..4ff606d3732388 100644
--- a/llvm/lib/MC/MCAssembler.cpp
+++ b/llvm/lib/MC/MCAssembler.cpp
@@ -831,6 +831,19 @@ void MCAssembler::layout(MCAsmLayout &Layout) {
     MCSection *Sec = Layout.getSectionOrder()[i];
     Sec->setLayoutOrder(i);
 
+    // Chain together fragments from all subsections.
+    MCDummyFragment Dummy(Sec);
+    MCFragment *Tail = &Dummy;
+    for (auto &[_, List] : Sec->Subsections) {
+      if (!List.Head)
+        continue;
+      Tail->Next = List.Head;
+      Tail = List.Tail;
+    }
+    Sec->Subsections.clear();
+    Sec->Subsections.push_back({0u, {Dummy.getNext(), Tail}});
+    Sec->CurFragList = &Sec->Subsections[0].second;
+
     unsigned FragmentIndex = 0;
     for (MCFragment &Frag : *Sec)
       Frag.setLayoutOrder(FragmentIndex++);
@@ -1094,7 +1107,7 @@ bool MCAssembler::relaxBoundaryAlign(MCAsmLayout &Layout,
 
   uint64_t AlignedOffset = Layout.getFragmentOffset(&BF);
   uint64_t AlignedSize = 0;
-  for (const MCFragment *F = BF.getNextNode();; F = F->getNextNode()) {
+  for (const MCFragment *F = BF.getNext();; F = F->getNext()) {
     AlignedSize += computeFragmentSize(Layout, *F);
     if (F == BF.getLastFragment())
       break;
diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp
index b70ac86c18ccf1..2eecdb82d30bb0 100644
--- a/llvm/lib/MC/MCExpr.cpp
+++ b/llvm/lib/MC/MCExpr.cpp
@@ -661,25 +661,16 @@ static void AttemptToFoldSymbolOffsetDifference(
     // this is important when the Subtarget is changed and a new MCDataFragment
     // is created in the case of foo: instr; .arch_extension ext; instr .if . -
     // foo.
-    if (SA.isVariable() || SB.isVariable() ||
-        FA->getSubsectionNumber() != FB->getSubsectionNumber())
+    if (SA.isVariable() || SB.isVariable())
       return;
 
     // Try to find a constant displacement from FA to FB, add the displacement
     // between the offset in FA of SA and the offset in FB of SB.
     bool Reverse = false;
-    if (FA == FB) {
+    if (FA == FB)
       Reverse = SA.getOffset() < SB.getOffset();
-    } else if (!isa<MCDummyFragment>(FA)) {
-      // Testing FA < FB is slow. Use setLayoutOrder to speed up computation.
-      // The formal layout order will be finalized in MCAssembler::layout.
-      if (FA->getLayoutOrder() == 0 || FB->getLayoutOrder()== 0) {
-        unsigned LayoutOrder = 0;
-        for (MCFragment &F : *FA->getParent())
-          F.setLayoutOrder(++LayoutOrder);
-      }
+    else if (!isa<MCDummyFragment>(FA))
       Reverse = FA->getLayoutOrder() < FB->getLayoutOrder();
-    }
 
     uint64_t SAOffset = SA.getOffset(), SBOffset = SB.getOffset();
     int64_t Displacement = SA.getOffset() - SB.getOffset();
@@ -695,7 +686,7 @@ static void AttemptToFoldSymbolOffsetDifference(
     // instruction, the difference cannot be resolved as it may be changed by
     // the linker.
     bool BBeforeRelax = false, AAfterRelax = false;
-    for (auto FI = FB->getIterator(), FE = SecA.end(); FI != FE; ++FI) {
+    for (auto FI = FB; FI; FI = FI->getNext()) {
       auto DF = dyn_cast<MCDataFragment>(FI);
       if (DF && DF->isLinkerRelaxable()) {
         if (&*FI != FB || SBOffset != DF->getContents().size())
@@ -726,12 +717,14 @@ static void AttemptToFoldSymbolOffsetDifference(
         return;
       }
     }
-    // If the previous loop does not find FA, FA must be a dummy fragment not in
-    // the fragment list (which means SA is a pending label (see
-    // flushPendingLabels)). In either case, we can resolve the difference.
-    assert(Found || isa<MCDummyFragment>(FA));
-    Addend += Reverse ? -Displacement : Displacement;
-    FinalizeFolding();
+    // If FA and FB belong to the same subsection, either the previous loop
+    // found FA, or FA is a dummy fragment not in the fragment list (which means
+    // SA is a pending label (see flushPendingLabels)) or FA and FB belong to
+    // different subsections. In either case, we can resolve the difference.
+    if (Found || isa<MCDummyFragment>(FA)) {
+      Addend += Reverse ? -Displacement : Displacement;
+      FinalizeFolding();
+    }
   }
 }
 
diff --git a/llvm/lib/MC/MCFragment.cpp b/llvm/lib/MC/MCFragment.cpp
index 84a587164c788d..6d97e8ce552baf 100644
--- a/llvm/lib/MC/MCFragment.cpp
+++ b/llvm/lib/MC/MCFragment.cpp
@@ -141,7 +141,7 @@ const MCSymbol *MCAsmLayout::getBaseSymbol(const MCSymbol &Symbol) const {
 
 uint64_t MCAsmLayout::getSectionAddressSize(const MCSection *Sec) const {
   // The size is the last fragment's end offset.
-  const MCFragment &F = Sec->getFragmentList().back();
+  const MCFragment &F = *Sec->curFragList()->Tail;
   return getFragmentOffset(&F) + getAssembler().computeFragmentSize(*this, F);
 }
 
@@ -197,8 +197,6 @@ uint64_t llvm::computeBundlePadding(const MCAssembler &Assembler,
 
 /* *** */
 
-void ilist_alloc_traits<MCFragment>::deleteNode(MCFragment *V) { V->destroy(); }
-
 MCFragment::MCFragment(FragmentType Kind, bool HasInstructions,
                        MCSection *Parent)
     : Parent(Parent), Atom(nullptr), Offset(~UINT64_C(0)), LayoutOrder(0),
diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp
index ae4e6915fa294c..bf1ce76cdc14bd 100644
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@@ -180,7 +180,6 @@ void MCObjectStreamer::reset() {
     if (getContext().getTargetOptions())
       Assembler->setRelaxAll(getContext().getTargetOptions()->MCRelaxAll);
   }
-  CurInsertionPoint = MCSection::iterator();
   EmitEHFrame = true;
   EmitDebugFrame = false;
   PendingLabels.clear();
@@ -200,12 +199,7 @@ void MCObjectStreamer::emitFrames(MCAsmBackend *MAB) {
 }
 
 MCFragment *MCObjectStreamer::getCurrentFragment() const {
-  assert(getCurrentSectionOnly() && "No current section!");
-
-  if (CurInsertionPoint != getCurrentSectionOnly()->begin())
-    return &*std::prev(CurInsertionPoint);
-
-  return nullptr;
+  return getCurrentSectionOnly()->curFragList()->Tail;
 }
 
 static bool canReuseDataFragment(const MCDataFragment &F,
@@ -391,8 +385,7 @@ bool MCObjectStreamer::changeSectionImpl(MCSection *Section,
   }
 
   CurSubsectionIdx = unsigned(IntSubsection);
-  CurInsertionPoint =
-      Section->getSubsectionInsertionPoint(CurSubsectionIdx);
+  Section->switchSubsection(CurSubsectionIdx);
   return Created;
 }
 
diff --git a/llvm/lib/MC/MCSection.cpp b/llvm/lib/MC/MCSection.cpp
index 9848d7fafe764a..1d9fe2cafd6174 100644
--- a/llvm/lib/MC/MCSection.cpp
+++ b/llvm/lib/MC/MCSection.cpp
@@ -24,7 +24,10 @@ MCSection::MCSection(SectionVariant V, StringRef Name, SectionKind K,
                      MCSymbol *Begin)
     : Begin(Begin), BundleGroupBeforeFirstInst(false), HasInstructions(false),
       HasLayout(false), IsRegistered(false), DummyFragment(this), Name(Name),
-      Variant(V), Kind(K) {}
+      Variant(V), Kind(K) {
+  // The initial subsection number is 0. Create a fragment list.
+  CurFragList = &Subsections.emplace_back(0u, FragList{}).second;
+}
 
 MCSymbol *MCSection::getEndSymbol(MCContext &Ctx) {
   if (!End)
@@ -34,7 +37,14 @@ MCSymbol *MCSection::getEndSymbol(MCContext &Ctx) {
 
 bool MCSection::hasEnded() const { return End && End->isInSection(); }
 
-MCSection::~MCSection() = default;
+MCSection::~MCSection() {
+  for (auto &[_, Chain] : Subsections) {
+    for (MCFragment *X = Chain.Head, *Y; X; X = Y) {
+      Y = X->Next;
+      X->destroy();
+    }
+  }
+}
 
 void MCSection::setBundleLockState(BundleLockStateType NewState) {
   if (NewState == NotBundleLocked) {
@@ -55,35 +65,15 @@ void MCSection::setBundleLockState(BundleLockStateType NewState) {
   ++BundleLockNestingDepth;
 }
 
-MCSection::iterator
-MCSection::getSubsectionInsertionPoint(unsigned Subsection) {
-  if (Subsection == 0 && SubsectionFragmentMap.empty())
-    return end();
-
-  SmallVectorImpl<std::pair<unsigned, MCFragment *>>::iterator MI = lower_bound(
-      SubsectionFragmentMap, std::make_pair(Subsection, (MCFragment *)nullptr));
-  bool ExactMatch = false;
-  if (MI != SubsectionFragmentMap.end()) {
-    ExactMatch = MI->first == Subsection;
-    if (ExactMatch)
-      ++MI;
-  }
-  iterator IP;
-  if (MI == SubsectionFragmentMap.end())
-    IP = end();
-  else
-    IP = MI->second->getIterator();
-  if (!ExactMatch && Subsection != 0) {
-    // The GNU as documentation claims that subsections have an alignment of 4,
-    // although this appears not to be the case.
-    MCFragment *F = new MCDataFragment();
-    SubsectionFragmentMap.insert(MI, std::make_pair(Subsection, F));
-    getFragmentList().insert(IP, F);
-    F->setParent(this);
-    F->setSubsectionNumber(Subsection);
-  }
-
-  return IP;
+void MCSection::switchSubsection(unsigned Subsection) {
+  size_t I = 0, E = Subsections.size();
+  while (I != E && Subsections[I].first < Subsection)
+    ++I;
+  // If the subsection number is not in the sorted Subsections list, create a
+  // new fragment list.
+  if (I == E || Subsections[I].first != Subsection)
+    Subsections.insert(Subsections.begin() + I, {Subsection, FragList{}});
+  CurFragList = &Subsections[I].second;
 }
 
 StringRef MCSection::getVirtualSectionKind() const { return "virtual"; }
@@ -111,13 +101,11 @@ void MCSection::flushPendingLabels() {
   // creating new empty data fragments for each Subsection with labels pending.
   while (!PendingLabels.empty()) {
     PendingLabel& Label = PendingLabels[0];
-    iterator CurInsertionPoint =
-      this->getSubsectionInsertionPoint(Label.Subsection);
-    const MCSymbol *Atom = nullptr;
-    if (CurInsertionPoint != begin())
-      Atom = std::prev(CurInsertionPoint)->getAtom();
+    switchSubsection(Label.Subsection);
+    const MCSymbol *Atom =
+        CurFragList->Tail ? CurFragList->Tail->getAtom() : nullptr;
     MCFragment *F = new MCDataFragment();
-    getFragmentList().insert(CurInsertionPoint, F);
+    addFragment(*F);
     F->setParent(this);
     F->setAtom(Atom);
     flushPendingLabels(F, 0, Label.Subsection);
diff --git a/llvm/lib/MC/WasmObjectWriter.cpp b/llvm/lib/MC/WasmObjectWriter.cpp
index 451269608f1799..522e268156aa3c 100644
--- a/llvm/lib/MC/WasmObjectWriter.cpp
+++ b/llvm/lib/MC/WasmObjectWriter.cpp
@@ -1864,15 +1864,14 @@ uint64_t WasmObjectWriter::writeOneObject(MCAssembler &Asm,
     if (EmptyFrag.getKind() != MCFragment::FT_Data)
       report_fatal_error(".init_array section should be aligned");
 
-    IT = std::next(IT);
-    const MCFragment &AlignFrag = *IT;
+    const MCFragment &AlignFrag = *EmptyFrag.getNext();
     if (AlignFrag.getKind() != MCFragment::FT_Align)
       report_fatal_error(".init_array section should be aligned");
     if (cast<MCAlignFragment>(AlignFrag).getAlignment() !=
         Align(is64Bit() ? 8 : 4))
       report_fatal_error(".init_array section should be aligned for pointers");
 
-    const MCFragment &Frag = *std::next(IT);
+    const MCFragment &Frag = *AlignFrag.getNext();
     if (Frag.hasInstructions() || Frag.getKind() != MCFragment::FT_Data)
       report_fatal_error("only data supported in .init_array section");
 
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
index 3b6ea81cdf10ed..54efe4bc25efe7 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
@@ -712,17 +712,20 @@ class HexagonAsmBackend : public MCAsmBackend {
 
   void finishLayout(MCAssembler const &Asm,
                     MCAsmLayout &Layout) const override {
+    SmallVector<MCFragment *> Frags;
     for (auto *I : Layout.getSectionOrder()) {
-      for (auto &J : *I) {
-        switch (J.getKind()) {
+      Frags.clear();
+      for (MCFragment &F : *I)
+        Frags.push_back(&F);
+      for (size_t J = 0, E = Frags.size(); J != E; ++J) {
+        switch (Frags[J]->getKind()) {
         default:
           break;
         case MCFragment::FT_Align: {
-          auto Size = Asm.computeFragmentSize(Layout, J);
-          for (auto K = J.getIterator();
-               K != I->begin() && Size >= HEXAGON_PACKET_SIZE;) {
+          auto Size = Asm.computeFragmentSize(Layout, *Frags[J]);
+          for (auto K = J; K != 0 && Size >= HEXAGON_PACKET_SIZE;) {
             --K;
-            switch (K->getKind()) {
+            switch (Frags[K]->getKind()) {
             default:
               break;
             case MCFragment::FT_Align: {
@@ -732,7 +735,7 @@ class HexagonAsmBackend : public MCAsmBackend {
             }
             case MCFragment::FT_Relaxable: {
               MCContext &Context = Asm.getContext();
-              auto &RF = cast<MCRelaxableFragment>(*K);
+              auto &RF = cast<MCRelaxableFragment>(*Frags[K]);
               auto &Inst = const_cast<MCInst &>(RF.getInst());
               while (Size > 0 &&
                      HexagonMCInstrInfo::bundleSize(Inst) < MaxPacketSize) {
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
index b8e0f3a867f402..d83dadd3016193 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
@@ -62,7 +62,7 @@ const MCFixup *RISCVMCExpr::getPCRelHiFixup(const MCFragment **DFOut) const {
 
   uint64_t Offset = AUIPCSymbol->getOffset();
   if (DF->getContents().size() == Offset) {
-    DF = dyn_cast_or_null<MCDataFragment>(DF->getNextNode());
+    DF = dyn_cast_or_null<MCDataFragment>(DF->getNext());
     if (!DF)
       return nullptr;
     Offset = 0;
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
index 49838e685a6d2b..6bb3e215240a87 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
@@ -300,6 +300,72 @@ lookupBuiltin(StringRef DemangledCall,
   return nullptr;
 }
 
+static MachineInstr *getBlockStructInstr(Register ParamReg,
+                                         MachineRegisterInfo *MRI) {
+  // We expect the following sequence of instructions:
+  //   %0:_(pN) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.spv.alloca)
+  //   or       = G_GLOBAL_VALUE @block_literal_global
+  //   %1:_(pN) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.spv.bitcast), %0
+  //   %2:_(p4) = G_ADDRSPACE_CAST %1:_(pN)
+  MachineInstr *MI = MRI->getUniqueVRegDef(ParamReg);
+  assert(MI->getOpcode() == TargetOpcode::G_ADDRSPACE_CAST &&
+         MI->getOperand(1).isReg());
+  Register BitcastReg = MI->getOperand(1).getReg();
+  MachineInstr *BitcastMI = MRI->getUniqueVRegDef(BitcastReg);
+  assert(isSpvIntrinsic(*BitcastMI, Intrinsic::spv_bitcast) &&
+         BitcastMI->getOperand(2).isReg());
+  Register ValueReg = BitcastMI->getOperand(2).getReg();
+  MachineInstr *ValueMI = MRI->getUniqueVRegDef(ValueReg);
+  return ValueMI;
+}
+
+// Return an integer constant corresponding to the given register and
+// defined in spv_track_constant.
+// TODO: maybe unify with prelegalizer pass.
+static unsigned getConstFromIntrinsic(Register Reg, MachineRegisterInfo *MRI) {
+  MachineInstr *DefMI = MRI->getUniqueVRegDef(Reg);
+  assert(isSpvIntrinsic(*DefMI, Intrinsic::spv_track_constant) &&
+         DefMI->getOperand(2).isReg());
+  MachineInstr *DefMI2 = MRI->getUniqueVRegDef(DefMI->getOperand(2).getReg());
+  assert(DefMI2->getOpcode() == TargetOpcode::G_CONSTANT &&
+         DefMI2->getOperand(1).isCImm());
+  return DefMI2->getOperand(1).getCImm()->getValue().getZExtValue();
+}
+
+// Return type of the instruction result from spv_assign_type intrinsic.
+// TODO: maybe unify with prelegalizer pass.
+static const Type *getMachineInstrType(MachineInstr *MI) {
+  MachineInstr *NextMI = MI->getNextNode();
+  if (!NextMI)
+    return nullptr;
+  if (isSpvIntrinsic(*NextMI, Intrinsic::spv_assign_name))
+    if ((NextMI = NextMI->getNextNode()) == nullptr)
+      return nullptr;
+  Register ValueReg = MI->getOperand(0).getReg();
+  if ((!isSpvIntrinsic(*NextMI, Intrinsic::spv_assign_type) &&
+       !isSpvIntrinsic(*NextMI, Intrinsic::spv_assign_ptr_type)) ||
+      NextMI->getOperand(1).getReg() != ValueReg)
+    return nullptr;
+  Type *Ty = getMDOperandAsType(NextMI->getOperand(2).getMetadata(), 0);
+  assert(Ty && "Type is expected");
+  return Ty;
+}
+
+static const Type *getBlockStructType(Register ParamReg,
+                                      MachineRegisterInfo *MRI) {
+  // In principle, this information should be passed to us from Clang via
+  // an elementtype attribute. However, said attribute requires that
+  // the function call be an intrinsic, which is not. Instead, we rely on being
+  // able to trace this to the declaration of a variable: OpenCL C specification
+  // section 6.12.5 should guarantee that we can do this.
+  MachineInstr *MI = getBlockStructInstr(ParamReg, MRI);
+  if (MI->getOpcode() == TargetOpcode::G_GLOBAL_VALUE)
+    return MI->getOperand(1).getGlobal()->getType();
+  assert(isSpvIntrinsic(*MI, Intrinsic::spv_alloca) &&
+         "Blocks in OpenCL C must be traceable to allocation site");
+  return getMachineInstrType(MI);
+}
+
 //===----------------------------------------------------------------------===//
 // Helper functions for building misc instructions
 //===----------------------------------------------------------------------===//
@@ -1371,6 +1437,14 @@ static bool generateBarrierInst(const SPIRV::IncomingCall *Call,
   return buildBarrierInst(Call, Opcode, MIRBuilder, GR);
 }
 
+static bool generateCastToPtrInst(const SPIRV::IncomingCall *Call,
+                                  MachineIRBuilder &MIRBuilder) {
+  MIRBuilder.buildInstr(TargetOpcode::G_ADDRSPACE_CAST)
+      .addDef(Call->ReturnRegister)
+      .addUse(Call->Arguments[0]);
+  return true;
+}
+
 static bool generateDotOrFMulInst(const SPIRV::IncomingCall *Call,
                                   MachineIRBuilder &MIRBuilder,
                                   SPIRVGlobalRegistry *GR) {
@@ -1847,68 +1921,6 @@ static bool buildNDRange(const SPIRV::IncomingCall *Call,
       .addUse(TmpReg);
 }
 
-static MachineInstr *getBlockStructInstr(Register ParamReg,
-                                         MachineRegisterInfo *MRI) {
-  // We expect the following sequence of instructions:
-  //   %0:_(pN) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.spv.alloca)
-  //   or       = G_GLOBAL_VALUE @block_literal_global
-  //   %1:_(pN) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.spv.bitcast), %0
-  //   %2:_(p4) = G_ADDRSPACE_CAST %1:_(pN)
-  MachineInstr *MI = MRI->getUniqueVRegDef(ParamReg);
-  assert(MI->getOpcode() == TargetOpcode::G_ADDRSPACE_CAST &&
-         MI->getOperand(1).isReg());
-  Register BitcastReg = MI->getOperand(1).getReg();
-  MachineInstr *BitcastMI = MRI->getUniqueVRegDef(BitcastReg);
-  assert(isSpvIntrinsic(*BitcastMI, Intrinsic::spv_bitcast) &&
-         BitcastMI->getOperand(2).isReg());
-  Register ValueReg = BitcastMI->getOperand(2).getReg();
-  MachineInstr *ValueMI = MRI->getUniqueVRegDef(ValueReg);
-  return ValueMI;
-}
-
-// Return an integer constant corresponding to the given register and
-// defined in spv_track_constant.
-// TODO: maybe unify with prelegalizer pass.
-static unsigned getConstFromIntrinsic(Register Reg, MachineRegisterInfo *MRI) {
-  MachineInstr *DefMI = MRI->getUniqueVRegDef(Reg);
-  assert(isSpvIntrinsic(*DefMI, Intrinsic::spv_track_constant) &&
-         DefMI->getOperand(2).isReg());
-  MachineInstr *DefMI2 = MRI->getUniqueVRegDef(DefMI->getOperand(2).getReg());
-  assert(DefMI2->getOpcode() == TargetOpcode::G_CONSTANT &&
-         DefMI2->getOperand(1).isCImm());
-  return DefMI2->getOperand(1).getCImm()->getValue().getZExtValue();
-}
-
-// Return type of the instruction result from spv_assign_type intrinsic.
-// TODO: maybe unify with prelegalizer pass.
-static const Type *getMachineInstrType(MachineInstr *MI) {
-  MachineInstr *NextMI = MI->getNextNode();
-  if (isSpvIntrinsic(*NextMI, Intrinsic::spv_assign_name))
-    NextMI = NextMI->getNextNode();
-  Register ValueReg = MI->getOperand(0).getReg();
-  if (!isSpvIntrinsic(*NextMI, Intrinsic::spv_assign_type) ||
-      NextMI->getOperand(1).getReg() != ValueReg)
-    return nullptr;
-  Type *Ty = getMDOperandAsType(NextMI->getOperand(2).getMetadata(), 0);
-  assert(Ty && "Type is expected");
-  return Ty;
-}
-
-static const Type *getBlockStructType(Register ParamReg,
-                                      MachineRegisterInfo *MRI) {
-  // In principle, this information should be passed to us from Clang via
-  // an elementtype attribute. However, said attribute requires that
-  // the function call be an intrinsic, which is not. Instead, we rely on being
-  // able to trace this to the declaration of a variable: OpenCL C specification
-  // section 6.12.5 should guarantee that we can do this.
-  MachineInstr *MI = getBlockStructInstr(ParamReg, MRI);
-  if (MI->getOpcode() == TargetOpcode::G_GLOBAL_VALUE)
-    return MI->getOperand(1).getGlobal()->getType();
-  assert(isSpvIntrinsic(*MI, Intrinsic::spv_alloca) &&
-         "Blocks in OpenCL C must be traceable to allocation site");
-  return getMachineInstrType(MI);
-}
-
 // TODO: maybe move to the global register.
 static SPIRVType *
 getOrCreateSPIRVDeviceEventPointer(MachineIRBuilder &MIRBuilder,
@@ -2322,6 +2334,8 @@ std::optional<bool> lowerBuiltin(const StringRef DemangledCall,
     return generateAtomicFloatingInst(Call.get(), MIRBuilder, GR);
   case SPIRV::Barrier:
     return generateBarrierInst(Call.get(), MIRBuilder, GR);
+  case SPIRV::CastToPtr:
+    return generateCastToPtrInst(Call.get(), MIRBuilder);
   case SPIRV::Dot:
     return generateDotOrFMulInst(Call.get(), MIRBuilder, GR);
   case SPIRV::Wave:
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
index edc9e1a33d9f5a..2edd2992425bd0 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
@@ -59,6 +59,7 @@ def IntelSubgroups : BuiltinGroup;
 def AtomicFloating : BuiltinGroup;
 def GroupUniform : BuiltinGroup;
 def KernelClock : BuiltinGroup;
+def CastToPtr : BuiltinGroup;
 
 //===----------------------------------------------------------------------===//
 // Class defining a demangled builtin record. The information in the record
@@ -595,6 +596,17 @@ defm : DemangledNativeBuiltin<"__spirv_GroupWaitEvents", OpenCL_std, AsyncCopy,
 defm : DemangledNativeBuiltin<"__spirv_Load", OpenCL_std, LoadStore, 1, 3, OpLoad>;
 defm : DemangledNativeBuiltin<"__spirv_Store", OpenCL_std, LoadStore, 2, 4, OpStore>;
 
+// Address Space Qualifier Functions/Pointers Conversion Instructions:
+defm : DemangledNativeBuiltin<"to_global", OpenCL_std, CastToPtr, 1, 1, OpGenericCastToPtr>;
+defm : DemangledNativeBuiltin<"to_local", OpenCL_std, CastToPtr, 1, 1, OpGenericCastToPtr>;
+defm : DemangledNativeBuiltin<"to_private", OpenCL_std, CastToPtr, 1, 1, OpGenericCastToPtr>;
+defm : DemangledNativeBuiltin<"__spirv_GenericCastToPtr_ToGlobal", OpenCL_std, CastToPtr, 2, 2, OpGenericCastToPtr>;
+defm : DemangledNativeBuiltin<"__spirv_GenericCastToPtr_ToLocal", OpenCL_std, CastToPtr, 2, 2, OpGenericCastToPtr>;
+defm : DemangledNativeBuiltin<"__spirv_GenericCastToPtr_ToPrivate", OpenCL_std, CastToPtr, 2, 2, OpGenericCastToPtr>;
+defm : DemangledNativeBuiltin<"__spirv_OpGenericCastToPtrExplicit_ToGlobal", OpenCL_std, CastToPtr, 2, 2, OpGenericCastToPtr>;
+defm : DemangledNativeBuiltin<"__spirv_OpGenericCastToPtrExplicit_ToLocal", OpenCL_std, CastToPtr, 2, 2, OpGenericCastToPtr>;
+defm : DemangledNativeBuiltin<"__spirv_OpGenericCastToPtrExplicit_ToPrivate", OpenCL_std, CastToPtr, 2, 2, OpGenericCastToPtr>;
+
 //===----------------------------------------------------------------------===//
 // Class defining a work/sub group builtin that should be translated into a
 // SPIR-V instruction using the defined properties.
diff --git a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h
index 2ec3fb35ca0451..3c8405fadd44e9 100644
--- a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h
+++ b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h
@@ -16,6 +16,7 @@
 
 #include "MCTargetDesc/SPIRVBaseInfo.h"
 #include "MCTargetDesc/SPIRVMCTargetDesc.h"
+#include "SPIRVUtils.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
@@ -285,10 +286,13 @@ class SPIRVGeneralDuplicatesTracker {
     TT.add(Ty, MF, R);
   }
 
-  void add(const Type *PointerElementType, unsigned AddressSpace,
+  void add(const Type *PointeeTy, unsigned AddressSpace,
            const MachineFunction *MF, Register R) {
-    ST.add(SPIRV::PointerTypeDescriptor(PointerElementType, AddressSpace), MF,
-           R);
+    if (isUntypedPointerTy(PointeeTy))
+      PointeeTy =
+          TypedPointerType::get(IntegerType::getInt8Ty(PointeeTy->getContext()),
+                                getPointerAddressSpace(PointeeTy));
+    ST.add(SPIRV::PointerTypeDescriptor(PointeeTy, AddressSpace), MF, R);
   }
 
   void add(const Constant *C, const MachineFunction *MF, Register R) {
@@ -320,10 +324,13 @@ class SPIRVGeneralDuplicatesTracker {
     return TT.find(const_cast<Type *>(Ty), MF);
   }
 
-  Register find(const Type *PointerElementType, unsigned AddressSpace,
+  Register find(const Type *PointeeTy, unsigned AddressSpace,
                 const MachineFunction *MF) {
-    return ST.find(
-        SPIRV::PointerTypeDescriptor(PointerElementType, AddressSpace), MF);
+    if (isUntypedPointerTy(PointeeTy))
+      PointeeTy =
+          TypedPointerType::get(IntegerType::getInt8Ty(PointeeTy->getContext()),
+                                getPointerAddressSpace(PointeeTy));
+    return ST.find(SPIRV::PointerTypeDescriptor(PointeeTy, AddressSpace), MF);
   }
 
   Register find(const Constant *C, const MachineFunction *MF) {
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index 7b8e3230bf5534..5c10e04325d515 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -69,7 +69,7 @@ class SPIRVEmitIntrinsics
   DenseSet<Instruction *> AggrStores;
 
   // deduce element type of untyped pointers
-  Type *deduceElementType(Value *I);
+  Type *deduceElementType(Value *I, bool UnknownElemTypeI8);
   Type *deduceElementTypeHelper(Value *I);
   Type *deduceElementTypeHelper(Value *I, std::unordered_set<Value *> &Visited);
   Type *deduceElementTypeByValueDeep(Type *ValueTy, Value *Operand,
@@ -105,7 +105,8 @@ class SPIRVEmitIntrinsics
 
   void replaceMemInstrUses(Instruction *Old, Instruction *New, IRBuilder<> &B);
   void processInstrAfterVisit(Instruction *I, IRBuilder<> &B);
-  void insertAssignPtrTypeIntrs(Instruction *I, IRBuilder<> &B);
+  bool insertAssignPtrTypeIntrs(Instruction *I, IRBuilder<> &B,
+                                bool UnknownElemTypeI8);
   void insertAssignTypeIntrs(Instruction *I, IRBuilder<> &B);
   void insertAssignPtrTypeTargetExt(TargetExtType *AssignedType, Value *V,
                                     IRBuilder<> &B);
@@ -367,6 +368,23 @@ Type *SPIRVEmitIntrinsics::deduceElementTypeHelper(
       if (Ty)
         break;
     }
+  } else if (auto *CI = dyn_cast<CallInst>(I)) {
+    static StringMap<unsigned> ResTypeByArg = {
+        {"to_global", 0},
+        {"to_local", 0},
+        {"to_private", 0},
+        {"__spirv_GenericCastToPtr_ToGlobal", 0},
+        {"__spirv_GenericCastToPtr_ToLocal", 0},
+        {"__spirv_GenericCastToPtr_ToPrivate", 0}};
+    // TODO: maybe improve performance by caching demangled names
+    if (Function *CalledF = CI->getCalledFunction()) {
+      std::string DemangledName =
+          getOclOrSpirvBuiltinDemangledName(CalledF->getName());
+      auto AsArgIt = ResTypeByArg.find(DemangledName);
+      if (AsArgIt != ResTypeByArg.end())
+        Ty = deduceElementTypeHelper(CI->getArgOperand(AsArgIt->second),
+                                     Visited);
+    }
   }
 
   // remember the found relationship
@@ -460,10 +478,10 @@ Type *SPIRVEmitIntrinsics::deduceNestedTypeHelper(
   return OrigTy;
 }
 
-Type *SPIRVEmitIntrinsics::deduceElementType(Value *I) {
+Type *SPIRVEmitIntrinsics::deduceElementType(Value *I, bool UnknownElemTypeI8) {
   if (Type *Ty = deduceElementTypeHelper(I))
     return Ty;
-  return IntegerType::getInt8Ty(I->getContext());
+  return UnknownElemTypeI8 ? IntegerType::getInt8Ty(I->getContext()) : nullptr;
 }
 
 // If the Instruction has Pointer operands with unresolved types, this function
@@ -1152,16 +1170,23 @@ void SPIRVEmitIntrinsics::processGlobalValue(GlobalVariable &GV,
     B.CreateIntrinsic(Intrinsic::spv_unref_global, GV.getType(), &GV);
 }
 
-void SPIRVEmitIntrinsics::insertAssignPtrTypeIntrs(Instruction *I,
-                                                   IRBuilder<> &B) {
+// Return true, if we can't decide what is the pointee type now and will get
+// back to the question later. Return false is spv_assign_ptr_type is not needed
+// or can be inserted immediately.
+bool SPIRVEmitIntrinsics::insertAssignPtrTypeIntrs(Instruction *I,
+                                                   IRBuilder<> &B,
+                                                   bool UnknownElemTypeI8) {
   reportFatalOnTokenType(I);
   if (!isPointerTy(I->getType()) || !requireAssignType(I) ||
       isa<BitCastInst>(I))
-    return;
+    return false;
 
   setInsertPointAfterDef(B, I);
-  Type *ElemTy = deduceElementType(I);
-  buildAssignPtr(B, ElemTy, I);
+  if (Type *ElemTy = deduceElementType(I, UnknownElemTypeI8)) {
+    buildAssignPtr(B, ElemTy, I);
+    return false;
+  }
+  return true;
 }
 
 void SPIRVEmitIntrinsics::insertAssignTypeIntrs(Instruction *I,
@@ -1199,7 +1224,7 @@ void SPIRVEmitIntrinsics::insertAssignTypeIntrs(Instruction *I,
           buildAssignPtr(B, PType->getElementType(), Op);
         } else if (isPointerTy(OpTy)) {
           Type *ElemTy = GR->findDeducedElementType(Op);
-          buildAssignPtr(B, ElemTy ? ElemTy : deduceElementType(Op), Op);
+          buildAssignPtr(B, ElemTy ? ElemTy : deduceElementType(Op, true), Op);
         } else {
           CallInst *AssignCI = buildIntrWithMD(Intrinsic::spv_assign_type,
                                                {OpTy}, Op, Op, {}, B);
@@ -1395,10 +1420,15 @@ bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) {
     if (isConvergenceIntrinsic(I))
       continue;
 
-    insertAssignPtrTypeIntrs(I, B);
+    bool Postpone = insertAssignPtrTypeIntrs(I, B, false);
+    // if Postpone is true, we can't decide on pointee type yet
     insertAssignTypeIntrs(I, B);
     insertPtrCastOrAssignTypeInstr(I, B);
     insertSpirvDecorations(I, B);
+    // if instruction requires a pointee type set, let's check if we know it
+    // already, and force it to be i8 if not
+    if (Postpone && !GR->findAssignPtrTypeInstr(I))
+      insertAssignPtrTypeIntrs(I, B, true);
   }
 
   for (auto &I : instructions(Func))
diff --git a/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
index 5ccbaf12ddee2e..4383d1c5c0e25d 100644
--- a/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
@@ -339,6 +339,7 @@ void SPIRVTargetLowering::finalizeLowering(MachineFunction &MF) const {
                          GR.getSPIRVTypeForVReg(MI.getOperand(1).getReg()));
         break;
       case SPIRV::OpPtrCastToGeneric:
+      case SPIRV::OpGenericCastToPtr:
         validateAccessChain(STI, MRI, GR, MI);
         break;
       case SPIRV::OpInBoundsPtrAccessChain:
diff --git a/llvm/lib/Target/SPIRV/SPIRVISelLowering.h b/llvm/lib/Target/SPIRV/SPIRVISelLowering.h
index 6fc200abf46279..77356b7512a739 100644
--- a/llvm/lib/Target/SPIRV/SPIRVISelLowering.h
+++ b/llvm/lib/Target/SPIRV/SPIRVISelLowering.h
@@ -68,6 +68,11 @@ class SPIRVTargetLowering : public TargetLowering {
   // extra instructions required to preserve validity of SPIR-V code imposed by
   // the standard.
   void finalizeLowering(MachineFunction &MF) const override;
+
+  MVT getPreferredSwitchConditionType(LLVMContext &Context,
+                                      EVT ConditionVT) const override {
+    return ConditionVT.getSimpleVT();
+  }
 };
 } // namespace llvm
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
index adc5b36af6f182..53e0432192ca91 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
@@ -271,6 +271,21 @@ static SPIRVType *propagateSPIRVType(MachineInstr *MI, SPIRVGlobalRegistry *GR,
   return SpirvTy;
 }
 
+// To support current approach and limitations wrt. bit width here we widen a
+// scalar register with a bit width greater than 1 to valid sizes and cap it to
+// 64 width.
+static void widenScalarLLTNextPow2(Register Reg, MachineRegisterInfo &MRI) {
+  LLT RegType = MRI.getType(Reg);
+  if (!RegType.isScalar())
+    return;
+  unsigned Sz = RegType.getScalarSizeInBits();
+  if (Sz == 1)
+    return;
+  unsigned NewSz = std::min(std::max(1u << Log2_32_Ceil(Sz), 8u), 64u);
+  if (NewSz != Sz)
+    MRI.setType(Reg, LLT::scalar(NewSz));
+}
+
 static std::pair<Register, unsigned>
 createNewIdReg(SPIRVType *SpvType, Register SrcReg, MachineRegisterInfo &MRI,
                const SPIRVGlobalRegistry &GR) {
@@ -406,6 +421,11 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
       MachineInstr &MI = *MII;
       unsigned MIOp = MI.getOpcode();
 
+      // validate bit width of scalar registers
+      for (const auto &MOP : MI.operands())
+        if (MOP.isReg())
+          widenScalarLLTNextPow2(MOP.getReg(), MRI);
+
       if (isSpvIntrinsic(MI, Intrinsic::spv_assign_ptr_type)) {
         Register Reg = MI.getOperand(1).getReg();
         MIB.setInsertPt(*MI.getParent(), MI.getIterator());
@@ -475,11 +495,6 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
         insertAssignInstr(Reg, Ty, nullptr, GR, MIB, MRI);
       } else if (MIOp == TargetOpcode::G_GLOBAL_VALUE) {
         propagateSPIRVType(&MI, GR, MRI, MIB);
-      } else if (MIOp == TargetOpcode::G_BITREVERSE) {
-        Register Reg = MI.getOperand(0).getReg();
-        LLT RegType = MRI.getType(Reg);
-        if (RegType.getSizeInBits() < 32)
-          MRI.setType(Reg, LLT::scalar(32));
       }
 
       if (MII == Begin)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
index d9936557776ba1..20e50c8c9e1ae0 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
@@ -201,6 +201,9 @@ struct RuntimeLibcallSignatureTable {
     Table[RTLIB::COS_F32] = f32_func_f32;
     Table[RTLIB::COS_F64] = f64_func_f64;
     Table[RTLIB::COS_F128] = i64_i64_func_i64_i64;
+    Table[RTLIB::TAN_F32] = f32_func_f32;
+    Table[RTLIB::TAN_F64] = f64_func_f64;
+    Table[RTLIB::TAN_F128] = i64_i64_func_i64_i64;
     Table[RTLIB::SINCOS_F32] = func_f32_iPTR_iPTR;
     Table[RTLIB::SINCOS_F64] = func_f64_iPTR_iPTR;
     Table[RTLIB::SINCOS_F128] = func_i64_i64_iPTR_iPTR;
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index bc2eb6dcd541c7..1b8462f2d258ca 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -530,7 +530,7 @@ void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS,
   if (!canPadInst(Inst, OS))
     return;
 
-  if (PendingBA && PendingBA->getNextNode() == OS.getCurrentFragment()) {
+  if (PendingBA && PendingBA->getNext() == OS.getCurrentFragment()) {
     // Macro fusion actually happens and there is no other fragment inserted
     // after the previous instruction.
     //
@@ -978,8 +978,8 @@ void X86AsmBackend::finishLayout(MCAssembler const &Asm,
   // The layout is done. Mark every fragment as valid.
   for (unsigned int i = 0, n = Layout.getSectionOrder().size(); i != n; ++i) {
     MCSection &Section = *Layout.getSectionOrder()[i];
-    Layout.getFragmentOffset(&*Section.getFragmentList().rbegin());
-    Asm.computeFragmentSize(Layout, *Section.getFragmentList().rbegin());
+    Layout.getFragmentOffset(&*Section.curFragList()->Tail);
+    Asm.computeFragmentSize(Layout, *Section.curFragList()->Tail);
   }
 }
 
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index cad3ea4716db3e..0d79e4eb3f75a0 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -23816,6 +23816,20 @@ SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
       }
     }
 
+    // Look for X == INT_MIN or X != INT_MIN. We can use NEG and test for
+    // overflow.
+    if (isMinSignedConstant(Op1)) {
+      EVT VT = Op0.getValueType();
+      if (VT == MVT::i32 || VT == MVT::i64 || Op0->hasOneUse()) {
+        SDVTList CmpVTs = DAG.getVTList(VT, MVT::i32);
+        X86::CondCode CondCode = CC == ISD::SETEQ ? X86::COND_O : X86::COND_NO;
+        X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
+        SDValue Neg = DAG.getNode(X86ISD::SUB, dl, CmpVTs,
+                                  DAG.getConstant(0, dl, VT), Op0);
+        return SDValue(Neg.getNode(), 1);
+      }
+    }
+
     // Try to use the carry flag from the add in place of an separate CMP for:
     // (seteq (add X, -1), -1). Similar for setne.
     if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
diff --git a/llvm/lib/Transforms/IPO/Inliner.cpp b/llvm/lib/Transforms/IPO/Inliner.cpp
index a9747aebf67bbc..1a7b9bc8e3e770 100644
--- a/llvm/lib/Transforms/IPO/Inliner.cpp
+++ b/llvm/lib/Transforms/IPO/Inliner.cpp
@@ -197,6 +197,14 @@ InlinerPass::getAdvisor(const ModuleAnalysisManagerCGSCCProxy::Result &MAM,
   return *IAA->getAdvisor();
 }
 
+void makeFunctionBodyUnreachable(Function &F) {
+  F.dropAllReferences();
+  for (BasicBlock &BB : make_early_inc_range(F))
+    BB.eraseFromParent();
+  BasicBlock *BB = BasicBlock::Create(F.getContext(), "", &F);
+  new UnreachableInst(F.getContext(), BB);
+}
+
 PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
                                    CGSCCAnalysisManager &AM, LazyCallGraph &CG,
                                    CGSCCUpdateResult &UR) {
@@ -448,11 +456,9 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
                              }),
               Calls.end());
 
-          // Clear the body and queue the function itself for deletion when we
-          // finish inlining and call graph updates.
-          // Note that after this point, it is an error to do anything other
-          // than use the callee's address or delete it.
-          Callee.dropAllReferences();
+          // Clear the body and queue the function itself for call graph
+          // updating when we finish inlining.
+          makeFunctionBodyUnreachable(Callee);
           assert(!is_contained(DeadFunctions, &Callee) &&
                  "Cannot put cause a function to become dead twice!");
           DeadFunctions.push_back(&Callee);
@@ -530,7 +536,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
   if (!DeadFunctionsInComdats.empty()) {
     filterDeadComdatFunctions(DeadFunctionsInComdats);
     for (auto *Callee : DeadFunctionsInComdats)
-      Callee->dropAllReferences();
+      makeFunctionBodyUnreachable(*Callee);
     DeadFunctions.append(DeadFunctionsInComdats);
   }
 
@@ -542,25 +548,18 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
   // that is OK as all we do is delete things and add pointers to unordered
   // sets.
   for (Function *DeadF : DeadFunctions) {
+    CG.markDeadFunction(*DeadF);
     // Get the necessary information out of the call graph and nuke the
     // function there. Also, clear out any cached analyses.
     auto &DeadC = *CG.lookupSCC(*CG.lookup(*DeadF));
     FAM.clear(*DeadF, DeadF->getName());
     AM.clear(DeadC, DeadC.getName());
-    auto &DeadRC = DeadC.getOuterRefSCC();
-    CG.removeDeadFunction(*DeadF);
 
     // Mark the relevant parts of the call graph as invalid so we don't visit
     // them.
     UR.InvalidatedSCCs.insert(&DeadC);
-    UR.InvalidatedRefSCCs.insert(&DeadRC);
-
-    // If the updated SCC was the one containing the deleted function, clear it.
-    if (&DeadC == UR.UpdatedC)
-      UR.UpdatedC = nullptr;
 
-    // And delete the actual function from the module.
-    M.getFunctionList().erase(DeadF);
+    UR.DeadFunctions.push_back(DeadF);
 
     ++NumDeleted;
   }
diff --git a/llvm/lib/Transforms/Utils/CallGraphUpdater.cpp b/llvm/lib/Transforms/Utils/CallGraphUpdater.cpp
index d0b9884aa9099b..e9f37d4044cb02 100644
--- a/llvm/lib/Transforms/Utils/CallGraphUpdater.cpp
+++ b/llvm/lib/Transforms/Utils/CallGraphUpdater.cpp
@@ -67,16 +67,20 @@ bool CallGraphUpdater::finalize() {
 
         FAM.clear(*DeadFn, DeadFn->getName());
         AM->clear(*DeadSCC, DeadSCC->getName());
-        LCG->removeDeadFunction(*DeadFn);
+        LCG->markDeadFunction(*DeadFn);
 
         // Mark the relevant parts of the call graph as invalid so we don't
         // visit them.
         UR->InvalidatedSCCs.insert(DeadSCC);
         UR->InvalidatedRefSCCs.insert(&DeadRC);
+        UR->DeadFunctions.push_back(DeadFn);
+      } else {
+        // The CGSCC infrastructure batch deletes functions at the end of the
+        // call graph walk, so only erase the function if we're not using that
+        // infrastructure.
+        // The function is now really dead and de-attached from everything.
+        DeadFn->eraseFromParent();
       }
-
-      // The function is now really dead and de-attached from everything.
-      DeadFn->eraseFromParent();
     }
   }
 
diff --git a/llvm/test/CodeGen/SPIRV/optimizations/switch-condition-type.ll b/llvm/test/CodeGen/SPIRV/optimizations/switch-condition-type.ll
new file mode 100644
index 00000000000000..054520d2021b99
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/optimizations/switch-condition-type.ll
@@ -0,0 +1,18 @@
+; RUN: llc -O2 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O2 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: %[[#INT16:]] = OpTypeInt 16 0
+; CHECK: %[[#PARAM:]] = OpFunctionParameter %[[#INT16]]
+; CHECK: OpSwitch %[[#PARAM]] %[[#]] 1 %[[#]] 2 %[[#]]
+
+define i32 @test_switch(i16 %cond) {
+entry:
+  switch i16 %cond, label %default [ i16 1, label %case_one
+                                     i16 2, label %case_two ]
+case_one:
+  ret i32 1
+case_two:
+  ret i32 2
+default:
+  ret i32 3
+}
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/OpGenericCastToPtr.ll b/llvm/test/CodeGen/SPIRV/transcoding/OpGenericCastToPtr.ll
new file mode 100644
index 00000000000000..e3a82b3577701b
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/transcoding/OpGenericCastToPtr.ll
@@ -0,0 +1,138 @@
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-SPIRV-DAG: %[[#Char:]] = OpTypeInt 8 0
+; CHECK-SPIRV-DAG: %[[#GlobalCharPtr:]] = OpTypePointer CrossWorkgroup %[[#Char]]
+; CHECK-SPIRV-DAG: %[[#LocalCharPtr:]] = OpTypePointer Workgroup %[[#Char]]
+; CHECK-SPIRV-DAG: %[[#PrivateCharPtr:]] = OpTypePointer Function %[[#Char]]
+; CHECK-SPIRV-DAG: %[[#GenericCharPtr:]] = OpTypePointer Generic %[[#Char]]
+
+; CHECK-SPIRV-DAG: %[[#Int:]] = OpTypeInt 32 0
+; CHECK-SPIRV-DAG: %[[#GlobalIntPtr:]] = OpTypePointer CrossWorkgroup %[[#Int]]
+; CHECK-SPIRV-DAG: %[[#PrivateIntPtr:]] = OpTypePointer Function %[[#Int]]
+; CHECK-SPIRV-DAG: %[[#GenericIntPtr:]] = OpTypePointer Generic %[[#Int]]
+
+%id = type { %arr }
+%arr = type { [1 x i64] }
+
+@__spirv_BuiltInGlobalInvocationId = external local_unnamed_addr addrspace(1) constant <3 x i64>
+
+; Mangling
+
+; CHECK-SPIRV: OpFunction
+; CHECK-SPIRV: OpPtrCastToGeneric %[[#GenericIntPtr]]
+; CHECK-SPIRV: OpPtrCastToGeneric %[[#GenericCharPtr]]
+; CHECK-SPIRV: OpPtrCastToGeneric %[[#GenericIntPtr]]
+; CHECK-SPIRV: OpGenericCastToPtr %[[#GlobalCharPtr]]
+; CHECK-SPIRV: OpGenericCastToPtr %[[#LocalCharPtr]]
+; CHECK-SPIRV: OpGenericCastToPtr %[[#PrivateCharPtr]]
+; CHECK-SPIRV: OpFunctionEnd
+
+; CHECK-SPIRV: OpFunction
+; CHECK-SPIRV: OpPtrCastToGeneric %[[#GenericIntPtr]]
+; CHECK-SPIRV: OpPtrCastToGeneric %[[#GenericCharPtr]]
+; CHECK-SPIRV: OpPtrCastToGeneric %[[#GenericIntPtr]]
+; CHECK-SPIRV: OpGenericCastToPtr %[[#GlobalCharPtr]]
+; CHECK-SPIRV: OpGenericCastToPtr %[[#LocalCharPtr]]
+; CHECK-SPIRV: OpGenericCastToPtr %[[#PrivateCharPtr]]
+; CHECK-SPIRV: OpFunctionEnd
+
+define spir_kernel void @test1(ptr addrspace(1) %_arg_GlobalA, ptr byval(%id) %_arg_GlobalId, ptr addrspace(3) %_arg_LocalA) {
+entry:
+  %var = alloca i32
+  %p0 = load i64, ptr %_arg_GlobalId
+  %add = getelementptr inbounds i32, ptr addrspace(1) %_arg_GlobalA, i64 %p0
+  %p2 = load i64, ptr addrspace(1) @__spirv_BuiltInGlobalInvocationId
+  %idx = getelementptr inbounds i32, ptr addrspace(1) %add, i64 %p2
+  %var1 = addrspacecast ptr addrspace(1) %idx to ptr addrspace(4)
+  %var2 = addrspacecast ptr addrspace(3) %_arg_LocalA to ptr addrspace(4)
+  %var3 = addrspacecast ptr %var to ptr addrspace(4)
+  %G = call spir_func ptr addrspace(1) @_Z33__spirv_GenericCastToPtr_ToGlobalPvi(ptr addrspace(4) %var1, i32 5)
+  %L = call spir_func ptr addrspace(3) @_Z32__spirv_GenericCastToPtr_ToLocalPvi(ptr addrspace(4) %var2, i32 4)
+  %P = call spir_func ptr @_Z34__spirv_GenericCastToPtr_ToPrivatePvi(ptr addrspace(4) %var3, i32 7)
+  ret void
+}
+
+define spir_kernel void @test2(ptr addrspace(1) %_arg_GlobalA, ptr byval(%id) %_arg_GlobalId, ptr addrspace(3) %_arg_LocalA) {
+entry:
+  %var = alloca i32
+  %p0 = load i64, ptr %_arg_GlobalId
+  %add = getelementptr inbounds i32, ptr addrspace(1) %_arg_GlobalA, i64 %p0
+  %p2 = load i64, ptr addrspace(1) @__spirv_BuiltInGlobalInvocationId
+  %idx = getelementptr inbounds i32, ptr addrspace(1) %add, i64 %p2
+  %var1 = addrspacecast ptr addrspace(1) %idx to ptr addrspace(4)
+  %var2 = addrspacecast ptr addrspace(3) %_arg_LocalA to ptr addrspace(4)
+  %var3 = addrspacecast ptr %var to ptr addrspace(4)
+  %G = call spir_func ptr addrspace(1) @_Z9to_globalPv(ptr addrspace(4) %var1)
+  %L = call spir_func ptr addrspace(3) @_Z8to_localPv(ptr addrspace(4) %var2)
+  %P = call spir_func ptr @_Z10to_privatePv(ptr addrspace(4) %var3)
+  ret void
+}
+
+declare spir_func ptr addrspace(1) @_Z33__spirv_GenericCastToPtr_ToGlobalPvi(ptr addrspace(4), i32)
+declare spir_func ptr addrspace(3) @_Z32__spirv_GenericCastToPtr_ToLocalPvi(ptr addrspace(4), i32)
+declare spir_func ptr @_Z34__spirv_GenericCastToPtr_ToPrivatePvi(ptr addrspace(4), i32)
+
+declare spir_func ptr addrspace(1) @_Z9to_globalPv(ptr addrspace(4))
+declare spir_func ptr addrspace(3) @_Z8to_localPv(ptr addrspace(4))
+declare spir_func ptr @_Z10to_privatePv(ptr addrspace(4))
+
+; No mangling
+
+; CHECK-SPIRV: OpFunction
+; CHECK-SPIRV: OpPtrCastToGeneric %[[#GenericIntPtr]]
+; CHECK-SPIRV: OpPtrCastToGeneric %[[#GenericCharPtr]]
+; CHECK-SPIRV: OpPtrCastToGeneric %[[#GenericIntPtr]]
+; CHECK-SPIRV: OpGenericCastToPtr %[[#GlobalIntPtr]]
+; CHECK-SPIRV: OpGenericCastToPtr %[[#LocalCharPtr]]
+; CHECK-SPIRV: OpGenericCastToPtr %[[#PrivateIntPtr]]
+; CHECK-SPIRV: OpFunctionEnd
+
+; CHECK-SPIRV: OpFunction
+; CHECK-SPIRV: OpPtrCastToGeneric %[[#GenericIntPtr]]
+; CHECK-SPIRV: OpPtrCastToGeneric %[[#GenericCharPtr]]
+; CHECK-SPIRV: OpPtrCastToGeneric %[[#GenericIntPtr]]
+; CHECK-SPIRV: OpGenericCastToPtr %[[#GlobalIntPtr]]
+; CHECK-SPIRV: OpGenericCastToPtr %[[#LocalCharPtr]]
+; CHECK-SPIRV: OpGenericCastToPtr %[[#PrivateIntPtr]]
+; CHECK-SPIRV: OpFunctionEnd
+
+define spir_kernel void @test3(ptr addrspace(1) %_arg_GlobalA, ptr byval(%id) %_arg_GlobalId, ptr addrspace(3) %_arg_LocalA) {
+entry:
+  %var = alloca i32
+  %p0 = load i64, ptr %_arg_GlobalId
+  %add = getelementptr inbounds i32, ptr addrspace(1) %_arg_GlobalA, i64 %p0
+  %p2 = load i64, ptr addrspace(1) @__spirv_BuiltInGlobalInvocationId
+  %idx = getelementptr inbounds i32, ptr addrspace(1) %add, i64 %p2
+  %var1 = addrspacecast ptr addrspace(1) %idx to ptr addrspace(4)
+  %var2 = addrspacecast ptr addrspace(3) %_arg_LocalA to ptr addrspace(4)
+  %var3 = addrspacecast ptr %var to ptr addrspace(4)
+  %G = call spir_func ptr addrspace(1) @__spirv_GenericCastToPtr_ToGlobal(ptr addrspace(4) %var1, i32 5)
+  %L = call spir_func ptr addrspace(3) @__spirv_GenericCastToPtr_ToLocal(ptr addrspace(4) %var2, i32 4)
+  %P = call spir_func ptr @__spirv_GenericCastToPtr_ToPrivate(ptr addrspace(4) %var3, i32 7)
+  ret void
+}
+
+define spir_kernel void @test4(ptr addrspace(1) %_arg_GlobalA, ptr byval(%id) %_arg_GlobalId, ptr addrspace(3) %_arg_LocalA) {
+entry:
+  %var = alloca i32
+  %p0 = load i64, ptr %_arg_GlobalId
+  %add = getelementptr inbounds i32, ptr addrspace(1) %_arg_GlobalA, i64 %p0
+  %p2 = load i64, ptr addrspace(1) @__spirv_BuiltInGlobalInvocationId
+  %idx = getelementptr inbounds i32, ptr addrspace(1) %add, i64 %p2
+  %var1 = addrspacecast ptr addrspace(1) %idx to ptr addrspace(4)
+  %var2 = addrspacecast ptr addrspace(3) %_arg_LocalA to ptr addrspace(4)
+  %var3 = addrspacecast ptr %var to ptr addrspace(4)
+  %G = call spir_func ptr addrspace(1) @to_global(ptr addrspace(4) %var1)
+  %L = call spir_func ptr addrspace(3) @to_local(ptr addrspace(4) %var2)
+  %P = call spir_func ptr @to_private(ptr addrspace(4) %var3)
+  ret void
+}
+
+declare spir_func ptr addrspace(1) @__spirv_GenericCastToPtr_ToGlobal(ptr addrspace(4), i32)
+declare spir_func ptr addrspace(3) @__spirv_GenericCastToPtr_ToLocal(ptr addrspace(4), i32)
+declare spir_func ptr @__spirv_GenericCastToPtr_ToPrivate(ptr addrspace(4), i32)
+
+declare spir_func ptr addrspace(1) @to_global(ptr addrspace(4))
+declare spir_func ptr addrspace(3) @to_local(ptr addrspace(4))
+declare spir_func ptr @to_private(ptr addrspace(4))
diff --git a/llvm/test/CodeGen/SPIRV/trunc-nonstd-bitwidth.ll b/llvm/test/CodeGen/SPIRV/trunc-nonstd-bitwidth.ll
new file mode 100644
index 00000000000000..437e161864eca5
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/trunc-nonstd-bitwidth.ll
@@ -0,0 +1,56 @@
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NOEXT
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s --spirv-ext=+SPV_INTEL_arbitrary_precision_integers -o - | FileCheck %s --check-prefixes=CHECK,CHECK-EXT
+
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NOEXT
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s --spirv-ext=+SPV_INTEL_arbitrary_precision_integers -o - | FileCheck %s --check-prefixes=CHECK,CHECK-EXT
+
+; CHECK-DAG: OpName %[[#Struct:]] "struct"
+; CHECK-DAG: OpName %[[#Arg:]] "arg"
+; CHECK-DAG: OpName %[[#QArg:]] "qarg"
+; CHECK-DAG: OpName %[[#R:]] "r"
+; CHECK-DAG: OpName %[[#Q:]] "q"
+; CHECK-DAG: OpName %[[#Tr:]] "tr"
+; CHECK-DAG: OpName %[[#Tq:]] "tq"
+; CHECK-DAG: %[[#Struct]] = OpTypeStruct %[[#]] %[[#]] %[[#]]
+; CHECK-DAG: %[[#PtrStruct:]] = OpTypePointer CrossWorkgroup %[[#Struct]]
+; CHECK-EXT-DAG: %[[#Int40:]] = OpTypeInt 40 0
+; CHECK-EXT-DAG: %[[#Int50:]] = OpTypeInt 50 0
+; CHECK-NOEXT-DAG: %[[#Int40:]] = OpTypeInt 64 0
+; CHECK-DAG: %[[#PtrInt40:]] = OpTypePointer CrossWorkgroup %[[#Int40]]
+
+; CHECK: OpFunction
+
+; CHECK-EXT: %[[#Tr]] = OpUConvert %[[#Int40]] %[[#R]]
+; CHECK-EXT: %[[#Store:]] = OpInBoundsPtrAccessChain %[[#PtrStruct]] %[[#Arg]] %[[#]]
+; CHECK-EXT: %[[#StoreAsInt40:]] = OpBitcast %[[#PtrInt40]] %[[#Store]]
+; CHECK-EXT: OpStore %[[#StoreAsInt40]] %[[#Tr]]
+
+; CHECK-NOEXT: %[[#Store:]] = OpInBoundsPtrAccessChain %[[#PtrStruct]] %[[#Arg]] %[[#]]
+; CHECK-NOEXT: %[[#StoreAsInt40:]] = OpBitcast %[[#PtrInt40]] %[[#Store]]
+; CHECK-NOEXT: OpStore %[[#StoreAsInt40]] %[[#R]]
+
+; CHECK: OpFunction
+
+; CHECK-EXT: %[[#Tq]] = OpUConvert %[[#Int40]] %[[#Q]]
+; CHECK-EXT: OpStore %[[#QArg]] %[[#Tq]]
+
+; CHECK-NOEXT: OpStore %[[#QArg]] %[[#Q]]
+
+%struct = type <{ i32, i8, [3 x i8] }>
+
+define spir_kernel void @foo(ptr addrspace(1) %arg, i64 %r) {
+  %tr = trunc i64 %r to i40
+  %addr = getelementptr inbounds %struct, ptr addrspace(1) %arg, i64 0
+  store i40 %tr, ptr addrspace(1) %addr
+  ret void
+}
+
+define spir_kernel void @bar(ptr addrspace(1) %qarg, i50 %q) {
+  %tq = trunc i50 %q to i40
+  store i40 %tq, ptr addrspace(1) %qarg
+  ret void
+}
diff --git a/llvm/test/CodeGen/WebAssembly/libcalls.ll b/llvm/test/CodeGen/WebAssembly/libcalls.ll
index 4f57c347a1a335..70f000664d388a 100644
--- a/llvm/test/CodeGen/WebAssembly/libcalls.ll
+++ b/llvm/test/CodeGen/WebAssembly/libcalls.ll
@@ -12,6 +12,7 @@ declare fp128 @llvm.nearbyint.f128(fp128)
 declare fp128 @llvm.pow.f128(fp128, fp128)
 declare fp128 @llvm.powi.f128.i32(fp128, i32)
 
+declare double @llvm.tan.f64(double)
 declare double @llvm.cos.f64(double)
 declare double @llvm.log10.f64(double)
 declare double @llvm.pow.f64(double, double)
@@ -240,42 +241,44 @@ define double @f64libcalls(double %x, double %y, i32 %z) {
 ; CHECK:         .functype f64libcalls (f64, f64, i32) -> (f64)
 ; CHECK-NEXT:    .local i32
 ; CHECK-NEXT:  # %bb.0:
-; CHECK-NEXT:    global.get $push11=, __stack_pointer
-; CHECK-NEXT:    i32.const $push12=, 16
-; CHECK-NEXT:    i32.sub $push18=, $pop11, $pop12
-; CHECK-NEXT:    local.tee $push17=, 3, $pop18
-; CHECK-NEXT:    global.set __stack_pointer, $pop17
-; CHECK-NEXT:    local.get $push22=, 0
-; CHECK-NEXT:    local.get $push19=, 0
-; CHECK-NEXT:    call $push0=, cos, $pop19
-; CHECK-NEXT:    call $push1=, log10, $pop0
-; CHECK-NEXT:    local.get $push20=, 1
-; CHECK-NEXT:    call $push2=, pow, $pop1, $pop20
-; CHECK-NEXT:    local.get $push21=, 2
-; CHECK-NEXT:    call $push3=, __powidf2, $pop2, $pop21
-; CHECK-NEXT:    call $push4=, log, $pop3
-; CHECK-NEXT:    call $push5=, exp, $pop4
-; CHECK-NEXT:    call $push6=, exp10, $pop5
-; CHECK-NEXT:    call $push7=, cbrt, $pop6
-; CHECK-NEXT:    call $push8=, lround, $pop7
-; CHECK-NEXT:    call $push9=, ldexp, $pop22, $pop8
-; CHECK-NEXT:    local.get $push23=, 3
-; CHECK-NEXT:    i32.const $push15=, 12
-; CHECK-NEXT:    i32.add $push16=, $pop23, $pop15
-; CHECK-NEXT:    call $push24=, frexp, $pop9, $pop16
-; CHECK-NEXT:    local.set 0, $pop24
-; CHECK-NEXT:    local.get $push25=, 3
-; CHECK-NEXT:    i32.load $push10=, 12($pop25)
-; CHECK-NEXT:    call escape_value, $pop10
-; CHECK-NEXT:    local.get $push26=, 3
+; CHECK-NEXT:    global.get $push12=, __stack_pointer
 ; CHECK-NEXT:    i32.const $push13=, 16
-; CHECK-NEXT:    i32.add $push14=, $pop26, $pop13
-; CHECK-NEXT:    global.set __stack_pointer, $pop14
-; CHECK-NEXT:    local.get $push27=, 0
-; CHECK-NEXT:    return $pop27
+; CHECK-NEXT:    i32.sub  $push19=, $pop12, $pop13
+; CHECK-NEXT:    local.tee $push18=, 3, $pop19
+; CHECK-NEXT:    global.set __stack_pointer, $pop18
+; CHECK-NEXT:    local.get $push23=, 0
+; CHECK-NEXT:    local.get $push20=, 0
+; CHECK-NEXT:    call $push0=, tan, $pop20
+; CHECK-NEXT:    call $push1=, cos, $pop0
+; CHECK-NEXT:    call $push2=, log10, $pop1
+; CHECK-NEXT:    local.get $push21=, 1
+; CHECK-NEXT:    call $push3=, pow, $pop2, $pop21
+; CHECK-NEXT:    local.get $push22=, 2
+; CHECK-NEXT:    call $push4=, __powidf2, $pop3, $pop22
+; CHECK-NEXT:    call $push5=, log, $pop4
+; CHECK-NEXT:    call $push6=, exp, $pop5
+; CHECK-NEXT:    call $push7=, exp10, $pop6
+; CHECK-NEXT:    call $push8=, cbrt, $pop7
+; CHECK-NEXT:    call $push9=, lround, $pop8
+; CHECK-NEXT:    call $push10=, ldexp, $pop23, $pop9
+; CHECK-NEXT:    local.get $push24=, 3
+; CHECK-NEXT:    i32.const $push16=, 12
+; CHECK-NEXT:    i32.add  $push17=, $pop24, $pop16
+; CHECK-NEXT:    call $push25=, frexp, $pop10, $pop17
+; CHECK-NEXT:    local.set 0, $pop25
+; CHECK-NEXT:    local.get $push26=, 3
+; CHECK-NEXT:    i32.load $push11=, 12($pop26)
+; CHECK-NEXT:    call escape_value, $pop11
+; CHECK-NEXT:    local.get $push27=, 3
+; CHECK-NEXT:    i32.const $push14=, 16
+; CHECK-NEXT:    i32.add  $push15=, $pop27, $pop14
+; CHECK-NEXT:    global.set __stack_pointer, $pop15
+; CHECK-NEXT:    local.get $push28=, 0
+; CHECK-NEXT:    return $pop28
 
 
- %a = call double @llvm.cos.f64(double %x)
+ %k = call double @llvm.tan.f64(double %x)
+ %a = call double @llvm.cos.f64(double %k)
  %b = call double @llvm.log10.f64(double %a)
  %c = call double @llvm.pow.f64(double %b, double %y)
  %d = call double @llvm.powi.f64.i32(double %c, i32 %z)
diff --git a/llvm/test/CodeGen/X86/2008-06-16-SubregsBug.ll b/llvm/test/CodeGen/X86/2008-06-16-SubregsBug.ll
index feaa38a7600a21..00ffea903079ef 100644
--- a/llvm/test/CodeGen/X86/2008-06-16-SubregsBug.ll
+++ b/llvm/test/CodeGen/X86/2008-06-16-SubregsBug.ll
@@ -8,8 +8,8 @@ define i16 @test(ptr %tmp179) nounwind  {
 ; CHECK-NEXT:    movzwl (%eax), %eax
 ; CHECK-NEXT:    movl %eax, %ecx
 ; CHECK-NEXT:    andl $64512, %ecx ## imm = 0xFC00
-; CHECK-NEXT:    cmpl $32768, %ecx ## imm = 0x8000
-; CHECK-NEXT:    jne LBB0_2
+; CHECK-NEXT:    negw %cx
+; CHECK-NEXT:    jno LBB0_2
 ; CHECK-NEXT:  ## %bb.1: ## %bb189
 ; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/abs.ll b/llvm/test/CodeGen/X86/abs.ll
index dde877c5bb61e5..bae140abdf6b1a 100644
--- a/llvm/test/CodeGen/X86/abs.ll
+++ b/llvm/test/CodeGen/X86/abs.ll
@@ -783,8 +783,7 @@ define i32 @test_minsigned_i32(i32 %a0, i32 %a1) nounwind {
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    negl %eax
 ; X64-NEXT:    cmovsl %edi, %eax
-; X64-NEXT:    cmpl $-2147483648, %edi # imm = 0x80000000
-; X64-NEXT:    cmovel %esi, %eax
+; X64-NEXT:    cmovol %esi, %eax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: test_minsigned_i32:
@@ -793,11 +792,7 @@ define i32 @test_minsigned_i32(i32 %a0, i32 %a1) nounwind {
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    negl %eax
 ; X86-NEXT:    cmovsl %ecx, %eax
-; X86-NEXT:    cmpl $-2147483648, %ecx # imm = 0x80000000
-; X86-NEXT:    jne .LBB19_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:  .LBB19_2:
+; X86-NEXT:    cmovol {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    retl
   %lim = icmp eq i32 %a0, -2147483648
   %abs = tail call i32 @llvm.abs.i32(i32 %a0, i1 false)
@@ -811,9 +806,7 @@ define i64 @test_minsigned_i64(i64 %a0, i64 %a1) nounwind {
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    negq %rax
 ; X64-NEXT:    cmovsq %rdi, %rax
-; X64-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; X64-NEXT:    cmpq %rcx, %rdi
-; X64-NEXT:    cmoveq %rsi, %rax
+; X64-NEXT:    cmovoq %rsi, %rax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: test_minsigned_i64:
diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll
index 49797fbefa5973..5c5487815b3360 100644
--- a/llvm/test/CodeGen/X86/combine-sdiv.ll
+++ b/llvm/test/CodeGen/X86/combine-sdiv.ll
@@ -58,8 +58,8 @@ define i32 @combine_sdiv_by_minsigned(i32 %x) {
 ; CHECK-LABEL: combine_sdiv_by_minsigned:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    cmpl $-2147483648, %edi # imm = 0x80000000
-; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    negl %edi
+; CHECK-NEXT:    seto %al
 ; CHECK-NEXT:    retq
   %1 = sdiv i32 %x, -2147483648
   ret i32 %1
diff --git a/llvm/test/CodeGen/X86/is_fpclass.ll b/llvm/test/CodeGen/X86/is_fpclass.ll
index 999be0f98b6fc5..532b2c09a9175f 100644
--- a/llvm/test/CodeGen/X86/is_fpclass.ll
+++ b/llvm/test/CodeGen/X86/is_fpclass.ll
@@ -937,15 +937,16 @@ entry:
 define i1 @is_minus_zero_f(float %x) {
 ; X86-LABEL: is_minus_zero_f:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    cmpl $-2147483648, {{[0-9]+}}(%esp) # imm = 0x80000000
-; X86-NEXT:    sete %al
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    seto %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: is_minus_zero_f:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movd %xmm0, %eax
-; X64-NEXT:    cmpl $-2147483648, %eax # imm = 0x80000000
-; X64-NEXT:    sete %al
+; X64-NEXT:    negl %eax
+; X64-NEXT:    seto %al
 ; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 32)  ; 0x20 = "-zero"
@@ -955,15 +956,16 @@ entry:
 define i1 @not_is_minus_zero_f(float %x) {
 ; X86-LABEL: not_is_minus_zero_f:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    cmpl $-2147483648, {{[0-9]+}}(%esp) # imm = 0x80000000
-; X86-NEXT:    setne %al
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    setno %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: not_is_minus_zero_f:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movd %xmm0, %eax
-; X64-NEXT:    cmpl $-2147483648, %eax # imm = 0x80000000
-; X64-NEXT:    setne %al
+; X64-NEXT:    negl %eax
+; X64-NEXT:    setno %al
 ; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 991)  ; ~0x20 = ~"-zero"
diff --git a/llvm/test/CodeGen/X86/lsr-overflow.ll b/llvm/test/CodeGen/X86/lsr-overflow.ll
index 09c1c07ef3de01..79440c282be758 100644
--- a/llvm/test/CodeGen/X86/lsr-overflow.ll
+++ b/llvm/test/CodeGen/X86/lsr-overflow.ll
@@ -4,9 +4,9 @@
 ; The comparison uses the pre-inc value, which could lead LSR to
 ; try to compute -INT64_MIN.
 
-; CHECK: movabsq $-9223372036854775808, %rax
-; CHECK: cmpq  %rax,
-; CHECK: sete  %al
+; CHECK-NOT: movabsq $-9223372036854775808, %rax
+; CHECK: negq  %r
+; CHECK-NEXT: seto  %al
 
 declare i64 @bar()
 
diff --git a/llvm/test/CodeGen/X86/shrink-compare-pgso.ll b/llvm/test/CodeGen/X86/shrink-compare-pgso.ll
index 254b8fe3fc6e30..5a15ee36c07263 100644
--- a/llvm/test/CodeGen/X86/shrink-compare-pgso.ll
+++ b/llvm/test/CodeGen/X86/shrink-compare-pgso.ll
@@ -265,8 +265,8 @@ if.end:
 define dso_local void @test_sext_i8_icmp_neg128(i8 %x) nounwind !prof !14 {
 ; CHECK-LABEL: test_sext_i8_icmp_neg128:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    cmpb $-128, %dil
-; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:    negb %dil
+; CHECK-NEXT:    jo bar # TAILCALL
 ; CHECK-NEXT:  # %bb.1: # %if.end
 ; CHECK-NEXT:    retq
 entry:
diff --git a/llvm/test/CodeGen/X86/shrink-compare.ll b/llvm/test/CodeGen/X86/shrink-compare.ll
index 840167ff9f4a0c..1a61451c26a03c 100644
--- a/llvm/test/CodeGen/X86/shrink-compare.ll
+++ b/llvm/test/CodeGen/X86/shrink-compare.ll
@@ -265,8 +265,8 @@ if.end:
 define dso_local void @test_sext_i8_icmp_neg128(i8 %x) nounwind minsize {
 ; CHECK-LABEL: test_sext_i8_icmp_neg128:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    cmpb $-128, %dil
-; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:    negb %dil
+; CHECK-NEXT:    jo bar # TAILCALL
 ; CHECK-NEXT:  # %bb.1: # %if.end
 ; CHECK-NEXT:    retq
 entry:
diff --git a/llvm/test/DebugInfo/symbolize-gnu-debuglink-fallback.test b/llvm/test/DebugInfo/symbolize-gnu-debuglink-fallback.test
index 43d5a2c818fac3..c0d6f83e4af0d5 100644
--- a/llvm/test/DebugInfo/symbolize-gnu-debuglink-fallback.test
+++ b/llvm/test/DebugInfo/symbolize-gnu-debuglink-fallback.test
@@ -1,4 +1,4 @@
-# REQUIRES: shell
+# UNSUPPORTED: system-windows
 # Ensures that .debuglink can fallback to a separate location. This is normally
 # /usr/lib/debug (or /usr/libdata/debug for NetBSD), but can be configured on
 # the command line (mainly for testing).
diff --git a/llvm/test/Other/cgscc-refscc-mutation-order.ll b/llvm/test/Other/cgscc-refscc-mutation-order.ll
index 13a46503c1f4c9..aa207357157633 100644
--- a/llvm/test/Other/cgscc-refscc-mutation-order.ll
+++ b/llvm/test/Other/cgscc-refscc-mutation-order.ll
@@ -15,8 +15,6 @@
 ; CHECK-NOT: InstCombinePass
 ; CHECK: Running pass: InstCombinePass on f4
 ; CHECK-NOT: InstCombinePass
-; CHECK: Running pass: InstCombinePass on f1
-; CHECK-NOT: InstCombinePass
 
 @a1 = alias void (), ptr @f1
 @a2 = alias void (), ptr @f2
diff --git a/llvm/test/Other/devirt-invalidated.ll b/llvm/test/Other/devirt-invalidated.ll
index c3ed5e53b3b04c..7926641dda97bb 100644
--- a/llvm/test/Other/devirt-invalidated.ll
+++ b/llvm/test/Other/devirt-invalidated.ll
@@ -1,8 +1,6 @@
 ; RUN: opt -passes='devirt<0>(inline)' < %s -S | FileCheck %s
 
-; CHECK-NOT: internal
 ; CHECK: define void @e()
-; CHECK-NOT: internal
 
 define void @e() {
 entry:
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/live_called_from_dead.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/live_called_from_dead.ll
index 2df81d6cb1832d..1c34fff8dd7554 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/live_called_from_dead.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/live_called_from_dead.ll
@@ -37,7 +37,6 @@ define internal i32 @caller(ptr %B) {
 ; CGSCC-LABEL: define {{[^@]+}}@caller
 ; CGSCC-SAME: () #[[ATTR0]] {
 ; CGSCC-NEXT:    [[A:%.*]] = alloca i32, align 4
-; CGSCC-NEXT:    [[A2:%.*]] = alloca i8, i32 0, align 4
 ; CGSCC-NEXT:    [[A1:%.*]] = alloca i8, i32 0, align 4
 ; CGSCC-NEXT:    ret i32 0
 ;
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/live_called_from_dead_2.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/live_called_from_dead_2.ll
index 7c28de24beea27..b42647840f7cfc 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/live_called_from_dead_2.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/live_called_from_dead_2.ll
@@ -54,7 +54,6 @@ define internal i32 @caller(ptr %B) {
 ; CGSCC-LABEL: define {{[^@]+}}@caller
 ; CGSCC-SAME: (ptr noalias nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[B:%.*]]) #[[ATTR0]] {
 ; CGSCC-NEXT:    [[A:%.*]] = alloca i32, align 4
-; CGSCC-NEXT:    [[A2:%.*]] = alloca i8, i32 0, align 4
 ; CGSCC-NEXT:    [[A1:%.*]] = alloca i8, i32 0, align 4
 ; CGSCC-NEXT:    [[C:%.*]] = call i32 @test(ptr noalias nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[B]]) #[[ATTR2:[0-9]+]]
 ; CGSCC-NEXT:    ret i32 0
diff --git a/llvm/test/Transforms/Inline/cgscc-cycle-debug.ll b/llvm/test/Transforms/Inline/cgscc-cycle-debug.ll
index 40a6b0577e7dd0..e79700e8dac624 100644
--- a/llvm/test/Transforms/Inline/cgscc-cycle-debug.ll
+++ b/llvm/test/Transforms/Inline/cgscc-cycle-debug.ll
@@ -13,7 +13,6 @@
 ; CHECK: Running an SCC pass across the RefSCC: [(test1_a, test1_b, test1_c)]
 ; CHECK: Enqueuing the existing SCC in the worklist:(test1_b)
 ; CHECK: Enqueuing a newly formed SCC:(test1_c)
-; CHECK: Enqueuing a new RefSCC in the update worklist: [(test1_b)]
 ; CHECK: Switch an internal ref edge to a call edge from 'test1_a' to 'test1_c'
 ; CHECK: Switch an internal ref edge to a call edge from 'test1_a' to 'test1_a'
 ; CHECK: Re-running SCC passes after a refinement of the current SCC: (test1_c, test1_a)
diff --git a/llvm/test/tools/llvm-cov/gcov/basic.test b/llvm/test/tools/llvm-cov/gcov/basic.test
index 5313fe2d7a69a1..7557739add8ba5 100644
--- a/llvm/test/tools/llvm-cov/gcov/basic.test
+++ b/llvm/test/tools/llvm-cov/gcov/basic.test
@@ -3,7 +3,7 @@
 
 # Test fails on Windows where internal shell is used due to path separator
 # mismatches.
-REQUIRES: shell
+UNSUPPORTED: system-windows
 
 RUN: rm -rf %t
 RUN: mkdir %t
diff --git a/llvm/test/tools/llvm-rc/windres-prefix.test b/llvm/test/tools/llvm-rc/windres-prefix.test
index 7dda51d0635232..4c53fdfc3db65e 100644
--- a/llvm/test/tools/llvm-rc/windres-prefix.test
+++ b/llvm/test/tools/llvm-rc/windres-prefix.test
@@ -1,3 +1,5 @@
+; REQUIRES: shell
+
 ; RUN: rm -rf %t && mkdir %t
 
 ; Check that a triple prefix on the executable gets picked up as target triple.
diff --git a/llvm/test/tools/llvm-rc/windres-preproc.test b/llvm/test/tools/llvm-rc/windres-preproc.test
index 13f82299a074bb..52427862e760b8 100644
--- a/llvm/test/tools/llvm-rc/windres-preproc.test
+++ b/llvm/test/tools/llvm-rc/windres-preproc.test
@@ -1,7 +1,6 @@
 ;; Some quoted arguments below don't work properly on Windows when llvm-lit
-;; invokes the cmd shell to run the commands. Just require running in a
-;; posix shell, to keep being able to test this corner case on Unix at least.
-; REQUIRES: shell
+;; invokes the cmd shell to run the commands.
+; UNSUPPORTED: system-windows
 
 ; RUN: llvm-windres -### --include-dir %p/incdir1 --include %p/incdir2 "-DFOO1=\\\"foo bar\\\"" -UFOO2 -D FOO3 --preprocessor-arg "-DFOO4=\\\"baz baz\\\"" -DFOO5=\"bar\" %p/Inputs/empty.rc %t.res | FileCheck %s --check-prefix=CHECK1
 ; RUN: llvm-windres -### --include-dir %p/incdir1 --include %p/incdir2 "-DFOO1=\"foo bar\"" -UFOO2 -D FOO3 --preprocessor-arg "-DFOO4=\"baz baz\"" "-DFOO5=bar" %p/Inputs/empty.rc %t.res --use-temp-file | FileCheck %s --check-prefix=CHECK1
diff --git a/llvm/unittests/Analysis/CGSCCPassManagerTest.cpp b/llvm/unittests/Analysis/CGSCCPassManagerTest.cpp
index b33567dd602b0e..aab148c12c4164 100644
--- a/llvm/unittests/Analysis/CGSCCPassManagerTest.cpp
+++ b/llvm/unittests/Analysis/CGSCCPassManagerTest.cpp
@@ -1659,18 +1659,16 @@ TEST_F(CGSCCPassManagerTest, TestUpdateCGAndAnalysisManagerForPasses9) {
         Function *FnF = M->getFunction("f");
 
         // Use the CallGraphUpdater to update the call graph.
-        {
-          CallGraphUpdater CGU;
-          CGU.initialize(CG, C, AM, UR);
-          ASSERT_NO_FATAL_FAILURE(CGU.removeFunction(*FnF));
-          ASSERT_EQ(M->getFunctionList().size(), 6U);
-        }
-        ASSERT_EQ(M->getFunctionList().size(), 5U);
+        CallGraphUpdater CGU;
+        CGU.initialize(CG, C, AM, UR);
+        ASSERT_NO_FATAL_FAILURE(CGU.removeFunction(*FnF));
+        ASSERT_EQ(M->getFunctionList().size(), 6U);
       }));
 
   ModulePassManager MPM;
   MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM)));
   MPM.run(*M, MAM);
+  ASSERT_EQ(M->getFunctionList().size(), 5U);
 }
 
 TEST_F(CGSCCPassManagerTest, TestUpdateCGAndAnalysisManagerForPasses10) {
diff --git a/llvm/unittests/Analysis/LazyCallGraphTest.cpp b/llvm/unittests/Analysis/LazyCallGraphTest.cpp
index 69af7d92c7cf0d..64b6ccddc53b0c 100644
--- a/llvm/unittests/Analysis/LazyCallGraphTest.cpp
+++ b/llvm/unittests/Analysis/LazyCallGraphTest.cpp
@@ -1169,7 +1169,7 @@ TEST(LazyCallGraphTest, InlineAndDeleteFunction) {
   LazyCallGraph::SCC &NewDC = *NewCs.begin();
   EXPECT_EQ(&NewDC, CG.lookupSCC(D1));
   EXPECT_EQ(&NewDC, CG.lookupSCC(D3));
-  auto NewRCs = DRC.removeInternalRefEdge(D1, {&D2});
+  auto NewRCs = DRC.removeInternalRefEdges({{&D1, &D2}});
   ASSERT_EQ(2u, NewRCs.size());
   LazyCallGraph::RefSCC &NewDRC = *NewRCs[0];
   EXPECT_EQ(&NewDRC, CG.lookupRefSCC(D1));
@@ -1186,7 +1186,8 @@ TEST(LazyCallGraphTest, InlineAndDeleteFunction) {
   EXPECT_TRUE(D2RC.isParentOf(NewDRC));
 
   // Now that we've updated the call graph, D2 is dead, so remove it.
-  CG.removeDeadFunction(D2F);
+  CG.markDeadFunction(D2F);
+  CG.removeDeadFunctions({&D2F});
 
   // Check that the graph still looks the same.
   EXPECT_EQ(&ARC, CG.lookupRefSCC(A1));
@@ -1344,7 +1345,7 @@ TEST(LazyCallGraphTest, InternalEdgeRemoval) {
   // Remove the edge from b -> a, which should leave the 3 functions still in
   // a single connected component because of a -> b -> c -> a.
   SmallVector<LazyCallGraph::RefSCC *, 1> NewRCs =
-      RC.removeInternalRefEdge(B, {&A});
+      RC.removeInternalRefEdges({{&B, &A}});
   EXPECT_EQ(0u, NewRCs.size());
   EXPECT_EQ(&RC, CG.lookupRefSCC(A));
   EXPECT_EQ(&RC, CG.lookupRefSCC(B));
@@ -1360,7 +1361,7 @@ TEST(LazyCallGraphTest, InternalEdgeRemoval) {
 
   // Remove the edge from c -> a, which should leave 'a' in the original RefSCC
   // and form a new RefSCC for 'b' and 'c'.
-  NewRCs = RC.removeInternalRefEdge(C, {&A});
+  NewRCs = RC.removeInternalRefEdges({{&C, &A}});
   ASSERT_EQ(2u, NewRCs.size());
   LazyCallGraph::RefSCC &BCRC = *NewRCs[0];
   LazyCallGraph::RefSCC &ARC = *NewRCs[1];
@@ -1425,7 +1426,7 @@ TEST(LazyCallGraphTest, InternalMultiEdgeRemoval) {
 
   // Remove the edges from b -> a and b -> c, leaving b in its own RefSCC.
   SmallVector<LazyCallGraph::RefSCC *, 1> NewRCs =
-      RC.removeInternalRefEdge(B, {&A, &C});
+      RC.removeInternalRefEdges({{&B, &A}, {&B, &C}});
 
   ASSERT_EQ(2u, NewRCs.size());
   LazyCallGraph::RefSCC &BRC = *NewRCs[0];
@@ -1494,7 +1495,7 @@ TEST(LazyCallGraphTest, InternalNoOpEdgeRemoval) {
 
   // Remove the edge from a -> c which doesn't change anything.
   SmallVector<LazyCallGraph::RefSCC *, 1> NewRCs =
-      RC.removeInternalRefEdge(AN, {&CN});
+      RC.removeInternalRefEdges({{&AN, &CN}});
   EXPECT_EQ(0u, NewRCs.size());
   EXPECT_EQ(&RC, CG.lookupRefSCC(AN));
   EXPECT_EQ(&RC, CG.lookupRefSCC(BN));
@@ -1509,8 +1510,8 @@ TEST(LazyCallGraphTest, InternalNoOpEdgeRemoval) {
 
   // Remove the edge from b -> a and c -> b; again this doesn't change
   // anything.
-  NewRCs = RC.removeInternalRefEdge(BN, {&AN});
-  NewRCs = RC.removeInternalRefEdge(CN, {&BN});
+  NewRCs = RC.removeInternalRefEdges({{&BN, &AN}});
+  NewRCs = RC.removeInternalRefEdges({{&CN, &BN}});
   EXPECT_EQ(0u, NewRCs.size());
   EXPECT_EQ(&RC, CG.lookupRefSCC(AN));
   EXPECT_EQ(&RC, CG.lookupRefSCC(BN));
@@ -2163,7 +2164,8 @@ TEST(LazyCallGraphTest, RemoveFunctionWithSpuriousRef) {
 
   // Now delete 'dead'. There are no uses of this function but there are
   // spurious references.
-  CG.removeDeadFunction(DeadN.getFunction());
+  CG.markDeadFunction(DeadN.getFunction());
+  CG.removeDeadFunctions({&DeadN.getFunction()});
 
   // The only observable change should be that the RefSCC is gone from the
   // postorder sequence.
@@ -2212,7 +2214,8 @@ TEST(LazyCallGraphTest, RemoveFunctionWithSpuriousRefRecursive) {
 
   // Now delete 'a'. There are no uses of this function but there are
   // spurious references.
-  CG.removeDeadFunction(AN.getFunction());
+  CG.markDeadFunction(AN.getFunction());
+  CG.removeDeadFunctions({&AN.getFunction()});
 
   // The only observable change should be that the RefSCC is gone from the
   // postorder sequence.
@@ -2269,7 +2272,8 @@ TEST(LazyCallGraphTest, RemoveFunctionWithSpuriousRefRecursive2) {
 
   // Now delete 'a'. There are no uses of this function but there are
   // spurious references.
-  CG.removeDeadFunction(AN.getFunction());
+  CG.markDeadFunction(AN.getFunction());
+  CG.removeDeadFunctions({&AN.getFunction()});
 
   // The only observable change should be that the RefSCC is gone from the
   // postorder sequence.
@@ -2320,7 +2324,8 @@ TEST(LazyCallGraphTest, RemoveFunctionWithSpuriousRefRecursive3) {
 
   // Now delete 'a'. There are no uses of this function but there are
   // spurious references.
-  CG.removeDeadFunction(AN.getFunction());
+  CG.markDeadFunction(AN.getFunction());
+  CG.removeDeadFunctions({&AN.getFunction()});
 
   // The only observable change should be that the RefSCC is gone from the
   // postorder sequence.
diff --git a/llvm/utils/TableGen/SubtargetEmitter.cpp b/llvm/utils/TableGen/SubtargetEmitter.cpp
index 323470940fec5c..60a0402103ce05 100644
--- a/llvm/utils/TableGen/SubtargetEmitter.cpp
+++ b/llvm/utils/TableGen/SubtargetEmitter.cpp
@@ -32,6 +32,7 @@
 #include <cstdint>
 #include <iterator>
 #include <map>
+#include <set>
 #include <string>
 #include <vector>
 
@@ -255,6 +256,9 @@ unsigned SubtargetEmitter::FeatureKeyValues(
 
   llvm::sort(FeatureList, LessRecordFieldName());
 
+  // Check that there are no duplicate keys
+  std::set<StringRef> UniqueKeys;
+
   // Begin feature table
   OS << "// Sorted (by key) array of values for CPU features.\n"
      << "extern const llvm::SubtargetFeatureKV " << Target
@@ -283,6 +287,10 @@ unsigned SubtargetEmitter::FeatureKeyValues(
 
     OS << " },\n";
     ++NumFeatures;
+
+    if (!UniqueKeys.insert(CommandLineName).second)
+      PrintFatalError("Duplicate key in SubtargetFeatureKV: " +
+                      CommandLineName);
   }
 
   // End feature table
diff --git a/mlir/include/mlir-c/Bindings/Python/Interop.h b/mlir/include/mlir-c/Bindings/Python/Interop.h
index 0a36e97c2ae683..a33190c380d37d 100644
--- a/mlir/include/mlir-c/Bindings/Python/Interop.h
+++ b/mlir/include/mlir-c/Bindings/Python/Interop.h
@@ -39,6 +39,7 @@
 #include "mlir-c/IR.h"
 #include "mlir-c/IntegerSet.h"
 #include "mlir-c/Pass.h"
+#include "mlir-c/Rewrite.h"
 
 // The 'mlir' Python package is relocatable and supports co-existing in multiple
 // projects. Each project must define its outer package prefix with this define
@@ -284,6 +285,26 @@ static inline MlirModule mlirPythonCapsuleToModule(PyObject *capsule) {
   return module;
 }
 
+/** Creates a capsule object encapsulating the raw C-API
+ * MlirFrozenRewritePatternSet.
+ * The returned capsule does not extend or affect ownership of any Python
+ * objects that reference the module in any way. */
+static inline PyObject *
+mlirPythonFrozenRewritePatternSetToCapsule(MlirFrozenRewritePatternSet pm) {
+  return PyCapsule_New(MLIR_PYTHON_GET_WRAPPED_POINTER(pm),
+                       MLIR_PYTHON_CAPSULE_PASS_MANAGER, NULL);
+}
+
+/** Extracts an MlirFrozenRewritePatternSet from a capsule as produced from
+ * mlirPythonFrozenRewritePatternSetToCapsule. If the capsule is not of the
+ * right type, then a null module is returned. */
+static inline MlirFrozenRewritePatternSet
+mlirPythonCapsuleToFrozenRewritePatternSet(PyObject *capsule) {
+  void *ptr = PyCapsule_GetPointer(capsule, MLIR_PYTHON_CAPSULE_PASS_MANAGER);
+  MlirFrozenRewritePatternSet pm = {ptr};
+  return pm;
+}
+
 /** Creates a capsule object encapsulating the raw C-API MlirPassManager.
  * The returned capsule does not extend or affect ownership of any Python
  * objects that reference the module in any way. */
diff --git a/mlir/include/mlir-c/Rewrite.h b/mlir/include/mlir-c/Rewrite.h
new file mode 100644
index 00000000000000..45218a1cd4ebd5
--- /dev/null
+++ b/mlir/include/mlir-c/Rewrite.h
@@ -0,0 +1,60 @@
+//===-- mlir-c/Rewrite.h - Helpers for C API to Rewrites ----------*- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM
+// Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header declares the registration and creation method for
+// rewrite patterns.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_C_REWRITE_H
+#define MLIR_C_REWRITE_H
+
+#include "mlir-c/IR.h"
+#include "mlir-c/Support.h"
+#include "mlir/Config/mlir-config.h"
+
+//===----------------------------------------------------------------------===//
+/// Opaque type declarations (see mlir-c/IR.h for more details).
+//===----------------------------------------------------------------------===//
+
+#define DEFINE_C_API_STRUCT(name, storage)                                     \
+  struct name {                                                                \
+    storage *ptr;                                                              \
+  };                                                                           \
+  typedef struct name name
+
+DEFINE_C_API_STRUCT(MlirFrozenRewritePatternSet, void);
+DEFINE_C_API_STRUCT(MlirGreedyRewriteDriverConfig, void);
+DEFINE_C_API_STRUCT(MlirRewritePatternSet, void);
+
+MLIR_CAPI_EXPORTED MlirFrozenRewritePatternSet
+mlirFreezeRewritePattern(MlirRewritePatternSet op);
+
+MLIR_CAPI_EXPORTED void
+mlirFrozenRewritePatternSetDestroy(MlirFrozenRewritePatternSet op);
+
+MLIR_CAPI_EXPORTED MlirLogicalResult mlirApplyPatternsAndFoldGreedily(
+    MlirModule op, MlirFrozenRewritePatternSet patterns,
+    MlirGreedyRewriteDriverConfig);
+
+#if MLIR_ENABLE_PDL_IN_PATTERNMATCH
+DEFINE_C_API_STRUCT(MlirPDLPatternModule, void);
+
+MLIR_CAPI_EXPORTED MlirPDLPatternModule
+mlirPDLPatternModuleFromModule(MlirModule op);
+
+MLIR_CAPI_EXPORTED void mlirPDLPatternModuleDestroy(MlirPDLPatternModule op);
+
+MLIR_CAPI_EXPORTED MlirRewritePatternSet
+mlirRewritePatternSetFromPDLPatternModule(MlirPDLPatternModule op);
+#endif // MLIR_ENABLE_PDL_IN_PATTERNMATCH
+
+#undef DEFINE_C_API_STRUCT
+
+#endif // MLIR_C_REWRITE_H
diff --git a/mlir/include/mlir/Bindings/Python/PybindAdaptors.h b/mlir/include/mlir/Bindings/Python/PybindAdaptors.h
index d8f22c7aa17096..ebf50109f72f23 100644
--- a/mlir/include/mlir/Bindings/Python/PybindAdaptors.h
+++ b/mlir/include/mlir/Bindings/Python/PybindAdaptors.h
@@ -198,6 +198,27 @@ struct type_caster<MlirModule> {
   };
 };
 
+/// Casts object <-> MlirFrozenRewritePatternSet.
+template <>
+struct type_caster<MlirFrozenRewritePatternSet> {
+  PYBIND11_TYPE_CASTER(MlirFrozenRewritePatternSet,
+                       _("MlirFrozenRewritePatternSet"));
+  bool load(handle src, bool) {
+    py::object capsule = mlirApiObjectToCapsule(src);
+    value = mlirPythonCapsuleToFrozenRewritePatternSet(capsule.ptr());
+    return value.ptr != nullptr;
+  }
+  static handle cast(MlirFrozenRewritePatternSet v, return_value_policy,
+                     handle) {
+    py::object capsule = py::reinterpret_steal<py::object>(
+        mlirPythonFrozenRewritePatternSetToCapsule(v));
+    return py::module::import(MAKE_MLIR_PYTHON_QUALNAME("rewrite"))
+        .attr("FrozenRewritePatternSet")
+        .attr(MLIR_PYTHON_CAPI_FACTORY_ATTR)(capsule)
+        .release();
+  };
+};
+
 /// Casts object <-> MlirOperation.
 template <>
 struct type_caster<MlirOperation> {
diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
index d6d038ef65bdf4..3043a0c4dc4109 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
@@ -248,6 +248,12 @@ std::unique_ptr<Pass> createSparsificationAndBufferizationPass(
     bool enableBufferInitialization, unsigned vectorLength,
     bool enableVLAVectorization, bool enableSIMDIndex32, bool enableGPULibgen);
 
+//===----------------------------------------------------------------------===//
+// Sparse Iteration Transform Passes
+//===----------------------------------------------------------------------===//
+
+std::unique_ptr<Pass> createSparseSpaceCollapsePass();
+
 //===----------------------------------------------------------------------===//
 // Registration.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
index 2f844cee5ff528..196110f55571d2 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
@@ -462,6 +462,34 @@ def SparsificationAndBufferization : Pass<"sparsification-and-bufferization", "M
     "sparse_tensor::SparseTensorDialect",
     "vector::VectorDialect"
   ];
+  // Important optimization options are made visible to the mini-pipeline
+  // so that clients can set these (when not using the full pipeline).
+  let options = [
+    Option<"vectorLength", "vl", "int32_t", "0",
+           "Set the vector length (use 0 to disable vectorization)">,
+    Option<"enableVLAVectorization", "enable-vla-vectorization", "bool", "false",
+           "Enable vector length agnostic vectorization">,
+    Option<"enableSIMDIndex32", "enable-simd-index32", "bool", "false",
+           "Enable i32 indexing into vectors (for efficient gather/scatter)">,
+    Option<"enableGPULibgen", "enable-gpu-libgen", "bool", "false",
+           "Enable GPU acceleration by means of direct library calls">,
+  ];
+}
+
+//===----------------------------------------------------------------------===//
+// Sparse Iteration Transform Passes
+//===----------------------------------------------------------------------===//
+
+def SparseSpaceCollapse : Pass<"sparse-space-collapse", "func::FuncOp"> {
+  let summary = "sparse space collapsing pass";
+  let description = [{
+     This pass collapses consecutive sparse spaces (extracted from the same tensor)
+     into one multi-dimensional space. The pass is not yet stablized.
+  }];
+  let constructor = "mlir::createSparseSpaceCollapsePass()";
+  let dependentDialects = [
+    "sparse_tensor::SparseTensorDialect",
+  ];
 }
 
 #endif // MLIR_DIALECT_SPARSETENSOR_TRANSFORMS_PASSES
diff --git a/mlir/lib/Bindings/Python/IRModule.h b/mlir/lib/Bindings/Python/IRModule.h
index 60cd39302c6387..172898cfda0c52 100644
--- a/mlir/lib/Bindings/Python/IRModule.h
+++ b/mlir/lib/Bindings/Python/IRModule.h
@@ -22,6 +22,7 @@
 #include "mlir-c/Diagnostics.h"
 #include "mlir-c/IR.h"
 #include "mlir-c/IntegerSet.h"
+#include "mlir-c/Transforms.h"
 #include "mlir/Bindings/Python/PybindAdaptors.h"
 #include "llvm/ADT/DenseMap.h"
 
diff --git a/mlir/lib/Bindings/Python/MainModule.cpp b/mlir/lib/Bindings/Python/MainModule.cpp
index 17272472ccca42..8da1ab16a4514b 100644
--- a/mlir/lib/Bindings/Python/MainModule.cpp
+++ b/mlir/lib/Bindings/Python/MainModule.cpp
@@ -11,6 +11,7 @@
 #include "Globals.h"
 #include "IRModule.h"
 #include "Pass.h"
+#include "Rewrite.h"
 
 namespace py = pybind11;
 using namespace mlir;
@@ -116,6 +117,9 @@ PYBIND11_MODULE(_mlir, m) {
   populateIRInterfaces(irModule);
   populateIRTypes(irModule);
 
+  auto rewriteModule = m.def_submodule("rewrite", "MLIR Rewrite Bindings");
+  populateRewriteSubmodule(rewriteModule);
+
   // Define and populate PassManager submodule.
   auto passModule =
       m.def_submodule("passmanager", "MLIR Pass Management Bindings");
diff --git a/mlir/lib/Bindings/Python/Rewrite.cpp b/mlir/lib/Bindings/Python/Rewrite.cpp
new file mode 100644
index 00000000000000..1d8128be9f0826
--- /dev/null
+++ b/mlir/lib/Bindings/Python/Rewrite.cpp
@@ -0,0 +1,110 @@
+//===- Rewrite.cpp - Rewrite ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Rewrite.h"
+
+#include "IRModule.h"
+#include "mlir-c/Bindings/Python/Interop.h"
+#include "mlir-c/Rewrite.h"
+#include "mlir/Config/mlir-config.h"
+
+namespace py = pybind11;
+using namespace mlir;
+using namespace py::literals;
+using namespace mlir::python;
+
+namespace {
+
+#if MLIR_ENABLE_PDL_IN_PATTERNMATCH
+/// Owning Wrapper around a PDLPatternModule.
+class PyPDLPatternModule {
+public:
+  PyPDLPatternModule(MlirPDLPatternModule module) : module(module) {}
+  PyPDLPatternModule(PyPDLPatternModule &&other) noexcept
+      : module(other.module) {
+    other.module.ptr = nullptr;
+  }
+  ~PyPDLPatternModule() {
+    if (module.ptr != nullptr)
+      mlirPDLPatternModuleDestroy(module);
+  }
+  MlirPDLPatternModule get() { return module; }
+
+private:
+  MlirPDLPatternModule module;
+};
+#endif // MLIR_ENABLE_PDL_IN_PATTERNMATCH
+
+/// Owning Wrapper around a FrozenRewritePatternSet.
+class PyFrozenRewritePatternSet {
+public:
+  PyFrozenRewritePatternSet(MlirFrozenRewritePatternSet set) : set(set) {}
+  PyFrozenRewritePatternSet(PyFrozenRewritePatternSet &&other) noexcept
+      : set(other.set) {
+    other.set.ptr = nullptr;
+  }
+  ~PyFrozenRewritePatternSet() {
+    if (set.ptr != nullptr)
+      mlirFrozenRewritePatternSetDestroy(set);
+  }
+  MlirFrozenRewritePatternSet get() { return set; }
+
+  pybind11::object getCapsule() {
+    return py::reinterpret_steal<py::object>(
+        mlirPythonFrozenRewritePatternSetToCapsule(get()));
+  }
+
+  static pybind11::object createFromCapsule(pybind11::object capsule) {
+    MlirFrozenRewritePatternSet rawPm =
+        mlirPythonCapsuleToFrozenRewritePatternSet(capsule.ptr());
+    if (rawPm.ptr == nullptr)
+      throw py::error_already_set();
+    return py::cast(PyFrozenRewritePatternSet(rawPm),
+                    py::return_value_policy::move);
+  }
+
+private:
+  MlirFrozenRewritePatternSet set;
+};
+
+} // namespace
+
+/// Create the `mlir.rewrite` here.
+void mlir::python::populateRewriteSubmodule(py::module &m) {
+  //----------------------------------------------------------------------------
+  // Mapping of the top-level PassManager
+  //----------------------------------------------------------------------------
+#if MLIR_ENABLE_PDL_IN_PATTERNMATCH
+  py::class_<PyPDLPatternModule>(m, "PDLModule", py::module_local())
+      .def(py::init<>([](MlirModule module) {
+             return mlirPDLPatternModuleFromModule(module);
+           }),
+           "module"_a, "Create a PDL module from the given module.")
+      .def("freeze", [](PyPDLPatternModule &self) {
+        return new PyFrozenRewritePatternSet(mlirFreezeRewritePattern(
+            mlirRewritePatternSetFromPDLPatternModule(self.get())));
+      });
+#endif // MLIR_ENABLE_PDL_IN_PATTERNMATCg
+  py::class_<PyFrozenRewritePatternSet>(m, "FrozenRewritePatternSet",
+                                        py::module_local())
+      .def_property_readonly(MLIR_PYTHON_CAPI_PTR_ATTR,
+                             &PyFrozenRewritePatternSet::getCapsule)
+      .def(MLIR_PYTHON_CAPI_FACTORY_ATTR,
+           &PyFrozenRewritePatternSet::createFromCapsule);
+  m.def(
+      "apply_patterns_and_fold_greedily",
+      [](MlirModule module, MlirFrozenRewritePatternSet set) {
+        auto status = mlirApplyPatternsAndFoldGreedily(module, set, {});
+        if (mlirLogicalResultIsFailure(status))
+          // FIXME: Not sure this is the right error to throw here.
+          throw py::value_error("pattern application failed to converge");
+      },
+      "module"_a, "set"_a,
+      "Applys the given patterns to the given module greedily while folding "
+      "results.");
+}
diff --git a/mlir/lib/Bindings/Python/Rewrite.h b/mlir/lib/Bindings/Python/Rewrite.h
new file mode 100644
index 00000000000000..997b80adda3038
--- /dev/null
+++ b/mlir/lib/Bindings/Python/Rewrite.h
@@ -0,0 +1,22 @@
+//===- Rewrite.h - Rewrite Submodules of pybind module --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_BINDINGS_PYTHON_REWRITE_H
+#define MLIR_BINDINGS_PYTHON_REWRITE_H
+
+#include "PybindUtils.h"
+
+namespace mlir {
+namespace python {
+
+void populateRewriteSubmodule(pybind11::module &m);
+
+} // namespace python
+} // namespace mlir
+
+#endif // MLIR_BINDINGS_PYTHON_REWRITE_H
diff --git a/mlir/lib/CAPI/Transforms/CMakeLists.txt b/mlir/lib/CAPI/Transforms/CMakeLists.txt
index 2638025a8c359a..6c67aa09fdf402 100644
--- a/mlir/lib/CAPI/Transforms/CMakeLists.txt
+++ b/mlir/lib/CAPI/Transforms/CMakeLists.txt
@@ -1,6 +1,9 @@
 add_mlir_upstream_c_api_library(MLIRCAPITransforms
   Passes.cpp
+  Rewrite.cpp
 
   LINK_LIBS PUBLIC
+  MLIRIR
   MLIRTransforms
+  MLIRTransformUtils
 )
diff --git a/mlir/lib/CAPI/Transforms/Rewrite.cpp b/mlir/lib/CAPI/Transforms/Rewrite.cpp
new file mode 100644
index 00000000000000..0de1958398f63e
--- /dev/null
+++ b/mlir/lib/CAPI/Transforms/Rewrite.cpp
@@ -0,0 +1,83 @@
+//===- Rewrite.cpp - C API for Rewrite Patterns ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir-c/Rewrite.h"
+#include "mlir-c/Transforms.h"
+#include "mlir/CAPI/IR.h"
+#include "mlir/CAPI/Support.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Rewrite/FrozenRewritePatternSet.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+using namespace mlir;
+
+inline mlir::RewritePatternSet &unwrap(MlirRewritePatternSet module) {
+  assert(module.ptr && "unexpected null module");
+  return *(static_cast<mlir::RewritePatternSet *>(module.ptr));
+}
+
+inline MlirRewritePatternSet wrap(mlir::RewritePatternSet *module) {
+  return {module};
+}
+
+inline mlir::FrozenRewritePatternSet *
+unwrap(MlirFrozenRewritePatternSet module) {
+  assert(module.ptr && "unexpected null module");
+  return static_cast<mlir::FrozenRewritePatternSet *>(module.ptr);
+}
+
+inline MlirFrozenRewritePatternSet wrap(mlir::FrozenRewritePatternSet *module) {
+  return {module};
+}
+
+MlirFrozenRewritePatternSet mlirFreezeRewritePattern(MlirRewritePatternSet op) {
+  auto *m = new mlir::FrozenRewritePatternSet(std::move(unwrap(op)));
+  op.ptr = nullptr;
+  return wrap(m);
+}
+
+void mlirFrozenRewritePatternSetDestroy(MlirFrozenRewritePatternSet op) {
+  delete unwrap(op);
+  op.ptr = nullptr;
+}
+
+MlirLogicalResult
+mlirApplyPatternsAndFoldGreedily(MlirModule op,
+                                 MlirFrozenRewritePatternSet patterns,
+                                 MlirGreedyRewriteDriverConfig) {
+  return wrap(
+      mlir::applyPatternsAndFoldGreedily(unwrap(op), *unwrap(patterns)));
+}
+
+#if MLIR_ENABLE_PDL_IN_PATTERNMATCH
+inline mlir::PDLPatternModule *unwrap(MlirPDLPatternModule module) {
+  assert(module.ptr && "unexpected null module");
+  return static_cast<mlir::PDLPatternModule *>(module.ptr);
+}
+
+inline MlirPDLPatternModule wrap(mlir::PDLPatternModule *module) {
+  return {module};
+}
+
+MlirPDLPatternModule mlirPDLPatternModuleFromModule(MlirModule op) {
+  return wrap(new mlir::PDLPatternModule(
+      mlir::OwningOpRef<mlir::ModuleOp>(unwrap(op))));
+}
+
+void mlirPDLPatternModuleDestroy(MlirPDLPatternModule op) {
+  delete unwrap(op);
+  op.ptr = nullptr;
+}
+
+MlirRewritePatternSet
+mlirRewritePatternSetFromPDLPatternModule(MlirPDLPatternModule op) {
+  auto *m = new mlir::RewritePatternSet(std::move(*unwrap(op)));
+  op.ptr = nullptr;
+  return wrap(m);
+}
+#endif // MLIR_ENABLE_PDL_IN_PATTERNMATCH
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CMakeLists.txt b/mlir/lib/Dialect/SparseTensor/Transforms/CMakeLists.txt
index af3a1b48f45af9..2a29ee8a7a87cb 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/CMakeLists.txt
@@ -6,6 +6,7 @@ add_mlir_dialect_library(MLIRSparseTensorTransforms
   SparseGPUCodegen.cpp
   SparseReinterpretMap.cpp
   SparseStorageSpecifierToLLVM.cpp
+  SparseSpaceCollapse.cpp
   SparseTensorCodegen.cpp
   SparseTensorConversion.cpp
   SparseTensorPasses.cpp
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseSpaceCollapse.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseSpaceCollapse.cpp
new file mode 100644
index 00000000000000..924046fcd9961f
--- /dev/null
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseSpaceCollapse.cpp
@@ -0,0 +1,199 @@
+//===--------- SparseSpaceCollapse.cpp - Collapse Sparse Space Pass -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/Transforms/Passes.h"
+
+#include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"
+#include "mlir/Dialect/SparseTensor/Transforms/Passes.h"
+
+namespace mlir {
+#define GEN_PASS_DEF_SPARSESPACECOLLAPSE
+#include "mlir/Dialect/SparseTensor/Transforms/Passes.h.inc"
+} // namespace mlir
+
+#define DEBUG_TYPE "sparse-space-collapse"
+
+using namespace mlir;
+using namespace sparse_tensor;
+
+namespace {
+
+struct CollapseSpaceInfo {
+  ExtractIterSpaceOp space;
+  IterateOp loop;
+};
+
+bool isCollapsableLoops(LoopLikeOpInterface parent, LoopLikeOpInterface node) {
+  auto pIterArgs = parent.getRegionIterArgs();
+  auto nInitArgs = node.getInits();
+  if (pIterArgs.size() != nInitArgs.size())
+    return false;
+
+  // Two loops are collapsable if they are perfectly nested.
+  auto pYields = parent.getYieldedValues();
+  auto nResult = node.getLoopResults().value();
+
+  bool yieldEq =
+      llvm::all_of(llvm::zip_equal(pYields, nResult), [](auto zipped) {
+        return std::get<0>(zipped) == std::get<1>(zipped);
+      });
+
+  // Parent iter_args should be passed directly to the node's init_args.
+  bool iterArgEq =
+      llvm::all_of(llvm::zip_equal(pIterArgs, nInitArgs), [](auto zipped) {
+        return std::get<0>(zipped) == std::get<1>(zipped);
+      });
+
+  return yieldEq && iterArgEq;
+}
+
+bool legalToCollapse(SmallVectorImpl<CollapseSpaceInfo> &toCollapse,
+                     ExtractIterSpaceOp curSpace) {
+
+  auto getIterateOpOverSpace = [](ExtractIterSpaceOp space) -> IterateOp {
+    Value spaceVal = space.getExtractedSpace();
+    if (spaceVal.hasOneUse())
+      return llvm::dyn_cast<IterateOp>(*spaceVal.getUsers().begin());
+    return nullptr;
+  };
+
+  if (toCollapse.empty()) {
+    // Collapse root.
+    if (auto itOp = getIterateOpOverSpace(curSpace)) {
+      CollapseSpaceInfo &info = toCollapse.emplace_back();
+      info.space = curSpace;
+      info.loop = itOp;
+      return true;
+    }
+    return false;
+  }
+
+  auto parent = toCollapse.back().space;
+  auto pItOp = toCollapse.back().loop;
+  auto nItOp = getIterateOpOverSpace(curSpace);
+
+  // Can only collapse spaces extracted from the same tensor.
+  if (parent.getTensor() != curSpace.getTensor()) {
+    LLVM_DEBUG({
+      llvm::dbgs()
+          << "failed to collpase spaces extracted from different tensors.";
+    });
+    return false;
+  }
+
+  // Can only collapse consecutive simple iteration on one tensor (i.e., no
+  // coiteration).
+  if (!nItOp || nItOp->getBlock() != curSpace->getBlock() ||
+      pItOp.getIterator() != curSpace.getParentIter() ||
+      curSpace->getParentOp() != pItOp.getOperation()) {
+    LLVM_DEBUG(
+        { llvm::dbgs() << "failed to collapse non-consecutive IterateOps."; });
+    return false;
+  }
+
+  if (pItOp && !isCollapsableLoops(pItOp, nItOp)) {
+    LLVM_DEBUG({
+      llvm::dbgs()
+          << "failed to collapse IterateOps that are not perfectly nested.";
+    });
+    return false;
+  }
+
+  CollapseSpaceInfo &info = toCollapse.emplace_back();
+  info.space = curSpace;
+  info.loop = nItOp;
+  return true;
+}
+
+void collapseSparseSpace(MutableArrayRef<CollapseSpaceInfo> toCollapse) {
+  if (toCollapse.size() < 2)
+    return;
+
+  ExtractIterSpaceOp root = toCollapse.front().space;
+  ExtractIterSpaceOp leaf = toCollapse.back().space;
+  Location loc = root.getLoc();
+
+  assert(root->hasOneUse() && leaf->hasOneUse());
+
+  // Insert collapsed operation at the same scope as root operation.
+  OpBuilder builder(root);
+
+  // Construct the collapsed iteration space.
+  auto collapsedSpace = builder.create<ExtractIterSpaceOp>(
+      loc, root.getTensor(), root.getParentIter(), root.getLoLvl(),
+      leaf.getHiLvl());
+
+  auto rItOp = llvm::cast<IterateOp>(*root->getUsers().begin());
+  auto innermost = toCollapse.back().loop;
+
+  IRMapping mapper;
+  mapper.map(leaf, collapsedSpace.getExtractedSpace());
+  for (auto z : llvm::zip_equal(innermost.getInitArgs(), rItOp.getInitArgs()))
+    mapper.map(std::get<0>(z), std::get<1>(z));
+
+  auto cloned = llvm::cast<IterateOp>(builder.clone(*innermost, mapper));
+  builder.setInsertionPointToStart(cloned.getBody());
+
+  LevelSet crdUsedLvls;
+  unsigned shift = 0, argIdx = 1;
+  for (auto info : toCollapse.drop_back()) {
+    LevelSet set = info.loop.getCrdUsedLvls();
+    crdUsedLvls |= set.lshift(shift);
+    shift += info.loop.getSpaceDim();
+    for (BlockArgument crd : info.loop.getCrds()) {
+      BlockArgument collapsedCrd = cloned.getBody()->insertArgument(
+          argIdx++, builder.getIndexType(), crd.getLoc());
+      crd.replaceAllUsesWith(collapsedCrd);
+    }
+  }
+  crdUsedLvls |= innermost.getCrdUsedLvls().lshift(shift);
+  cloned.getIterator().setType(collapsedSpace.getType().getIteratorType());
+  cloned.setCrdUsedLvls(crdUsedLvls);
+
+  rItOp.replaceAllUsesWith(cloned.getResults());
+  // Erase collapsed loops.
+  rItOp.erase();
+  root.erase();
+}
+
+struct SparseSpaceCollapsePass
+    : public impl::SparseSpaceCollapseBase<SparseSpaceCollapsePass> {
+  SparseSpaceCollapsePass() = default;
+
+  void runOnOperation() override {
+    func::FuncOp func = getOperation();
+
+    // A naive (experimental) implementation to collapse consecutive sparse
+    // spaces. It does NOT handle complex cases where multiple spaces are
+    // extracted in the same basic block. E.g.,
+    //
+    // %space1 = extract_space %t1 ...
+    // %space2 = extract_space %t2 ...
+    // sparse_tensor.iterate(%sp1) ...
+    //
+    SmallVector<CollapseSpaceInfo> toCollapse;
+    func->walk([&](ExtractIterSpaceOp op) {
+      if (!legalToCollapse(toCollapse, op)) {
+        // if not legal to collapse one more space, collapse the existing ones
+        // and clear.
+        collapseSparseSpace(toCollapse);
+        toCollapse.clear();
+      }
+    });
+
+    collapseSparseSpace(toCollapse);
+  }
+};
+
+} // namespace
+
+std::unique_ptr<Pass> mlir::createSparseSpaceCollapsePass() {
+  return std::make_unique<SparseSpaceCollapsePass>();
+}
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp
index 3a8972072ac3b1..13c750e83d0454 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp
@@ -61,20 +61,34 @@ class SparsificationAndBufferizationPass
     : public impl::SparsificationAndBufferizationBase<
           SparsificationAndBufferizationPass> {
 public:
+  // Private pass options only.
   SparsificationAndBufferizationPass(
       const bufferization::OneShotBufferizationOptions &bufferizationOptions,
       const SparsificationOptions &sparsificationOptions,
       bool createSparseDeallocs, bool enableRuntimeLibrary,
-      bool enableBufferInitialization, unsigned vectorLength,
-      bool enableVLAVectorization, bool enableSIMDIndex32, bool enableGPULibgen)
+      bool enableBufferInitialization)
       : bufferizationOptions(bufferizationOptions),
         sparsificationOptions(sparsificationOptions),
         createSparseDeallocs(createSparseDeallocs),
         enableRuntimeLibrary(enableRuntimeLibrary),
-        enableBufferInitialization(enableBufferInitialization),
-        vectorLength(vectorLength),
-        enableVLAVectorization(enableVLAVectorization),
-        enableSIMDIndex32(enableSIMDIndex32), enableGPULibgen(enableGPULibgen) {
+        enableBufferInitialization(enableBufferInitialization) {}
+  // Private pass options and visible pass options.
+  SparsificationAndBufferizationPass(
+      const bufferization::OneShotBufferizationOptions &bufferizationOptions,
+      const SparsificationOptions &sparsificationOptions,
+      bool createSparseDeallocs, bool enableRuntimeLibrary,
+      bool enableBufferInitialization, unsigned vl, bool vla, bool index32,
+      bool gpu)
+      : bufferizationOptions(bufferizationOptions),
+        sparsificationOptions(sparsificationOptions),
+        createSparseDeallocs(createSparseDeallocs),
+        enableRuntimeLibrary(enableRuntimeLibrary),
+        enableBufferInitialization(enableBufferInitialization) {
+    // Set the visible pass options explicitly.
+    vectorLength = vl;
+    enableVLAVectorization = vla;
+    enableSIMDIndex32 = index32;
+    enableGPULibgen = gpu;
   }
 
   /// Bufferize all dense ops. This assumes that no further analysis is needed
@@ -178,10 +192,6 @@ class SparsificationAndBufferizationPass
   bool createSparseDeallocs;
   bool enableRuntimeLibrary;
   bool enableBufferInitialization;
-  unsigned vectorLength;
-  bool enableVLAVectorization;
-  bool enableSIMDIndex32;
-  bool enableGPULibgen;
 };
 
 } // namespace sparse_tensor
@@ -213,16 +223,13 @@ mlir::getBufferizationOptionsForSparsification(bool analysisOnly) {
 
 std::unique_ptr<mlir::Pass> mlir::createSparsificationAndBufferizationPass() {
   SparsificationOptions sparseOptions;
-  return createSparsificationAndBufferizationPass(
+  return std::make_unique<
+      mlir::sparse_tensor::SparsificationAndBufferizationPass>(
       getBufferizationOptionsForSparsification(/*analysisOnly=*/false),
       sparseOptions,
       /*createSparseDeallocs=*/false,
       /*enableRuntimeLibrary=*/false,
-      /*enableBufferInitialization=*/false,
-      /*vectorLength=*/0,
-      /*enableVLAVectorization=*/false,
-      /*enableSIMDIndex32=*/false,
-      /*enableGPULibgen=*/false);
+      /*enableBufferInitialization=*/false);
 }
 
 std::unique_ptr<mlir::Pass> mlir::createSparsificationAndBufferizationPass(
diff --git a/mlir/python/CMakeLists.txt b/mlir/python/CMakeLists.txt
index d8f2d1989fdea7..d03036e17749d4 100644
--- a/mlir/python/CMakeLists.txt
+++ b/mlir/python/CMakeLists.txt
@@ -21,6 +21,7 @@ declare_mlir_python_sources(MLIRPythonSources.Core.Python
     _mlir_libs/__init__.py
     ir.py
     passmanager.py
+    rewrite.py
     dialects/_ods_common.py
 
     # The main _mlir module has submodules: include stubs from each.
@@ -448,6 +449,7 @@ declare_mlir_python_extension(MLIRPythonExtension.Core
     IRModule.cpp
     IRTypes.cpp
     Pass.cpp
+    Rewrite.cpp
 
     # Headers must be included explicitly so they are installed.
     Globals.h
diff --git a/mlir/python/mlir/dialects/pdl.py b/mlir/python/mlir/dialects/pdl.py
index db07dc50aabd79..b7b8430cebd07a 100644
--- a/mlir/python/mlir/dialects/pdl.py
+++ b/mlir/python/mlir/dialects/pdl.py
@@ -6,7 +6,7 @@
 from ._pdl_ops_gen import _Dialect
 from .._mlir_libs._mlirDialectsPDL import *
 from .._mlir_libs._mlirDialectsPDL import OperationType
-
+from ..extras.meta import region_op
 
 try:
     from ..ir import *
@@ -127,6 +127,9 @@ def body(self):
         return self.regions[0].blocks[0]
 
 
+pattern = region_op(PatternOp.__base__)
+
+
 @_ods_cext.register_operation(_Dialect, replace=True)
 class ReplaceOp(ReplaceOp):
     """Specialization for PDL replace op class."""
@@ -195,6 +198,9 @@ def body(self):
         return self.regions[0].blocks[0]
 
 
+rewrite = region_op(RewriteOp)
+
+
 @_ods_cext.register_operation(_Dialect, replace=True)
 class TypeOp(TypeOp):
     """Specialization for PDL type op class."""
diff --git a/mlir/python/mlir/rewrite.py b/mlir/python/mlir/rewrite.py
new file mode 100644
index 00000000000000..5bc1bba7ae9a72
--- /dev/null
+++ b/mlir/python/mlir/rewrite.py
@@ -0,0 +1,5 @@
+#  Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#  See https://llvm.org/LICENSE.txt for license information.
+#  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from ._mlir_libs._mlir.rewrite import *
diff --git a/mlir/test/Dialect/SparseTensor/minipipeline_vector.mlir b/mlir/test/Dialect/SparseTensor/minipipeline_vector.mlir
new file mode 100755
index 00000000000000..2475aa5139da48
--- /dev/null
+++ b/mlir/test/Dialect/SparseTensor/minipipeline_vector.mlir
@@ -0,0 +1,43 @@
+// RUN: mlir-opt %s --sparsification-and-bufferization        | FileCheck %s --check-prefix=CHECK-NOVEC
+// RUN: mlir-opt %s --sparsification-and-bufferization="vl=8" | FileCheck %s --check-prefix=CHECK-VEC
+
+// Test to ensure we can pass optimization flags into
+// the mini sparsification and bufferization pipeline.
+
+#SV = #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>
+
+#trait_sum_reduction = {
+  indexing_maps = [
+    affine_map<(i) -> (i)>,  // a
+    affine_map<(i) -> ()>    // x (scalar out)
+  ],
+  iterator_types = ["reduction"],
+  doc = "x += SUM_i a(i)"
+}
+
+//
+// CHECK-NOVEC-LABEL: func.func @sum_reduction
+// CHECK-NOVEC:       scf.for
+// CHECK-NOVEC:         arith.addf %{{.*}} %{{.*}} : f32
+// CHECK-NOVEC:       }
+//
+// CHECK-VEC-LABEL: func.func @sum_reduction
+// CHECK-VEC:       vector.insertelement
+// CHECK-VEC:       scf.for
+// CHECK-VEC:         vector.create_mask
+// CHECK-VEC:         vector.maskedload
+// CHECK-VEC:         arith.addf %{{.*}} %{{.*}} : vector<8xf32>
+// CHECK-VEC:       }
+// CHECK-VEC:       vector.reduction <add>
+//
+func.func @sum_reduction(%arga: tensor<?xf32, #SV>,
+                         %argx: tensor<f32>) -> tensor<f32> {
+  %0 = linalg.generic #trait_sum_reduction
+    ins(%arga: tensor<?xf32, #SV>)
+    outs(%argx: tensor<f32>) {
+      ^bb(%a: f32, %x: f32):
+        %0 = arith.addf %x, %a : f32
+        linalg.yield %0 : f32
+  } -> tensor<f32>
+  return %0 : tensor<f32>
+}
diff --git a/mlir/test/Dialect/SparseTensor/sparse_space_collapse.mlir b/mlir/test/Dialect/SparseTensor/sparse_space_collapse.mlir
new file mode 100644
index 00000000000000..baa6199f12bc38
--- /dev/null
+++ b/mlir/test/Dialect/SparseTensor/sparse_space_collapse.mlir
@@ -0,0 +1,37 @@
+// RUN: mlir-opt %s --sparse-space-collapse | FileCheck %s
+
+#COO = #sparse_tensor.encoding<{
+  map = (i, j) -> (
+    i : compressed(nonunique),
+    j : singleton(soa)
+  )
+}>
+
+// CHECK-LABEL:   func.func @sparse_sparse_collapse(
+// CHECK-SAME:         %[[VAL_0:.*]]: tensor<4x8xf32, #sparse>,
+// CHECK-SAME:         %[[VAL_1:.*]]: index) {
+// CHECK:           %[[VAL_3:.*]] = sparse_tensor.extract_iteration_space %[[VAL_0]] lvls = 0 to 2 : tensor<4x8xf32, #sparse>
+// CHECK:           %[[VAL_4:.*]] = sparse_tensor.iterate %[[VAL_5:.*]] in %[[VAL_3]] at(%[[VAL_6:.*]], _) iter_args(%[[VAL_7:.*]] = %[[VAL_1]])
+// CHECK:             %[[VAL_8:.*]] = "test.op"(%[[VAL_7]]) : (index) -> index
+// CHECK:             sparse_tensor.yield %[[VAL_8]] : index
+// CHECK:           }
+// CHECK:           "test.sink"(%[[VAL_4]]) : (index) -> ()
+// CHECK:           return
+// CHECK:         }
+func.func @sparse_sparse_collapse(%sp : tensor<4x8xf32, #COO>, %i : index) {
+  %l1 = sparse_tensor.extract_iteration_space %sp lvls = 0
+      : tensor<4x8xf32, #COO>
+     -> !sparse_tensor.iter_space<#COO, lvls = 0>
+  %r1 = sparse_tensor.iterate %it1 in %l1 at(%crd0) iter_args(%outer = %i): !sparse_tensor.iter_space<#COO, lvls = 0 to 1> -> index {
+    %l2 = sparse_tensor.extract_iteration_space %sp at %it1 lvls = 1
+        : tensor<4x8xf32, #COO>, !sparse_tensor.iterator<#COO, lvls = 0 to 1>
+       -> !sparse_tensor.iter_space<#COO, lvls = 1>
+    %r2 = sparse_tensor.iterate %it2 in %l2 iter_args(%inner = %outer): !sparse_tensor.iter_space<#COO, lvls = 1 to 2> -> index {
+      %k ="test.op"(%inner) : (index) -> index
+      sparse_tensor.yield %k : index
+    }
+    sparse_tensor.yield %r2 : index
+  }
+  "test.sink"(%r1) : (index) -> ()
+  return
+}
diff --git a/mlir/test/python/integration/dialects/pdl.py b/mlir/test/python/integration/dialects/pdl.py
new file mode 100644
index 00000000000000..923af29a71ad75
--- /dev/null
+++ b/mlir/test/python/integration/dialects/pdl.py
@@ -0,0 +1,67 @@
+# RUN: %PYTHON %s 2>&1 | FileCheck %s
+
+from mlir.dialects import arith, func, pdl
+from mlir.dialects.builtin import module
+from mlir.ir import *
+from mlir.rewrite import *
+
+
+def construct_and_print_in_module(f):
+    print("\nTEST:", f.__name__)
+    with Context(), Location.unknown():
+        module = Module.create()
+        with InsertionPoint(module.body):
+            module = f(module)
+        if module is not None:
+            print(module)
+    return f
+
+
+# CHECK-LABEL: TEST: test_add_to_mul
+# CHECK: arith.muli
+@construct_and_print_in_module
+def test_add_to_mul(module_):
+    index_type = IndexType.get()
+
+    # Create a test case.
+    @module(sym_name="ir")
+    def ir():
+        @func.func(index_type, index_type)
+        def add_func(a, b):
+            return arith.addi(a, b)
+
+    # Create a rewrite from add to mul. This will match
+    # - operation name is arith.addi
+    # - operands are index types.
+    # - there are two operands.
+    with Location.unknown():
+        m = Module.create()
+        with InsertionPoint(m.body):
+            # Change all arith.addi with index types to arith.muli.
+            @pdl.pattern(benefit=1, sym_name="addi_to_mul")
+            def pat():
+                # Match arith.addi with index types.
+                index_type = pdl.TypeOp(IndexType.get())
+                operand0 = pdl.OperandOp(index_type)
+                operand1 = pdl.OperandOp(index_type)
+                op0 = pdl.OperationOp(
+                    name="arith.addi", args=[operand0, operand1], types=[index_type]
+                )
+
+                # Replace the matched op with arith.muli.
+                @pdl.rewrite()
+                def rew():
+                    newOp = pdl.OperationOp(
+                        name="arith.muli", args=[operand0, operand1], types=[index_type]
+                    )
+                    pdl.ReplaceOp(op0, with_op=newOp)
+
+    # Create a PDL module from module and freeze it. At this point the ownership
+    # of the module is transferred to the PDL module. This ownership transfer is
+    # not yet captured Python side/has sharp edges. So best to construct the
+    # module and PDL module in same scope.
+    # FIXME: This should be made more robust.
+    frozen = PDLModule(m).freeze()
+    # Could apply frozen pattern set multiple times.
+    apply_patterns_and_fold_greedily(module_, frozen)
+    return module_
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 3dfd3fd91add77..a2d3f400600d0d 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -420,6 +420,7 @@ mlir_c_api_cc_library(
         "include/mlir-c/Interfaces.h",
         "include/mlir-c/Pass.h",
         "include/mlir-c/RegisterEverything.h",
+        "include/mlir-c/Rewrite.h",
         "include/mlir-c/Support.h",
         "include/mlir/CAPI/AffineExpr.h",
         "include/mlir/CAPI/AffineMap.h",
@@ -866,7 +867,10 @@ mlir_c_api_cc_library(
 
 mlir_c_api_cc_library(
     name = "CAPITransforms",
-    srcs = ["lib/CAPI/Transforms/Passes.cpp"],
+    srcs = [
+        "lib/CAPI/Transforms/Passes.cpp",
+        "lib/CAPI/Transforms/Rewrite.cpp",
+    ],
     hdrs = ["include/mlir-c/Transforms.h"],
     capi_deps = [
         ":CAPIIR",
@@ -876,7 +880,10 @@ mlir_c_api_cc_library(
     ],
     includes = ["include"],
     deps = [
+        ":IR",
         ":Pass",
+        ":Rewrite",
+        ":TransformUtils",
         ":Transforms",
     ],
 )
@@ -939,6 +946,7 @@ cc_library(
     textual_hdrs = glob(MLIR_BINDINGS_PYTHON_HEADERS),
     deps = [
         ":CAPIIRHeaders",
+        ":CAPITransformsHeaders",
         "@local_config_python//:python_headers",
         "@pybind11",
     ],
@@ -957,6 +965,7 @@ cc_library(
     textual_hdrs = glob(MLIR_BINDINGS_PYTHON_HEADERS),
     deps = [
         ":CAPIIR",
+        ":CAPITransforms",
         "@local_config_python//:python_headers",
         "@pybind11",
     ],
@@ -981,6 +990,7 @@ MLIR_PYTHON_BINDINGS_SOURCES = [
     "lib/Bindings/Python/IRModule.cpp",
     "lib/Bindings/Python/IRTypes.cpp",
     "lib/Bindings/Python/Pass.cpp",
+    "lib/Bindings/Python/Rewrite.cpp",
 ]
 
 cc_library(
diff --git a/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel
index add150de69faf4..254cab0db4a5d6 100644
--- a/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel
@@ -82,6 +82,13 @@ filegroup(
     ],
 )
 
+filegroup(
+    name = "RewritePyFiles",
+    srcs = [
+        "mlir/rewrite.py",
+    ],
+)
+
 filegroup(
     name = "RuntimePyFiles",
     srcs = glob([