diff --git a/clang-tools-extra/clangd/ClangdServer.cpp b/clang-tools-extra/clangd/ClangdServer.cpp
index 2907e3ba3c303c..5790273d625ef1 100644
--- a/clang-tools-extra/clangd/ClangdServer.cpp
+++ b/clang-tools-extra/clangd/ClangdServer.cpp
@@ -523,7 +523,7 @@ void ClangdServer::formatFile(PathRef File, std::optional<Range> Rng,
   auto Action = [File = File.str(), Code = std::move(*Code),
                  Ranges = std::vector<tooling::Range>{RequestedRange},
                  CB = std::move(CB), this]() mutable {
-    format::FormatStyle Style = getFormatStyleForFile(File, Code, TFS);
+    format::FormatStyle Style = getFormatStyleForFile(File, Code, TFS, true);
     tooling::Replacements IncludeReplaces =
         format::sortIncludes(Style, Code, Ranges, File);
     auto Changed = tooling::applyAllReplacements(Code, IncludeReplaces);
@@ -551,7 +551,7 @@ void ClangdServer::formatOnType(PathRef File, Position Pos,
   auto Action = [File = File.str(), Code = std::move(*Code),
                  TriggerText = TriggerText.str(), CursorPos = *CursorPos,
                  CB = std::move(CB), this]() mutable {
-    auto Style = getFormatStyleForFile(File, Code, TFS);
+    auto Style = getFormatStyleForFile(File, Code, TFS, false);
     std::vector<TextEdit> Result;
     for (const tooling::Replacement &R :
          formatIncremental(Code, CursorPos, TriggerText, Style))
@@ -605,7 +605,7 @@ void ClangdServer::rename(PathRef File, Position Pos, llvm::StringRef NewName,
 
     if (Opts.WantFormat) {
       auto Style = getFormatStyleForFile(File, InpAST->Inputs.Contents,
-                                         *InpAST->Inputs.TFS);
+                                         *InpAST->Inputs.TFS, false);
       llvm::Error Err = llvm::Error::success();
       for (auto &E : R->GlobalChanges)
         Err =
@@ -762,7 +762,7 @@ void ClangdServer::applyTweak(PathRef File, Range Sel, StringRef TweakID,
       for (auto &It : (*Effect)->ApplyEdits) {
         Edit &E = It.second;
         format::FormatStyle Style =
-            getFormatStyleForFile(File, E.InitialCode, TFS);
+            getFormatStyleForFile(File, E.InitialCode, TFS, false);
         if (llvm::Error Err = reformatEdit(E, Style))
           elog("Failed to format {0}: {1}", It.first(), std::move(Err));
       }
@@ -825,7 +825,7 @@ void ClangdServer::findHover(PathRef File, Position Pos,
     if (!InpAST)
       return CB(InpAST.takeError());
     format::FormatStyle Style = getFormatStyleForFile(
-        File, InpAST->Inputs.Contents, *InpAST->Inputs.TFS);
+        File, InpAST->Inputs.Contents, *InpAST->Inputs.TFS, false);
     CB(clangd::getHover(InpAST->AST, Pos, std::move(Style), Index));
   };
 
diff --git a/clang-tools-extra/clangd/CodeComplete.cpp b/clang-tools-extra/clangd/CodeComplete.cpp
index 0e5f08cec440ce..036eb9808ea082 100644
--- a/clang-tools-extra/clangd/CodeComplete.cpp
+++ b/clang-tools-extra/clangd/CodeComplete.cpp
@@ -1628,7 +1628,7 @@ class CodeCompleteFlow {
       IsUsingDeclaration = Recorder->CCContext.isUsingDeclaration();
       auto Style = getFormatStyleForFile(SemaCCInput.FileName,
                                          SemaCCInput.ParseInput.Contents,
-                                         *SemaCCInput.ParseInput.TFS);
+                                         *SemaCCInput.ParseInput.TFS, false);
       const auto NextToken = findTokenAfterCompletionPoint(
           Recorder->CCSema->getPreprocessor().getCodeCompletionLoc(),
           Recorder->CCSema->getSourceManager(), Recorder->CCSema->LangOpts);
@@ -1719,7 +1719,7 @@ class CodeCompleteFlow {
     ProxSources[FileName].Cost = 0;
     FileProximity.emplace(ProxSources);
 
-    auto Style = getFormatStyleForFile(FileName, Content, TFS);
+    auto Style = getFormatStyleForFile(FileName, Content, TFS, false);
     // This will only insert verbatim headers.
     Inserter.emplace(FileName, Content, Style,
                      /*BuildDir=*/"", /*HeaderSearchInfo=*/nullptr);
diff --git a/clang-tools-extra/clangd/IncludeCleaner.cpp b/clang-tools-extra/clangd/IncludeCleaner.cpp
index 7375b7b0860917..8e48f546d94e77 100644
--- a/clang-tools-extra/clangd/IncludeCleaner.cpp
+++ b/clang-tools-extra/clangd/IncludeCleaner.cpp
@@ -116,7 +116,7 @@ std::vector<Diag> generateMissingIncludeDiagnostics(
   const SourceManager &SM = AST.getSourceManager();
   const FileEntry *MainFile = SM.getFileEntryForID(SM.getMainFileID());
 
-  auto FileStyle = getFormatStyleForFile(AST.tuPath(), Code, TFS);
+  auto FileStyle = getFormatStyleForFile(AST.tuPath(), Code, TFS, false);
 
   tooling::HeaderIncludes HeaderIncludes(AST.tuPath(), Code,
                                          FileStyle.IncludeStyle);
diff --git a/clang-tools-extra/clangd/ParsedAST.cpp b/clang-tools-extra/clangd/ParsedAST.cpp
index 862f06196a7100..3ff759415f7c8b 100644
--- a/clang-tools-extra/clangd/ParsedAST.cpp
+++ b/clang-tools-extra/clangd/ParsedAST.cpp
@@ -626,7 +626,7 @@ ParsedAST::build(llvm::StringRef Filename, const ParseInputs &Inputs,
     // (e.g. incomplete type) and attach include insertion fixes to diagnostics.
     if (Inputs.Index && !BuildDir.getError()) {
       auto Style =
-          getFormatStyleForFile(Filename, Inputs.Contents, *Inputs.TFS);
+          getFormatStyleForFile(Filename, Inputs.Contents, *Inputs.TFS, false);
       auto Inserter = std::make_shared<IncludeInserter>(
           Filename, Inputs.Contents, Style, BuildDir.get(),
           &Clang->getPreprocessor().getHeaderSearchInfo());
diff --git a/clang-tools-extra/clangd/Protocol.cpp b/clang-tools-extra/clangd/Protocol.cpp
index 8aa18bb0058abe..c6553e00dcae28 100644
--- a/clang-tools-extra/clangd/Protocol.cpp
+++ b/clang-tools-extra/clangd/Protocol.cpp
@@ -1412,7 +1412,7 @@ bool fromJSON(const llvm::json::Value &Params, ReferenceParams &R,
 }
 
 llvm::json::Value toJSON(SymbolTag Tag) {
-  return llvm::json::Value{static_cast<int>(Tag)};
+  return llvm::json::Value(static_cast<int>(Tag));
 }
 
 llvm::json::Value toJSON(const CallHierarchyItem &I) {
diff --git a/clang-tools-extra/clangd/SourceCode.cpp b/clang-tools-extra/clangd/SourceCode.cpp
index 3e741f6e0b536b..3af99b9db056da 100644
--- a/clang-tools-extra/clangd/SourceCode.cpp
+++ b/clang-tools-extra/clangd/SourceCode.cpp
@@ -582,7 +582,21 @@ std::optional<FileDigest> digestFile(const SourceManager &SM, FileID FID) {
 
 format::FormatStyle getFormatStyleForFile(llvm::StringRef File,
                                           llvm::StringRef Content,
-                                          const ThreadsafeFS &TFS) {
+                                          const ThreadsafeFS &TFS,
+                                          bool FormatFile) {
+  // Unless we're formatting a substantial amount of code (the entire file
+  // or an arbitrarily large range), skip libFormat's heuristic check for
+  // .h files that tries to determine whether the file contains objective-c
+  // code. (This is accomplished by passing empty code contents to getStyle().
+  // The heuristic is the only thing that looks at the contents.)
+  // This is a workaround for PR60151, a known issue in libFormat where this
+  // heuristic can OOM on large files. If we *are* formatting the entire file,
+  // there's no point in doing this because the actual format::reformat() call
+  // will run into the same OOM; we'd just be risking inconsistencies between
+  // clangd and clang-format on smaller .h files where they disagree on what
+  // language is detected.
+  if (!FormatFile)
+    Content = {};
   auto Style = format::getStyle(format::DefaultFormatStyle, File,
                                 format::DefaultFallbackStyle, Content,
                                 TFS.view(/*CWD=*/std::nullopt).get());
diff --git a/clang-tools-extra/clangd/SourceCode.h b/clang-tools-extra/clangd/SourceCode.h
index a1bb44c1761202..028549f659d60a 100644
--- a/clang-tools-extra/clangd/SourceCode.h
+++ b/clang-tools-extra/clangd/SourceCode.h
@@ -171,9 +171,13 @@ std::optional<std::string> getCanonicalPath(const FileEntryRef F,
 /// FIXME: should we be caching the .clang-format file search?
 /// This uses format::DefaultFormatStyle and format::DefaultFallbackStyle,
 /// though the latter may have been overridden in main()!
+/// \p FormatFile indicates whether the returned FormatStyle is used
+/// to format the entire main file (or a range selected by the user
+/// which can be arbitrarily long).
 format::FormatStyle getFormatStyleForFile(llvm::StringRef File,
                                           llvm::StringRef Content,
-                                          const ThreadsafeFS &TFS);
+                                          const ThreadsafeFS &TFS,
+                                          bool FormatFile);
 
 /// Cleanup and format the given replacements.
 llvm::Expected<tooling::Replacements>
diff --git a/clang-tools-extra/clangd/tool/Check.cpp b/clang-tools-extra/clangd/tool/Check.cpp
index b5c4d145619df3..45e2e1e278deaf 100644
--- a/clang-tools-extra/clangd/tool/Check.cpp
+++ b/clang-tools-extra/clangd/tool/Check.cpp
@@ -226,7 +226,7 @@ class Checker {
 
     // FIXME: Check that resource-dir/built-in-headers exist?
 
-    Style = getFormatStyleForFile(File, Inputs.Contents, TFS);
+    Style = getFormatStyleForFile(File, Inputs.Contents, TFS, false);
 
     return true;
   }
diff --git a/clang-tools-extra/clangd/unittests/SourceCodeTests.cpp b/clang-tools-extra/clangd/unittests/SourceCodeTests.cpp
index 1be5b7f6a8dbba..801d535c1b9d0d 100644
--- a/clang-tools-extra/clangd/unittests/SourceCodeTests.cpp
+++ b/clang-tools-extra/clangd/unittests/SourceCodeTests.cpp
@@ -1090,6 +1090,44 @@ TEST(ApplyEditsTest, EndLineOutOfRange) {
                     FailedWithMessage("Line value is out of range (100)"));
 }
 
+TEST(FormatStyleForFile, LanguageGuessingHeuristic) {
+  StringRef ObjCContent = "@interface Foo\n@end\n";
+  StringRef CppContent = "class Foo {};\n";
+  using LK = format::FormatStyle::LanguageKind;
+  struct TestCase {
+    llvm::StringRef Filename;
+    llvm::StringRef Contents;
+    bool FormatFile;
+    LK ExpectedLanguage;
+  } TestCases[] = {
+      // If the file extension identifies the file as ObjC, the guessed
+      // language should be ObjC regardless of content or FormatFile flag.
+      {"foo.mm", ObjCContent, true, LK::LK_ObjC},
+      {"foo.mm", ObjCContent, false, LK::LK_ObjC},
+      {"foo.mm", CppContent, true, LK::LK_ObjC},
+      {"foo.mm", CppContent, false, LK::LK_ObjC},
+
+      // If the file extension is ambiguous like .h, FormatFile=true should
+      // result in using libFormat's heuristic to guess the language based
+      // on the file contents.
+      {"foo.h", ObjCContent, true, LK::LK_ObjC},
+      {"foo.h", CppContent, true, LK::LK_Cpp},
+
+      // With FomatFile=false, the language guessing heuristic should be
+      // bypassed
+      {"foo.h", ObjCContent, false, LK::LK_Cpp},
+      {"foo.h", CppContent, false, LK::LK_Cpp},
+  };
+
+  MockFS FS;
+  for (const auto &[Filename, Contents, FormatFile, ExpectedLanguage] :
+       TestCases) {
+    EXPECT_EQ(
+        getFormatStyleForFile(Filename, Contents, FS, FormatFile).Language,
+        ExpectedLanguage);
+  }
+}
+
 } // namespace
 } // namespace clangd
 } // namespace clang
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 3b89d5a8720785..bce27dc8c4a996 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -251,6 +251,9 @@ Bug Fixes in This Version
   for logical operators in C23.
   Fixes (#GH64356).
 
+- ``__is_trivially_relocatable`` no longer returns ``false`` for volatile-qualified types.
+  Fixes (#GH77091).
+
 - Clang no longer produces a false-positive `-Wunused-variable` warning
   for variables created through copy initialization having side-effects in C++17 and later.
   Fixes (#GH64356) (#GH79518).
diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h
index 76810a86a78a46..47ed6d0d1db0df 100644
--- a/clang/include/clang/AST/DeclBase.h
+++ b/clang/include/clang/AST/DeclBase.h
@@ -673,6 +673,16 @@ class alignas(8) Decl {
   /// fragment. See [module.global.frag]p3,4 for details.
   bool isDiscardedInGlobalModuleFragment() const { return false; }
 
+  /// Check if we should skip checking ODRHash for declaration \param D.
+  ///
+  /// The existing ODRHash mechanism seems to be not stable enough and
+  /// the false positive ODR violation reports are annoying and we rarely see
+  /// true ODR violation reports. Also we learned that MSVC disabled ODR checks
+  /// for declarations in GMF. So we try to disable ODR checks in the GMF to
+  /// get better user experiences before we make the ODR violation checks stable
+  /// enough.
+  bool shouldSkipCheckingODR() const;
+
   /// Return true if this declaration has an attribute which acts as
   /// definition of the entity, such as 'alias' or 'ifunc'.
   bool hasDefiningAttr() const;
diff --git a/clang/include/clang/Interpreter/Interpreter.h b/clang/include/clang/Interpreter/Interpreter.h
index c8f932e95c4798..469ce1fd75bf84 100644
--- a/clang/include/clang/Interpreter/Interpreter.h
+++ b/clang/include/clang/Interpreter/Interpreter.h
@@ -18,6 +18,7 @@
 #include "clang/AST/GlobalDecl.h"
 #include "clang/Interpreter/PartialTranslationUnit.h"
 #include "clang/Interpreter/Value.h"
+#include "clang/Sema/Ownership.h"
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ExecutionEngine/JITSymbol.h"
@@ -75,17 +76,26 @@ class IncrementalCompilerBuilder {
   llvm::StringRef CudaSDKPath;
 };
 
+/// Generate glue code between the Interpreter's built-in runtime and user code.
+class RuntimeInterfaceBuilder {
+public:
+  virtual ~RuntimeInterfaceBuilder() = default;
+
+  using TransformExprFunction = ExprResult(RuntimeInterfaceBuilder *Builder,
+                                           Expr *, ArrayRef<Expr *>);
+  virtual TransformExprFunction *getPrintValueTransformer() = 0;
+};
+
 /// Provides top-level interfaces for incremental compilation and execution.
 class Interpreter {
   std::unique_ptr<llvm::orc::ThreadSafeContext> TSCtx;
   std::unique_ptr<IncrementalParser> IncrParser;
   std::unique_ptr<IncrementalExecutor> IncrExecutor;
+  std::unique_ptr<RuntimeInterfaceBuilder> RuntimeIB;
 
   // An optional parser for CUDA offloading
   std::unique_ptr<IncrementalParser> DeviceParser;
 
-  Interpreter(std::unique_ptr<CompilerInstance> CI, llvm::Error &Err);
-
   llvm::Error CreateExecutor();
   unsigned InitPTUSize = 0;
 
@@ -94,8 +104,25 @@ class Interpreter {
   // printing happens, it's in an invalid state.
   Value LastValue;
 
+  // Add a call to an Expr to report its result. We query the function from
+  // RuntimeInterfaceBuilder once and store it as a function pointer to avoid
+  // frequent virtual function calls.
+  RuntimeInterfaceBuilder::TransformExprFunction *AddPrintValueCall = nullptr;
+
+protected:
+  // Derived classes can make use an extended interface of the Interpreter.
+  // That's useful for testing and out-of-tree clients.
+  Interpreter(std::unique_ptr<CompilerInstance> CI, llvm::Error &Err);
+
+  // Lazily construct the RuntimeInterfaceBuilder. The provided instance will be
+  // used for the entire lifetime of the interpreter. The default implementation
+  // targets the in-process __clang_Interpreter runtime. Override this to use a
+  // custom runtime.
+  virtual std::unique_ptr<RuntimeInterfaceBuilder> FindRuntimeInterface();
+
 public:
-  ~Interpreter();
+  virtual ~Interpreter();
+
   static llvm::Expected<std::unique_ptr<Interpreter>>
   create(std::unique_ptr<CompilerInstance> CI);
   static llvm::Expected<std::unique_ptr<Interpreter>>
@@ -142,8 +169,7 @@ class Interpreter {
 
 private:
   size_t getEffectivePTUSize() const;
-
-  bool FindRuntimeInterface();
+  void markUserCodeStart();
 
   llvm::DenseMap<CXXRecordDecl *, llvm::orc::ExecutorAddr> Dtors;
 
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 00b3f53f5c1c66..267c79cc057cba 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -6752,18 +6752,10 @@ class Sema final {
                             SourceLocation RParenLoc);
 
   //// ActOnCXXThis -  Parse 'this' pointer.
-  ///
-  /// \param ThisRefersToClosureObject Whether to skip the 'this' check for a
-  /// lambda because 'this' refers to the closure object.
-  ExprResult ActOnCXXThis(SourceLocation loc,
-                          bool ThisRefersToClosureObject = false);
+  ExprResult ActOnCXXThis(SourceLocation loc);
 
   /// Build a CXXThisExpr and mark it referenced in the current context.
-  ///
-  /// \param ThisRefersToClosureObject Whether to skip the 'this' check for a
-  /// lambda because 'this' refers to the closure object.
-  Expr *BuildCXXThisExpr(SourceLocation Loc, QualType Type, bool IsImplicit,
-                         bool ThisRefersToClosureObject = false);
+  Expr *BuildCXXThisExpr(SourceLocation Loc, QualType Type, bool IsImplicit);
   void MarkThisReferenced(CXXThisExpr *This);
 
   /// Try to retrieve the type of the 'this' pointer.
diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h
index 2002bf23c9595f..370d8037a4da17 100644
--- a/clang/include/clang/Serialization/ASTReader.h
+++ b/clang/include/clang/Serialization/ASTReader.h
@@ -2456,13 +2456,6 @@ class BitsUnpacker {
   uint32_t Value;
   uint32_t CurrentBitsIndex = ~0;
 };
-
-inline bool shouldSkipCheckingODR(const Decl *D) {
-  return D->getOwningModule() &&
-         D->getASTContext().getLangOpts().SkipODRCheckInGMF &&
-         D->getOwningModule()->isExplicitGlobalModule();
-}
-
 } // namespace clang
 
 #endif // LLVM_CLANG_SERIALIZATION_ASTREADER_H
diff --git a/clang/lib/APINotes/APINotesManager.cpp b/clang/lib/APINotes/APINotesManager.cpp
index d3aef09dac9105..f60f09e2b3c231 100644
--- a/clang/lib/APINotes/APINotesManager.cpp
+++ b/clang/lib/APINotes/APINotesManager.cpp
@@ -224,7 +224,7 @@ APINotesManager::getCurrentModuleAPINotes(Module *M, bool LookInModule,
   llvm::SmallVector<FileEntryRef, 2> APINotes;
 
   // First, look relative to the module itself.
-  if (LookInModule) {
+  if (LookInModule && M->Directory) {
     // Local function to try loading an API notes file in the given directory.
     auto tryAPINotes = [&](DirectoryEntryRef Dir, bool WantPublic) {
       if (auto File = findAPINotesFile(Dir, ModuleName, WantPublic)) {
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index d681791d3920c3..8626f04012f7d4 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -4496,7 +4496,7 @@ unsigned FunctionDecl::getODRHash() {
   }
 
   class ODRHash Hash;
-  Hash.AddFunctionDecl(this);
+  Hash.AddFunctionDecl(this, /*SkipBody=*/shouldSkipCheckingODR());
   setHasODRHash(true);
   ODRHash = Hash.CalculateHash();
   return ODRHash;
diff --git a/clang/lib/AST/DeclBase.cpp b/clang/lib/AST/DeclBase.cpp
index fcedb3cfd176a0..04bbc49ab2f319 100644
--- a/clang/lib/AST/DeclBase.cpp
+++ b/clang/lib/AST/DeclBase.cpp
@@ -1102,6 +1102,11 @@ bool Decl::isInAnotherModuleUnit() const {
   return M != getASTContext().getCurrentNamedModule();
 }
 
+bool Decl::shouldSkipCheckingODR() const {
+  return getASTContext().getLangOpts().SkipODRCheckInGMF && getOwningModule() &&
+         getOwningModule()->isExplicitGlobalModule();
+}
+
 static Decl::Kind getKind(const Decl *D) { return D->getKind(); }
 static Decl::Kind getKind(const DeclContext *DC) { return DC->getDeclKind(); }
 
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index 78dcd3f4007a5a..22666184c56ccf 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -2682,6 +2682,8 @@ bool QualType::isTriviallyRelocatableType(const ASTContext &Context) const {
     return false;
   } else if (const auto *RD = BaseElementType->getAsRecordDecl()) {
     return RD->canPassInRegisters();
+  } else if (BaseElementType.isTriviallyCopyableType(Context)) {
+    return true;
   } else {
     switch (isNonTrivialToPrimitiveDestructiveMove()) {
     case PCK_Trivial:
diff --git a/clang/lib/Interpreter/Interpreter.cpp b/clang/lib/Interpreter/Interpreter.cpp
index 37696b28976428..e293fefb524963 100644
--- a/clang/lib/Interpreter/Interpreter.cpp
+++ b/clang/lib/Interpreter/Interpreter.cpp
@@ -280,15 +280,14 @@ Interpreter::create(std::unique_ptr<CompilerInstance> CI) {
   if (Err)
     return std::move(Err);
 
+  // Add runtime code and set a marker to hide it from user code. Undo will not
+  // go through that.
   auto PTU = Interp->Parse(Runtimes);
   if (!PTU)
     return PTU.takeError();
+  Interp->markUserCodeStart();
 
   Interp->ValuePrintingInfo.resize(4);
-  // FIXME: This is a ugly hack. Undo command checks its availability by looking
-  // at the size of the PTU list. However we have parsed something in the
-  // beginning of the REPL so we have to mark them as 'Irrevocable'.
-  Interp->InitPTUSize = Interp->IncrParser->getPTUs().size();
   return std::move(Interp);
 }
 
@@ -345,6 +344,11 @@ const ASTContext &Interpreter::getASTContext() const {
   return getCompilerInstance()->getASTContext();
 }
 
+void Interpreter::markUserCodeStart() {
+  assert(!InitPTUSize && "We only do this once");
+  InitPTUSize = IncrParser->getPTUs().size();
+}
+
 size_t Interpreter::getEffectivePTUSize() const {
   std::list<PartialTranslationUnit> &PTUs = IncrParser->getPTUs();
   assert(PTUs.size() >= InitPTUSize && "empty PTU list?");
@@ -507,9 +511,13 @@ static constexpr llvm::StringRef MagicRuntimeInterface[] = {
     "__clang_Interpreter_SetValueWithAlloc",
     "__clang_Interpreter_SetValueCopyArr", "__ci_newtag"};
 
-bool Interpreter::FindRuntimeInterface() {
+static std::unique_ptr<RuntimeInterfaceBuilder>
+createInProcessRuntimeInterfaceBuilder(Interpreter &Interp, ASTContext &Ctx,
+                                       Sema &S);
+
+std::unique_ptr<RuntimeInterfaceBuilder> Interpreter::FindRuntimeInterface() {
   if (llvm::all_of(ValuePrintingInfo, [](Expr *E) { return E != nullptr; }))
-    return true;
+    return nullptr;
 
   Sema &S = getCompilerInstance()->getSema();
   ASTContext &Ctx = S.getASTContext();
@@ -528,120 +536,34 @@ bool Interpreter::FindRuntimeInterface() {
 
   if (!LookupInterface(ValuePrintingInfo[NoAlloc],
                        MagicRuntimeInterface[NoAlloc]))
-    return false;
+    return nullptr;
   if (!LookupInterface(ValuePrintingInfo[WithAlloc],
                        MagicRuntimeInterface[WithAlloc]))
-    return false;
+    return nullptr;
   if (!LookupInterface(ValuePrintingInfo[CopyArray],
                        MagicRuntimeInterface[CopyArray]))
-    return false;
+    return nullptr;
   if (!LookupInterface(ValuePrintingInfo[NewTag],
                        MagicRuntimeInterface[NewTag]))
-    return false;
-  return true;
+    return nullptr;
+
+  return createInProcessRuntimeInterfaceBuilder(*this, Ctx, S);
 }
 
 namespace {
 
-class RuntimeInterfaceBuilder
-    : public TypeVisitor<RuntimeInterfaceBuilder, Interpreter::InterfaceKind> {
-  clang::Interpreter &Interp;
+class InterfaceKindVisitor
+    : public TypeVisitor<InterfaceKindVisitor, Interpreter::InterfaceKind> {
+  friend class InProcessRuntimeInterfaceBuilder;
+
   ASTContext &Ctx;
   Sema &S;
   Expr *E;
   llvm::SmallVector<Expr *, 3> Args;
 
 public:
-  RuntimeInterfaceBuilder(clang::Interpreter &In, ASTContext &C, Sema &SemaRef,
-                          Expr *VE, ArrayRef<Expr *> FixedArgs)
-      : Interp(In), Ctx(C), S(SemaRef), E(VE) {
-    // The Interpreter* parameter and the out parameter `OutVal`.
-    for (Expr *E : FixedArgs)
-      Args.push_back(E);
-
-    // Get rid of ExprWithCleanups.
-    if (auto *EWC = llvm::dyn_cast_if_present<ExprWithCleanups>(E))
-      E = EWC->getSubExpr();
-  }
-
-  ExprResult getCall() {
-    QualType Ty = E->getType();
-    QualType DesugaredTy = Ty.getDesugaredType(Ctx);
-
-    // For lvalue struct, we treat it as a reference.
-    if (DesugaredTy->isRecordType() && E->isLValue()) {
-      DesugaredTy = Ctx.getLValueReferenceType(DesugaredTy);
-      Ty = Ctx.getLValueReferenceType(Ty);
-    }
-
-    Expr *TypeArg =
-        CStyleCastPtrExpr(S, Ctx.VoidPtrTy, (uintptr_t)Ty.getAsOpaquePtr());
-    // The QualType parameter `OpaqueType`, represented as `void*`.
-    Args.push_back(TypeArg);
-
-    // We push the last parameter based on the type of the Expr. Note we need
-    // special care for rvalue struct.
-    Interpreter::InterfaceKind Kind = Visit(&*DesugaredTy);
-    switch (Kind) {
-    case Interpreter::InterfaceKind::WithAlloc:
-    case Interpreter::InterfaceKind::CopyArray: {
-      // __clang_Interpreter_SetValueWithAlloc.
-      ExprResult AllocCall = S.ActOnCallExpr(
-          /*Scope=*/nullptr,
-          Interp.getValuePrintingInfo()[Interpreter::InterfaceKind::WithAlloc],
-          E->getBeginLoc(), Args, E->getEndLoc());
-      assert(!AllocCall.isInvalid() && "Can't create runtime interface call!");
-
-      TypeSourceInfo *TSI = Ctx.getTrivialTypeSourceInfo(Ty, SourceLocation());
-
-      // Force CodeGen to emit destructor.
-      if (auto *RD = Ty->getAsCXXRecordDecl()) {
-        auto *Dtor = S.LookupDestructor(RD);
-        Dtor->addAttr(UsedAttr::CreateImplicit(Ctx));
-        Interp.getCompilerInstance()->getASTConsumer().HandleTopLevelDecl(
-            DeclGroupRef(Dtor));
-      }
-
-      // __clang_Interpreter_SetValueCopyArr.
-      if (Kind == Interpreter::InterfaceKind::CopyArray) {
-        const auto *ConstantArrTy =
-            cast<ConstantArrayType>(DesugaredTy.getTypePtr());
-        size_t ArrSize = Ctx.getConstantArrayElementCount(ConstantArrTy);
-        Expr *ArrSizeExpr = IntegerLiteralExpr(Ctx, ArrSize);
-        Expr *Args[] = {E, AllocCall.get(), ArrSizeExpr};
-        return S.ActOnCallExpr(
-            /*Scope *=*/nullptr,
-            Interp
-                .getValuePrintingInfo()[Interpreter::InterfaceKind::CopyArray],
-            SourceLocation(), Args, SourceLocation());
-      }
-      Expr *Args[] = {
-          AllocCall.get(),
-          Interp.getValuePrintingInfo()[Interpreter::InterfaceKind::NewTag]};
-      ExprResult CXXNewCall = S.BuildCXXNew(
-          E->getSourceRange(),
-          /*UseGlobal=*/true, /*PlacementLParen=*/SourceLocation(), Args,
-          /*PlacementRParen=*/SourceLocation(),
-          /*TypeIdParens=*/SourceRange(), TSI->getType(), TSI, std::nullopt,
-          E->getSourceRange(), E);
-
-      assert(!CXXNewCall.isInvalid() &&
-             "Can't create runtime placement new call!");
-
-      return S.ActOnFinishFullExpr(CXXNewCall.get(),
-                                   /*DiscardedValue=*/false);
-    }
-      // __clang_Interpreter_SetValueNoAlloc.
-    case Interpreter::InterfaceKind::NoAlloc: {
-      return S.ActOnCallExpr(
-          /*Scope=*/nullptr,
-          Interp.getValuePrintingInfo()[Interpreter::InterfaceKind::NoAlloc],
-          E->getBeginLoc(), Args, E->getEndLoc());
-    }
-    default:
-      llvm_unreachable("Unhandled Interpreter::InterfaceKind");
-    }
-  }
+  InterfaceKindVisitor(ASTContext &Ctx, Sema &S, Expr *E)
+      : Ctx(Ctx), S(S), E(E) {}
 
   Interpreter::InterfaceKind VisitRecordType(const RecordType *Ty) {
     return Interpreter::InterfaceKind::WithAlloc;
@@ -713,8 +635,124 @@ class RuntimeInterfaceBuilder
     Args.push_back(CastedExpr.get());
   }
 };
+
+class InProcessRuntimeInterfaceBuilder : public RuntimeInterfaceBuilder {
+  Interpreter &Interp;
+  ASTContext &Ctx;
+  Sema &S;
+
+public:
+  InProcessRuntimeInterfaceBuilder(Interpreter &Interp, ASTContext &C, Sema &S)
+      : Interp(Interp), Ctx(C), S(S) {}
+
+  TransformExprFunction *getPrintValueTransformer() override {
+    return &transformForValuePrinting;
+  }
+
+private:
+  static ExprResult transformForValuePrinting(RuntimeInterfaceBuilder *Builder,
+                                              Expr *E,
+                                              ArrayRef<Expr *> FixedArgs) {
+    auto *B = static_cast<InProcessRuntimeInterfaceBuilder *>(Builder);
+
+    // Get rid of ExprWithCleanups.
+    if (auto *EWC = llvm::dyn_cast_if_present<ExprWithCleanups>(E))
+      E = EWC->getSubExpr();
+
+    InterfaceKindVisitor Visitor(B->Ctx, B->S, E);
+
+    // The Interpreter* parameter and the out parameter `OutVal`.
+    for (Expr *E : FixedArgs)
+      Visitor.Args.push_back(E);
+
+    QualType Ty = E->getType();
+    QualType DesugaredTy = Ty.getDesugaredType(B->Ctx);
+
+    // For lvalue struct, we treat it as a reference.
+    if (DesugaredTy->isRecordType() && E->isLValue()) {
+      DesugaredTy = B->Ctx.getLValueReferenceType(DesugaredTy);
+      Ty = B->Ctx.getLValueReferenceType(Ty);
+    }
+
+    Expr *TypeArg = CStyleCastPtrExpr(B->S, B->Ctx.VoidPtrTy,
+                                      (uintptr_t)Ty.getAsOpaquePtr());
+    // The QualType parameter `OpaqueType`, represented as `void*`.
+    Visitor.Args.push_back(TypeArg);
+
+    // We push the last parameter based on the type of the Expr. Note we need
+    // special care for rvalue struct.
+    Interpreter::InterfaceKind Kind = Visitor.Visit(&*DesugaredTy);
+    switch (Kind) {
+    case Interpreter::InterfaceKind::WithAlloc:
+    case Interpreter::InterfaceKind::CopyArray: {
+      // __clang_Interpreter_SetValueWithAlloc.
+      ExprResult AllocCall = B->S.ActOnCallExpr(
+          /*Scope=*/nullptr,
+          B->Interp
+              .getValuePrintingInfo()[Interpreter::InterfaceKind::WithAlloc],
+          E->getBeginLoc(), Visitor.Args, E->getEndLoc());
+      assert(!AllocCall.isInvalid() && "Can't create runtime interface call!");
+
+      TypeSourceInfo *TSI =
+          B->Ctx.getTrivialTypeSourceInfo(Ty, SourceLocation());
+
+      // Force CodeGen to emit destructor.
+      if (auto *RD = Ty->getAsCXXRecordDecl()) {
+        auto *Dtor = B->S.LookupDestructor(RD);
+        Dtor->addAttr(UsedAttr::CreateImplicit(B->Ctx));
+        B->Interp.getCompilerInstance()->getASTConsumer().HandleTopLevelDecl(
+            DeclGroupRef(Dtor));
+      }
+
+      // __clang_Interpreter_SetValueCopyArr.
+      if (Kind == Interpreter::InterfaceKind::CopyArray) {
+        const auto *ConstantArrTy =
+            cast<ConstantArrayType>(DesugaredTy.getTypePtr());
+        size_t ArrSize = B->Ctx.getConstantArrayElementCount(ConstantArrTy);
+        Expr *ArrSizeExpr = IntegerLiteralExpr(B->Ctx, ArrSize);
+        Expr *Args[] = {E, AllocCall.get(), ArrSizeExpr};
+        return B->S.ActOnCallExpr(
+            /*Scope *=*/nullptr,
+            B->Interp
+                .getValuePrintingInfo()[Interpreter::InterfaceKind::CopyArray],
+            SourceLocation(), Args, SourceLocation());
+      }
+      Expr *Args[] = {
+          AllocCall.get(),
+          B->Interp.getValuePrintingInfo()[Interpreter::InterfaceKind::NewTag]};
+      ExprResult CXXNewCall = B->S.BuildCXXNew(
+          E->getSourceRange(),
+          /*UseGlobal=*/true, /*PlacementLParen=*/SourceLocation(), Args,
+          /*PlacementRParen=*/SourceLocation(),
+          /*TypeIdParens=*/SourceRange(), TSI->getType(), TSI, std::nullopt,
+          E->getSourceRange(), E);
+
+      assert(!CXXNewCall.isInvalid() &&
+             "Can't create runtime placement new call!");
+
+      return B->S.ActOnFinishFullExpr(CXXNewCall.get(),
+                                      /*DiscardedValue=*/false);
+    }
+      // __clang_Interpreter_SetValueNoAlloc.
+    case Interpreter::InterfaceKind::NoAlloc: {
+      return B->S.ActOnCallExpr(
+          /*Scope=*/nullptr,
+          B->Interp.getValuePrintingInfo()[Interpreter::InterfaceKind::NoAlloc],
+          E->getBeginLoc(), Visitor.Args, E->getEndLoc());
+    }
+    default:
+      llvm_unreachable("Unhandled Interpreter::InterfaceKind");
+    }
+  }
+};
 } // namespace
 
+static std::unique_ptr<RuntimeInterfaceBuilder>
+createInProcessRuntimeInterfaceBuilder(Interpreter &Interp, ASTContext &Ctx,
+                                       Sema &S) {
+  return std::make_unique<InProcessRuntimeInterfaceBuilder>(Interp, Ctx, S);
+}
+
 // This synthesizes a call expression to a speciall
 // function that is responsible for generating the Value.
 // In general, we transform:
@@ -733,8 +771,13 @@ Expr *Interpreter::SynthesizeExpr(Expr *E) {
   Sema &S = getCompilerInstance()->getSema();
   ASTContext &Ctx = S.getASTContext();
 
-  if (!FindRuntimeInterface())
-    llvm_unreachable("We can't find the runtime iterface for pretty print!");
+  if (!RuntimeIB) {
+    RuntimeIB = FindRuntimeInterface();
+    AddPrintValueCall = RuntimeIB->getPrintValueTransformer();
+  }
+
+  assert(AddPrintValueCall &&
+         "We don't have a runtime interface for pretty print!");
 
   // Create parameter `ThisInterp`.
   auto *ThisInterp = CStyleCastPtrExpr(S, Ctx.VoidPtrTy, (uintptr_t)this);
@@ -743,9 +786,9 @@ Expr *Interpreter::SynthesizeExpr(Expr *E) {
   auto *OutValue = CStyleCastPtrExpr(S, Ctx.VoidPtrTy, (uintptr_t)&LastValue);
 
   // Build `__clang_Interpreter_SetValue*` call.
-  RuntimeInterfaceBuilder Builder(*this, Ctx, S, E, {ThisInterp, OutValue});
+  ExprResult Result =
+      AddPrintValueCall(RuntimeIB.get(), E, {ThisInterp, OutValue});
 
-  ExprResult Result = Builder.getCall();
   // It could fail, like printing an array type in C. (not supported)
   if (Result.isInvalid())
     return E;
diff --git a/clang/lib/Sema/SemaCoroutine.cpp b/clang/lib/Sema/SemaCoroutine.cpp
index 5206fc7621c7cd..736632857efc36 100644
--- a/clang/lib/Sema/SemaCoroutine.cpp
+++ b/clang/lib/Sema/SemaCoroutine.cpp
@@ -25,7 +25,6 @@
 #include "clang/Sema/Initialization.h"
 #include "clang/Sema/Overload.h"
 #include "clang/Sema/ScopeInfo.h"
-#include "clang/Sema/Sema.h"
 #include "clang/Sema/SemaInternal.h"
 #include "llvm/ADT/SmallSet.h"
 
@@ -1291,21 +1290,8 @@ bool CoroutineStmtBuilder::makeReturnOnAllocFailure() {
 static bool collectPlacementArgs(Sema &S, FunctionDecl &FD, SourceLocation Loc,
                                  SmallVectorImpl<Expr *> &PlacementArgs) {
   if (auto *MD = dyn_cast<CXXMethodDecl>(&FD)) {
-    if (MD->isImplicitObjectMemberFunction()) {
-      ExprResult ThisExpr{};
-
-      if (isLambdaCallOperator(MD) && !MD->isStatic()) {
-        Qualifiers ThisQuals = MD->getMethodQualifiers();
-        CXXRecordDecl *Record = MD->getParent();
-
-        Sema::CXXThisScopeRAII ThisScope(S, Record, ThisQuals,
-                                         Record != nullptr);
-
-        ThisExpr = S.ActOnCXXThis(Loc, /*ThisRefersToClosureObject=*/true);
-      } else {
-        ThisExpr = S.ActOnCXXThis(Loc);
-      }
-
+    if (MD->isImplicitObjectMemberFunction() && !isLambdaCallOperator(MD)) {
+      ExprResult ThisExpr = S.ActOnCXXThis(Loc);
       if (ThisExpr.isInvalid())
         return false;
       ThisExpr = S.CreateBuiltinUnaryOp(Loc, UO_Deref, ThisExpr.get());
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 88e3d9ced044cb..c34a40fa7c81ac 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -1414,8 +1414,7 @@ bool Sema::CheckCXXThisCapture(SourceLocation Loc, const bool Explicit,
   return false;
 }
 
-ExprResult Sema::ActOnCXXThis(SourceLocation Loc,
-                              bool ThisRefersToClosureObject) {
+ExprResult Sema::ActOnCXXThis(SourceLocation Loc) {
   /// C++ 9.3.2: In the body of a non-static member function, the keyword this
   /// is a non-lvalue expression whose value is the address of the object for
   /// which the function is called.
@@ -1435,18 +1434,13 @@ ExprResult Sema::ActOnCXXThis(SourceLocation Loc,
     return Diag(Loc, diag::err_invalid_this_use) << 0;
   }
 
-  return BuildCXXThisExpr(Loc, ThisTy, /*IsImplicit=*/false,
-                          ThisRefersToClosureObject);
+  return BuildCXXThisExpr(Loc, ThisTy, /*IsImplicit=*/false);
 }
 
-Expr *Sema::BuildCXXThisExpr(SourceLocation Loc, QualType Type, bool IsImplicit,
-                             bool ThisRefersToClosureObject) {
+Expr *Sema::BuildCXXThisExpr(SourceLocation Loc, QualType Type,
+                             bool IsImplicit) {
   auto *This = CXXThisExpr::Create(Context, Loc, Type, IsImplicit);
-
-  if (!ThisRefersToClosureObject) {
-    MarkThisReferenced(This);
-  }
-
+  MarkThisReferenced(This);
   return This;
 }
 
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 683a076e6bc399..ede9f6e93469b7 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -9762,7 +9762,7 @@ void ASTReader::finishPendingActions() {
             !NonConstDefn->isLateTemplateParsed() &&
             // We only perform ODR checks for decls not in the explicit
             // global module fragment.
-            !shouldSkipCheckingODR(FD) &&
+            !FD->shouldSkipCheckingODR() &&
             FD->getODRHash() != NonConstDefn->getODRHash()) {
           if (!isa<CXXMethodDecl>(FD)) {
             PendingFunctionOdrMergeFailures[FD].push_back(NonConstDefn);
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index d5309e3fc31f70..a22f760408c634 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -832,7 +832,7 @@ void ASTDeclReader::VisitEnumDecl(EnumDecl *ED) {
       Reader.mergeDefinitionVisibility(OldDef, ED);
       // We don't want to check the ODR hash value for declarations from global
       // module fragment.
-      if (!shouldSkipCheckingODR(ED) &&
+      if (!ED->shouldSkipCheckingODR() &&
           OldDef->getODRHash() != ED->getODRHash())
         Reader.PendingEnumOdrMergeFailures[OldDef].push_back(ED);
     } else {
@@ -874,7 +874,7 @@ void ASTDeclReader::VisitRecordDecl(RecordDecl *RD) {
   VisitRecordDeclImpl(RD);
   // We should only reach here if we're in C/Objective-C. There is no
   // global module fragment.
-  assert(!shouldSkipCheckingODR(RD));
+  assert(!RD->shouldSkipCheckingODR());
   RD->setODRHash(Record.readInt());
 
   // Maintain the invariant of a redeclaration chain containing only
@@ -2152,7 +2152,7 @@ void ASTDeclReader::MergeDefinitionData(
   }
 
   // We don't want to check ODR for decls in the global module fragment.
-  if (shouldSkipCheckingODR(MergeDD.Definition))
+  if (MergeDD.Definition->shouldSkipCheckingODR())
     return;
 
   if (D->getODRHash() != MergeDD.ODRHash) {
@@ -3526,7 +3526,7 @@ ASTDeclReader::FindExistingResult ASTDeclReader::findExisting(NamedDecl *D) {
   // same template specialization into the same CXXRecordDecl.
   auto MergedDCIt = Reader.MergedDeclContexts.find(D->getLexicalDeclContext());
   if (MergedDCIt != Reader.MergedDeclContexts.end() &&
-      !shouldSkipCheckingODR(D) && MergedDCIt->second == D->getDeclContext())
+      !D->shouldSkipCheckingODR() && MergedDCIt->second == D->getDeclContext())
     Reader.PendingOdrMergeChecks.push_back(D);
 
   return FindExistingResult(Reader, D, /*Existing=*/nullptr,
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 6904c924c2fd3d..3653d94c6e0739 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -6060,7 +6060,7 @@ void ASTRecordWriter::AddCXXDefinitionData(const CXXRecordDecl *D) {
 
   BitsPacker DefinitionBits;
 
-  bool ShouldSkipCheckingODR = shouldSkipCheckingODR(D);
+  bool ShouldSkipCheckingODR = D->shouldSkipCheckingODR();
   DefinitionBits.addBit(ShouldSkipCheckingODR);
 
 #define FIELD(Name, Width, Merge)                                              \
diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index e1862de4a35b8f..d04e1c781b4e28 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -519,7 +519,7 @@ void ASTDeclWriter::VisitEnumDecl(EnumDecl *D) {
   BitsPacker EnumDeclBits;
   EnumDeclBits.addBits(D->getNumPositiveBits(), /*BitWidth=*/8);
   EnumDeclBits.addBits(D->getNumNegativeBits(), /*BitWidth=*/8);
-  bool ShouldSkipCheckingODR = shouldSkipCheckingODR(D);
+  bool ShouldSkipCheckingODR = D->shouldSkipCheckingODR();
   EnumDeclBits.addBit(ShouldSkipCheckingODR);
   EnumDeclBits.addBit(D->isScoped());
   EnumDeclBits.addBit(D->isScopedUsingClassTag());
@@ -545,7 +545,7 @@ void ASTDeclWriter::VisitEnumDecl(EnumDecl *D) {
       !D->isTopLevelDeclInObjCContainer() &&
       !CXXRecordDecl::classofKind(D->getKind()) &&
       !D->getIntegerTypeSourceInfo() && !D->getMemberSpecializationInfo() &&
-      !needsAnonymousDeclarationNumber(D) && !shouldSkipCheckingODR(D) &&
+      !needsAnonymousDeclarationNumber(D) && !D->shouldSkipCheckingODR() &&
       D->getDeclName().getNameKind() == DeclarationName::Identifier)
     AbbrevToUse = Writer.getDeclEnumAbbrev();
 
@@ -711,7 +711,7 @@ void ASTDeclWriter::VisitFunctionDecl(FunctionDecl *D) {
   // FIXME: stable encoding
   FunctionDeclBits.addBits(llvm::to_underlying(D->getLinkageInternal()), 3);
   FunctionDeclBits.addBits((uint32_t)D->getStorageClass(), /*BitWidth=*/3);
-  bool ShouldSkipCheckingODR = shouldSkipCheckingODR(D);
+  bool ShouldSkipCheckingODR = D->shouldSkipCheckingODR();
   FunctionDeclBits.addBit(ShouldSkipCheckingODR);
   FunctionDeclBits.addBit(D->isInlineSpecified());
   FunctionDeclBits.addBit(D->isInlined());
@@ -1545,7 +1545,7 @@ void ASTDeclWriter::VisitCXXMethodDecl(CXXMethodDecl *D) {
       D->getFirstDecl() == D->getMostRecentDecl() && !D->isInvalidDecl() &&
       !D->hasAttrs() && !D->isTopLevelDeclInObjCContainer() &&
       D->getDeclName().getNameKind() == DeclarationName::Identifier &&
-      !shouldSkipCheckingODR(D) && !D->hasExtInfo() &&
+      !D->shouldSkipCheckingODR() && !D->hasExtInfo() &&
       !D->isExplicitlyDefaulted()) {
     if (D->getTemplatedKind() == FunctionDecl::TK_NonTemplate ||
         D->getTemplatedKind() == FunctionDecl::TK_FunctionTemplate ||
diff --git a/clang/test/Modules/hashing-decls-in-exprs-from-gmf.cppm b/clang/test/Modules/hashing-decls-in-exprs-from-gmf.cppm
new file mode 100644
index 00000000000000..8db53c0ace8796
--- /dev/null
+++ b/clang/test/Modules/hashing-decls-in-exprs-from-gmf.cppm
@@ -0,0 +1,67 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 -fskip-odr-check-in-gmf %t/A.cppm -emit-module-interface -o %t/A.pcm
+// RUN: %clang_cc1 -std=c++20 -fskip-odr-check-in-gmf %t/B.cppm -emit-module-interface -o %t/B.pcm
+// RUN: %clang_cc1 -std=c++20 -fskip-odr-check-in-gmf %t/test.cpp -fprebuilt-module-path=%t -fsyntax-only -verify
+
+//--- header.h
+#pragma once
+template <class _Tp>
+class Optional {};
+
+template <class _Tp>
+concept C = requires(const _Tp& __t) {
+    []<class _Up>(const Optional<_Up>&) {}(__t);
+};
+
+//--- func.h
+#include "header.h"
+template <C T>
+void func() {}
+
+//--- duplicated_func.h
+#include "header.h"
+template <C T>
+void duplicated_func() {}
+
+//--- test_func.h
+#include "func.h"
+
+void test_func() {
+    func<Optional<int>>();
+}
+
+//--- test_duplicated_func.h
+#include "duplicated_func.h"
+
+void test_duplicated_func() {
+    duplicated_func<Optional<int>>();
+}
+
+//--- A.cppm
+module;
+#include "header.h"
+#include "test_duplicated_func.h"
+export module A;
+export using ::test_duplicated_func;
+
+//--- B.cppm
+module;
+#include "header.h"
+#include "test_func.h"
+#include "test_duplicated_func.h"
+export module B;
+export using ::test_func;
+export using ::test_duplicated_func;
+
+//--- test.cpp
+// expected-no-diagnostics
+import A;
+import B;
+
+void test() {
+    test_func();
+    test_duplicated_func();
+}
diff --git a/clang/test/SemaCXX/gh84064-1.cpp b/clang/test/SemaCXX/gh84064-1.cpp
deleted file mode 100644
index d9c2738a002b8d..00000000000000
--- a/clang/test/SemaCXX/gh84064-1.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-// RUN: %clang_cc1 -fsyntax-only -verify -I%S/Inputs -std=c++20 %s
-
-// expected-no-diagnostics
-
-#include "std-coroutine.h"
-
-using size_t = decltype(sizeof(0));
-
-struct Generator {
-  struct promise_type {
-    int _val{};
-
-    Generator get_return_object() noexcept
-    {
-      return {};
-    }
-
-    std::suspend_never initial_suspend() noexcept
-    {
-      return {};
-    }
-
-    std::suspend_always final_suspend() noexcept
-    {
-      return {};
-    }
-
-    void return_void() noexcept {}
-    void unhandled_exception() noexcept {}
-
-    template<typename This, typename... TheRest>
-    static void*
-    operator new(size_t size,
-                 This&,
-                 TheRest&&...) noexcept
-    {
-        return nullptr;
-    }
-
-    static void operator delete(void*, size_t)
-    {
-    }
-  };
-};
-
-struct CapturingThisTest
-{
-    int x{};
-
-    void AsPointer()
-    {
-      auto lamb = [=,this]() -> Generator {
-        int y = x;
-        co_return;
-      };
-
-      static_assert(sizeof(decltype(lamb)) == sizeof(void*));
-    }
-
-    void AsStarThis()
-    {
-      auto lamb = [*this]() -> Generator {
-        int y = x;
-        co_return;
-      };
-
-      static_assert(sizeof(decltype(lamb)) == sizeof(int));
-    }
-};
-
-int main()
-{
-  auto lamb = []() -> Generator {
-    co_return;
-  };
-
-  static_assert(sizeof(decltype(lamb)) == 1);
-}
-
diff --git a/clang/test/SemaCXX/gh84064-2.cpp b/clang/test/SemaCXX/gh84064-2.cpp
deleted file mode 100644
index 457de43eab6d9e..00000000000000
--- a/clang/test/SemaCXX/gh84064-2.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-// RUN: %clang_cc1 -fsyntax-only -verify -I%S/Inputs -std=c++23 %s
-
-// expected-no-diagnostics
-
-#include "std-coroutine.h"
-
-using size_t = decltype(sizeof(0));
-
-struct GeneratorStatic {
-  struct promise_type {
-    int _val{};
-
-    GeneratorStatic get_return_object() noexcept
-    {
-      return {};
-    }
-
-    std::suspend_never initial_suspend() noexcept
-    {
-      return {};
-    }
-
-    std::suspend_always final_suspend() noexcept
-    {
-      return {};
-    }
-
-    void return_void() noexcept {}
-    void unhandled_exception() noexcept {}
-
-    template<typename... TheRest>
-    static void*
-    operator new(size_t  size,
-                 TheRest&&...) noexcept
-    {
-        return nullptr;
-    }
-
-    static void operator delete(void*, size_t)
-    {
-    }
-  };
-};
-
-
-int main()
-{
-  auto lambCpp23 = []() static -> GeneratorStatic {
-    co_return;
-  };
-
-  static_assert(sizeof(decltype(lambCpp23)) == 1);
-}
diff --git a/clang/test/SemaCXX/type-traits.cpp b/clang/test/SemaCXX/type-traits.cpp
index f50f51cc70982e..14ec17989ec7c7 100644
--- a/clang/test/SemaCXX/type-traits.cpp
+++ b/clang/test/SemaCXX/type-traits.cpp
@@ -1722,91 +1722,91 @@ struct StructWithAnonUnion3 {
 
 void is_layout_compatible(int n)
 {
-  static_assert(__is_layout_compatible(void, void), "");
-  static_assert(!__is_layout_compatible(void, int), "");
-  static_assert(__is_layout_compatible(void, const void), "");
-  static_assert(__is_layout_compatible(void, volatile void), "");
-  static_assert(__is_layout_compatible(const int, volatile int), "");
-  static_assert(__is_layout_compatible(int, int), "");
-  static_assert(__is_layout_compatible(int, const int), "");
-  static_assert(__is_layout_compatible(int, volatile int), "");
-  static_assert(__is_layout_compatible(const int, volatile int), "");
-  static_assert(__is_layout_compatible(int *, int * __restrict), "");
+  static_assert(__is_layout_compatible(void, void));
+  static_assert(!__is_layout_compatible(void, int));
+  static_assert(__is_layout_compatible(void, const void));
+  static_assert(__is_layout_compatible(void, volatile void));
+  static_assert(__is_layout_compatible(const int, volatile int));
+  static_assert(__is_layout_compatible(int, int));
+  static_assert(__is_layout_compatible(int, const int));
+  static_assert(__is_layout_compatible(int, volatile int));
+  static_assert(__is_layout_compatible(const int, volatile int));
+  static_assert(__is_layout_compatible(int *, int * __restrict));
   // Note: atomic qualification matters for layout compatibility.
-  static_assert(!__is_layout_compatible(int, _Atomic int), "");
-  static_assert(__is_layout_compatible(_Atomic(int), _Atomic int), "");
-  static_assert(!__is_layout_compatible(int, unsigned int), "");
-  static_assert(!__is_layout_compatible(char, unsigned char), "");
-  static_assert(!__is_layout_compatible(char, signed char), "");
-  static_assert(!__is_layout_compatible(unsigned char, signed char), "");
-  static_assert(__is_layout_compatible(int[], int[]), "");
-  static_assert(__is_layout_compatible(int[2], int[2]), "");
-  static_assert(!__is_layout_compatible(int[n], int[2]), ""); // FIXME: VLAs should be rejected
-  static_assert(!__is_layout_compatible(int[n], int[n]), ""); // FIXME: VLAs should be rejected
-  static_assert(__is_layout_compatible(int&, int&), "");
-  static_assert(!__is_layout_compatible(int&, char&), "");
-  static_assert(__is_layout_compatible(void(int), void(int)), "");
-  static_assert(!__is_layout_compatible(void(int), void(char)), "");
-  static_assert(__is_layout_compatible(void(&)(int), void(&)(int)), "");
-  static_assert(!__is_layout_compatible(void(&)(int), void(&)(char)), "");
-  static_assert(__is_layout_compatible(void(*)(int), void(*)(int)), "");
-  static_assert(!__is_layout_compatible(void(*)(int), void(*)(char)), "");
+  static_assert(!__is_layout_compatible(int, _Atomic int));
+  static_assert(__is_layout_compatible(_Atomic(int), _Atomic int));
+  static_assert(!__is_layout_compatible(int, unsigned int));
+  static_assert(!__is_layout_compatible(char, unsigned char));
+  static_assert(!__is_layout_compatible(char, signed char));
+  static_assert(!__is_layout_compatible(unsigned char, signed char));
+  static_assert(__is_layout_compatible(int[], int[]));
+  static_assert(__is_layout_compatible(int[2], int[2]));
+  static_assert(!__is_layout_compatible(int[n], int[2])); // FIXME: VLAs should be rejected
+  static_assert(!__is_layout_compatible(int[n], int[n])); // FIXME: VLAs should be rejected
+  static_assert(__is_layout_compatible(int&, int&));
+  static_assert(!__is_layout_compatible(int&, char&));
+  static_assert(__is_layout_compatible(void(int), void(int)));
+  static_assert(!__is_layout_compatible(void(int), void(char)));
+  static_assert(__is_layout_compatible(void(&)(int), void(&)(int)));
+  static_assert(!__is_layout_compatible(void(&)(int), void(&)(char)));
+  static_assert(__is_layout_compatible(void(*)(int), void(*)(int)));
+  static_assert(!__is_layout_compatible(void(*)(int), void(*)(char)));
   using function_type = void();
   using function_type2 = void(char);
-  static_assert(__is_layout_compatible(const function_type, const function_type), "");
+  static_assert(__is_layout_compatible(const function_type, const function_type));
   // expected-warning@-1 {{'const' qualifier on function type 'function_type' (aka 'void ()') has no effect}}
   // expected-warning@-2 {{'const' qualifier on function type 'function_type' (aka 'void ()') has no effect}}
-  static_assert(__is_layout_compatible(function_type, const function_type), "");
+  static_assert(__is_layout_compatible(function_type, const function_type));
   // expected-warning@-1 {{'const' qualifier on function type 'function_type' (aka 'void ()') has no effect}}
-  static_assert(!__is_layout_compatible(const function_type, const function_type2), "");
+  static_assert(!__is_layout_compatible(const function_type, const function_type2));
   // expected-warning@-1 {{'const' qualifier on function type 'function_type' (aka 'void ()') has no effect}}
   // expected-warning@-2 {{'const' qualifier on function type 'function_type2' (aka 'void (char)') has no effect}}
-  static_assert(__is_layout_compatible(CStruct, CStruct2), "");
-  static_assert(__is_layout_compatible(CStruct, const CStruct2), "");
-  static_assert(__is_layout_compatible(CStruct, volatile CStruct2), "");
-  static_assert(__is_layout_compatible(const CStruct, volatile CStruct2), "");
-  static_assert(__is_layout_compatible(CEmptyStruct, CEmptyStruct2), "");
-  static_assert(__is_layout_compatible(CppEmptyStruct, CppEmptyStruct2), "");
-  static_assert(__is_layout_compatible(CppStructStandard, CppStructStandard2), "");
-  static_assert(!__is_layout_compatible(CppStructNonStandardByBase, CppStructNonStandardByBase2), "");
-  static_assert(!__is_layout_compatible(CppStructNonStandardByVirt, CppStructNonStandardByVirt2), "");
-  static_assert(!__is_layout_compatible(CppStructNonStandardByMemb, CppStructNonStandardByMemb2), "");
-  static_assert(!__is_layout_compatible(CppStructNonStandardByProt, CppStructNonStandardByProt2), "");
-  static_assert(!__is_layout_compatible(CppStructNonStandardByVirtBase, CppStructNonStandardByVirtBase2), "");
-  static_assert(!__is_layout_compatible(CppStructNonStandardBySameBase, CppStructNonStandardBySameBase2), "");
-  static_assert(!__is_layout_compatible(CppStructNonStandardBy2ndVirtBase, CppStructNonStandardBy2ndVirtBase2), "");
-  static_assert(__is_layout_compatible(CStruct, CStructWithQualifiers), "");
-  static_assert(__is_layout_compatible(CStruct, CStructNoUniqueAddress) != bool(__has_cpp_attribute(no_unique_address)), "");
-  static_assert(__is_layout_compatible(CStructNoUniqueAddress, CStructNoUniqueAddress2) != bool(__has_cpp_attribute(no_unique_address)), "");
-  static_assert(__is_layout_compatible(CStruct, CStructAlignment), "");
-  static_assert(!__is_layout_compatible(CStruct, CStructAlignedMembers), "");
-  static_assert(__is_layout_compatible(UnionNoOveralignedMembers, UnionWithOveralignedMembers), "");
-  static_assert(__is_layout_compatible(CStructWithBitfelds, CStructWithBitfelds), "");
-  static_assert(__is_layout_compatible(CStructWithBitfelds, CStructWithBitfelds2), "");
-  static_assert(!__is_layout_compatible(CStructWithBitfelds, CStructWithBitfelds3), "");
-  static_assert(!__is_layout_compatible(CStructWithBitfelds, CStructWithBitfelds4), "");
-  static_assert(__is_layout_compatible(int CStruct2::*, int CStruct2::*), "");
-  static_assert(!__is_layout_compatible(int CStruct2::*, char CStruct2::*), "");
-  static_assert(__is_layout_compatible(void(CStruct2::*)(int), void(CStruct2::*)(int)), "");
-  static_assert(!__is_layout_compatible(void(CStruct2::*)(int), void(CStruct2::*)(char)), "");
-  static_assert(__is_layout_compatible(CStructNested, CStructNested2), "");
-  static_assert(__is_layout_compatible(UnionLayout, UnionLayout), "");
-  static_assert(!__is_layout_compatible(UnionLayout, UnionLayout2), "");
-  static_assert(!__is_layout_compatible(UnionLayout, UnionLayout3), "");
-  static_assert(!__is_layout_compatible(StructWithAnonUnion, StructWithAnonUnion2), "");
-  static_assert(!__is_layout_compatible(StructWithAnonUnion, StructWithAnonUnion3), "");
-  static_assert(__is_layout_compatible(EnumLayout, EnumClassLayout), "");
-  static_assert(__is_layout_compatible(EnumForward, EnumForward), "");
-  static_assert(__is_layout_compatible(EnumForward, EnumClassForward), "");
+  static_assert(__is_layout_compatible(CStruct, CStruct2));
+  static_assert(__is_layout_compatible(CStruct, const CStruct2));
+  static_assert(__is_layout_compatible(CStruct, volatile CStruct2));
+  static_assert(__is_layout_compatible(const CStruct, volatile CStruct2));
+  static_assert(__is_layout_compatible(CEmptyStruct, CEmptyStruct2));
+  static_assert(__is_layout_compatible(CppEmptyStruct, CppEmptyStruct2));
+  static_assert(__is_layout_compatible(CppStructStandard, CppStructStandard2));
+  static_assert(!__is_layout_compatible(CppStructNonStandardByBase, CppStructNonStandardByBase2));
+  static_assert(!__is_layout_compatible(CppStructNonStandardByVirt, CppStructNonStandardByVirt2));
+  static_assert(!__is_layout_compatible(CppStructNonStandardByMemb, CppStructNonStandardByMemb2));
+  static_assert(!__is_layout_compatible(CppStructNonStandardByProt, CppStructNonStandardByProt2));
+  static_assert(!__is_layout_compatible(CppStructNonStandardByVirtBase, CppStructNonStandardByVirtBase2));
+  static_assert(!__is_layout_compatible(CppStructNonStandardBySameBase, CppStructNonStandardBySameBase2));
+  static_assert(!__is_layout_compatible(CppStructNonStandardBy2ndVirtBase, CppStructNonStandardBy2ndVirtBase2));
+  static_assert(__is_layout_compatible(CStruct, CStructWithQualifiers));
+  static_assert(__is_layout_compatible(CStruct, CStructNoUniqueAddress) != bool(__has_cpp_attribute(no_unique_address)));
+  static_assert(__is_layout_compatible(CStructNoUniqueAddress, CStructNoUniqueAddress2) != bool(__has_cpp_attribute(no_unique_address)));
+  static_assert(__is_layout_compatible(CStruct, CStructAlignment));
+  static_assert(!__is_layout_compatible(CStruct, CStructAlignedMembers));
+  static_assert(__is_layout_compatible(UnionNoOveralignedMembers, UnionWithOveralignedMembers));
+  static_assert(__is_layout_compatible(CStructWithBitfelds, CStructWithBitfelds));
+  static_assert(__is_layout_compatible(CStructWithBitfelds, CStructWithBitfelds2));
+  static_assert(!__is_layout_compatible(CStructWithBitfelds, CStructWithBitfelds3));
+  static_assert(!__is_layout_compatible(CStructWithBitfelds, CStructWithBitfelds4));
+  static_assert(__is_layout_compatible(int CStruct2::*, int CStruct2::*));
+  static_assert(!__is_layout_compatible(int CStruct2::*, char CStruct2::*));
+  static_assert(__is_layout_compatible(void(CStruct2::*)(int), void(CStruct2::*)(int)));
+  static_assert(!__is_layout_compatible(void(CStruct2::*)(int), void(CStruct2::*)(char)));
+  static_assert(__is_layout_compatible(CStructNested, CStructNested2));
+  static_assert(__is_layout_compatible(UnionLayout, UnionLayout));
+  static_assert(!__is_layout_compatible(UnionLayout, UnionLayout2));
+  static_assert(!__is_layout_compatible(UnionLayout, UnionLayout3));
+  static_assert(!__is_layout_compatible(StructWithAnonUnion, StructWithAnonUnion2));
+  static_assert(!__is_layout_compatible(StructWithAnonUnion, StructWithAnonUnion3));
+  static_assert(__is_layout_compatible(EnumLayout, EnumClassLayout));
+  static_assert(__is_layout_compatible(EnumForward, EnumForward));
+  static_assert(__is_layout_compatible(EnumForward, EnumClassForward));
   // Layout compatibility for enums might be relaxed in the future. See https://github.com/cplusplus/CWG/issues/39#issuecomment-1184791364
-  static_assert(!__is_layout_compatible(EnumLayout, int), "");
-  static_assert(!__is_layout_compatible(EnumClassLayout, int), "");
-  static_assert(!__is_layout_compatible(EnumForward, int), "");
-  static_assert(!__is_layout_compatible(EnumClassForward, int), "");
+  static_assert(!__is_layout_compatible(EnumLayout, int));
+  static_assert(!__is_layout_compatible(EnumClassLayout, int));
+  static_assert(!__is_layout_compatible(EnumForward, int));
+  static_assert(!__is_layout_compatible(EnumClassForward, int));
   // FIXME: the following should be rejected (array of unknown bound and void are the only allowed incomplete types)
-  static_assert(__is_layout_compatible(CStructIncomplete, CStructIncomplete), ""); 
-  static_assert(!__is_layout_compatible(CStruct, CStructIncomplete), "");
-  static_assert(__is_layout_compatible(CStructIncomplete[2], CStructIncomplete[2]), "");
+  static_assert(__is_layout_compatible(CStructIncomplete, CStructIncomplete)); 
+  static_assert(!__is_layout_compatible(CStruct, CStructIncomplete));
+  static_assert(__is_layout_compatible(CStructIncomplete[2], CStructIncomplete[2]));
 }
 
 void is_signed()
@@ -3340,6 +3340,8 @@ namespace is_trivially_relocatable {
 static_assert(!__is_trivially_relocatable(void));
 static_assert(__is_trivially_relocatable(int));
 static_assert(__is_trivially_relocatable(int[]));
+static_assert(__is_trivially_relocatable(const int));
+static_assert(__is_trivially_relocatable(volatile int));
 
 enum Enum {};
 static_assert(__is_trivially_relocatable(Enum));
@@ -3351,7 +3353,28 @@ static_assert(__is_trivially_relocatable(Union[]));
 
 struct Trivial {};
 static_assert(__is_trivially_relocatable(Trivial));
+static_assert(__is_trivially_relocatable(const Trivial));
+static_assert(__is_trivially_relocatable(volatile Trivial));
+
 static_assert(__is_trivially_relocatable(Trivial[]));
+static_assert(__is_trivially_relocatable(const Trivial[]));
+static_assert(__is_trivially_relocatable(volatile Trivial[]));
+
+static_assert(__is_trivially_relocatable(int[10]));
+static_assert(__is_trivially_relocatable(const int[10]));
+static_assert(__is_trivially_relocatable(volatile int[10]));
+
+static_assert(__is_trivially_relocatable(int[10][10]));
+static_assert(__is_trivially_relocatable(const int[10][10]));
+static_assert(__is_trivially_relocatable(volatile int[10][10]));
+
+static_assert(__is_trivially_relocatable(int[]));
+static_assert(__is_trivially_relocatable(const int[]));
+static_assert(__is_trivially_relocatable(volatile int[]));
+
+static_assert(__is_trivially_relocatable(int[][10]));
+static_assert(__is_trivially_relocatable(const int[][10]));
+static_assert(__is_trivially_relocatable(volatile int[][10]));
 
 struct Incomplete; // expected-note {{forward declaration of 'is_trivially_relocatable::Incomplete'}}
 bool unused = __is_trivially_relocatable(Incomplete); // expected-error {{incomplete type}}
@@ -3361,6 +3384,8 @@ struct NontrivialDtor {
 };
 static_assert(!__is_trivially_relocatable(NontrivialDtor));
 static_assert(!__is_trivially_relocatable(NontrivialDtor[]));
+static_assert(!__is_trivially_relocatable(const NontrivialDtor));
+static_assert(!__is_trivially_relocatable(volatile NontrivialDtor));
 
 struct NontrivialCopyCtor {
   NontrivialCopyCtor(const NontrivialCopyCtor&) {}
@@ -3379,12 +3404,16 @@ struct [[clang::trivial_abi]] TrivialAbiNontrivialDtor {
 };
 static_assert(__is_trivially_relocatable(TrivialAbiNontrivialDtor));
 static_assert(__is_trivially_relocatable(TrivialAbiNontrivialDtor[]));
+static_assert(__is_trivially_relocatable(const TrivialAbiNontrivialDtor));
+static_assert(__is_trivially_relocatable(volatile TrivialAbiNontrivialDtor));
 
 struct [[clang::trivial_abi]] TrivialAbiNontrivialCopyCtor {
   TrivialAbiNontrivialCopyCtor(const TrivialAbiNontrivialCopyCtor&) {}
 };
 static_assert(__is_trivially_relocatable(TrivialAbiNontrivialCopyCtor));
 static_assert(__is_trivially_relocatable(TrivialAbiNontrivialCopyCtor[]));
+static_assert(__is_trivially_relocatable(const TrivialAbiNontrivialCopyCtor));
+static_assert(__is_trivially_relocatable(volatile TrivialAbiNontrivialCopyCtor));
 
 // A more complete set of tests for the behavior of trivial_abi can be found in
 // clang/test/SemaCXX/attr-trivial-abi.cpp
@@ -3393,6 +3422,8 @@ struct [[clang::trivial_abi]] TrivialAbiNontrivialMoveCtor {
 };
 static_assert(__is_trivially_relocatable(TrivialAbiNontrivialMoveCtor));
 static_assert(__is_trivially_relocatable(TrivialAbiNontrivialMoveCtor[]));
+static_assert(__is_trivially_relocatable(const TrivialAbiNontrivialMoveCtor));
+static_assert(__is_trivially_relocatable(volatile TrivialAbiNontrivialMoveCtor));
 
 } // namespace is_trivially_relocatable
 
diff --git a/clang/unittests/Interpreter/CMakeLists.txt b/clang/unittests/Interpreter/CMakeLists.txt
index 0ddedb283e07d1..046d96ad0ec644 100644
--- a/clang/unittests/Interpreter/CMakeLists.txt
+++ b/clang/unittests/Interpreter/CMakeLists.txt
@@ -10,6 +10,7 @@ add_clang_unittest(ClangReplInterpreterTests
   IncrementalCompilerBuilderTest.cpp
   IncrementalProcessingTest.cpp
   InterpreterTest.cpp
+  InterpreterExtensionsTest.cpp
   CodeCompletionTest.cpp
   )
 target_link_libraries(ClangReplInterpreterTests PUBLIC
diff --git a/clang/unittests/Interpreter/InterpreterExtensionsTest.cpp b/clang/unittests/Interpreter/InterpreterExtensionsTest.cpp
new file mode 100644
index 00000000000000..4e9f2dba210a37
--- /dev/null
+++ b/clang/unittests/Interpreter/InterpreterExtensionsTest.cpp
@@ -0,0 +1,79 @@
+//===- unittests/Interpreter/InterpreterExtensionsTest.cpp ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Unit tests for Clang's Interpreter library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Interpreter/Interpreter.h"
+
+#include "clang/AST/Expr.h"
+#include "clang/Frontend/CompilerInstance.h"
+#include "clang/Sema/Lookup.h"
+#include "clang/Sema/Sema.h"
+
+#include "llvm/Support/Error.h"
+#include "llvm/Testing/Support/Error.h"
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include <system_error>
+
+using namespace clang;
+namespace {
+
+class RecordRuntimeIBMetrics : public Interpreter {
+  struct NoopRuntimeInterfaceBuilder : public RuntimeInterfaceBuilder {
+    NoopRuntimeInterfaceBuilder(Sema &S) : S(S) {}
+
+    TransformExprFunction *getPrintValueTransformer() override {
+      TransformerQueries += 1;
+      return &noop;
+    }
+
+    static ExprResult noop(RuntimeInterfaceBuilder *Builder, Expr *E,
+                           ArrayRef<Expr *> FixedArgs) {
+      auto *B = static_cast<NoopRuntimeInterfaceBuilder *>(Builder);
+      B->TransformedExprs += 1;
+      return B->S.ActOnFinishFullExpr(E, /*DiscardedValue=*/false);
+    }
+
+    Sema &S;
+    size_t TransformedExprs = 0;
+    size_t TransformerQueries = 0;
+  };
+
+public:
+  // Inherit with using wouldn't make it public
+  RecordRuntimeIBMetrics(std::unique_ptr<CompilerInstance> CI, llvm::Error &Err)
+      : Interpreter(std::move(CI), Err) {}
+
+  std::unique_ptr<RuntimeInterfaceBuilder> FindRuntimeInterface() override {
+    assert(RuntimeIBPtr == nullptr && "We create the builder only once");
+    Sema &S = getCompilerInstance()->getSema();
+    auto RuntimeIB = std::make_unique<NoopRuntimeInterfaceBuilder>(S);
+    RuntimeIBPtr = RuntimeIB.get();
+    return RuntimeIB;
+  }
+
+  NoopRuntimeInterfaceBuilder *RuntimeIBPtr = nullptr;
+};
+
+TEST(InterpreterExtensionsTest, FindRuntimeInterface) {
+  clang::IncrementalCompilerBuilder CB;
+  llvm::Error ErrOut = llvm::Error::success();
+  RecordRuntimeIBMetrics Interp(cantFail(CB.CreateCpp()), ErrOut);
+  cantFail(std::move(ErrOut));
+  cantFail(Interp.Parse("int a = 1; a"));
+  cantFail(Interp.Parse("int b = 2; b"));
+  cantFail(Interp.Parse("int c = 3; c"));
+  EXPECT_EQ(3U, Interp.RuntimeIBPtr->TransformedExprs);
+  EXPECT_EQ(1U, Interp.RuntimeIBPtr->TransformerQueries);
+}
+
+} // end anonymous namespace
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformWindows.c b/compiler-rt/lib/profile/InstrProfilingPlatformWindows.c
index 9421f67b768e0d..741b01faada4e2 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformWindows.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformWindows.c
@@ -93,7 +93,7 @@ ValueProfNode *__llvm_profile_end_vnodes(void) { return &VNodesEnd; }
 ValueProfNode *CurrentVNode = &VNodesStart + 1;
 ValueProfNode *EndVNode = &VNodesEnd;
 
-/* lld-link provides __buildid symbol which ponits to the 16 bytes build id when
+/* lld-link provides __buildid symbol which points to the 16 bytes build id when
  * using /build-id flag. https://lld.llvm.org/windows_support.html#lld-flags */
 #define BUILD_ID_LEN 16
 COMPILER_RT_WEAK uint8_t __buildid[BUILD_ID_LEN] = {0};
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
index 3ecdb55cdbf72f..a1be676730a786 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
@@ -974,7 +974,7 @@ INTERCEPTOR(SSIZE_T, read, int fd, void *ptr, SIZE_T count) {
   // FIXME: under ASan the call below may write to freed memory and corrupt
   // its metadata. See
   // https://github.com/google/sanitizers/issues/321.
-  SSIZE_T res = REAL(read)(fd, ptr, count);
+  SSIZE_T res = COMMON_INTERCEPTOR_BLOCK_REAL(read)(fd, ptr, count);
   if (res > 0) COMMON_INTERCEPTOR_WRITE_RANGE(ctx, ptr, res);
   if (res >= 0 && fd >= 0) COMMON_INTERCEPTOR_FD_ACQUIRE(ctx, fd);
   return res;
@@ -1009,7 +1009,7 @@ INTERCEPTOR(SSIZE_T, pread, int fd, void *ptr, SIZE_T count, OFF_T offset) {
   // FIXME: under ASan the call below may write to freed memory and corrupt
   // its metadata. See
   // https://github.com/google/sanitizers/issues/321.
-  SSIZE_T res = REAL(pread)(fd, ptr, count, offset);
+  SSIZE_T res = COMMON_INTERCEPTOR_BLOCK_REAL(pread)(fd, ptr, count, offset);
   if (res > 0) COMMON_INTERCEPTOR_WRITE_RANGE(ctx, ptr, res);
   if (res >= 0 && fd >= 0) COMMON_INTERCEPTOR_FD_ACQUIRE(ctx, fd);
   return res;
@@ -1027,7 +1027,7 @@ INTERCEPTOR(SSIZE_T, pread64, int fd, void *ptr, SIZE_T count, OFF64_T offset) {
   // FIXME: under ASan the call below may write to freed memory and corrupt
   // its metadata. See
   // https://github.com/google/sanitizers/issues/321.
-  SSIZE_T res = REAL(pread64)(fd, ptr, count, offset);
+  SSIZE_T res = COMMON_INTERCEPTOR_BLOCK_REAL(pread64)(fd, ptr, count, offset);
   if (res > 0) COMMON_INTERCEPTOR_WRITE_RANGE(ctx, ptr, res);
   if (res >= 0 && fd >= 0) COMMON_INTERCEPTOR_FD_ACQUIRE(ctx, fd);
   return res;
@@ -1043,7 +1043,7 @@ INTERCEPTOR_WITH_SUFFIX(SSIZE_T, readv, int fd, __sanitizer_iovec *iov,
   void *ctx;
   COMMON_INTERCEPTOR_ENTER(ctx, readv, fd, iov, iovcnt);
   COMMON_INTERCEPTOR_FD_ACCESS(ctx, fd);
-  SSIZE_T res = REAL(readv)(fd, iov, iovcnt);
+  SSIZE_T res = COMMON_INTERCEPTOR_BLOCK_REAL(readv)(fd, iov, iovcnt);
   if (res > 0) write_iovec(ctx, iov, iovcnt, res);
   if (res >= 0 && fd >= 0) COMMON_INTERCEPTOR_FD_ACQUIRE(ctx, fd);
   return res;
@@ -1059,7 +1059,7 @@ INTERCEPTOR(SSIZE_T, preadv, int fd, __sanitizer_iovec *iov, int iovcnt,
   void *ctx;
   COMMON_INTERCEPTOR_ENTER(ctx, preadv, fd, iov, iovcnt, offset);
   COMMON_INTERCEPTOR_FD_ACCESS(ctx, fd);
-  SSIZE_T res = REAL(preadv)(fd, iov, iovcnt, offset);
+  SSIZE_T res = COMMON_INTERCEPTOR_BLOCK_REAL(preadv)(fd, iov, iovcnt, offset);
   if (res > 0) write_iovec(ctx, iov, iovcnt, res);
   if (res >= 0 && fd >= 0) COMMON_INTERCEPTOR_FD_ACQUIRE(ctx, fd);
   return res;
@@ -1075,7 +1075,8 @@ INTERCEPTOR(SSIZE_T, preadv64, int fd, __sanitizer_iovec *iov, int iovcnt,
   void *ctx;
   COMMON_INTERCEPTOR_ENTER(ctx, preadv64, fd, iov, iovcnt, offset);
   COMMON_INTERCEPTOR_FD_ACCESS(ctx, fd);
-  SSIZE_T res = REAL(preadv64)(fd, iov, iovcnt, offset);
+  SSIZE_T res =
+      COMMON_INTERCEPTOR_BLOCK_REAL(preadv64)(fd, iov, iovcnt, offset);
   if (res > 0) write_iovec(ctx, iov, iovcnt, res);
   if (res >= 0 && fd >= 0) COMMON_INTERCEPTOR_FD_ACQUIRE(ctx, fd);
   return res;
@@ -1091,8 +1092,9 @@ INTERCEPTOR(SSIZE_T, write, int fd, void *ptr, SIZE_T count) {
   COMMON_INTERCEPTOR_ENTER(ctx, write, fd, ptr, count);
   COMMON_INTERCEPTOR_FD_ACCESS(ctx, fd);
   if (fd >= 0) COMMON_INTERCEPTOR_FD_RELEASE(ctx, fd);
-  SSIZE_T res = REAL(write)(fd, ptr, count);
-  // FIXME: this check should be _before_ the call to REAL(write), not after
+  SSIZE_T res = COMMON_INTERCEPTOR_BLOCK_REAL(write)(fd, ptr, count);
+  // FIXME: this check should be _before_ the call to
+  // COMMON_INTERCEPTOR_BLOCK_REAL(write), not after
   if (res > 0) COMMON_INTERCEPTOR_READ_RANGE(ctx, ptr, res);
   return res;
 }
@@ -1121,7 +1123,7 @@ INTERCEPTOR(SSIZE_T, pwrite, int fd, void *ptr, SIZE_T count, OFF_T offset) {
   COMMON_INTERCEPTOR_ENTER(ctx, pwrite, fd, ptr, count, offset);
   COMMON_INTERCEPTOR_FD_ACCESS(ctx, fd);
   if (fd >= 0) COMMON_INTERCEPTOR_FD_RELEASE(ctx, fd);
-  SSIZE_T res = REAL(pwrite)(fd, ptr, count, offset);
+  SSIZE_T res = COMMON_INTERCEPTOR_BLOCK_REAL(pwrite)(fd, ptr, count, offset);
   if (res > 0) COMMON_INTERCEPTOR_READ_RANGE(ctx, ptr, res);
   return res;
 }
@@ -1137,7 +1139,7 @@ INTERCEPTOR(SSIZE_T, pwrite64, int fd, void *ptr, OFF64_T count,
   COMMON_INTERCEPTOR_ENTER(ctx, pwrite64, fd, ptr, count, offset);
   COMMON_INTERCEPTOR_FD_ACCESS(ctx, fd);
   if (fd >= 0) COMMON_INTERCEPTOR_FD_RELEASE(ctx, fd);
-  SSIZE_T res = REAL(pwrite64)(fd, ptr, count, offset);
+  SSIZE_T res = COMMON_INTERCEPTOR_BLOCK_REAL(pwrite64)(fd, ptr, count, offset);
   if (res > 0) COMMON_INTERCEPTOR_READ_RANGE(ctx, ptr, res);
   return res;
 }
@@ -1153,7 +1155,7 @@ INTERCEPTOR_WITH_SUFFIX(SSIZE_T, writev, int fd, __sanitizer_iovec *iov,
   COMMON_INTERCEPTOR_ENTER(ctx, writev, fd, iov, iovcnt);
   COMMON_INTERCEPTOR_FD_ACCESS(ctx, fd);
   if (fd >= 0) COMMON_INTERCEPTOR_FD_RELEASE(ctx, fd);
-  SSIZE_T res = REAL(writev)(fd, iov, iovcnt);
+  SSIZE_T res = COMMON_INTERCEPTOR_BLOCK_REAL(writev)(fd, iov, iovcnt);
   if (res > 0) read_iovec(ctx, iov, iovcnt, res);
   return res;
 }
@@ -1169,7 +1171,7 @@ INTERCEPTOR(SSIZE_T, pwritev, int fd, __sanitizer_iovec *iov, int iovcnt,
   COMMON_INTERCEPTOR_ENTER(ctx, pwritev, fd, iov, iovcnt, offset);
   COMMON_INTERCEPTOR_FD_ACCESS(ctx, fd);
   if (fd >= 0) COMMON_INTERCEPTOR_FD_RELEASE(ctx, fd);
-  SSIZE_T res = REAL(pwritev)(fd, iov, iovcnt, offset);
+  SSIZE_T res = COMMON_INTERCEPTOR_BLOCK_REAL(pwritev)(fd, iov, iovcnt, offset);
   if (res > 0) read_iovec(ctx, iov, iovcnt, res);
   return res;
 }
@@ -1185,7 +1187,8 @@ INTERCEPTOR(SSIZE_T, pwritev64, int fd, __sanitizer_iovec *iov, int iovcnt,
   COMMON_INTERCEPTOR_ENTER(ctx, pwritev64, fd, iov, iovcnt, offset);
   COMMON_INTERCEPTOR_FD_ACCESS(ctx, fd);
   if (fd >= 0) COMMON_INTERCEPTOR_FD_RELEASE(ctx, fd);
-  SSIZE_T res = REAL(pwritev64)(fd, iov, iovcnt, offset);
+  SSIZE_T res =
+      COMMON_INTERCEPTOR_BLOCK_REAL(pwritev64)(fd, iov, iovcnt, offset);
   if (res > 0) read_iovec(ctx, iov, iovcnt, res);
   return res;
 }
@@ -2549,7 +2552,7 @@ INTERCEPTOR_WITH_SUFFIX(int, wait, int *status) {
   // FIXME: under ASan the call below may write to freed memory and corrupt
   // its metadata. See
   // https://github.com/google/sanitizers/issues/321.
-  int res = REAL(wait)(status);
+  int res = COMMON_INTERCEPTOR_BLOCK_REAL(wait)(status);
   if (res != -1 && status)
     COMMON_INTERCEPTOR_WRITE_RANGE(ctx, status, sizeof(*status));
   return res;
@@ -2567,7 +2570,7 @@ INTERCEPTOR_WITH_SUFFIX(int, waitid, int idtype, int id, void *infop,
   // FIXME: under ASan the call below may write to freed memory and corrupt
   // its metadata. See
   // https://github.com/google/sanitizers/issues/321.
-  int res = REAL(waitid)(idtype, id, infop, options);
+  int res = COMMON_INTERCEPTOR_BLOCK_REAL(waitid)(idtype, id, infop, options);
   if (res != -1 && infop)
     COMMON_INTERCEPTOR_WRITE_RANGE(ctx, infop, siginfo_t_sz);
   return res;
@@ -2578,7 +2581,7 @@ INTERCEPTOR_WITH_SUFFIX(int, waitpid, int pid, int *status, int options) {
   // FIXME: under ASan the call below may write to freed memory and corrupt
   // its metadata. See
   // https://github.com/google/sanitizers/issues/321.
-  int res = REAL(waitpid)(pid, status, options);
+  int res = COMMON_INTERCEPTOR_BLOCK_REAL(waitpid)(pid, status, options);
   if (res != -1 && status)
     COMMON_INTERCEPTOR_WRITE_RANGE(ctx, status, sizeof(*status));
   return res;
@@ -2589,7 +2592,7 @@ INTERCEPTOR(int, wait3, int *status, int options, void *rusage) {
   // FIXME: under ASan the call below may write to freed memory and corrupt
   // its metadata. See
   // https://github.com/google/sanitizers/issues/321.
-  int res = REAL(wait3)(status, options, rusage);
+  int res = COMMON_INTERCEPTOR_BLOCK_REAL(wait3)(status, options, rusage);
   if (res != -1) {
     if (status) COMMON_INTERCEPTOR_WRITE_RANGE(ctx, status, sizeof(*status));
     if (rusage) COMMON_INTERCEPTOR_WRITE_RANGE(ctx, rusage, struct_rusage_sz);
@@ -2603,7 +2606,8 @@ INTERCEPTOR(int, __wait4, int pid, int *status, int options, void *rusage) {
   // FIXME: under ASan the call below may write to freed memory and corrupt
   // its metadata. See
   // https://github.com/google/sanitizers/issues/321.
-  int res = REAL(__wait4)(pid, status, options, rusage);
+  int res =
+      COMMON_INTERCEPTOR_BLOCK_REAL(__wait4)(pid, status, options, rusage);
   if (res != -1) {
     if (status) COMMON_INTERCEPTOR_WRITE_RANGE(ctx, status, sizeof(*status));
     if (rusage) COMMON_INTERCEPTOR_WRITE_RANGE(ctx, rusage, struct_rusage_sz);
@@ -2618,7 +2622,7 @@ INTERCEPTOR(int, wait4, int pid, int *status, int options, void *rusage) {
   // FIXME: under ASan the call below may write to freed memory and corrupt
   // its metadata. See
   // https://github.com/google/sanitizers/issues/321.
-  int res = REAL(wait4)(pid, status, options, rusage);
+  int res = COMMON_INTERCEPTOR_BLOCK_REAL(wait4)(pid, status, options, rusage);
   if (res != -1) {
     if (status) COMMON_INTERCEPTOR_WRITE_RANGE(ctx, status, sizeof(*status));
     if (rusage) COMMON_INTERCEPTOR_WRITE_RANGE(ctx, rusage, struct_rusage_sz);
@@ -2996,7 +3000,7 @@ INTERCEPTOR(int, accept, int fd, void *addr, unsigned *addrlen) {
     COMMON_INTERCEPTOR_READ_RANGE(ctx, addrlen, sizeof(*addrlen));
     addrlen0 = *addrlen;
   }
-  int fd2 = REAL(accept)(fd, addr, addrlen);
+  int fd2 = COMMON_INTERCEPTOR_BLOCK_REAL(accept)(fd, addr, addrlen);
   if (fd2 >= 0) {
     if (fd >= 0) COMMON_INTERCEPTOR_FD_SOCKET_ACCEPT(ctx, fd, fd2);
     if (addr && addrlen)
@@ -3021,7 +3025,7 @@ INTERCEPTOR(int, accept4, int fd, void *addr, unsigned *addrlen, int f) {
   // FIXME: under ASan the call below may write to freed memory and corrupt
   // its metadata. See
   // https://github.com/google/sanitizers/issues/321.
-  int fd2 = REAL(accept4)(fd, addr, addrlen, f);
+  int fd2 = COMMON_INTERCEPTOR_BLOCK_REAL(accept4)(fd, addr, addrlen, f);
   if (fd2 >= 0) {
     if (fd >= 0) COMMON_INTERCEPTOR_FD_SOCKET_ACCEPT(ctx, fd, fd2);
     if (addr && addrlen)
@@ -3045,7 +3049,7 @@ INTERCEPTOR(int, paccept, int fd, void *addr, unsigned *addrlen,
     addrlen0 = *addrlen;
   }
   if (set) COMMON_INTERCEPTOR_READ_RANGE(ctx, set, sizeof(*set));
-  int fd2 = REAL(paccept)(fd, addr, addrlen, set, f);
+  int fd2 = COMMON_INTERCEPTOR_BLOCK_REAL(paccept)(fd, addr, addrlen, set, f);
   if (fd2 >= 0) {
     if (fd >= 0) COMMON_INTERCEPTOR_FD_SOCKET_ACCEPT(ctx, fd, fd2);
     if (addr && addrlen)
@@ -3126,7 +3130,7 @@ INTERCEPTOR(SSIZE_T, recvmsg, int fd, struct __sanitizer_msghdr *msg,
   // FIXME: under ASan the call below may write to freed memory and corrupt
   // its metadata. See
   // https://github.com/google/sanitizers/issues/321.
-  SSIZE_T res = REAL(recvmsg)(fd, msg, flags);
+  SSIZE_T res = COMMON_INTERCEPTOR_BLOCK_REAL(recvmsg)(fd, msg, flags);
   if (res >= 0) {
     if (fd >= 0) COMMON_INTERCEPTOR_FD_ACQUIRE(ctx, fd);
     if (msg) {
@@ -3147,7 +3151,8 @@ INTERCEPTOR(int, recvmmsg, int fd, struct __sanitizer_mmsghdr *msgvec,
   void *ctx;
   COMMON_INTERCEPTOR_ENTER(ctx, recvmmsg, fd, msgvec, vlen, flags, timeout);
   if (timeout) COMMON_INTERCEPTOR_READ_RANGE(ctx, timeout, struct_timespec_sz);
-  int res = REAL(recvmmsg)(fd, msgvec, vlen, flags, timeout);
+  int res =
+      COMMON_INTERCEPTOR_BLOCK_REAL(recvmmsg)(fd, msgvec, vlen, flags, timeout);
   if (res >= 0) {
     if (fd >= 0) COMMON_INTERCEPTOR_FD_ACQUIRE(ctx, fd);
     for (int i = 0; i < res; ++i) {
@@ -3225,7 +3230,7 @@ INTERCEPTOR(SSIZE_T, sendmsg, int fd, struct __sanitizer_msghdr *msg,
     COMMON_INTERCEPTOR_FD_ACCESS(ctx, fd);
     COMMON_INTERCEPTOR_FD_RELEASE(ctx, fd);
   }
-  SSIZE_T res = REAL(sendmsg)(fd, msg, flags);
+  SSIZE_T res = COMMON_INTERCEPTOR_BLOCK_REAL(sendmsg)(fd, msg, flags);
   if (common_flags()->intercept_send && res >= 0 && msg)
     read_msghdr(ctx, msg, res);
   return res;
@@ -3244,7 +3249,7 @@ INTERCEPTOR(int, sendmmsg, int fd, struct __sanitizer_mmsghdr *msgvec,
     COMMON_INTERCEPTOR_FD_ACCESS(ctx, fd);
     COMMON_INTERCEPTOR_FD_RELEASE(ctx, fd);
   }
-  int res = REAL(sendmmsg)(fd, msgvec, vlen, flags);
+  int res = COMMON_INTERCEPTOR_BLOCK_REAL(sendmmsg)(fd, msgvec, vlen, flags);
   if (res >= 0 && msgvec) {
     for (int i = 0; i < res; ++i) {
       COMMON_INTERCEPTOR_WRITE_RANGE(ctx, &msgvec[i].msg_len,
@@ -3267,7 +3272,7 @@ INTERCEPTOR(int, msgsnd, int msqid, const void *msgp, SIZE_T msgsz,
   COMMON_INTERCEPTOR_ENTER(ctx, msgsnd, msqid, msgp, msgsz, msgflg);
   if (msgp)
     COMMON_INTERCEPTOR_READ_RANGE(ctx, msgp, sizeof(long) + msgsz);
-  int res = REAL(msgsnd)(msqid, msgp, msgsz, msgflg);
+  int res = COMMON_INTERCEPTOR_BLOCK_REAL(msgsnd)(msqid, msgp, msgsz, msgflg);
   return res;
 }
 
@@ -3275,7 +3280,8 @@ INTERCEPTOR(SSIZE_T, msgrcv, int msqid, void *msgp, SIZE_T msgsz,
             long msgtyp, int msgflg) {
   void *ctx;
   COMMON_INTERCEPTOR_ENTER(ctx, msgrcv, msqid, msgp, msgsz, msgtyp, msgflg);
-  SSIZE_T len = REAL(msgrcv)(msqid, msgp, msgsz, msgtyp, msgflg);
+  SSIZE_T len =
+      COMMON_INTERCEPTOR_BLOCK_REAL(msgrcv)(msqid, msgp, msgsz, msgtyp, msgflg);
   if (len != -1)
     COMMON_INTERCEPTOR_WRITE_RANGE(ctx, msgp, sizeof(long) + len);
   return len;
@@ -6119,7 +6125,7 @@ INTERCEPTOR(int, flopen, const char *path, int flags, ...) {
   if (path) {
     COMMON_INTERCEPTOR_READ_RANGE(ctx, path, internal_strlen(path) + 1);
   }
-  return REAL(flopen)(path, flags, mode);
+  return COMMON_INTERCEPTOR_BLOCK_REAL(flopen)(path, flags, mode);
 }
 
 INTERCEPTOR(int, flopenat, int dirfd, const char *path, int flags, ...) {
@@ -6132,7 +6138,7 @@ INTERCEPTOR(int, flopenat, int dirfd, const char *path, int flags, ...) {
   if (path) {
     COMMON_INTERCEPTOR_READ_RANGE(ctx, path, internal_strlen(path) + 1);
   }
-  return REAL(flopenat)(dirfd, path, flags, mode);
+  return COMMON_INTERCEPTOR_BLOCK_REAL(flopenat)(dirfd, path, flags, mode);
 }
 
 #define INIT_FLOPEN    \
@@ -6717,7 +6723,7 @@ INTERCEPTOR(SSIZE_T, recv, int fd, void *buf, SIZE_T len, int flags) {
   void *ctx;
   COMMON_INTERCEPTOR_ENTER(ctx, recv, fd, buf, len, flags);
   COMMON_INTERCEPTOR_FD_ACCESS(ctx, fd);
-  SSIZE_T res = REAL(recv)(fd, buf, len, flags);
+  SSIZE_T res = COMMON_INTERCEPTOR_BLOCK_REAL(recv)(fd, buf, len, flags);
   if (res > 0) {
     COMMON_INTERCEPTOR_WRITE_RANGE(ctx, buf, Min((SIZE_T)res, len));
   }
@@ -6734,7 +6740,8 @@ INTERCEPTOR(SSIZE_T, recvfrom, int fd, void *buf, SIZE_T len, int flags,
   SIZE_T srcaddr_sz;
   if (srcaddr) srcaddr_sz = *addrlen;
   (void)srcaddr_sz;  // prevent "set but not used" warning
-  SSIZE_T res = REAL(recvfrom)(fd, buf, len, flags, srcaddr, addrlen);
+  SSIZE_T res = COMMON_INTERCEPTOR_BLOCK_REAL(recvfrom)(fd, buf, len, flags,
+                                                        srcaddr, addrlen);
   if (res > 0)
     COMMON_INTERCEPTOR_WRITE_RANGE(ctx, buf, Min((SIZE_T)res, len));
   if (res >= 0 && srcaddr)
@@ -6757,7 +6764,7 @@ INTERCEPTOR(SSIZE_T, send, int fd, void *buf, SIZE_T len, int flags) {
     COMMON_INTERCEPTOR_FD_ACCESS(ctx, fd);
     COMMON_INTERCEPTOR_FD_RELEASE(ctx, fd);
   }
-  SSIZE_T res = REAL(send)(fd, buf, len, flags);
+  SSIZE_T res = COMMON_INTERCEPTOR_BLOCK_REAL(send)(fd, buf, len, flags);
   if (common_flags()->intercept_send && res > 0)
     COMMON_INTERCEPTOR_READ_RANGE(ctx, buf, Min((SIZE_T)res, len));
   return res;
@@ -6772,7 +6779,8 @@ INTERCEPTOR(SSIZE_T, sendto, int fd, void *buf, SIZE_T len, int flags,
     COMMON_INTERCEPTOR_FD_RELEASE(ctx, fd);
   }
   // Can't check dstaddr as it may have uninitialized padding at the end.
-  SSIZE_T res = REAL(sendto)(fd, buf, len, flags, dstaddr, addrlen);
+  SSIZE_T res = COMMON_INTERCEPTOR_BLOCK_REAL(sendto)(fd, buf, len, flags,
+                                                      dstaddr, addrlen);
   if (common_flags()->intercept_send && res > 0)
     COMMON_INTERCEPTOR_READ_RANGE(ctx, buf, Min((SIZE_T)res, len));
   return res;
@@ -6789,7 +6797,7 @@ INTERCEPTOR(int, eventfd_read, int fd, __sanitizer_eventfd_t *value) {
   void *ctx;
   COMMON_INTERCEPTOR_ENTER(ctx, eventfd_read, fd, value);
   COMMON_INTERCEPTOR_FD_ACCESS(ctx, fd);
-  int res = REAL(eventfd_read)(fd, value);
+  int res = COMMON_INTERCEPTOR_BLOCK_REAL(eventfd_read)(fd, value);
   if (res == 0) {
     COMMON_INTERCEPTOR_WRITE_RANGE(ctx, value, sizeof(*value));
     if (fd >= 0) COMMON_INTERCEPTOR_FD_ACQUIRE(ctx, fd);
@@ -6803,7 +6811,7 @@ INTERCEPTOR(int, eventfd_write, int fd, __sanitizer_eventfd_t value) {
     COMMON_INTERCEPTOR_FD_ACCESS(ctx, fd);
     COMMON_INTERCEPTOR_FD_RELEASE(ctx, fd);
   }
-  int res = REAL(eventfd_write)(fd, value);
+  int res = COMMON_INTERCEPTOR_BLOCK_REAL(eventfd_write)(fd, value);
   return res;
 }
 #define INIT_EVENTFD_READ_WRITE            \
@@ -7426,7 +7434,8 @@ INTERCEPTOR(int, open_by_handle_at, int mount_fd, struct file_handle* handle,
   COMMON_INTERCEPTOR_READ_RANGE(
       ctx, &sanitizer_handle->f_handle, sanitizer_handle->handle_bytes);
 
-  return REAL(open_by_handle_at)(mount_fd, handle, flags);
+  return COMMON_INTERCEPTOR_BLOCK_REAL(open_by_handle_at)(mount_fd, handle,
+                                                          flags);
 }
 
 #define INIT_OPEN_BY_HANDLE_AT COMMON_INTERCEPT_FUNCTION(open_by_handle_at)
diff --git a/compiler-rt/test/tsan/pthread_atfork_deadlock3.c b/compiler-rt/test/tsan/pthread_atfork_deadlock3.c
index 793eaf6ac86747..41b8f051b33c12 100644
--- a/compiler-rt/test/tsan/pthread_atfork_deadlock3.c
+++ b/compiler-rt/test/tsan/pthread_atfork_deadlock3.c
@@ -28,17 +28,17 @@ void *worker(void *main) {
 }
 
 void atfork() {
+  write(2, "in atfork\n", strlen("in atfork\n"));
   barrier_wait(&barrier);
   barrier_wait(&barrier);
-  write(2, "in atfork\n", strlen("in atfork\n"));
   static volatile long a;
   __atomic_fetch_add(&a, 1, __ATOMIC_RELEASE);
 }
 
 void afterfork() {
+  write(2, "in afterfork\n", strlen("in afterfork\n"));
   barrier_wait(&barrier);
   barrier_wait(&barrier);
-  write(2, "in afterfork\n", strlen("in afterfork\n"));
   static volatile long a;
   __atomic_fetch_add(&a, 1, __ATOMIC_RELEASE);
 }
diff --git a/compiler-rt/test/tsan/signal_in_read.c b/compiler-rt/test/tsan/signal_in_read.c
new file mode 100644
index 00000000000000..ec50d9d0217456
--- /dev/null
+++ b/compiler-rt/test/tsan/signal_in_read.c
@@ -0,0 +1,59 @@
+// RUN: %clang_tsan -O1 %s -o %t && %run %t 2>&1 | FileCheck %s
+
+#include "test.h"
+
+#include <assert.h>
+#include <errno.h>
+#include <pthread.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+static int SignalPipeFd[] = {-1, -1};
+static int BlockingPipeFd[] = {-1, -1};
+
+static void Handler(int _) { assert(write(SignalPipeFd[1], ".", 1) == 1); }
+
+static void *ThreadFunc(void *_) {
+  char C;
+  assert(read(BlockingPipeFd[0], &C, sizeof(C)) == 1);
+  assert(C == '.');
+  return 0;
+}
+
+int main() {
+  alarm(60); // Kill the test if it hangs.
+
+  assert(pipe(SignalPipeFd) == 0);
+  assert(pipe(BlockingPipeFd) == 0);
+
+  struct sigaction act;
+  sigemptyset(&act.sa_mask);
+  act.sa_flags = SA_RESTART;
+  act.sa_handler = Handler;
+  assert(sigaction(SIGUSR1, &act, 0) == 0);
+
+  pthread_t Thr;
+  assert(pthread_create(&Thr, 0, ThreadFunc, 0) == 0);
+
+  // Give the thread enough time to block in the read call.
+  usleep(1000000);
+
+  // Signal the thread, this should run the signal handler and unblock the read
+  // below.
+  pthread_kill(Thr, SIGUSR1);
+  char C;
+  assert(read(SignalPipeFd[0], &C, 1) == 1);
+
+  // Unblock the thread and join it.
+  assert(write(BlockingPipeFd[1], &C, 1) == 1);
+  void *_ = 0;
+  assert(pthread_join(Thr, &_) == 0);
+
+  fprintf(stderr, "PASS\n");
+  return 0;
+}
+
+// CHECK-NOT: WARNING: ThreadSanitizer:
+// CHECK: PASS
diff --git a/flang/CMakeLists.txt b/flang/CMakeLists.txt
index 21617aeea0215e..71141e5efac488 100644
--- a/flang/CMakeLists.txt
+++ b/flang/CMakeLists.txt
@@ -413,6 +413,14 @@ if (LLVM_COMPILER_IS_GCC_COMPATIBLE)
 
 endif()
 
+# Clang on Darwin enables non-POSIX extensions by default, which allows the
+# macro HUGE to leak out of <math.h> even when it is never directly included,
+# conflicting with Flang's HUGE symbols.
+# Set _POSIX_C_SOURCE to avoid including these extensions.
+if (APPLE)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_POSIX_C_SOURCE=200809")
+endif()
+
 list(REMOVE_DUPLICATES CMAKE_CXX_FLAGS)
 
 # Determine HOST_LINK_VERSION on Darwin.
diff --git a/flang/include/flang/Evaluate/integer.h b/flang/include/flang/Evaluate/integer.h
index 2fce4bedfaee0f..977d35c7eecf48 100644
--- a/flang/include/flang/Evaluate/integer.h
+++ b/flang/include/flang/Evaluate/integer.h
@@ -27,10 +27,6 @@
 #include <string>
 #include <type_traits>
 
-// Some environments, viz. clang on Darwin, allow the macro HUGE
-// to leak out of <math.h> even when it is never directly included.
-#undef HUGE
-
 namespace Fortran::evaluate::value {
 
 // Implements an integer as an assembly of smaller host integer parts
diff --git a/flang/include/flang/Evaluate/real.h b/flang/include/flang/Evaluate/real.h
index 62c99cebc31684..5266bd0ef64bfd 100644
--- a/flang/include/flang/Evaluate/real.h
+++ b/flang/include/flang/Evaluate/real.h
@@ -18,10 +18,6 @@
 #include <limits>
 #include <string>
 
-// Some environments, viz. clang on Darwin, allow the macro HUGE
-// to leak out of <math.h> even when it is never directly included.
-#undef HUGE
-
 namespace llvm {
 class raw_ostream;
 }
diff --git a/flang/include/flang/Optimizer/Builder/HLFIRTools.h b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
index 170e134baef614..ce87941d5382ca 100644
--- a/flang/include/flang/Optimizer/Builder/HLFIRTools.h
+++ b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
@@ -230,7 +230,8 @@ translateToExtendedValue(mlir::Location loc, fir::FirOpBuilder &builder,
 /// on the IR.
 fir::ExtendedValue
 translateToExtendedValue(mlir::Location loc, fir::FirOpBuilder &builder,
-                         fir::FortranVariableOpInterface fortranVariable);
+                         fir::FortranVariableOpInterface fortranVariable,
+                         bool forceHlfirBase = false);
 
 /// Generate declaration for a fir::ExtendedValue in memory.
 fir::FortranVariableOpInterface
diff --git a/flang/lib/Evaluate/fold-implementation.h b/flang/lib/Evaluate/fold-implementation.h
index 798bc5f37f6f42..6b3c9416724cb6 100644
--- a/flang/lib/Evaluate/fold-implementation.h
+++ b/flang/lib/Evaluate/fold-implementation.h
@@ -39,10 +39,6 @@
 #include <type_traits>
 #include <variant>
 
-// Some environments, viz. clang on Darwin, allow the macro HUGE
-// to leak out of <math.h> even when it is never directly included.
-#undef HUGE
-
 namespace Fortran::evaluate {
 
 // Utilities
diff --git a/flang/lib/Evaluate/intrinsics-library.cpp b/flang/lib/Evaluate/intrinsics-library.cpp
index e68c5ed3f6a8b1..7315a7a057b102 100644
--- a/flang/lib/Evaluate/intrinsics-library.cpp
+++ b/flang/lib/Evaluate/intrinsics-library.cpp
@@ -299,8 +299,8 @@ struct HostRuntimeLibrary<std::complex<HostT>, LibraryVersion::Libm> {
 /// Define libm extensions
 /// Bessel functions are defined in POSIX.1-2001.
 
-// Remove float bessel functions for AIX as they are not supported
-#ifndef _AIX
+// Remove float bessel functions for AIX and Darwin as they are not supported
+#if !defined(_AIX) && !defined(__APPLE__)
 template <> struct HostRuntimeLibrary<float, LibraryVersion::LibmExtensions> {
   using F = FuncPointer<float, float>;
   using FN = FuncPointer<float, int, float>;
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 8048693119b4c5..a668ba4116faab 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -618,7 +618,8 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     assert(details && "No host-association found");
     const Fortran::semantics::Symbol &hsym = details->symbol();
     mlir::Type hSymType = genType(hsym);
-    Fortran::lower::SymbolBox hsb = lookupSymbol(hsym);
+    Fortran::lower::SymbolBox hsb =
+        lookupSymbol(hsym, /*symMap=*/nullptr, /*forceHlfirBase=*/true);
 
     auto allocate = [&](llvm::ArrayRef<mlir::Value> shape,
                         llvm::ArrayRef<mlir::Value> typeParams) -> mlir::Value {
@@ -727,7 +728,8 @@ class FirConverter : public Fortran::lower::AbstractConverter {
   void createHostAssociateVarCloneDealloc(
       const Fortran::semantics::Symbol &sym) override final {
     mlir::Location loc = genLocation(sym.name());
-    Fortran::lower::SymbolBox hsb = lookupSymbol(sym);
+    Fortran::lower::SymbolBox hsb =
+        lookupSymbol(sym, /*symMap=*/nullptr, /*forceHlfirBase=*/true);
 
     fir::ExtendedValue hexv = symBoxToExtendedValue(hsb);
     hexv.match(
@@ -960,13 +962,14 @@ class FirConverter : public Fortran::lower::AbstractConverter {
   /// Find the symbol in the local map or return null.
   Fortran::lower::SymbolBox
   lookupSymbol(const Fortran::semantics::Symbol &sym,
-               Fortran::lower::SymMap *symMap = nullptr) {
+               Fortran::lower::SymMap *symMap = nullptr,
+               bool forceHlfirBase = false) {
     symMap = symMap ? symMap : &localSymbols;
     if (lowerToHighLevelFIR()) {
       if (std::optional<fir::FortranVariableOpInterface> var =
               symMap->lookupVariableDefinition(sym)) {
-        auto exv =
-            hlfir::translateToExtendedValue(toLocation(), *builder, *var);
+        auto exv = hlfir::translateToExtendedValue(toLocation(), *builder, *var,
+                                                   forceHlfirBase);
         return exv.match(
             [](mlir::Value x) -> Fortran::lower::SymbolBox {
               return Fortran::lower::SymbolBox::Intrinsic{x};
diff --git a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
index 4ffa303f27103a..0e0b14e8d69094 100644
--- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp
+++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
@@ -848,36 +848,38 @@ hlfir::LoopNest hlfir::genLoopNest(mlir::Location loc,
 
 static fir::ExtendedValue
 translateVariableToExtendedValue(mlir::Location loc, fir::FirOpBuilder &builder,
-                                 hlfir::Entity variable) {
+                                 hlfir::Entity variable,
+                                 bool forceHlfirBase = false) {
   assert(variable.isVariable() && "must be a variable");
   /// When going towards FIR, use the original base value to avoid
   /// introducing descriptors at runtime when they are not required.
-  mlir::Value firBase = variable.getFirBase();
+  mlir::Value base =
+      forceHlfirBase ? variable.getBase() : variable.getFirBase();
   if (variable.isMutableBox())
-    return fir::MutableBoxValue(firBase, getExplicitTypeParams(variable),
+    return fir::MutableBoxValue(base, getExplicitTypeParams(variable),
                                 fir::MutableProperties{});
 
-  if (firBase.getType().isa<fir::BaseBoxType>()) {
+  if (base.getType().isa<fir::BaseBoxType>()) {
     if (!variable.isSimplyContiguous() || variable.isPolymorphic() ||
         variable.isDerivedWithLengthParameters() || variable.isOptional()) {
       llvm::SmallVector<mlir::Value> nonDefaultLbounds =
           getNonDefaultLowerBounds(loc, builder, variable);
-      return fir::BoxValue(firBase, nonDefaultLbounds,
+      return fir::BoxValue(base, nonDefaultLbounds,
                            getExplicitTypeParams(variable));
     }
     // Otherwise, the variable can be represented in a fir::ExtendedValue
     // without the overhead of a fir.box.
-    firBase = genVariableRawAddress(loc, builder, variable);
+    base = genVariableRawAddress(loc, builder, variable);
   }
 
   if (variable.isScalar()) {
     if (variable.isCharacter()) {
-      if (firBase.getType().isa<fir::BoxCharType>())
-        return genUnboxChar(loc, builder, firBase);
+      if (base.getType().isa<fir::BoxCharType>())
+        return genUnboxChar(loc, builder, base);
       mlir::Value len = genCharacterVariableLength(loc, builder, variable);
-      return fir::CharBoxValue{firBase, len};
+      return fir::CharBoxValue{base, len};
     }
-    return firBase;
+    return base;
   }
   llvm::SmallVector<mlir::Value> extents;
   llvm::SmallVector<mlir::Value> nonDefaultLbounds;
@@ -893,15 +895,16 @@ translateVariableToExtendedValue(mlir::Location loc, fir::FirOpBuilder &builder,
   }
   if (variable.isCharacter())
     return fir::CharArrayBoxValue{
-        firBase, genCharacterVariableLength(loc, builder, variable), extents,
+        base, genCharacterVariableLength(loc, builder, variable), extents,
         nonDefaultLbounds};
-  return fir::ArrayBoxValue{firBase, extents, nonDefaultLbounds};
+  return fir::ArrayBoxValue{base, extents, nonDefaultLbounds};
 }
 
 fir::ExtendedValue
 hlfir::translateToExtendedValue(mlir::Location loc, fir::FirOpBuilder &builder,
-                                fir::FortranVariableOpInterface var) {
-  return translateVariableToExtendedValue(loc, builder, var);
+                                fir::FortranVariableOpInterface var,
+                                bool forceHlfirBase) {
+  return translateVariableToExtendedValue(loc, builder, var, forceHlfirBase);
 }
 
 std::pair<fir::ExtendedValue, std::optional<hlfir::CleanupFunction>>
diff --git a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
index 4cf39716a73755..746c275f37eaca 100644
--- a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
+++ b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
@@ -209,6 +209,7 @@ class BoxedProcedurePass
       BoxprocTypeRewriter typeConverter(mlir::UnknownLoc::get(context));
       mlir::Dialect *firDialect = context->getLoadedDialect("fir");
       getModule().walk([&](mlir::Operation *op) {
+        bool opIsValid = true;
         typeConverter.setLocation(op->getLoc());
         if (auto addr = mlir::dyn_cast<BoxAddrOp>(op)) {
           mlir::Type ty = addr.getVal().getType();
@@ -220,6 +221,7 @@ class BoxedProcedurePass
             rewriter.setInsertionPoint(addr);
             rewriter.replaceOpWithNewOp<ConvertOp>(
                 addr, typeConverter.convertType(addr.getType()), addr.getVal());
+            opIsValid = false;
           } else if (typeConverter.needsConversion(resTy)) {
             rewriter.startOpModification(op);
             op->getResult(0).setType(typeConverter.convertType(resTy));
@@ -271,10 +273,12 @@ class BoxedProcedurePass
                 llvm::ArrayRef<mlir::Value>{tramp});
             rewriter.replaceOpWithNewOp<ConvertOp>(embox, toTy,
                                                    adjustCall.getResult(0));
+            opIsValid = false;
           } else {
             // Just forward the function as a pointer.
             rewriter.replaceOpWithNewOp<ConvertOp>(embox, toTy,
                                                    embox.getFunc());
+            opIsValid = false;
           }
         } else if (auto global = mlir::dyn_cast<GlobalOp>(op)) {
           auto ty = global.getType();
@@ -297,6 +301,7 @@ class BoxedProcedurePass
             rewriter.replaceOpWithNewOp<AllocaOp>(
                 mem, toTy, uniqName, bindcName, isPinned, mem.getTypeparams(),
                 mem.getShape());
+            opIsValid = false;
           }
         } else if (auto mem = mlir::dyn_cast<AllocMemOp>(op)) {
           auto ty = mem.getType();
@@ -310,6 +315,7 @@ class BoxedProcedurePass
             rewriter.replaceOpWithNewOp<AllocMemOp>(
                 mem, toTy, uniqName, bindcName, mem.getTypeparams(),
                 mem.getShape());
+            opIsValid = false;
           }
         } else if (auto coor = mlir::dyn_cast<CoordinateOp>(op)) {
           auto ty = coor.getType();
@@ -321,6 +327,7 @@ class BoxedProcedurePass
             auto toBaseTy = typeConverter.convertType(baseTy);
             rewriter.replaceOpWithNewOp<CoordinateOp>(coor, toTy, coor.getRef(),
                                                       coor.getCoor(), toBaseTy);
+            opIsValid = false;
           }
         } else if (auto index = mlir::dyn_cast<FieldIndexOp>(op)) {
           auto ty = index.getType();
@@ -332,6 +339,7 @@ class BoxedProcedurePass
             auto toOnTy = typeConverter.convertType(onTy);
             rewriter.replaceOpWithNewOp<FieldIndexOp>(
                 index, toTy, index.getFieldId(), toOnTy, index.getTypeparams());
+            opIsValid = false;
           }
         } else if (auto index = mlir::dyn_cast<LenParamIndexOp>(op)) {
           auto ty = index.getType();
@@ -343,6 +351,7 @@ class BoxedProcedurePass
             auto toOnTy = typeConverter.convertType(onTy);
             rewriter.replaceOpWithNewOp<LenParamIndexOp>(
                 index, toTy, index.getFieldId(), toOnTy, index.getTypeparams());
+            opIsValid = false;
           }
         } else if (op->getDialect() == firDialect) {
           rewriter.startOpModification(op);
@@ -354,7 +363,7 @@ class BoxedProcedurePass
           rewriter.finalizeOpModification(op);
         }
         // Ensure block arguments are updated if needed.
-        if (op->getNumRegions() != 0) {
+        if (opIsValid && op->getNumRegions() != 0) {
           rewriter.startOpModification(op);
           for (mlir::Region &region : op->getRegions())
             for (mlir::Block &block : region.getBlocks())
diff --git a/flang/test/Lower/OpenMP/parallel-private-clause-str.f90 b/flang/test/Lower/OpenMP/parallel-private-clause-str.f90
index f668957624b497..025e51e0661764 100644
--- a/flang/test/Lower/OpenMP/parallel-private-clause-str.f90
+++ b/flang/test/Lower/OpenMP/parallel-private-clause-str.f90
@@ -10,7 +10,7 @@
 !CHECK:    %[[C_DECL:.*]]:2 = hlfir.declare %[[C_BOX_REF]] typeparams %{{.*}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_allocatable_stringEc"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, i32) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>)
 !CHECK:    omp.parallel {
 !CHECK:      %[[C_PVT_BOX_REF:.*]] = fir.alloca !fir.box<!fir.heap<!fir.char<1,?>>> {bindc_name = "c", pinned, uniq_name = "_QFtest_allocatable_stringEc"}
-!CHECK:      %[[C_BOX:.*]] = fir.load %[[C_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
+!CHECK:      %[[C_BOX:.*]] = fir.load %[[C_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
 !CHECK:      fir.if %{{.*}} {
 !CHECK:        %[[C_PVT_MEM:.*]] = fir.allocmem !fir.char<1,?>(%{{.*}} : index) {fir.must_be_heap = true, uniq_name = "_QFtest_allocatable_stringEc.alloc"}
 !CHECK:        %[[C_PVT_BOX:.*]] = fir.embox %[[C_PVT_MEM]] typeparams %{{.*}} : (!fir.heap<!fir.char<1,?>>, index) -> !fir.box<!fir.heap<!fir.char<1,?>>>
@@ -18,7 +18,7 @@
 !CHECK:      }
 !CHECK:      %[[C_PVT_DECL:.*]]:2 = hlfir.declare %[[C_PVT_BOX_REF]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_allocatable_stringEc"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>)
 !CHECK:      fir.if %{{.*}} {
-!CHECK:        %[[C_PVT_BOX:.*]] = fir.load %[[C_PVT_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
+!CHECK:        %[[C_PVT_BOX:.*]] = fir.load %[[C_PVT_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
 !CHECK:        %[[C_PVT_BOX_ADDR:.*]] = fir.box_addr %[[C_PVT_BOX]] : (!fir.box<!fir.heap<!fir.char<1,?>>>) -> !fir.heap<!fir.char<1,?>>
 !CHECK:        fir.freemem %[[C_PVT_BOX_ADDR]] : !fir.heap<!fir.char<1,?>>
 !CHECK:      }
@@ -38,16 +38,16 @@ subroutine test_allocatable_string(n)
 !CHECK:    %[[C_DECL:.*]]:2 = hlfir.declare %[[C_BOX_REF]] typeparams %{{.*}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_allocatable_string_arrayEc"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>, i32) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>)
 !CHECK:    omp.parallel {
 !CHECK:      %[[C_PVT_BOX_REF:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>> {bindc_name = "c", pinned, uniq_name = "_QFtest_allocatable_string_arrayEc"}
-!CHECK:      %{{.*}} = fir.load %[[C_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>
+!CHECK:      %{{.*}} = fir.load %[[C_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>
 !CHECK:      fir.if %{{.*}} {
 !CHECK:        %[[C_PVT_ALLOC:.*]] = fir.allocmem !fir.array<?x!fir.char<1,?>>(%{{.*}} : index), %{{.*}} {fir.must_be_heap = true, uniq_name = "_QFtest_allocatable_string_arrayEc.alloc"}
 !CHECK:        %[[C_PVT_BOX:.*]] = fir.embox %[[C_PVT_ALLOC]](%{{.*}}) typeparams %{{.*}} : (!fir.heap<!fir.array<?x!fir.char<1,?>>>, !fir.shapeshift<1>, index) -> !fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>
 !CHECK:        fir.store %[[C_PVT_BOX]] to %[[C_PVT_BOX_REF]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>
 !CHECK:      }
 !CHECK:      %[[C_PVT_DECL:.*]]:2 = hlfir.declare %[[C_PVT_BOX_REF]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_allocatable_string_arrayEc"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>)
-!CHECK:      %{{.*}} = fir.load %[[C_PVT_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>
+!CHECK:      %{{.*}} = fir.load %[[C_PVT_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>
 !CHECK:      fir.if %{{.*}} {
-!CHECK:        %[[C_PVT_BOX:.*]] = fir.load %[[C_PVT_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>
+!CHECK:        %[[C_PVT_BOX:.*]] = fir.load %[[C_PVT_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>
 !CHECK:        %[[C_PVT_ADDR:.*]] = fir.box_addr %[[C_PVT_BOX]] : (!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>) -> !fir.heap<!fir.array<?x!fir.char<1,?>>>
 !CHECK:        fir.freemem %[[C_PVT_ADDR]] : !fir.heap<!fir.array<?x!fir.char<1,?>>>
 !CHECK:      }
diff --git a/flang/test/Lower/OpenMP/parallel-private-clause.f90 b/flang/test/Lower/OpenMP/parallel-private-clause.f90
index 3e46d315f8cc47..5578b6710da7cd 100644
--- a/flang/test/Lower/OpenMP/parallel-private-clause.f90
+++ b/flang/test/Lower/OpenMP/parallel-private-clause.f90
@@ -150,8 +150,8 @@ subroutine private_clause_derived_type()
 !FIRDialect-DAG:    %[[X4_PVT:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>> {bindc_name = "x4", pinned, uniq_name = "{{.*}}Ex4"}
 !FIRDialect-DAG:    %[[X4_PVT_DECL:.*]]:2 = hlfir.declare %[[X4_PVT]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "{{.*}}Ex4"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
 
-!FIRDialect-DAG:    %[[TMP58:.*]] = fir.load %[[X4_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
-!FIRDialect-DAG:    %[[TMP97:.*]] = fir.load %[[X4_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+!FIRDialect-DAG:    %[[TMP58:.*]] = fir.load %[[X4_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+!FIRDialect-DAG:    %[[TMP97:.*]] = fir.load %[[X4_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 !FIRDialect-DAG:    %[[TMP98:.*]]:3 = fir.box_dims %[[TMP97]], {{.*}} : (!fir.box<!fir.heap<!fir.array<?xi32>>>, index) -> (index, index, index)
 
 !FIRDialect-DAG:    %[[TMP101:.*]] = fir.allocmem !fir.array<?xi32>, {{.*}} {fir.must_be_heap = true, uniq_name = "{{.*}}Ex4.alloc"}
@@ -192,12 +192,12 @@ subroutine private_clause_allocatable()
 !FIRDialect-DAG: }
 !FIRDialect-DAG: %[[X5_PVT_DECL:.*]]:2 = hlfir.declare %[[X5_PVT]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFprivate_clause_real_call_allocatableEx5"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
 !FIRDialect-DAG: fir.call @_QFprivate_clause_real_call_allocatablePhelper_private_clause_real_call_allocatable(%[[X5_PVT_DECL]]#0) fastmath<contract> : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> ()
-!FIRDialect-DAG: %{{.*}} = fir.load %[[X5_PVT_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<f32>>>
+!FIRDialect-DAG: %{{.*}} = fir.load %[[X5_PVT_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<f32>>>
 
 !FIRDialect-DAG: fir.if %{{.*}} {
-!FIRDialect-DAG:   %{{.*}} = fir.load %[[X5_PVT_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<f32>>>
+!FIRDialect-DAG:   %{{.*}} = fir.load %[[X5_PVT_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<f32>>>
 
-!FIRDialect-DAG:     fir.store %{{.*}} to %[[X5_PVT_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<f32>>>
+!FIRDialect-DAG:     fir.store %{{.*}} to %[[X5_PVT_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<f32>>>
 !FIRDialect-DAG:   }
 !FIRDialect-DAG:   omp.terminator
 !FIRDialect-DAG:   }
@@ -313,12 +313,12 @@ subroutine simple_loop_1
     print*, i
   end do
   ! FIRDialect:     omp.yield
-  ! FIRDialect:     {{%.*}} = fir.load %[[R_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<f32>>>
+  ! FIRDialect:     {{%.*}} = fir.load %[[R_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<f32>>>
   ! FIRDialect:     fir.if {{%.*}} {
-  ! FIRDialect:     [[LD:%.*]] = fir.load %[[R_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<f32>>>
+  ! FIRDialect:     [[LD:%.*]] = fir.load %[[R_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<f32>>>
   ! FIRDialect:     [[AD:%.*]] = fir.box_addr [[LD]] : (!fir.box<!fir.heap<f32>>) -> !fir.heap<f32>
   ! FIRDialect:     fir.freemem [[AD]] : !fir.heap<f32>
-  ! FIRDialect:     fir.store {{%.*}} to %[[R_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<f32>>>
+  ! FIRDialect:     fir.store {{%.*}} to %[[R_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<f32>>>
   !$OMP END DO
   ! FIRDialect:  omp.terminator
   !$OMP END PARALLEL
@@ -351,12 +351,12 @@ subroutine simple_loop_2
     print*, i
   end do
   ! FIRDialect:     omp.yield
-  ! FIRDialect:     {{%.*}} = fir.load %[[R_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<f32>>>
+  ! FIRDialect:     {{%.*}} = fir.load %[[R_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<f32>>>
   ! FIRDialect:     fir.if {{%.*}} {
-  ! FIRDialect:     [[LD:%.*]] = fir.load %[[R_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<f32>>>
+  ! FIRDialect:     [[LD:%.*]] = fir.load %[[R_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<f32>>>
   ! FIRDialect:     [[AD:%.*]] = fir.box_addr [[LD]] : (!fir.box<!fir.heap<f32>>) -> !fir.heap<f32>
   ! FIRDialect:     fir.freemem [[AD]] : !fir.heap<f32>
-  ! FIRDialect:     fir.store {{%.*}} to %[[R_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<f32>>>
+  ! FIRDialect:     fir.store {{%.*}} to %[[R_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<f32>>>
   !$OMP END DO
   ! FIRDialect:  omp.terminator
   !$OMP END PARALLEL
@@ -388,12 +388,12 @@ subroutine simple_loop_3
     print*, i
   end do
   ! FIRDialect:     omp.yield
-  ! FIRDialect:     {{%.*}} = fir.load [[R_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<f32>>>
+  ! FIRDialect:     {{%.*}} = fir.load [[R_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<f32>>>
   ! FIRDialect:     fir.if {{%.*}} {
-  ! FIRDialect:     [[LD:%.*]] = fir.load [[R_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<f32>>>
+  ! FIRDialect:     [[LD:%.*]] = fir.load [[R_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<f32>>>
   ! FIRDialect:     [[AD:%.*]] = fir.box_addr [[LD]] : (!fir.box<!fir.heap<f32>>) -> !fir.heap<f32>
   ! FIRDialect:     fir.freemem [[AD]] : !fir.heap<f32>
-  ! FIRDialect:     fir.store {{%.*}} to [[R_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<f32>>>
+  ! FIRDialect:     fir.store {{%.*}} to [[R_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<f32>>>
   !$OMP END PARALLEL DO
   ! FIRDialect:  omp.terminator
 end subroutine
@@ -421,10 +421,10 @@ subroutine simd_loop_1
   end do
   !$OMP END SIMD
   ! FIRDialect:     omp.yield
-  ! FIRDialect:     {{%.*}} = fir.load [[R_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<f32>>>
+  ! FIRDialect:     {{%.*}} = fir.load [[R_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<f32>>>
   ! FIRDialect:     fir.if {{%.*}} {
-  ! FIRDialect:     [[LD:%.*]] = fir.load [[R_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<f32>>>
+  ! FIRDialect:     [[LD:%.*]] = fir.load [[R_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<f32>>>
   ! FIRDialect:     [[AD:%.*]] = fir.box_addr [[LD]] : (!fir.box<!fir.heap<f32>>) -> !fir.heap<f32>
   ! FIRDialect:     fir.freemem [[AD]] : !fir.heap<f32>
-  ! FIRDialect:     fir.store {{%.*}} to [[R_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<f32>>>
+  ! FIRDialect:     fir.store {{%.*}} to [[R_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<f32>>>
 end subroutine
diff --git a/flang/unittests/Runtime/Ragged.cpp b/flang/unittests/Runtime/Ragged.cpp
index 4b261b14789c47..5049bc83405f17 100644
--- a/flang/unittests/Runtime/Ragged.cpp
+++ b/flang/unittests/Runtime/Ragged.cpp
@@ -14,7 +14,7 @@ using namespace Fortran::runtime;
 TEST(Ragged, RaggedArrayAllocateDeallocateTest) {
   struct RaggedArrayHeader header;
   unsigned rank = 2;
-  int64_t *extents = new int64_t[2];
+  int64_t *extents = reinterpret_cast<int64_t *>(malloc(2 * sizeof(int64_t)));
   extents[0] = 10;
   extents[1] = 100;
   RaggedArrayHeader *ret = (RaggedArrayHeader *)_FortranARaggedArrayAllocate(
diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
index b4a2523b778877..6edf5c656193db 100644
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -60,6 +60,10 @@ if(LLVM_LIBC_FULL_BUILD OR LLVM_LIBC_GPU_BUILD)
     message(STATUS "Will use ${LIBC_HDRGEN_EXE} for libc header generation.")
   endif()
 endif()
+# We will build the GPU utilities if we are not doing a runtimes build.
+if(LLVM_LIBC_GPU_BUILD AND NOT LLVM_RUNTIMES_BUILD)
+  add_subdirectory(utils/gpu)
+endif()
 
 set(NEED_LIBC_HDRGEN FALSE)
 if(NOT LLVM_RUNTIMES_BUILD)
@@ -79,11 +83,6 @@ if(LIBC_HDRGEN_ONLY OR NEED_LIBC_HDRGEN)
   # When libc is build as part of the runtimes/bootstrap build's CMake run, we
   # only need to build the host tools to build the libc. So, we just do enough
   # to build libc-hdrgen and return.
-
-  # Always make the RPC server availible to other projects for GPU mode.
-  if(LLVM_LIBC_GPU_BUILD)
-    add_subdirectory(utils/gpu/server)
-  endif()
   return()
 endif()
 unset(NEED_LIBC_HDRGEN)
diff --git a/libc/cmake/modules/prepare_libc_gpu_build.cmake b/libc/cmake/modules/prepare_libc_gpu_build.cmake
index 2de4cb8d82b28b..bea6bb016491b6 100644
--- a/libc/cmake/modules/prepare_libc_gpu_build.cmake
+++ b/libc/cmake/modules/prepare_libc_gpu_build.cmake
@@ -93,6 +93,41 @@ else()
 endif()
 set(LIBC_GPU_TARGET_ARCHITECTURE "${gpu_test_architecture}")
 
+# Identify the GPU loader utility used to run tests.
+set(LIBC_GPU_LOADER_EXECUTABLE "" CACHE STRING "Executable for the GPU loader.")
+if(LIBC_GPU_LOADER_EXECUTABLE)
+  set(gpu_loader_executable ${LIBC_GPU_LOADER_EXECUTABLE})
+elseif(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
+  find_program(LIBC_AMDHSA_LOADER_EXECUTABLE
+               NAMES amdhsa-loader NO_DEFAULT_PATH
+               PATHS ${LLVM_BINARY_DIR}/bin ${compiler_path})
+  if(LIBC_AMDHSA_LOADER_EXECUTABLE)
+    set(gpu_loader_executable ${LIBC_AMDHSA_LOADER_EXECUTABLE})
+  endif()
+elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
+  find_program(LIBC_NVPTX_LOADER_EXECUTABLE
+               NAMES nvptx-loader NO_DEFAULT_PATH
+               PATHS ${LLVM_BINARY_DIR}/bin ${compiler_path})
+  if(LIBC_NVPTX_LOADER_EXECUTABLE)
+    set(gpu_loader_executable ${LIBC_NVPTX_LOADER_EXECUTABLE})
+  endif()
+endif()
+if(NOT TARGET libc.utils.gpu.loader AND gpu_loader_executable)
+  add_custom_target(libc.utils.gpu.loader)
+  set_target_properties(
+    libc.utils.gpu.loader
+    PROPERTIES
+      EXECUTABLE "${gpu_loader_executable}"
+  )
+endif()
+
+if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
+  # The AMDGPU environment uses different code objects to encode the ABI for
+  # kernel calls and intrinsic functions. We want to specify this manually to
+  # conform to whatever the test suite was built to handle.
+  set(LIBC_GPU_CODE_OBJECT_VERSION 5)
+endif()
+
 if(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
   # FIXME: This is a hack required to keep the CUDA package from trying to find
   #        pthreads. We only link the CUDA driver, so this is unneeded.
@@ -103,10 +138,3 @@ if(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
     get_filename_component(LIBC_CUDA_ROOT "${CUDAToolkit_BIN_DIR}" DIRECTORY ABSOLUTE)
   endif()
 endif()
-
-if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
-  # The AMDGPU environment uses different code objects to encode the ABI for
-  # kernel calls and intrinsic functions. We want to specify this manually to
-  # conform to whatever the test suite was built to handle.
-  set(LIBC_GPU_CODE_OBJECT_VERSION 5)
-endif()
diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
index 7f2a1b2f3e2824..b22ed5127c179e 100644
--- a/libc/docs/math/index.rst
+++ b/libc/docs/math/index.rst
@@ -567,13 +567,13 @@ Legends:
 Performance
 ===========
 
-* Simple performance testings are located at: `libc/test/src/math/differential_testing <https://github.com/llvm/llvm-project/tree/main/libc/test/src/math/differential_testing>`_.
+* Simple performance testings are located at: `libc/test/src/math/performance_testing <https://github.com/llvm/llvm-project/tree/main/libc/test/src/math/performance_testing>`_.
 
 * We also use the *perf* tool from the `CORE-MATH <https://core-math.gitlabpages.inria.fr/>`_
   project: `link <https://gitlab.inria.fr/core-math/core-math/-/tree/master>`_.
   The performance results from the CORE-MATH's perf tool are reported in the
   table below, using the system library as reference (such as the `GNU C library <https://www.gnu.org/software/libc/>`_
-  on Linux). Fmod performance results obtained with "differential_testing".
+  on Linux). Fmod performance results obtained with "performance_testing".
 
 +--------------+-------------------------------+-------------------------------+-------------------------------------+----------------------------------------------------------------------+
 | <Func>       | Reciprocal throughput (clk)   | Latency (clk)                 | Testing ranges                      | Testing configuration                                                |
diff --git a/libc/src/math/amdgpu/CMakeLists.txt b/libc/src/math/amdgpu/CMakeLists.txt
index c300730208d509..93735a556a31bf 100644
--- a/libc/src/math/amdgpu/CMakeLists.txt
+++ b/libc/src/math/amdgpu/CMakeLists.txt
@@ -176,26 +176,6 @@ add_entrypoint_object(
     -O2
 )
 
-add_entrypoint_object(
-  modf
-  SRCS
-    modf.cpp
-  HDRS
-    ../modf.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_entrypoint_object(
-  modff
-  SRCS
-    modff.cpp
-  HDRS
-    ../modff.h
-  COMPILE_OPTIONS
-    -O2
-)
-
 add_entrypoint_object(
   nearbyint
   SRCS
diff --git a/libc/src/math/amdgpu/modf.cpp b/libc/src/math/amdgpu/modf.cpp
deleted file mode 100644
index 07dbbd6059c35f..00000000000000
--- a/libc/src/math/amdgpu/modf.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-//===-- Implementation of the GPU modf function ---------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "src/math/modf.h"
-#include "src/__support/common.h"
-
-namespace LIBC_NAMESPACE {
-
-LLVM_LIBC_FUNCTION(double, modf, (double x, double *iptr)) {
-  return __builtin_modf(x, iptr);
-}
-
-} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/modff.cpp b/libc/src/math/amdgpu/modff.cpp
deleted file mode 100644
index ad35f9006b5122..00000000000000
--- a/libc/src/math/amdgpu/modff.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-//===-- Implementation of the GPU modff function --------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "src/math/modff.h"
-#include "src/__support/common.h"
-
-namespace LIBC_NAMESPACE {
-
-LLVM_LIBC_FUNCTION(float, modff, (float x, float *iptr)) {
-  return __builtin_modff(x, iptr);
-}
-
-} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/docs/add_math_function.md b/libc/src/math/docs/add_math_function.md
index 6f08bf037c578e..f8bc8a3bdd8b1d 100644
--- a/libc/src/math/docs/add_math_function.md
+++ b/libc/src/math/docs/add_math_function.md
@@ -129,11 +129,11 @@ implementation (which is very often glibc).
 
 - Add a performance test to:
 ```
-  libc/test/src/math/differential_testing/<func>_perf.cpp
+  libc/test/src/math/performance_testing/<func>_perf.cpp
 ```
 - Add the corresponding entry point to:
 ```
-  libc/test/src/math/differential_testing/CMakeLists.txt
+  libc/test/src/math/performance_testing/CMakeLists.txt
 ```
 
 ## Build and Run
@@ -189,8 +189,8 @@ implementation (which is very often glibc).
 
 - Build and Run performance test:
 ```
-  $ ninja libc.test.src.math.differential_testing.<func>_perf
-  $ projects/libc/test/src/math/differential_testing/libc.test.src.math.differential_testing.<func>_perf
+  $ ninja libc.test.src.math.performance_testing.<func>_perf
+  $ projects/libc/test/src/math/performance_testing/libc.test.src.math.performance_testing.<func>_perf
   $ cat <func>_perf.log
 ```
 
diff --git a/libc/src/math/nvptx/CMakeLists.txt b/libc/src/math/nvptx/CMakeLists.txt
index 56bff1472f134f..581e1c6a3044b2 100644
--- a/libc/src/math/nvptx/CMakeLists.txt
+++ b/libc/src/math/nvptx/CMakeLists.txt
@@ -177,26 +177,6 @@ add_entrypoint_object(
     -O2
 )
 
-add_entrypoint_object(
-  modf
-  SRCS
-    modf.cpp
-  HDRS
-    ../modf.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_entrypoint_object(
-  modff
-  SRCS
-    modff.cpp
-  HDRS
-    ../modff.h
-  COMPILE_OPTIONS
-    -O2
-)
-
 add_entrypoint_object(
   nearbyint
   SRCS
diff --git a/libc/src/math/nvptx/modf.cpp b/libc/src/math/nvptx/modf.cpp
deleted file mode 100644
index 07dbbd6059c35f..00000000000000
--- a/libc/src/math/nvptx/modf.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-//===-- Implementation of the GPU modf function ---------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "src/math/modf.h"
-#include "src/__support/common.h"
-
-namespace LIBC_NAMESPACE {
-
-LLVM_LIBC_FUNCTION(double, modf, (double x, double *iptr)) {
-  return __builtin_modf(x, iptr);
-}
-
-} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/modff.cpp b/libc/src/math/nvptx/modff.cpp
deleted file mode 100644
index ad35f9006b5122..00000000000000
--- a/libc/src/math/nvptx/modff.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-//===-- Implementation of the GPU modff function --------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "src/math/modff.h"
-#include "src/__support/common.h"
-
-namespace LIBC_NAMESPACE {
-
-LLVM_LIBC_FUNCTION(float, modff, (float x, float *iptr)) {
-  return __builtin_modff(x, iptr);
-}
-
-} // namespace LIBC_NAMESPACE
diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt
index ad7dfdb3dfd9ec..b8a4aafcd97aa2 100644
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@@ -1721,5 +1721,5 @@ add_subdirectory(smoke)
 
 if(NOT LLVM_LIBC_FULL_BUILD)
   add_subdirectory(exhaustive)
-  add_subdirectory(differential_testing)
+  add_subdirectory(performance_testing)
 endif()
diff --git a/libc/test/src/math/differential_testing/ceilf_diff.cpp b/libc/test/src/math/differential_testing/ceilf_diff.cpp
deleted file mode 100644
index 7c0bb1e95a03fd..00000000000000
--- a/libc/test/src/math/differential_testing/ceilf_diff.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-//===-- Differential test for ceilf----------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "SingleInputSingleOutputDiff.h"
-
-#include "src/math/ceilf.h"
-
-#include <math.h>
-
-SINGLE_INPUT_SINGLE_OUTPUT_DIFF(float, LIBC_NAMESPACE::ceilf, ::ceilf,
-                                "ceilf_diff.log")
diff --git a/libc/test/src/math/differential_testing/cosf_diff.cpp b/libc/test/src/math/differential_testing/cosf_diff.cpp
deleted file mode 100644
index ee3102384a8e6b..00000000000000
--- a/libc/test/src/math/differential_testing/cosf_diff.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-//===-- Differential test for cosf ----------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "SingleInputSingleOutputDiff.h"
-
-#include "src/math/cosf.h"
-
-#include <math.h>
-
-SINGLE_INPUT_SINGLE_OUTPUT_DIFF(float, LIBC_NAMESPACE::cosf, ::cosf,
-                                "cosf_diff.log")
diff --git a/libc/test/src/math/differential_testing/exp2f_diff.cpp b/libc/test/src/math/differential_testing/exp2f_diff.cpp
deleted file mode 100644
index 545c6de320fc7c..00000000000000
--- a/libc/test/src/math/differential_testing/exp2f_diff.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-//===-- Differential test for exp2f----------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "SingleInputSingleOutputDiff.h"
-
-#include "src/math/exp2f.h"
-
-#include <math.h>
-
-SINGLE_INPUT_SINGLE_OUTPUT_DIFF(float, LIBC_NAMESPACE::exp2f, ::exp2f,
-                                "exp2f_diff.log")
diff --git a/libc/test/src/math/differential_testing/expf_diff.cpp b/libc/test/src/math/differential_testing/expf_diff.cpp
deleted file mode 100644
index 7c2e90744bc915..00000000000000
--- a/libc/test/src/math/differential_testing/expf_diff.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-//===-- Differential test for expf ----------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "SingleInputSingleOutputDiff.h"
-
-#include "src/math/expf.h"
-
-#include <math.h>
-
-SINGLE_INPUT_SINGLE_OUTPUT_DIFF(float, LIBC_NAMESPACE::expf, ::expf,
-                                "expf_diff.log")
diff --git a/libc/test/src/math/differential_testing/expm1f_diff.cpp b/libc/test/src/math/differential_testing/expm1f_diff.cpp
deleted file mode 100644
index 3cbd8a99690fb4..00000000000000
--- a/libc/test/src/math/differential_testing/expm1f_diff.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-//===-- Differential test for expm1f --------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "SingleInputSingleOutputDiff.h"
-
-#include "src/math/expm1f.h"
-
-#include <math.h>
-
-SINGLE_INPUT_SINGLE_OUTPUT_DIFF(float, LIBC_NAMESPACE::expm1f, ::expm1f,
-                                "expm1f_diff.log")
diff --git a/libc/test/src/math/differential_testing/fabsf_diff.cpp b/libc/test/src/math/differential_testing/fabsf_diff.cpp
deleted file mode 100644
index 9bf9eff888fb51..00000000000000
--- a/libc/test/src/math/differential_testing/fabsf_diff.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-//===-- Differential test for fabsf----------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "SingleInputSingleOutputDiff.h"
-
-#include "src/math/fabsf.h"
-
-#include <math.h>
-
-SINGLE_INPUT_SINGLE_OUTPUT_DIFF(float, LIBC_NAMESPACE::fabsf, ::fabsf,
-                                "fabsf_diff.log")
diff --git a/libc/test/src/math/differential_testing/floorf_diff.cpp b/libc/test/src/math/differential_testing/floorf_diff.cpp
deleted file mode 100644
index 6d72927b5010c5..00000000000000
--- a/libc/test/src/math/differential_testing/floorf_diff.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-//===-- Differential test for floorf---------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "SingleInputSingleOutputDiff.h"
-
-#include "src/math/floorf.h"
-
-#include <math.h>
-
-SINGLE_INPUT_SINGLE_OUTPUT_DIFF(float, LIBC_NAMESPACE::floorf, ::floorf,
-                                "floorf_diff.log")
diff --git a/libc/test/src/math/differential_testing/fmod_diff.cpp b/libc/test/src/math/differential_testing/fmod_diff.cpp
deleted file mode 100644
index 026e529c6cae2a..00000000000000
--- a/libc/test/src/math/differential_testing/fmod_diff.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-//===-- Differential test for fmod ----------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "BinaryOpSingleOutputDiff.h"
-
-#include "src/math/fmod.h"
-
-#include <math.h>
-
-BINARY_OP_SINGLE_OUTPUT_DIFF(double, LIBC_NAMESPACE::fmod, ::fmod,
-                             "fmod_diff.log")
diff --git a/libc/test/src/math/differential_testing/fmodf_diff.cpp b/libc/test/src/math/differential_testing/fmodf_diff.cpp
deleted file mode 100644
index 7029b1ee42cd0e..00000000000000
--- a/libc/test/src/math/differential_testing/fmodf_diff.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-//===-- Differential test for fmodf ---------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "BinaryOpSingleOutputDiff.h"
-
-#include "src/math/fmodf.h"
-
-#include <math.h>
-
-BINARY_OP_SINGLE_OUTPUT_DIFF(float, LIBC_NAMESPACE::fmodf, ::fmodf,
-                             "fmodf_diff.log")
diff --git a/libc/test/src/math/differential_testing/hypot_diff.cpp b/libc/test/src/math/differential_testing/hypot_diff.cpp
deleted file mode 100644
index c61e589bdb2dff..00000000000000
--- a/libc/test/src/math/differential_testing/hypot_diff.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-//===-- Differential test for hypot ---------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "BinaryOpSingleOutputDiff.h"
-
-#include "src/math/hypot.h"
-
-#include <math.h>
-
-BINARY_OP_SINGLE_OUTPUT_DIFF(double, LIBC_NAMESPACE::hypot, ::hypot,
-                             "hypot_diff.log")
diff --git a/libc/test/src/math/differential_testing/hypotf_diff.cpp b/libc/test/src/math/differential_testing/hypotf_diff.cpp
deleted file mode 100644
index d1c70fc2b6edbd..00000000000000
--- a/libc/test/src/math/differential_testing/hypotf_diff.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-//===-- Differential test for hypotf --------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "BinaryOpSingleOutputDiff.h"
-
-#include "src/math/hypotf.h"
-
-#include <math.h>
-
-BINARY_OP_SINGLE_OUTPUT_DIFF(float, LIBC_NAMESPACE::hypotf, ::hypotf,
-                             "hypotf_diff.log")
diff --git a/libc/test/src/math/differential_testing/log2f_diff.cpp b/libc/test/src/math/differential_testing/log2f_diff.cpp
deleted file mode 100644
index aef431dce48701..00000000000000
--- a/libc/test/src/math/differential_testing/log2f_diff.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-//===-- Differential test for log2f ---------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "SingleInputSingleOutputDiff.h"
-
-#include "src/math/log2f.h"
-
-#include <math.h>
-
-SINGLE_INPUT_SINGLE_OUTPUT_DIFF(float, LIBC_NAMESPACE::log2f, ::log2f,
-                                "log2f_diff.log")
diff --git a/libc/test/src/math/differential_testing/logbf_diff.cpp b/libc/test/src/math/differential_testing/logbf_diff.cpp
deleted file mode 100644
index 37441eb40a4dfa..00000000000000
--- a/libc/test/src/math/differential_testing/logbf_diff.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-//===-- Differential test for logbf----------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "SingleInputSingleOutputDiff.h"
-
-#include "src/math/logbf.h"
-
-#include <math.h>
-
-SINGLE_INPUT_SINGLE_OUTPUT_DIFF(float, LIBC_NAMESPACE::logbf, ::logbf,
-                                "logbf_diff.log")
diff --git a/libc/test/src/math/differential_testing/logf_diff.cpp b/libc/test/src/math/differential_testing/logf_diff.cpp
deleted file mode 100644
index 4ed1307f712081..00000000000000
--- a/libc/test/src/math/differential_testing/logf_diff.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-//===-- Differential test for logf ----------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "SingleInputSingleOutputDiff.h"
-
-#include "src/math/logf.h"
-
-#include <math.h>
-
-SINGLE_INPUT_SINGLE_OUTPUT_DIFF(float, LIBC_NAMESPACE::logf, ::logf,
-                                "logf_diff.log")
diff --git a/libc/test/src/math/differential_testing/nearbyintf_diff.cpp b/libc/test/src/math/differential_testing/nearbyintf_diff.cpp
deleted file mode 100644
index 14200116883db4..00000000000000
--- a/libc/test/src/math/differential_testing/nearbyintf_diff.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-//===-- Differential test for nearbyintf-----------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "SingleInputSingleOutputDiff.h"
-
-#include "src/math/nearbyintf.h"
-
-#include <math.h>
-
-SINGLE_INPUT_SINGLE_OUTPUT_DIFF(float, LIBC_NAMESPACE::nearbyintf, ::nearbyintf,
-                                "nearbyintf_diff.log")
diff --git a/libc/test/src/math/differential_testing/rintf_diff.cpp b/libc/test/src/math/differential_testing/rintf_diff.cpp
deleted file mode 100644
index e60f66085e5d70..00000000000000
--- a/libc/test/src/math/differential_testing/rintf_diff.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-//===-- Differential test for rintf----------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "SingleInputSingleOutputDiff.h"
-
-#include "src/math/rintf.h"
-
-#include <math.h>
-
-SINGLE_INPUT_SINGLE_OUTPUT_DIFF(float, LIBC_NAMESPACE::rintf, ::rintf,
-                                "rintf_diff.log")
diff --git a/libc/test/src/math/differential_testing/roundf_diff.cpp b/libc/test/src/math/differential_testing/roundf_diff.cpp
deleted file mode 100644
index e1401a01af3574..00000000000000
--- a/libc/test/src/math/differential_testing/roundf_diff.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-//===-- Differential test for roundf---------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "SingleInputSingleOutputDiff.h"
-
-#include "src/math/roundf.h"
-
-#include <math.h>
-
-SINGLE_INPUT_SINGLE_OUTPUT_DIFF(float, LIBC_NAMESPACE::roundf, ::roundf,
-                                "roundf_diff.log")
diff --git a/libc/test/src/math/differential_testing/sinf_diff.cpp b/libc/test/src/math/differential_testing/sinf_diff.cpp
deleted file mode 100644
index cb4557e6796b55..00000000000000
--- a/libc/test/src/math/differential_testing/sinf_diff.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-//===-- Differential test for sinf ----------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "SingleInputSingleOutputDiff.h"
-
-#include "src/math/sinf.h"
-
-#include <math.h>
-
-SINGLE_INPUT_SINGLE_OUTPUT_DIFF(float, LIBC_NAMESPACE::sinf, ::sinf,
-                                "sinf_diff.log")
diff --git a/libc/test/src/math/differential_testing/sqrtf_diff.cpp b/libc/test/src/math/differential_testing/sqrtf_diff.cpp
deleted file mode 100644
index 22ddeaac9caf99..00000000000000
--- a/libc/test/src/math/differential_testing/sqrtf_diff.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-//===-- Differential test for sqrtf----------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "SingleInputSingleOutputDiff.h"
-
-#include "src/math/sqrtf.h"
-
-#include <math.h>
-
-SINGLE_INPUT_SINGLE_OUTPUT_DIFF(float, LIBC_NAMESPACE::sqrtf, ::sqrtf,
-                                "sqrtf_diff.log")
diff --git a/libc/test/src/math/differential_testing/truncf_diff.cpp b/libc/test/src/math/differential_testing/truncf_diff.cpp
deleted file mode 100644
index 7f6ac4e6a92694..00000000000000
--- a/libc/test/src/math/differential_testing/truncf_diff.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-//===-- Differential test for truncf---------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "SingleInputSingleOutputDiff.h"
-
-#include "src/math/truncf.h"
-
-#include <math.h>
-
-SINGLE_INPUT_SINGLE_OUTPUT_DIFF(float, LIBC_NAMESPACE::truncf, ::truncf,
-                                "truncf_diff.log")
diff --git a/libc/test/src/math/differential_testing/BinaryOpSingleOutputDiff.h b/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h
similarity index 70%
rename from libc/test/src/math/differential_testing/BinaryOpSingleOutputDiff.h
rename to libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h
index 48572e78e5153e..68d37b46b77c73 100644
--- a/libc/test/src/math/differential_testing/BinaryOpSingleOutputDiff.h
+++ b/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h
@@ -7,14 +7,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/FPUtil/FPBits.h"
-#include "test/src/math/differential_testing/Timer.h"
+#include "test/src/math/performance_testing/Timer.h"
 
 #include <fstream>
 
 namespace LIBC_NAMESPACE {
 namespace testing {
 
-template <typename T> class BinaryOpSingleOutputDiff {
+template <typename T> class BinaryOpSingleOutputPerf {
   using FPBits = fputil::FPBits<T>;
   using StorageType = typename FPBits::StorageType;
   static constexpr StorageType UIntMax =
@@ -23,40 +23,6 @@ template <typename T> class BinaryOpSingleOutputDiff {
 public:
   typedef T Func(T, T);
 
-  static uint64_t run_diff_in_range(Func myFunc, Func otherFunc,
-                                    StorageType startingBit,
-                                    StorageType endingBit, StorageType N,
-                                    std::ofstream &log) {
-    uint64_t result = 0;
-    if (endingBit < startingBit) {
-      return result;
-    }
-
-    StorageType step = (endingBit - startingBit) / N;
-    for (StorageType bitsX = startingBit, bitsY = endingBit;;
-         bitsX += step, bitsY -= step) {
-      T x = T(FPBits(bitsX));
-      T y = T(FPBits(bitsY));
-      FPBits myBits = FPBits(myFunc(x, y));
-      FPBits otherBits = FPBits(otherFunc(x, y));
-      if (myBits.uintval() != otherBits.uintval()) {
-        result++;
-        log << "       Input: " << bitsX << ", " << bitsY << " (" << x << ", "
-            << y << ")\n"
-            << "   My result: " << myBits.uintval() << " (" << myBits.get_val()
-            << ")\n"
-            << "Other result: " << otherBits.uintval() << " ("
-            << otherBits.get_val() << ")\n"
-            << '\n';
-      }
-
-      if (endingBit - bitsX < step) {
-        break;
-      }
-    }
-    return result;
-  }
-
   static void run_perf_in_range(Func myFunc, Func otherFunc,
                                 StorageType startingBit, StorageType endingBit,
                                 StorageType N, std::ofstream &log) {
@@ -69,8 +35,8 @@ template <typename T> class BinaryOpSingleOutputDiff {
       StorageType step = (endingBit - startingBit) / N;
       for (StorageType bitsX = startingBit, bitsY = endingBit;;
            bitsX += step, bitsY -= step) {
-        T x = T(FPBits(bitsX));
-        T y = T(FPBits(bitsY));
+        T x = FPBits(bitsX).get_val();
+        T y = FPBits(bitsY).get_val();
         result = func(x, y);
         if (endingBit - bitsX < step) {
           break;
@@ -110,12 +76,12 @@ template <typename T> class BinaryOpSingleOutputDiff {
     log << " Performance tests with inputs in denormal range:\n";
     run_perf_in_range(myFunc, otherFunc, /* startingBit= */ StorageType(0),
                       /* endingBit= */ FPBits::max_subnormal().uintval(),
-                      1'000'001, log);
+                      10'000'001, log);
     log << "\n Performance tests with inputs in normal range:\n";
     run_perf_in_range(myFunc, otherFunc,
                       /* startingBit= */ FPBits::min_normal().uintval(),
                       /* endingBit= */ FPBits::max_normal().uintval(),
-                      100'000'001, log);
+                      10'000'001, log);
     log << "\n Performance tests with inputs in normal range with exponents "
            "close to each other:\n";
     run_perf_in_range(
@@ -148,16 +114,9 @@ template <typename T> class BinaryOpSingleOutputDiff {
 } // namespace testing
 } // namespace LIBC_NAMESPACE
 
-#define BINARY_OP_SINGLE_OUTPUT_DIFF(T, myFunc, otherFunc, filename)           \
-  int main() {                                                                 \
-    LIBC_NAMESPACE::testing::BinaryOpSingleOutputDiff<T>::run_diff(            \
-        &myFunc, &otherFunc, filename);                                        \
-    return 0;                                                                  \
-  }
-
 #define BINARY_OP_SINGLE_OUTPUT_PERF(T, myFunc, otherFunc, filename)           \
   int main() {                                                                 \
-    LIBC_NAMESPACE::testing::BinaryOpSingleOutputDiff<T>::run_perf(            \
+    LIBC_NAMESPACE::testing::BinaryOpSingleOutputPerf<T>::run_perf(            \
         &myFunc, &otherFunc, filename);                                        \
     return 0;                                                                  \
   }
diff --git a/libc/test/src/math/differential_testing/CMakeLists.txt b/libc/test/src/math/performance_testing/CMakeLists.txt
similarity index 56%
rename from libc/test/src/math/differential_testing/CMakeLists.txt
rename to libc/test/src/math/performance_testing/CMakeLists.txt
index 878f81f1d573c8..d20c2eb303a7cc 100644
--- a/libc/test/src/math/differential_testing/CMakeLists.txt
+++ b/libc/test/src/math/performance_testing/CMakeLists.txt
@@ -4,28 +4,28 @@ add_library(
   Timer.h
 )
 
-# A convenience target to build all differential tests.
-add_custom_target(libc-math-differential-tests)
+# A convenience target to build all performance tests.
+add_custom_target(libc-math-performance-tests)
 
-function(add_diff_binary target_name)
+function(add_perf_binary target_name)
   cmake_parse_arguments(
-    "DIFF"
+    "PERF"
     "" # No optional arguments
     "SUITE;CXX_STANDARD" # Single value arguments
     "SRCS;HDRS;DEPENDS;COMPILE_OPTIONS" # Multi-value arguments
     ${ARGN}
   )
-  if(NOT DIFF_SRCS)
-    message(FATAL_ERROR "'add_diff_binary' target requires a SRCS list of .cpp "
+  if(NOT PERF_SRCS)
+    message(FATAL_ERROR "'add_perf_binary' target requires a SRCS list of .cpp "
                         "files.")
   endif()
-  if(NOT DIFF_DEPENDS)
-    message(FATAL_ERROR "'add_diff_binary' target requires a DEPENDS list of "
+  if(NOT PERF_DEPENDS)
+    message(FATAL_ERROR "'add_perf_binary' target requires a DEPENDS list of "
                         "'add_entrypoint_object' targets.")
   endif()
 
   get_fq_target_name(${target_name} fq_target_name)
-  get_fq_deps_list(fq_deps_list ${DIFF_DEPENDS})
+  get_fq_deps_list(fq_deps_list ${PERF_DEPENDS})
   get_object_files_for_test(
       link_object_files skipped_entrypoints_list ${fq_deps_list})
   if(skipped_entrypoints_list)
@@ -40,18 +40,18 @@ function(add_diff_binary target_name)
   add_executable(
     ${fq_target_name}
     EXCLUDE_FROM_ALL
-    ${DIFF_SRCS}
-    ${DIFF_HDRS}
+    ${PERF_SRCS}
+    ${PERF_HDRS}
   )
   target_include_directories(
     ${fq_target_name}
     PRIVATE
       ${LIBC_SOURCE_DIR}
   )
-  if(DIFF_COMPILE_OPTIONS)
+  if(PERF_COMPILE_OPTIONS)
     target_compile_options(
       ${fq_target_name}
-      PRIVATE ${DIFF_COMPILE_OPTIONS}
+      PRIVATE ${PERF_COMPILE_OPTIONS}
     )
   endif()
 
@@ -62,11 +62,11 @@ function(add_diff_binary target_name)
   set_target_properties(${fq_target_name}
     PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
-  if(DIFF_CXX_STANDARD)
+  if(PERF_CXX_STANDARD)
     set_target_properties(
       ${fq_target_name}
       PROPERTIES
-        CXX_STANDARD ${DIFF_CXX_STANDARD}
+        CXX_STANDARD ${PERF_CXX_STANDARD}
     )
   endif()
 
@@ -75,31 +75,22 @@ function(add_diff_binary target_name)
     libc.src.__support.FPUtil.fp_bits
     ${fq_deps_list}
   )
-  add_dependencies(libc-math-differential-tests ${fq_target_name})
+  add_dependencies(libc-math-performance-tests ${fq_target_name})
 endfunction()
 
 add_header_library(
   single_input_single_output_diff
   HDRS
-    SingleInputSingleOutputDiff.h
+    SingleInputSingleOutputPerf.h
 )
 
 add_header_library(
   binary_op_single_output_diff
   HDRS
-    BinaryOpSingleOutputDiff.h
+    BinaryOpSingleOutputPerf.h
 )
 
-add_diff_binary(
-  sinf_diff
-  SRCS
-    sinf_diff.cpp
-  DEPENDS
-    .single_input_single_output_diff
-    libc.src.math.sinf
-)
-
-add_diff_binary(
+add_perf_binary(
   sinf_perf
   SRCS
     sinf_perf.cpp
@@ -110,16 +101,7 @@ add_diff_binary(
     -fno-builtin
 )
 
-add_diff_binary(
-  cosf_diff
-  SRCS
-    cosf_diff.cpp
-  DEPENDS
-    .single_input_single_output_diff
-    libc.src.math.cosf
-)
-
-add_diff_binary(
+add_perf_binary(
   cosf_perf
   SRCS
     cosf_perf.cpp
@@ -130,16 +112,7 @@ add_diff_binary(
     -fno-builtin
 )
 
-add_diff_binary(
-  expm1f_diff
-  SRCS
-    expm1f_diff.cpp
-  DEPENDS
-    .single_input_single_output_diff
-    libc.src.math.expm1f
-)
-
-add_diff_binary(
+add_perf_binary(
   expm1f_perf
   SRCS
     expm1f_perf.cpp
@@ -150,16 +123,7 @@ add_diff_binary(
     -fno-builtin
 )
 
-add_diff_binary(
-  ceilf_diff
-  SRCS
-    ceilf_diff.cpp
-  DEPENDS
-    .single_input_single_output_diff
-    libc.src.math.ceilf
-)
-
-add_diff_binary(
+add_perf_binary(
   ceilf_perf
   SRCS
     ceilf_perf.cpp
@@ -170,16 +134,7 @@ add_diff_binary(
     -fno-builtin
 )
 
-add_diff_binary(
-  exp2f_diff
-  SRCS
-    exp2f_diff.cpp
-  DEPENDS
-    .single_input_single_output_diff
-    libc.src.math.exp2f
-)
-
-add_diff_binary(
+add_perf_binary(
   exp2f_perf
   SRCS
     exp2f_perf.cpp
@@ -190,16 +145,7 @@ add_diff_binary(
     -fno-builtin
 )
 
-add_diff_binary(
-  expf_diff
-  SRCS
-    expf_diff.cpp
-  DEPENDS
-    .single_input_single_output_diff
-    libc.src.math.expf
-)
-
-add_diff_binary(
+add_perf_binary(
   expf_perf
   SRCS
     expf_perf.cpp
@@ -210,16 +156,7 @@ add_diff_binary(
     -fno-builtin
 )
 
-add_diff_binary(
-  fabsf_diff
-  SRCS
-    fabsf_diff.cpp
-  DEPENDS
-    .single_input_single_output_diff
-    libc.src.math.fabsf
-)
-
-add_diff_binary(
+add_perf_binary(
   fabsf_perf
   SRCS
     fabsf_perf.cpp
@@ -230,16 +167,7 @@ add_diff_binary(
     -fno-builtin
 )
 
-add_diff_binary(
-  floorf_diff
-  SRCS
-    floorf_diff.cpp
-  DEPENDS
-    .single_input_single_output_diff
-    libc.src.math.floorf
-)
-
-add_diff_binary(
+add_perf_binary(
   floorf_perf
   SRCS
     floorf_perf.cpp
@@ -250,7 +178,7 @@ add_diff_binary(
     -fno-builtin
 )
 
-add_diff_binary(
+add_perf_binary(
   log10f_perf
   SRCS
     log10f_perf.cpp
@@ -261,7 +189,7 @@ add_diff_binary(
     -fno-builtin
 )
 
-add_diff_binary(
+add_perf_binary(
   log1pf_perf
   SRCS
     log1pf_perf.cpp
@@ -272,18 +200,7 @@ add_diff_binary(
     -fno-builtin
 )
 
-add_diff_binary(
-  log2f_diff
-  SRCS
-    log2f_diff.cpp
-  DEPENDS
-    .single_input_single_output_diff
-    libc.src.math.log2f
-  COMPILE_OPTIONS
-    -fno-builtin
-)
-
-add_diff_binary(
+add_perf_binary(
   log2f_perf
   SRCS
     log2f_perf.cpp
@@ -294,18 +211,7 @@ add_diff_binary(
     -fno-builtin
 )
 
-add_diff_binary(
-  logf_diff
-  SRCS
-    logf_diff.cpp
-  DEPENDS
-    .single_input_single_output_diff
-    libc.src.math.logf
-  COMPILE_OPTIONS
-    -fno-builtin
-)
-
-add_diff_binary(
+add_perf_binary(
   logf_perf
   SRCS
     logf_perf.cpp
@@ -316,16 +222,7 @@ add_diff_binary(
     -fno-builtin
 )
 
-add_diff_binary(
-  logbf_diff
-  SRCS
-    logbf_diff.cpp
-  DEPENDS
-    .single_input_single_output_diff
-    libc.src.math.logbf
-)
-
-add_diff_binary(
+add_perf_binary(
   logbf_perf
   SRCS
     logbf_perf.cpp
@@ -336,16 +233,7 @@ add_diff_binary(
     -fno-builtin
 )
 
-add_diff_binary(
-  nearbyintf_diff
-  SRCS
-    nearbyintf_diff.cpp
-  DEPENDS
-    .single_input_single_output_diff
-    libc.src.math.nearbyintf
-)
-
-add_diff_binary(
+add_perf_binary(
   nearbyintf_perf
   SRCS
     nearbyintf_perf.cpp
@@ -356,16 +244,7 @@ add_diff_binary(
     -fno-builtin
 )
 
-add_diff_binary(
-  rintf_diff
-  SRCS
-    rintf_diff.cpp
-  DEPENDS
-    .single_input_single_output_diff
-    libc.src.math.rintf
-)
-
-add_diff_binary(
+add_perf_binary(
   rintf_perf
   SRCS
     rintf_perf.cpp
@@ -376,16 +255,7 @@ add_diff_binary(
     -fno-builtin
 )
 
-add_diff_binary(
-  roundf_diff
-  SRCS
-    roundf_diff.cpp
-  DEPENDS
-    .single_input_single_output_diff
-    libc.src.math.roundf
-)
-
-add_diff_binary(
+add_perf_binary(
   roundf_perf
   SRCS
     roundf_perf.cpp
@@ -396,16 +266,7 @@ add_diff_binary(
     -fno-builtin
 )
 
-add_diff_binary(
-  sqrtf_diff
-  SRCS
-    sqrtf_diff.cpp
-  DEPENDS
-    .single_input_single_output_diff
-    libc.src.math.sqrtf
-)
-
-add_diff_binary(
+add_perf_binary(
   sqrtf_perf
   SRCS
     sqrtf_perf.cpp
@@ -416,16 +277,7 @@ add_diff_binary(
     -fno-builtin
 )
 
-add_diff_binary(
-  truncf_diff
-  SRCS
-    truncf_diff.cpp
-  DEPENDS
-    .single_input_single_output_diff
-    libc.src.math.truncf
-)
-
-add_diff_binary(
+add_perf_binary(
   truncf_perf
   SRCS
     truncf_perf.cpp
@@ -436,18 +288,7 @@ add_diff_binary(
     -fno-builtin
 )
 
-add_diff_binary(
-  hypotf_diff
-  SRCS
-    hypotf_diff.cpp
-  DEPENDS
-    .binary_op_single_output_diff
-    libc.src.math.hypotf
-  COMPILE_OPTIONS
-    -fno-builtin
-)
-
-add_diff_binary(
+add_perf_binary(
   hypotf_perf
   SRCS
     hypotf_perf.cpp
@@ -458,18 +299,7 @@ add_diff_binary(
     -fno-builtin
 )
 
-add_diff_binary(
-  hypot_diff
-  SRCS
-    hypot_diff.cpp
-  DEPENDS
-    .binary_op_single_output_diff
-    libc.src.math.hypot
-  COMPILE_OPTIONS
-    -fno-builtin
-)
-
-add_diff_binary(
+add_perf_binary(
   hypot_perf
   SRCS
     hypot_perf.cpp
@@ -480,16 +310,7 @@ add_diff_binary(
     -fno-builtin
 )
 
-add_diff_binary(
-  fmodf_diff
-  SRCS
-    fmodf_diff.cpp
-  DEPENDS
-    .single_input_single_output_diff
-    libc.src.math.fmodf
-)
-
-add_diff_binary(
+add_perf_binary(
   fmodf_perf
   SRCS
     fmodf_perf.cpp
@@ -500,16 +321,7 @@ add_diff_binary(
     -fno-builtin
 )
 
-add_diff_binary(
-  fmod_diff
-  SRCS
-    fmod_diff.cpp
-  DEPENDS
-    .single_input_single_output_diff
-    libc.src.math.fmod
-)
-
-add_diff_binary(
+add_perf_binary(
   fmod_perf
   SRCS
     fmod_perf.cpp
diff --git a/libc/test/src/math/differential_testing/SingleInputSingleOutputDiff.h b/libc/test/src/math/performance_testing/SingleInputSingleOutputPerf.h
similarity index 64%
rename from libc/test/src/math/differential_testing/SingleInputSingleOutputDiff.h
rename to libc/test/src/math/performance_testing/SingleInputSingleOutputPerf.h
index 5e8310e889dc67..b5b38313a69ca9 100644
--- a/libc/test/src/math/differential_testing/SingleInputSingleOutputDiff.h
+++ b/libc/test/src/math/performance_testing/SingleInputSingleOutputPerf.h
@@ -7,14 +7,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/FPUtil/FPBits.h"
-#include "test/src/math/differential_testing/Timer.h"
+#include "test/src/math/performance_testing/Timer.h"
 
 #include <fstream>
 
 namespace LIBC_NAMESPACE {
 namespace testing {
 
-template <typename T> class SingleInputSingleOutputDiff {
+template <typename T> class SingleInputSingleOutputPerf {
   using FPBits = fputil::FPBits<T>;
   using StorageType = typename FPBits::StorageType;
   static constexpr StorageType UIntMax =
@@ -23,40 +23,18 @@ template <typename T> class SingleInputSingleOutputDiff {
 public:
   typedef T Func(T);
 
-  static void runDiff(Func myFunc, Func otherFunc, const char *logFile) {
-    StorageType diffCount = 0;
-    std::ofstream log(logFile);
-    log << "Starting diff for values from 0 to " << UIntMax << '\n'
-        << "Only differing results will be logged.\n\n";
-    for (StorageType bits = 0;; ++bits) {
-      T x = T(FPBits(bits));
-      T myResult = myFunc(x);
-      T otherResult = otherFunc(x);
-      StorageType myBits = FPBits(myResult).uintval();
-      StorageType otherBits = FPBits(otherResult).uintval();
-      if (myBits != otherBits) {
-        ++diffCount;
-        log << "       Input: " << bits << " (" << x << ")\n"
-            << "   My result: " << myBits << " (" << myResult << ")\n"
-            << "Other result: " << otherBits << " (" << otherResult << ")\n"
-            << '\n';
-      }
-      if (bits == UIntMax)
-        break;
-    }
-    log << "Total number of differing results: " << diffCount << '\n';
-  }
-
   static void runPerfInRange(Func myFunc, Func otherFunc,
                              StorageType startingBit, StorageType endingBit,
                              std::ofstream &log) {
     auto runner = [=](Func func) {
+      constexpr StorageType N = 10'010'001;
+      StorageType step = (endingBit - startingBit) / N;
+      if (step == 0)
+        step = 1;
       volatile T result;
-      for (StorageType bits = startingBit;; ++bits) {
-        T x = T(FPBits(bits));
+      for (StorageType bits = startingBit; bits < endingBit; bits += step) {
+        T x = FPBits(bits).get_val();
         result = func(x);
-        if (bits == endingBit)
-          break;
       }
     };
 
@@ -104,16 +82,9 @@ template <typename T> class SingleInputSingleOutputDiff {
 } // namespace testing
 } // namespace LIBC_NAMESPACE
 
-#define SINGLE_INPUT_SINGLE_OUTPUT_DIFF(T, myFunc, otherFunc, filename)        \
-  int main() {                                                                 \
-    LIBC_NAMESPACE::testing::SingleInputSingleOutputDiff<T>::runDiff(          \
-        &myFunc, &otherFunc, filename);                                        \
-    return 0;                                                                  \
-  }
-
 #define SINGLE_INPUT_SINGLE_OUTPUT_PERF(T, myFunc, otherFunc, filename)        \
   int main() {                                                                 \
-    LIBC_NAMESPACE::testing::SingleInputSingleOutputDiff<T>::runPerf(          \
+    LIBC_NAMESPACE::testing::SingleInputSingleOutputPerf<T>::runPerf(          \
         &myFunc, &otherFunc, filename);                                        \
     return 0;                                                                  \
   }
diff --git a/libc/test/src/math/differential_testing/Timer.cpp b/libc/test/src/math/performance_testing/Timer.cpp
similarity index 100%
rename from libc/test/src/math/differential_testing/Timer.cpp
rename to libc/test/src/math/performance_testing/Timer.cpp
diff --git a/libc/test/src/math/differential_testing/Timer.h b/libc/test/src/math/performance_testing/Timer.h
similarity index 77%
rename from libc/test/src/math/differential_testing/Timer.h
rename to libc/test/src/math/performance_testing/Timer.h
index 0d9518c37d9e0f..2327ede260ab9d 100644
--- a/libc/test/src/math/differential_testing/Timer.h
+++ b/libc/test/src/math/performance_testing/Timer.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIBC_TEST_SRC_MATH_DIFFERENTIAL_TESTING_TIMER_H
-#define LLVM_LIBC_TEST_SRC_MATH_DIFFERENTIAL_TESTING_TIMER_H
+#ifndef LLVM_LIBC_TEST_SRC_MATH_PERFORMACE_TESTING_TIMER_H
+#define LLVM_LIBC_TEST_SRC_MATH_PERFORMACE_TESTING_TIMER_H
 
 #include <stdint.h>
 
@@ -30,4 +30,4 @@ class Timer {
 } // namespace testing
 } // namespace LIBC_NAMESPACE
 
-#endif // LLVM_LIBC_TEST_SRC_MATH_DIFFERENTIAL_TESTING_TIMER_H
+#endif // LLVM_LIBC_TEST_SRC_MATH_PERFORMANCE_TESTING_TIMER_H
diff --git a/libc/test/src/math/differential_testing/ceilf_perf.cpp b/libc/test/src/math/performance_testing/ceilf_perf.cpp
similarity index 92%
rename from libc/test/src/math/differential_testing/ceilf_perf.cpp
rename to libc/test/src/math/performance_testing/ceilf_perf.cpp
index c304231e0678de..04e96f6fb2dccc 100644
--- a/libc/test/src/math/differential_testing/ceilf_perf.cpp
+++ b/libc/test/src/math/performance_testing/ceilf_perf.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SingleInputSingleOutputDiff.h"
+#include "SingleInputSingleOutputPerf.h"
 
 #include "src/math/ceilf.h"
 
diff --git a/libc/test/src/math/differential_testing/cosf_perf.cpp b/libc/test/src/math/performance_testing/cosf_perf.cpp
similarity index 92%
rename from libc/test/src/math/differential_testing/cosf_perf.cpp
rename to libc/test/src/math/performance_testing/cosf_perf.cpp
index 981a94133b8040..1501b8bf254044 100644
--- a/libc/test/src/math/differential_testing/cosf_perf.cpp
+++ b/libc/test/src/math/performance_testing/cosf_perf.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SingleInputSingleOutputDiff.h"
+#include "SingleInputSingleOutputPerf.h"
 
 #include "src/math/cosf.h"
 
diff --git a/libc/test/src/math/differential_testing/exp2f_perf.cpp b/libc/test/src/math/performance_testing/exp2f_perf.cpp
similarity index 92%
rename from libc/test/src/math/differential_testing/exp2f_perf.cpp
rename to libc/test/src/math/performance_testing/exp2f_perf.cpp
index 4aae5220e6a516..19a70ac6569aa4 100644
--- a/libc/test/src/math/differential_testing/exp2f_perf.cpp
+++ b/libc/test/src/math/performance_testing/exp2f_perf.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SingleInputSingleOutputDiff.h"
+#include "SingleInputSingleOutputPerf.h"
 
 #include "src/math/exp2f.h"
 
diff --git a/libc/test/src/math/differential_testing/expf_perf.cpp b/libc/test/src/math/performance_testing/expf_perf.cpp
similarity index 92%
rename from libc/test/src/math/differential_testing/expf_perf.cpp
rename to libc/test/src/math/performance_testing/expf_perf.cpp
index c34173b21b4f60..4b743514023d12 100644
--- a/libc/test/src/math/differential_testing/expf_perf.cpp
+++ b/libc/test/src/math/performance_testing/expf_perf.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SingleInputSingleOutputDiff.h"
+#include "SingleInputSingleOutputPerf.h"
 
 #include "src/math/expf.h"
 
diff --git a/libc/test/src/math/differential_testing/expm1f_perf.cpp b/libc/test/src/math/performance_testing/expm1f_perf.cpp
similarity index 92%
rename from libc/test/src/math/differential_testing/expm1f_perf.cpp
rename to libc/test/src/math/performance_testing/expm1f_perf.cpp
index 3c25ef81d4808c..128ab351d86db1 100644
--- a/libc/test/src/math/differential_testing/expm1f_perf.cpp
+++ b/libc/test/src/math/performance_testing/expm1f_perf.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SingleInputSingleOutputDiff.h"
+#include "SingleInputSingleOutputPerf.h"
 
 #include "src/math/expm1f.h"
 
diff --git a/libc/test/src/math/differential_testing/fabsf_perf.cpp b/libc/test/src/math/performance_testing/fabsf_perf.cpp
similarity index 92%
rename from libc/test/src/math/differential_testing/fabsf_perf.cpp
rename to libc/test/src/math/performance_testing/fabsf_perf.cpp
index f9f9cea72c6dae..b6c6add75d230c 100644
--- a/libc/test/src/math/differential_testing/fabsf_perf.cpp
+++ b/libc/test/src/math/performance_testing/fabsf_perf.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SingleInputSingleOutputDiff.h"
+#include "SingleInputSingleOutputPerf.h"
 
 #include "src/math/fabsf.h"
 
diff --git a/libc/test/src/math/differential_testing/floorf_perf.cpp b/libc/test/src/math/performance_testing/floorf_perf.cpp
similarity index 92%
rename from libc/test/src/math/differential_testing/floorf_perf.cpp
rename to libc/test/src/math/performance_testing/floorf_perf.cpp
index abd1cd7885ffd2..0f1087b3c8236b 100644
--- a/libc/test/src/math/differential_testing/floorf_perf.cpp
+++ b/libc/test/src/math/performance_testing/floorf_perf.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SingleInputSingleOutputDiff.h"
+#include "SingleInputSingleOutputPerf.h"
 
 #include "src/math/floorf.h"
 
diff --git a/libc/test/src/math/differential_testing/fmod_perf.cpp b/libc/test/src/math/performance_testing/fmod_perf.cpp
similarity index 93%
rename from libc/test/src/math/differential_testing/fmod_perf.cpp
rename to libc/test/src/math/performance_testing/fmod_perf.cpp
index 219ee7860a242b..fa9b4c6b41287b 100644
--- a/libc/test/src/math/differential_testing/fmod_perf.cpp
+++ b/libc/test/src/math/performance_testing/fmod_perf.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "BinaryOpSingleOutputDiff.h"
+#include "BinaryOpSingleOutputPerf.h"
 
 #include "src/math/fmod.h"
 
diff --git a/libc/test/src/math/differential_testing/fmodf_perf.cpp b/libc/test/src/math/performance_testing/fmodf_perf.cpp
similarity index 93%
rename from libc/test/src/math/differential_testing/fmodf_perf.cpp
rename to libc/test/src/math/performance_testing/fmodf_perf.cpp
index c2927bb1ea9d9f..f13f02e2439da3 100644
--- a/libc/test/src/math/differential_testing/fmodf_perf.cpp
+++ b/libc/test/src/math/performance_testing/fmodf_perf.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "BinaryOpSingleOutputDiff.h"
+#include "BinaryOpSingleOutputPerf.h"
 
 #include "src/math/fmodf.h"
 
diff --git a/libc/test/src/math/differential_testing/hypot_perf.cpp b/libc/test/src/math/performance_testing/hypot_perf.cpp
similarity index 93%
rename from libc/test/src/math/differential_testing/hypot_perf.cpp
rename to libc/test/src/math/performance_testing/hypot_perf.cpp
index 01a72e6fbc3d79..393697b7540330 100644
--- a/libc/test/src/math/differential_testing/hypot_perf.cpp
+++ b/libc/test/src/math/performance_testing/hypot_perf.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "BinaryOpSingleOutputDiff.h"
+#include "BinaryOpSingleOutputPerf.h"
 
 #include "src/math/hypot.h"
 
diff --git a/libc/test/src/math/differential_testing/hypotf_perf.cpp b/libc/test/src/math/performance_testing/hypotf_perf.cpp
similarity index 93%
rename from libc/test/src/math/differential_testing/hypotf_perf.cpp
rename to libc/test/src/math/performance_testing/hypotf_perf.cpp
index ed57b186f889ba..f711729377dacf 100644
--- a/libc/test/src/math/differential_testing/hypotf_perf.cpp
+++ b/libc/test/src/math/performance_testing/hypotf_perf.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "BinaryOpSingleOutputDiff.h"
+#include "BinaryOpSingleOutputPerf.h"
 
 #include "src/math/hypotf.h"
 
diff --git a/libc/test/src/math/differential_testing/log10f_perf.cpp b/libc/test/src/math/performance_testing/log10f_perf.cpp
similarity index 92%
rename from libc/test/src/math/differential_testing/log10f_perf.cpp
rename to libc/test/src/math/performance_testing/log10f_perf.cpp
index 60c1161a31cf96..32a31b93252855 100644
--- a/libc/test/src/math/differential_testing/log10f_perf.cpp
+++ b/libc/test/src/math/performance_testing/log10f_perf.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SingleInputSingleOutputDiff.h"
+#include "SingleInputSingleOutputPerf.h"
 
 #include "src/math/log10f.h"
 
diff --git a/libc/test/src/math/differential_testing/log1pf_perf.cpp b/libc/test/src/math/performance_testing/log1pf_perf.cpp
similarity index 92%
rename from libc/test/src/math/differential_testing/log1pf_perf.cpp
rename to libc/test/src/math/performance_testing/log1pf_perf.cpp
index 5cd523d82184cc..18c168423b87d1 100644
--- a/libc/test/src/math/differential_testing/log1pf_perf.cpp
+++ b/libc/test/src/math/performance_testing/log1pf_perf.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SingleInputSingleOutputDiff.h"
+#include "SingleInputSingleOutputPerf.h"
 
 #include "src/math/log1pf.h"
 
diff --git a/libc/test/src/math/differential_testing/log2f_perf.cpp b/libc/test/src/math/performance_testing/log2f_perf.cpp
similarity index 92%
rename from libc/test/src/math/differential_testing/log2f_perf.cpp
rename to libc/test/src/math/performance_testing/log2f_perf.cpp
index ee899394c421ed..c4c4dbf4d9f554 100644
--- a/libc/test/src/math/differential_testing/log2f_perf.cpp
+++ b/libc/test/src/math/performance_testing/log2f_perf.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SingleInputSingleOutputDiff.h"
+#include "SingleInputSingleOutputPerf.h"
 
 #include "src/math/log2f.h"
 
diff --git a/libc/test/src/math/differential_testing/logbf_perf.cpp b/libc/test/src/math/performance_testing/logbf_perf.cpp
similarity index 92%
rename from libc/test/src/math/differential_testing/logbf_perf.cpp
rename to libc/test/src/math/performance_testing/logbf_perf.cpp
index 89d5bd13f9316b..eefd64b8ae913f 100644
--- a/libc/test/src/math/differential_testing/logbf_perf.cpp
+++ b/libc/test/src/math/performance_testing/logbf_perf.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SingleInputSingleOutputDiff.h"
+#include "SingleInputSingleOutputPerf.h"
 
 #include "src/math/logbf.h"
 
diff --git a/libc/test/src/math/differential_testing/logf_perf.cpp b/libc/test/src/math/performance_testing/logf_perf.cpp
similarity index 92%
rename from libc/test/src/math/differential_testing/logf_perf.cpp
rename to libc/test/src/math/performance_testing/logf_perf.cpp
index f1b3f986bd40a3..53f4f50e09efe4 100644
--- a/libc/test/src/math/differential_testing/logf_perf.cpp
+++ b/libc/test/src/math/performance_testing/logf_perf.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SingleInputSingleOutputDiff.h"
+#include "SingleInputSingleOutputPerf.h"
 
 #include "src/math/logf.h"
 
diff --git a/libc/test/src/math/differential_testing/nearbyintf_perf.cpp b/libc/test/src/math/performance_testing/nearbyintf_perf.cpp
similarity index 93%
rename from libc/test/src/math/differential_testing/nearbyintf_perf.cpp
rename to libc/test/src/math/performance_testing/nearbyintf_perf.cpp
index 9c5736fb4ab048..ae708dd2132432 100644
--- a/libc/test/src/math/differential_testing/nearbyintf_perf.cpp
+++ b/libc/test/src/math/performance_testing/nearbyintf_perf.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SingleInputSingleOutputDiff.h"
+#include "SingleInputSingleOutputPerf.h"
 
 #include "src/math/nearbyintf.h"
 
diff --git a/libc/test/src/math/differential_testing/rintf_perf.cpp b/libc/test/src/math/performance_testing/rintf_perf.cpp
similarity index 92%
rename from libc/test/src/math/differential_testing/rintf_perf.cpp
rename to libc/test/src/math/performance_testing/rintf_perf.cpp
index 432e5da77f3789..6347ac9149af6e 100644
--- a/libc/test/src/math/differential_testing/rintf_perf.cpp
+++ b/libc/test/src/math/performance_testing/rintf_perf.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SingleInputSingleOutputDiff.h"
+#include "SingleInputSingleOutputPerf.h"
 
 #include "src/math/rintf.h"
 
diff --git a/libc/test/src/math/differential_testing/roundf_perf.cpp b/libc/test/src/math/performance_testing/roundf_perf.cpp
similarity index 92%
rename from libc/test/src/math/differential_testing/roundf_perf.cpp
rename to libc/test/src/math/performance_testing/roundf_perf.cpp
index 091c7b2b86800f..36becacba07cb5 100644
--- a/libc/test/src/math/differential_testing/roundf_perf.cpp
+++ b/libc/test/src/math/performance_testing/roundf_perf.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SingleInputSingleOutputDiff.h"
+#include "SingleInputSingleOutputPerf.h"
 
 #include "src/math/roundf.h"
 
diff --git a/libc/test/src/math/differential_testing/sinf_perf.cpp b/libc/test/src/math/performance_testing/sinf_perf.cpp
similarity index 92%
rename from libc/test/src/math/differential_testing/sinf_perf.cpp
rename to libc/test/src/math/performance_testing/sinf_perf.cpp
index 7247bca2853d88..43ba60e1ef76a4 100644
--- a/libc/test/src/math/differential_testing/sinf_perf.cpp
+++ b/libc/test/src/math/performance_testing/sinf_perf.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SingleInputSingleOutputDiff.h"
+#include "SingleInputSingleOutputPerf.h"
 
 #include "src/math/sinf.h"
 
diff --git a/libc/test/src/math/differential_testing/sqrtf_perf.cpp b/libc/test/src/math/performance_testing/sqrtf_perf.cpp
similarity index 92%
rename from libc/test/src/math/differential_testing/sqrtf_perf.cpp
rename to libc/test/src/math/performance_testing/sqrtf_perf.cpp
index 5ae586ba31267d..71325518533b60 100644
--- a/libc/test/src/math/differential_testing/sqrtf_perf.cpp
+++ b/libc/test/src/math/performance_testing/sqrtf_perf.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SingleInputSingleOutputDiff.h"
+#include "SingleInputSingleOutputPerf.h"
 
 #include "src/math/sqrtf.h"
 
diff --git a/libc/test/src/math/differential_testing/truncf_perf.cpp b/libc/test/src/math/performance_testing/truncf_perf.cpp
similarity index 92%
rename from libc/test/src/math/differential_testing/truncf_perf.cpp
rename to libc/test/src/math/performance_testing/truncf_perf.cpp
index e07db1320fddd7..ff74c6b4eb64df 100644
--- a/libc/test/src/math/differential_testing/truncf_perf.cpp
+++ b/libc/test/src/math/performance_testing/truncf_perf.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SingleInputSingleOutputDiff.h"
+#include "SingleInputSingleOutputPerf.h"
 
 #include "src/math/truncf.h"
 
diff --git a/libc/utils/CMakeLists.txt b/libc/utils/CMakeLists.txt
index 7bf02a4af7deae..11f25503cc13e2 100644
--- a/libc/utils/CMakeLists.txt
+++ b/libc/utils/CMakeLists.txt
@@ -1,6 +1,3 @@
 if(LLVM_INCLUDE_TESTS)
   add_subdirectory(MPFRWrapper)
 endif()
-if(LIBC_TARGET_OS_IS_GPU)
-  add_subdirectory(gpu)
-endif()
diff --git a/libc/utils/gpu/CMakeLists.txt b/libc/utils/gpu/CMakeLists.txt
index 4d1ebcfb9f8e65..7c15f36052cf3b 100644
--- a/libc/utils/gpu/CMakeLists.txt
+++ b/libc/utils/gpu/CMakeLists.txt
@@ -1,4 +1,2 @@
 add_subdirectory(server)
-if(LIBC_TARGET_OS_IS_GPU)
-  add_subdirectory(loader)
-endif()
+add_subdirectory(loader)
diff --git a/libc/utils/gpu/loader/CMakeLists.txt b/libc/utils/gpu/loader/CMakeLists.txt
index 189460bb02e6e5..b562cdc521c076 100644
--- a/libc/utils/gpu/loader/CMakeLists.txt
+++ b/libc/utils/gpu/loader/CMakeLists.txt
@@ -6,37 +6,18 @@ target_include_directories(gpu_loader PUBLIC
   ${LIBC_SOURCE_DIR}
 )
 
-# This utility needs to be compiled for the host system when cross compiling.
-if(LLVM_RUNTIMES_TARGET OR LIBC_TARGET_TRIPLE)
-  target_compile_options(gpu_loader PUBLIC --target=${LLVM_HOST_TRIPLE})
-  target_link_libraries(gpu_loader PUBLIC "--target=${LLVM_HOST_TRIPLE}")
-endif()
-
 find_package(hsa-runtime64 QUIET 1.2.0 HINTS ${CMAKE_INSTALL_PREFIX} PATHS /opt/rocm)
-if(hsa-runtime64_FOUND AND LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
+if(hsa-runtime64_FOUND)
   add_subdirectory(amdgpu)
-elseif(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
-  message(STATUS "Skipping HSA loader for gpu target, no HSA was detected")
 endif()
 
 # The CUDA loader requires LLVM to traverse the ELF image for symbols.
-find_package(LLVM QUIET)
-if(CUDAToolkit_FOUND AND LLVM_FOUND AND LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
+find_package(CUDAToolkit 11.2 QUIET)
+if(CUDAToolkit_FOUND)
   add_subdirectory(nvptx)
-elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
-  message(STATUS "Skipping CUDA loader for gpu target, no CUDA was detected")
 endif()
 
-# Add a custom target to be used for testing.
-set(LIBC_GPU_LOADER_EXECUTABLE "" CACHE STRING "Overriding binary for the GPU loader.")
-if(LIBC_GPU_LOADER_EXECUTABLE)
-  add_custom_target(libc.utils.gpu.loader)
-  set_target_properties(
-    libc.utils.gpu.loader
-    PROPERTIES
-      EXECUTABLE "${LIBC_GPU_LOADER_EXECUTABLE}"
-  )
-elseif(TARGET amdhsa-loader AND LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
+if(TARGET amdhsa-loader AND LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
   add_custom_target(libc.utils.gpu.loader)
   add_dependencies(libc.utils.gpu.loader amdhsa-loader)
   set_target_properties(
@@ -56,11 +37,10 @@ elseif(TARGET nvptx-loader AND LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
   )
 endif()
 
-if(TARGET libc.utils.gpu.loader)
-  get_target_property(gpu_loader_tgt libc.utils.gpu.loader "TARGET")
-  if(gpu_loader_tgt)
+foreach(gpu_loader_tgt amdhsa-loader nvptx-loader)
+  if(TARGET ${gpu_loader_tgt})
     install(TARGETS ${gpu_loader_tgt}
             DESTINATION ${CMAKE_INSTALL_BINDIR}
             COMPONENT libc)
   endif()
-endif()
+endforeach()
diff --git a/libc/utils/gpu/loader/amdgpu/CMakeLists.txt b/libc/utils/gpu/loader/amdgpu/CMakeLists.txt
index b99319f5040112..97a2de9f8379ab 100644
--- a/libc/utils/gpu/loader/amdgpu/CMakeLists.txt
+++ b/libc/utils/gpu/loader/amdgpu/CMakeLists.txt
@@ -1,5 +1,4 @@
 add_executable(amdhsa-loader Loader.cpp)
-add_dependencies(amdhsa-loader libc.src.__support.RPC.rpc)
 
 target_link_libraries(amdhsa-loader
   PRIVATE
diff --git a/libc/utils/gpu/loader/nvptx/CMakeLists.txt b/libc/utils/gpu/loader/nvptx/CMakeLists.txt
index e76362a1e8cca6..948493959badf2 100644
--- a/libc/utils/gpu/loader/nvptx/CMakeLists.txt
+++ b/libc/utils/gpu/loader/nvptx/CMakeLists.txt
@@ -1,10 +1,10 @@
 add_executable(nvptx-loader Loader.cpp)
-add_dependencies(nvptx-loader libc.src.__support.RPC.rpc)
 
 if(NOT LLVM_ENABLE_RTTI)
   target_compile_options(nvptx-loader PRIVATE -fno-rtti)
 endif()
-target_include_directories(nvptx-loader PRIVATE ${LLVM_INCLUDE_DIRS})
+target_include_directories(nvptx-loader PRIVATE
+                           ${LLVM_MAIN_INCLUDE_DIR} ${LLVM_BINARY_DIR}/include)
 target_link_libraries(nvptx-loader
   PRIVATE
   gpu_loader
diff --git a/libc/utils/gpu/server/CMakeLists.txt b/libc/utils/gpu/server/CMakeLists.txt
index 10cfdb45a2c9d2..6fca72cfae95fb 100644
--- a/libc/utils/gpu/server/CMakeLists.txt
+++ b/libc/utils/gpu/server/CMakeLists.txt
@@ -5,21 +5,12 @@ target_include_directories(llvmlibc_rpc_server PRIVATE ${LIBC_SOURCE_DIR})
 target_include_directories(llvmlibc_rpc_server PUBLIC ${LIBC_SOURCE_DIR}/include)
 target_include_directories(llvmlibc_rpc_server PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 
-
 # Ignore unsupported clang attributes if we're using GCC.
 target_compile_options(llvmlibc_rpc_server PUBLIC
                        $<$<CXX_COMPILER_ID:GNU>:-Wno-attributes>)
 target_compile_definitions(llvmlibc_rpc_server PUBLIC
                            LIBC_NAMESPACE=${LIBC_NAMESPACE})
 
-# This utility needs to be compiled for the host system when cross compiling.
-if(LLVM_RUNTIMES_TARGET OR LIBC_TARGET_TRIPLE)
-  target_compile_options(llvmlibc_rpc_server PUBLIC
-                         --target=${LLVM_HOST_TRIPLE})
-  target_link_libraries(llvmlibc_rpc_server PUBLIC
-                        "--target=${LLVM_HOST_TRIPLE}")
-endif()
-
 # Install the server and associated header.
 install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/llvmlibc_rpc_server.h
         DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index e37c4ac4fddd8c..63adc03fae2980 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -701,6 +701,7 @@ set(files
   __thread/thread.h
   __thread/timed_backoff_policy.h
   __tree
+  __tuple/find_index.h
   __tuple/make_tuple_types.h
   __tuple/pair_like.h
   __tuple/sfinae_helpers.h
diff --git a/libcxx/include/__exception/exception_ptr.h b/libcxx/include/__exception/exception_ptr.h
index 53e2f718bc1b35..c9027de9238cdd 100644
--- a/libcxx/include/__exception/exception_ptr.h
+++ b/libcxx/include/__exception/exception_ptr.h
@@ -26,6 +26,8 @@
 
 #ifndef _LIBCPP_ABI_MICROSOFT
 
+#  if _LIBCPP_AVAILABILITY_HAS_INIT_PRIMARY_EXCEPTION
+
 namespace __cxxabiv1 {
 
 extern "C" {
@@ -37,14 +39,16 @@ _LIBCPP_OVERRIDABLE_FUNC_VIS __cxa_exception* __cxa_init_primary_exception(
     void*,
     std::type_info*,
     void(
-#  if defined(_WIN32)
+#    if defined(_WIN32)
         __thiscall
-#  endif
+#    endif
             *)(void*)) throw();
 }
 
 } // namespace __cxxabiv1
 
+#  endif
+
 #endif
 
 namespace std { // purposefully not using versioning namespace
diff --git a/libcxx/include/__tuple/find_index.h b/libcxx/include/__tuple/find_index.h
new file mode 100644
index 00000000000000..133b00419d0c6c
--- /dev/null
+++ b/libcxx/include/__tuple/find_index.h
@@ -0,0 +1,62 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___TUPLE_FIND_INDEX_H
+#define _LIBCPP___TUPLE_FIND_INDEX_H
+
+#include <__config>
+#include <__type_traits/is_same.h>
+#include <cstddef>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if _LIBCPP_STD_VER >= 14
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+namespace __find_detail {
+
+static constexpr size_t __not_found = static_cast<size_t>(-1);
+static constexpr size_t __ambiguous = __not_found - 1;
+
+inline _LIBCPP_HIDE_FROM_ABI constexpr size_t __find_idx_return(size_t __curr_i, size_t __res, bool __matches) {
+  return !__matches ? __res : (__res == __not_found ? __curr_i : __ambiguous);
+}
+
+template <size_t _Nx>
+inline _LIBCPP_HIDE_FROM_ABI constexpr size_t __find_idx(size_t __i, const bool (&__matches)[_Nx]) {
+  return __i == _Nx
+           ? __not_found
+           : __find_detail::__find_idx_return(__i, __find_detail::__find_idx(__i + 1, __matches), __matches[__i]);
+}
+
+template <class _T1, class... _Args>
+struct __find_exactly_one_checked {
+  static constexpr bool __matches[sizeof...(_Args)] = {is_same<_T1, _Args>::value...};
+  static constexpr size_t value                     = __find_detail::__find_idx(0, __matches);
+  static_assert(value != __not_found, "type not found in type list");
+  static_assert(value != __ambiguous, "type occurs more than once in type list");
+};
+
+template <class _T1>
+struct __find_exactly_one_checked<_T1> {
+  static_assert(!is_same<_T1, _T1>::value, "type not in empty type list");
+};
+
+} // namespace __find_detail
+
+template <typename _T1, typename... _Args>
+struct __find_exactly_one_t : public __find_detail::__find_exactly_one_checked<_T1, _Args...> {};
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP_STD_VER >= 14
+
+#endif // _LIBCPP___TUPLE_FIND_INDEX_H
diff --git a/libcxx/include/libcxx.imp b/libcxx/include/libcxx.imp
index e02dc8da6ba182..b1e728cde868da 100644
--- a/libcxx/include/libcxx.imp
+++ b/libcxx/include/libcxx.imp
@@ -697,6 +697,7 @@
   { include: [ "<__thread/this_thread.h>", "private", "<thread>", "public" ] },
   { include: [ "<__thread/thread.h>", "private", "<thread>", "public" ] },
   { include: [ "<__thread/timed_backoff_policy.h>", "private", "<thread>", "public" ] },
+  { include: [ "<__tuple/find_index.h>", "private", "<tuple>", "public" ] },
   { include: [ "<__tuple/make_tuple_types.h>", "private", "<tuple>", "public" ] },
   { include: [ "<__tuple/pair_like.h>", "private", "<tuple>", "public" ] },
   { include: [ "<__tuple/sfinae_helpers.h>", "private", "<tuple>", "public" ] },
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index 98890e890cdb13..0bd2831b7f159c 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -1799,6 +1799,7 @@ module std_private_thread_thread               [system] {
 }
 module std_private_thread_timed_backoff_policy [system] { header "__thread/timed_backoff_policy.h" }
 
+module std_private_tuple_find_index       [system] { header "__tuple/find_index.h" }
 module std_private_tuple_make_tuple_types [system] { header "__tuple/make_tuple_types.h" }
 module std_private_tuple_pair_like        [system] {
   header "__tuple/pair_like.h"
diff --git a/libcxx/include/tuple b/libcxx/include/tuple
index 8808db6739fb9b..e63e4e25a7d2bd 100644
--- a/libcxx/include/tuple
+++ b/libcxx/include/tuple
@@ -213,6 +213,7 @@ template <class... Types>
 #include <__fwd/tuple.h>
 #include <__memory/allocator_arg_t.h>
 #include <__memory/uses_allocator.h>
+#include <__tuple/find_index.h>
 #include <__tuple/make_tuple_types.h>
 #include <__tuple/sfinae_helpers.h>
 #include <__tuple/tuple_element.h>
@@ -1087,40 +1088,6 @@ get(const tuple<_Tp...>&& __t) _NOEXCEPT {
 
 #  if _LIBCPP_STD_VER >= 14
 
-namespace __find_detail {
-
-static constexpr size_t __not_found = static_cast<size_t>(-1);
-static constexpr size_t __ambiguous = __not_found - 1;
-
-inline _LIBCPP_HIDE_FROM_ABI constexpr size_t __find_idx_return(size_t __curr_i, size_t __res, bool __matches) {
-  return !__matches ? __res : (__res == __not_found ? __curr_i : __ambiguous);
-}
-
-template <size_t _Nx>
-inline _LIBCPP_HIDE_FROM_ABI constexpr size_t __find_idx(size_t __i, const bool (&__matches)[_Nx]) {
-  return __i == _Nx
-           ? __not_found
-           : __find_detail::__find_idx_return(__i, __find_detail::__find_idx(__i + 1, __matches), __matches[__i]);
-}
-
-template <class _T1, class... _Args>
-struct __find_exactly_one_checked {
-  static constexpr bool __matches[sizeof...(_Args)] = {is_same<_T1, _Args>::value...};
-  static constexpr size_t value                     = __find_detail::__find_idx(0, __matches);
-  static_assert(value != __not_found, "type not found in type list");
-  static_assert(value != __ambiguous, "type occurs more than once in type list");
-};
-
-template <class _T1>
-struct __find_exactly_one_checked<_T1> {
-  static_assert(!is_same<_T1, _T1>::value, "type not in empty type list");
-};
-
-} // namespace __find_detail
-
-template <typename _T1, typename... _Args>
-struct __find_exactly_one_t : public __find_detail::__find_exactly_one_checked<_T1, _Args...> {};
-
 template <class _T1, class... _Args>
 inline _LIBCPP_HIDE_FROM_ABI constexpr _T1& get(tuple<_Args...>& __tup) noexcept {
   return std::get<__find_exactly_one_t<_T1, _Args...>::value>(__tup);
diff --git a/libcxx/include/variant b/libcxx/include/variant
index 5ce99250a8b4f4..d1eea52f0a9301 100644
--- a/libcxx/include/variant
+++ b/libcxx/include/variant
@@ -221,13 +221,18 @@ namespace std {
 #include <__functional/operations.h>
 #include <__functional/unary_function.h>
 #include <__memory/addressof.h>
+#include <__tuple/find_index.h>
+#include <__tuple/sfinae_helpers.h>
 #include <__type_traits/add_const.h>
 #include <__type_traits/add_cv.h>
 #include <__type_traits/add_pointer.h>
 #include <__type_traits/add_volatile.h>
+#include <__type_traits/common_type.h>
 #include <__type_traits/dependent_type.h>
 #include <__type_traits/is_array.h>
+#include <__type_traits/is_default_constructible.h>
 #include <__type_traits/is_destructible.h>
+#include <__type_traits/is_nothrow_assignable.h>
 #include <__type_traits/is_nothrow_move_constructible.h>
 #include <__type_traits/is_trivially_copy_assignable.h>
 #include <__type_traits/is_trivially_copy_constructible.h>
@@ -242,6 +247,7 @@ namespace std {
 #include <__utility/forward.h>
 #include <__utility/forward_like.h>
 #include <__utility/in_place.h>
+#include <__utility/integer_sequence.h>
 #include <__utility/move.h>
 #include <__utility/swap.h>
 #include <__variant/monostate.h>
@@ -249,7 +255,6 @@ namespace std {
 #include <initializer_list>
 #include <limits>
 #include <new>
-#include <tuple>
 #include <version>
 
 // standard-mandated includes
@@ -340,21 +345,20 @@ struct _LIBCPP_TEMPLATE_VIS variant_alternative<_Ip, variant<_Types...>> {
 
 inline constexpr size_t variant_npos = static_cast<size_t>(-1);
 
-_LIBCPP_HIDE_FROM_ABI constexpr int __choose_index_type(unsigned int __num_elem) {
-  if (__num_elem < numeric_limits<unsigned char>::max())
-    return 0;
-  if (__num_elem < numeric_limits<unsigned short>::max())
-    return 1;
-  return 2;
+template <size_t _NumAlternatives>
+_LIBCPP_HIDE_FROM_ABI constexpr auto __choose_index_type() {
+#ifdef _LIBCPP_ABI_VARIANT_INDEX_TYPE_OPTIMIZATION
+  if constexpr (_NumAlternatives < numeric_limits<unsigned char>::max())
+    return static_cast<unsigned char>(0);
+  else if constexpr (_NumAlternatives < numeric_limits<unsigned short>::max())
+    return static_cast<unsigned short>(0);
+  else
+#endif // _LIBCPP_ABI_VARIANT_INDEX_TYPE_OPTIMIZATION
+    return static_cast<unsigned int>(0);
 }
 
 template <size_t _NumAlts>
-using __variant_index_t =
-#  ifndef _LIBCPP_ABI_VARIANT_INDEX_TYPE_OPTIMIZATION
-    unsigned int;
-#  else
-    std::tuple_element_t< __choose_index_type(_NumAlts), std::tuple<unsigned char, unsigned short, unsigned int> >;
-#  endif
+using __variant_index_t = decltype(std::__choose_index_type<_NumAlts>());
 
 template <class _IndexType>
 constexpr _IndexType __variant_npos = static_cast<_IndexType>(-1);
@@ -1625,6 +1629,7 @@ _LIBCPP_POP_MACROS
 
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #  include <exception>
+#  include <tuple>
 #  include <type_traits>
 #  include <typeinfo>
 #  include <utility>
diff --git a/libcxx/test/libcxx/transitive_includes/cxx23.csv b/libcxx/test/libcxx/transitive_includes/cxx23.csv
index 043d23d551c5cd..daa3e17698bbed 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx23.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx23.csv
@@ -661,7 +661,6 @@ variant cstring
 variant initializer_list
 variant limits
 variant new
-variant tuple
 variant version
 vector array
 vector cctype
diff --git a/libcxx/test/libcxx/transitive_includes/cxx26.csv b/libcxx/test/libcxx/transitive_includes/cxx26.csv
index 043d23d551c5cd..daa3e17698bbed 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx26.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx26.csv
@@ -661,7 +661,6 @@ variant cstring
 variant initializer_list
 variant limits
 variant new
-variant tuple
 variant version
 vector array
 vector cctype
diff --git a/libcxx/test/libcxx/utilities/variant/variant.variant/variant_size.pass.cpp b/libcxx/test/libcxx/utilities/variant/variant.variant/variant_size.pass.cpp
index 9011e61e78808a..2f1ea8bffb479b 100644
--- a/libcxx/test/libcxx/utilities/variant/variant.variant/variant_size.pass.cpp
+++ b/libcxx/test/libcxx/utilities/variant/variant.variant/variant_size.pass.cpp
@@ -49,13 +49,13 @@ void test_index_type() {
 template <class IndexType>
 void test_index_internals() {
   using Lim = std::numeric_limits<IndexType>;
-  static_assert(std::__choose_index_type(Lim::max() -1) !=
-                std::__choose_index_type(Lim::max()), "");
-  static_assert(std::is_same_v<
-      std::__variant_index_t<Lim::max()-1>,
-      std::__variant_index_t<Lim::max()>
-    > == ExpectEqual, "");
-  using IndexT = std::__variant_index_t<Lim::max()-1>;
+#ifdef _LIBCPP_ABI_VARIANT_INDEX_TYPE_OPTIMIZATION
+  static_assert(!std::is_same_v<decltype(std::__choose_index_type<Lim::max() - 1>()),
+                                decltype(std::__choose_index_type<Lim::max()>())>);
+#endif
+  static_assert(
+      std::is_same_v<std::__variant_index_t<Lim::max() - 1>, std::__variant_index_t<Lim::max()> > == ExpectEqual, "");
+  using IndexT   = std::__variant_index_t<Lim::max() - 1>;
   using IndexLim = std::numeric_limits<IndexT>;
   static_assert(std::__variant_npos<IndexT> == IndexLim::max(), "");
 }
diff --git a/libcxx/test/std/containers/container.adaptors/container.adaptors.format/parse.pass.cpp b/libcxx/test/std/containers/container.adaptors/container.adaptors.format/parse.pass.cpp
index 136910b90c909a..c47fb188c865c4 100644
--- a/libcxx/test/std/containers/container.adaptors/container.adaptors.format/parse.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/container.adaptors.format/parse.pass.cpp
@@ -27,10 +27,10 @@
 #include <cassert>
 #include <concepts>
 #include <format>
+#include <memory>
 #include <queue>
 #include <stack>
 
-#include "test_format_context.h"
 #include "test_macros.h"
 #include "make_string.h"
 
@@ -44,7 +44,8 @@ constexpr void test_parse(StringViewT fmt, std::size_t offset) {
   static_assert(std::semiregular<decltype(formatter)>);
 
   std::same_as<typename StringViewT::iterator> auto it = formatter.parse(parse_ctx);
-  assert(it == fmt.end() - offset);
+  // std::to_address works around LWG3989 and MSVC STL's iterator debugging mechanism.
+  assert(std::to_address(it) == std::to_address(fmt.end()) - offset);
 }
 
 template <class StringViewT>
diff --git a/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/parse.pass.cpp b/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/parse.pass.cpp
index c761039442196f..abae40d78b23ac 100644
--- a/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/parse.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/parse.pass.cpp
@@ -25,9 +25,9 @@
 #include <cassert>
 #include <concepts>
 #include <format>
+#include <memory>
 #include <vector>
 
-#include "test_format_context.h"
 #include "test_macros.h"
 #include "make_string.h"
 
@@ -41,7 +41,8 @@ constexpr void test_parse(StringViewT fmt, std::size_t offset) {
   static_assert(std::semiregular<decltype(formatter)>);
 
   std::same_as<typename StringViewT::iterator> auto it = formatter.parse(parse_ctx);
-  assert(it == fmt.end() - offset);
+  // std::to_address works around LWG3989 and MSVC STL's iterator debugging mechanism.
+  assert(std::to_address(it) == std::to_address(fmt.end()) - offset);
 }
 
 template <class CharT>
diff --git a/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/parse.pass.cpp b/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/parse.pass.cpp
index 8523bc89497174..2e75606832b435 100644
--- a/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/parse.pass.cpp
+++ b/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/parse.pass.cpp
@@ -24,9 +24,9 @@
 
 #include <cassert>
 #include <concepts>
+#include <memory>
 #include <thread>
 
-#include "test_format_context.h"
 #include "test_macros.h"
 #include "make_string.h"
 
@@ -40,7 +40,8 @@ constexpr void test_parse(StringViewT fmt, std::size_t offset) {
   static_assert(std::semiregular<decltype(formatter)>);
 
   std::same_as<typename StringViewT::iterator> auto it = formatter.parse(parse_ctx);
-  assert(it == fmt.end() - offset);
+  // std::to_address works around LWG3989 and MSVC STL's iterator debugging mechanism.
+  assert(std::to_address(it) == std::to_address(fmt.end()) - offset);
 }
 
 template <class CharT>
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.bool.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.bool.pass.cpp
index efea2889ce3b5a..116f78e63be09e 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.bool.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.bool.pass.cpp
@@ -22,6 +22,7 @@
 #include <cassert>
 #include <concepts>
 #include <iterator>
+#include <memory>
 #include <type_traits>
 
 #include "test_format_context.h"
@@ -38,7 +39,8 @@ void test(StringT expected, StringViewT fmt, bool arg, std::size_t offset) {
   static_assert(std::semiregular<decltype(formatter)>);
 
   std::same_as<typename StringViewT::iterator> auto it = formatter.parse(parse_ctx);
-  assert(it == fmt.end() - offset);
+  // std::to_address works around LWG3989 and MSVC STL's iterator debugging mechanism.
+  assert(std::to_address(it) == std::to_address(fmt.end()) - offset);
 
   StringT result;
   auto out = std::back_inserter(result);
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.c_string.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.c_string.pass.cpp
index f363bc3032003c..3125dd8b60bbdc 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.c_string.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.c_string.pass.cpp
@@ -21,6 +21,7 @@
 #include <cassert>
 #include <concepts>
 #include <iterator>
+#include <memory>
 #include <type_traits>
 
 #include "test_format_context.h"
@@ -38,7 +39,8 @@ void test(StringT expected, StringViewT fmt, const CharT* a, std::size_t offset)
   static_assert(std::semiregular<decltype(formatter)>);
 
   std::same_as<typename StringViewT::iterator> auto it = formatter.parse(parse_ctx);
-  assert(it == fmt.end() - offset);
+  // std::to_address works around LWG3989 and MSVC STL's iterator debugging mechanism.
+  assert(std::to_address(it) == std::to_address(fmt.end()) - offset);
 
   StringT result;
   auto out = std::back_inserter(result);
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char.pass.cpp
index 554def930020a9..0723547c2df275 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char.pass.cpp
@@ -22,6 +22,7 @@
 #include <cassert>
 #include <concepts>
 #include <iterator>
+#include <memory>
 #include <type_traits>
 
 #include "test_format_context.h"
@@ -39,7 +40,8 @@ void test(StringT expected, StringViewT fmt, ArgumentT arg, std::size_t offset)
   static_assert(std::semiregular<decltype(formatter)>);
 
   std::same_as<typename StringViewT::iterator> auto it = formatter.parse(parse_ctx);
-  assert(it == fmt.end() - offset);
+  // std::to_address works around LWG3989 and MSVC STL's iterator debugging mechanism.
+  assert(std::to_address(it) == std::to_address(fmt.end()) - offset);
 
   StringT result;
   auto out = std::back_inserter(result);
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char_array.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char_array.pass.cpp
index 295ba7f67bbc5b..b0ee399a1c191e 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char_array.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char_array.pass.cpp
@@ -21,6 +21,7 @@
 #include <cassert>
 #include <concepts>
 #include <iterator>
+#include <memory>
 #include <type_traits>
 
 #include "test_format_context.h"
@@ -51,7 +52,8 @@ struct Tester {
     static_assert(std::semiregular<decltype(formatter)>);
 
     std::same_as<typename std::basic_string_view<CharT>::iterator> auto it = formatter.parse(parse_ctx);
-    assert(it == fmt.end() - offset);
+    // std::to_address works around LWG3989 and MSVC STL's iterator debugging mechanism.
+    assert(std::to_address(it) == std::to_address(fmt.end()) - offset);
 
     std::basic_string<CharT> result;
     auto out = std::back_inserter(result);
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.floating_point.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.floating_point.pass.cpp
index 206b0214cf5fd7..263dc1d8d85180 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.floating_point.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.floating_point.pass.cpp
@@ -29,10 +29,12 @@
 #include <algorithm>
 #include <array>
 #include <cassert>
+#include <cctype>
 #include <cmath>
 #include <charconv>
 #include <concepts>
 #include <iterator>
+#include <memory>
 #include <string>
 #include <type_traits>
 
@@ -49,7 +51,8 @@ void test(std::basic_string_view<CharT> fmt, ArithmeticT arg, std::basic_string<
   static_assert(std::semiregular<decltype(formatter)>);
 
   std::same_as<typename std::basic_string_view<CharT>::iterator> auto it = formatter.parse(parse_ctx);
-  assert(it == fmt.end() - offset);
+  // std::to_address works around LWG3989 and MSVC STL's iterator debugging mechanism.
+  assert(std::to_address(it) == std::to_address(fmt.end()) - offset);
 
   std::basic_string<CharT> result;
   auto out = std::back_inserter(result);
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.handle.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.handle.pass.cpp
index e2b3d6b3d23701..5921cc6efcecbe 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.handle.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.handle.pass.cpp
@@ -21,6 +21,7 @@
 #include <charconv>
 #include <concepts>
 #include <iterator>
+#include <memory>
 #include <string>
 #include <type_traits>
 
@@ -43,7 +44,8 @@ void test(std::string expected, std::string_view fmt, color arg, std::size_t off
   static_assert(std::semiregular<decltype(formatter)>);
 
   std::same_as<typename std::string_view::iterator> auto it = formatter.parse(parse_ctx);
-  assert(it == fmt.end() - offset);
+  // std::to_address works around LWG3989 and MSVC STL's iterator debugging mechanism.
+  assert(std::to_address(it) == std::to_address(fmt.end()) - offset);
 
   std::string result;
   auto out = std::back_inserter(result);
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.pointer.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.pointer.pass.cpp
index aa10f34c95b796..408168e033bb6c 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.pointer.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.pointer.pass.cpp
@@ -27,6 +27,7 @@
 #include <charconv>
 #include <concepts>
 #include <iterator>
+#include <memory>
 #include <string>
 #include <type_traits>
 
@@ -44,7 +45,8 @@ void test(StringT expected, StringViewT fmt, PointerT arg, std::size_t offset) {
   static_assert(std::semiregular<decltype(formatter)>);
 
   std::same_as<typename StringViewT::iterator> auto it = formatter.parse(parse_ctx);
-  assert(it == fmt.end() - offset);
+  // std::to_address works around LWG3989 and MSVC STL's iterator debugging mechanism.
+  assert(std::to_address(it) == std::to_address(fmt.end()) - offset);
 
   StringT result;
   auto out = std::back_inserter(result);
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.signed_integral.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.signed_integral.pass.cpp
index e5db5dac0c569d..cdd56d1b882a00 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.signed_integral.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.signed_integral.pass.cpp
@@ -30,6 +30,7 @@
 #include <cassert>
 #include <concepts>
 #include <iterator>
+#include <memory>
 #include <type_traits>
 
 #include "test_format_context.h"
@@ -46,7 +47,8 @@ void test(StringT expected, StringViewT fmt, ArithmeticT arg, std::size_t offset
   static_assert(std::semiregular<decltype(formatter)>);
 
   std::same_as<typename StringViewT::iterator> auto it = formatter.parse(parse_ctx);
-  assert(it == fmt.end() - offset);
+  // std::to_address works around LWG3989 and MSVC STL's iterator debugging mechanism.
+  assert(std::to_address(it) == std::to_address(fmt.end()) - offset);
 
   StringT result;
   auto out = std::back_inserter(result);
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.string.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.string.pass.cpp
index 73df7464dcb7a7..49f54dae264787 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.string.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.string.pass.cpp
@@ -23,6 +23,7 @@
 #include <cassert>
 #include <concepts>
 #include <iterator>
+#include <memory>
 #include <type_traits>
 
 #include "make_string.h"
@@ -46,7 +47,8 @@ void test(StringT expected, StringViewT fmt, StringT a, std::size_t offset) {
   static_assert(std::semiregular<decltype(formatter)>);
 
   std::same_as<typename StringViewT::iterator> auto it = formatter.parse(parse_ctx);
-  assert(it == fmt.end() - offset);
+  // std::to_address works around LWG3989 and MSVC STL's iterator debugging mechanism.
+  assert(std::to_address(it) == std::to_address(fmt.end()) - offset);
 
   StringT result;
   auto out = std::back_inserter(result);
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.unsigned_integral.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.unsigned_integral.pass.cpp
index eb70115bf5de59..a9537465faf9db 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.unsigned_integral.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.unsigned_integral.pass.cpp
@@ -30,6 +30,7 @@
 #include <cassert>
 #include <concepts>
 #include <iterator>
+#include <memory>
 #include <type_traits>
 
 #include "test_format_context.h"
@@ -46,7 +47,8 @@ void test(StringT expected, StringViewT fmt, ArithmeticT arg, std::size_t offset
   static_assert(std::semiregular<decltype(formatter)>);
 
   std::same_as<typename StringViewT::iterator> auto it = formatter.parse(parse_ctx);
-  assert(it == fmt.end() - offset);
+  // std::to_address works around LWG3989 and MSVC STL's iterator debugging mechanism.
+  assert(std::to_address(it) == std::to_address(fmt.end()) - offset);
 
   StringT result;
   auto out = std::back_inserter(result);
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/parse.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/parse.pass.cpp
index 9f9b4d4545a892..0eb984cc2c01aa 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/parse.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/parse.pass.cpp
@@ -22,8 +22,8 @@
 #include <cassert>
 #include <concepts>
 #include <format>
+#include <memory>
 
-#include "test_format_context.h"
 #include "test_macros.h"
 #include "make_string.h"
 
@@ -37,7 +37,8 @@ constexpr void test_parse(StringViewT fmt, std::size_t offset) {
   static_assert(std::semiregular<decltype(formatter)>);
 
   std::same_as<typename StringViewT::iterator> auto it = formatter.parse(parse_ctx);
-  assert(it == fmt.end() - offset);
+  // std::to_address works around LWG3989 and MSVC STL's iterator debugging mechanism.
+  assert(std::to_address(it) == std::to_address(fmt.end()) - offset);
 }
 
 template <class CharT>
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/parse.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/parse.pass.cpp
index daa92214845ba9..99d6aa7452a022 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/parse.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/parse.pass.cpp
@@ -25,8 +25,8 @@
 #include <concepts>
 #include <format>
 #include <map>
+#include <memory>
 
-#include "test_format_context.h"
 #include "test_macros.h"
 #include "make_string.h"
 
@@ -40,7 +40,8 @@ constexpr void test_parse(StringViewT fmt, std::size_t offset) {
   static_assert(std::semiregular<decltype(formatter)>);
 
   std::same_as<typename StringViewT::iterator> auto it = formatter.parse(parse_ctx);
-  assert(it == fmt.end() - offset);
+  // std::to_address works around LWG3989 and MSVC STL's iterator debugging mechanism.
+  assert(std::to_address(it) == std::to_address(fmt.end()) - offset);
 }
 
 template <class CharT>
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtset/parse.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtset/parse.pass.cpp
index 843855f4e6d076..182beff4bd168f 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtset/parse.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtset/parse.pass.cpp
@@ -24,9 +24,9 @@
 #include <cassert>
 #include <concepts>
 #include <format>
+#include <memory>
 #include <set>
 
-#include "test_format_context.h"
 #include "test_macros.h"
 #include "make_string.h"
 
@@ -40,7 +40,8 @@ constexpr void test_parse(StringViewT fmt, std::size_t offset) {
   static_assert(std::semiregular<decltype(formatter)>);
 
   std::same_as<typename StringViewT::iterator> auto it = formatter.parse(parse_ctx);
-  assert(it == fmt.end() - offset);
+  // std::to_address works around LWG3989 and MSVC STL's iterator debugging mechanism.
+  assert(std::to_address(it) == std::to_address(fmt.end()) - offset);
 }
 
 template <class CharT>
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/parse.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/parse.pass.cpp
index 7acee9cb9dc51a..3354de34721990 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/parse.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/parse.pass.cpp
@@ -23,9 +23,9 @@
 #include <cassert>
 #include <concepts>
 #include <format>
+#include <memory>
 
 #include "format.functions.tests.h"
-#include "test_format_context.h"
 #include "test_macros.h"
 
 template <class FormatterT, class StringViewT>
@@ -36,7 +36,8 @@ constexpr void test_parse(StringViewT fmt, std::size_t offset) {
   static_assert(std::semiregular<decltype(formatter)>);
 
   std::same_as<typename StringViewT::iterator> auto it = formatter.parse(parse_ctx);
-  assert(it == fmt.end() - offset);
+  // std::to_address works around LWG3989 and MSVC STL's iterator debugging mechanism.
+  assert(std::to_address(it) == std::to_address(fmt.end()) - offset);
 }
 
 template <class StringViewT>
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.formatter/parse.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.formatter/parse.pass.cpp
index 87774c26208771..2d0cef11feb8c5 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.formatter/parse.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.formatter/parse.pass.cpp
@@ -25,8 +25,8 @@
 #include <cassert>
 #include <concepts>
 #include <format>
+#include <memory>
 
-#include "test_format_context.h"
 #include "test_macros.h"
 #include "make_string.h"
 
@@ -40,7 +40,8 @@ constexpr void test_parse(StringViewT fmt, std::size_t offset) {
   static_assert(std::semiregular<decltype(formatter)>);
 
   std::same_as<typename StringViewT::iterator> auto it = formatter.parse(parse_ctx);
-  assert(it == fmt.end() - offset);
+  // std::to_address works around LWG3989 and MSVC STL's iterator debugging mechanism.
+  assert(std::to_address(it) == std::to_address(fmt.end()) - offset);
 }
 
 template <class CharT>
diff --git a/libcxx/test/std/utilities/format/format.tuple/parse.pass.cpp b/libcxx/test/std/utilities/format/format.tuple/parse.pass.cpp
index 5cabbda63dd02e..8653c282bfe107 100644
--- a/libcxx/test/std/utilities/format/format.tuple/parse.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.tuple/parse.pass.cpp
@@ -24,10 +24,10 @@
 #include <cassert>
 #include <concepts>
 #include <format>
+#include <memory>
 #include <tuple>
 #include <utility>
 
-#include "test_format_context.h"
 #include "test_macros.h"
 #include "make_string.h"
 
@@ -41,7 +41,8 @@ constexpr void test(StringViewT fmt, std::size_t offset) {
   static_assert(std::semiregular<decltype(formatter)>);
 
   std::same_as<typename StringViewT::iterator> auto it = formatter.parse(parse_ctx);
-  assert(it == fmt.end() - offset);
+  // std::to_address works around LWG3989 and MSVC STL's iterator debugging mechanism.
+  assert(std::to_address(it) == std::to_address(fmt.end()) - offset);
 }
 
 template <class CharT, class Arg>
diff --git a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.elem/tuple.by.type.verify.cpp b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.elem/tuple.by.type.verify.cpp
index 1d05eb5fe76e97..00f27c3220d2ed 100644
--- a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.elem/tuple.by.type.verify.cpp
+++ b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.elem/tuple.by.type.verify.cpp
@@ -18,13 +18,13 @@ struct UserType {};
 
 void test_bad_index() {
     std::tuple<long, long, char, std::string, char, UserType, char> t1;
-    TEST_IGNORE_NODISCARD std::get<int>(t1); // expected-error@tuple:* {{type not found}}
+    TEST_IGNORE_NODISCARD std::get<int>(t1); // expected-error@*:* {{type not found}}
     TEST_IGNORE_NODISCARD std::get<long>(t1); // expected-note {{requested here}}
     TEST_IGNORE_NODISCARD std::get<char>(t1); // expected-note {{requested here}}
-        // expected-error@tuple:* 2 {{type occurs more than once}}
+        // expected-error@*:* 2 {{type occurs more than once}}
     std::tuple<> t0;
     TEST_IGNORE_NODISCARD std::get<char*>(t0); // expected-node {{requested here}}
-        // expected-error@tuple:* 1 {{type not in empty type list}}
+        // expected-error@*:* {{type not in empty type list}}
 }
 
 void test_bad_return_type() {
diff --git a/libcxx/test/std/utilities/variant/variant.visit.member/visit.pass.cpp b/libcxx/test/std/utilities/variant/variant.visit.member/visit.pass.cpp
index 68706d6c32af4f..50e7fc81387abc 100644
--- a/libcxx/test/std/utilities/variant/variant.visit.member/visit.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.visit.member/visit.pass.cpp
@@ -21,6 +21,7 @@
 #include <cassert>
 #include <memory>
 #include <string>
+#include <tuple>
 #include <type_traits>
 #include <utility>
 #include <variant>
diff --git a/libcxx/test/std/utilities/variant/variant.visit.member/visit_return_type.pass.cpp b/libcxx/test/std/utilities/variant/variant.visit.member/visit_return_type.pass.cpp
index 20472c62fc5f98..b005f303bc4b6c 100644
--- a/libcxx/test/std/utilities/variant/variant.visit.member/visit_return_type.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.visit.member/visit_return_type.pass.cpp
@@ -22,6 +22,7 @@
 #include <memory>
 #include <string>
 #include <type_traits>
+#include <tuple>
 #include <utility>
 #include <variant>
 
diff --git a/libcxx/test/std/utilities/variant/variant.visit/visit.pass.cpp b/libcxx/test/std/utilities/variant/variant.visit/visit.pass.cpp
index 097b784f2bf2ce..798ce7ded72a60 100644
--- a/libcxx/test/std/utilities/variant/variant.visit/visit.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.visit/visit.pass.cpp
@@ -15,6 +15,7 @@
 #include <cassert>
 #include <memory>
 #include <string>
+#include <tuple>
 #include <type_traits>
 #include <utility>
 #include <variant>
diff --git a/libcxx/test/std/utilities/variant/variant.visit/visit_return_type.pass.cpp b/libcxx/test/std/utilities/variant/variant.visit/visit_return_type.pass.cpp
index eb425c07f93222..b1189dff656db4 100644
--- a/libcxx/test/std/utilities/variant/variant.visit/visit_return_type.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.visit/visit_return_type.pass.cpp
@@ -15,6 +15,7 @@
 #include <cassert>
 #include <memory>
 #include <string>
+#include <tuple>
 #include <type_traits>
 #include <utility>
 #include <variant>
diff --git a/lld/ELF/Arch/AArch64.cpp b/lld/ELF/Arch/AArch64.cpp
index 71a1b1111e4298..30ccd68f7b7506 100644
--- a/lld/ELF/Arch/AArch64.cpp
+++ b/lld/ELF/Arch/AArch64.cpp
@@ -994,7 +994,7 @@ addTaggedSymbolReferences(InputSectionBase &sec,
     error("non-RELA relocations are not allowed with memtag globals");
 
   for (const typename ELFT::Rela &rel : rels.relas) {
-    Symbol &sym = sec.getFile<ELFT>()->getRelocTargetSym(rel);
+    Symbol &sym = sec.file->getRelocTargetSym(rel);
     // Linker-synthesized symbols such as __executable_start may be referenced
     // as tagged in input objfiles, and we don't want them to be tagged. A
     // cheap way to exclude them is the type check, but their type is
diff --git a/lld/ELF/Arch/PPC64.cpp b/lld/ELF/Arch/PPC64.cpp
index 019c073bd541b6..657332deebfde1 100644
--- a/lld/ELF/Arch/PPC64.cpp
+++ b/lld/ELF/Arch/PPC64.cpp
@@ -347,7 +347,7 @@ getRelaTocSymAndAddend(InputSectionBase *tocSec, uint64_t offset) {
   uint64_t index = std::min<uint64_t>(offset / 8, relas.size() - 1);
   for (;;) {
     if (relas[index].r_offset == offset) {
-      Symbol &sym = tocSec->getFile<ELFT>()->getRelocTargetSym(relas[index]);
+      Symbol &sym = tocSec->file->getRelocTargetSym(relas[index]);
       return {dyn_cast<Defined>(&sym), getAddend<ELFT>(relas[index])};
     }
     if (relas[index].r_offset < offset || index == 0)
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 24faa1753f1e3d..de4b2e345ac917 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -2302,9 +2302,9 @@ static void readSymbolPartitionSection(InputSectionBase *s) {
   Symbol *sym;
   const RelsOrRelas<ELFT> rels = s->template relsOrRelas<ELFT>();
   if (rels.areRelocsRel())
-    sym = &s->getFile<ELFT>()->getRelocTargetSym(rels.rels[0]);
+    sym = &s->file->getRelocTargetSym(rels.rels[0]);
   else
-    sym = &s->getFile<ELFT>()->getRelocTargetSym(rels.relas[0]);
+    sym = &s->file->getRelocTargetSym(rels.relas[0]);
   if (!isa<Defined>(sym) || !sym->includeInDynsym())
     return;
 
diff --git a/lld/ELF/ICF.cpp b/lld/ELF/ICF.cpp
index 9d7251037fb6d6..2551c2e807b73f 100644
--- a/lld/ELF/ICF.cpp
+++ b/lld/ELF/ICF.cpp
@@ -247,8 +247,8 @@ bool ICF<ELFT>::constantEq(const InputSection *secA, ArrayRef<RelTy> ra,
     uint64_t addA = getAddend<ELFT>(ra[i]);
     uint64_t addB = getAddend<ELFT>(rb[i]);
 
-    Symbol &sa = secA->template getFile<ELFT>()->getRelocTargetSym(ra[i]);
-    Symbol &sb = secB->template getFile<ELFT>()->getRelocTargetSym(rb[i]);
+    Symbol &sa = secA->file->getRelocTargetSym(ra[i]);
+    Symbol &sb = secB->file->getRelocTargetSym(rb[i]);
     if (&sa == &sb) {
       if (addA == addB)
         continue;
@@ -338,8 +338,8 @@ bool ICF<ELFT>::variableEq(const InputSection *secA, ArrayRef<RelTy> ra,
 
   for (size_t i = 0; i < ra.size(); ++i) {
     // The two sections must be identical.
-    Symbol &sa = secA->template getFile<ELFT>()->getRelocTargetSym(ra[i]);
-    Symbol &sb = secB->template getFile<ELFT>()->getRelocTargetSym(rb[i]);
+    Symbol &sa = secA->file->getRelocTargetSym(ra[i]);
+    Symbol &sb = secB->file->getRelocTargetSym(rb[i]);
     if (&sa == &sb)
       continue;
 
@@ -437,12 +437,12 @@ void ICF<ELFT>::forEachClass(llvm::function_ref<void(size_t, size_t)> fn) {
 
 // Combine the hashes of the sections referenced by the given section into its
 // hash.
-template <class ELFT, class RelTy>
+template <class RelTy>
 static void combineRelocHashes(unsigned cnt, InputSection *isec,
                                ArrayRef<RelTy> rels) {
   uint32_t hash = isec->eqClass[cnt % 2];
   for (RelTy rel : rels) {
-    Symbol &s = isec->template getFile<ELFT>()->getRelocTargetSym(rel);
+    Symbol &s = isec->file->getRelocTargetSym(rel);
     if (auto *d = dyn_cast<Defined>(&s))
       if (auto *relSec = dyn_cast_or_null<InputSection>(d->section))
         hash += relSec->eqClass[cnt % 2];
@@ -504,9 +504,9 @@ template <class ELFT> void ICF<ELFT>::run() {
     parallelForEach(sections, [&](InputSection *s) {
       const RelsOrRelas<ELFT> rels = s->template relsOrRelas<ELFT>();
       if (rels.areRelocsRel())
-        combineRelocHashes<ELFT>(cnt, s, rels.rels);
+        combineRelocHashes(cnt, s, rels.rels);
       else
-        combineRelocHashes<ELFT>(cnt, s, rels.relas);
+        combineRelocHashes(cnt, s, rels.relas);
     });
   }
 
diff --git a/lld/ELF/InputFiles.h b/lld/ELF/InputFiles.h
index 0cbe00aa396acb..54de842a81cf35 100644
--- a/lld/ELF/InputFiles.h
+++ b/lld/ELF/InputFiles.h
@@ -99,6 +99,18 @@ class InputFile {
     return {symbols.get(), numSymbols};
   }
 
+  Symbol &getSymbol(uint32_t symbolIndex) const {
+    assert(fileKind == ObjKind);
+    if (symbolIndex >= numSymbols)
+      fatal(toString(this) + ": invalid symbol index");
+    return *this->symbols[symbolIndex];
+  }
+
+  template <typename RelT> Symbol &getRelocTargetSym(const RelT &rel) const {
+    uint32_t symIndex = rel.getSymbol(config->isMips64EL);
+    return getSymbol(symIndex);
+  }
+
   // Get filename to use for linker script processing.
   StringRef getNameForScript() const;
 
@@ -242,19 +254,8 @@ template <class ELFT> class ObjFile : public ELFFileBase {
   StringRef getShtGroupSignature(ArrayRef<Elf_Shdr> sections,
                                  const Elf_Shdr &sec);
 
-  Symbol &getSymbol(uint32_t symbolIndex) const {
-    if (symbolIndex >= numSymbols)
-      fatal(toString(this) + ": invalid symbol index");
-    return *this->symbols[symbolIndex];
-  }
-
   uint32_t getSectionIndex(const Elf_Sym &sym) const;
 
-  template <typename RelT> Symbol &getRelocTargetSym(const RelT &rel) const {
-    uint32_t symIndex = rel.getSymbol(config->isMips64EL);
-    return getSymbol(symIndex);
-  }
-
   std::optional<llvm::DILineInfo> getDILineInfo(const InputSectionBase *,
                                                 uint64_t);
   std::optional<std::pair<std::string, unsigned>>
diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
index e033a715b59214..7508a1336c91a4 100644
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -924,7 +924,7 @@ void InputSection::relocateNonAlloc(uint8_t *buf, ArrayRef<RelTy> rels) {
     if (!RelTy::IsRela)
       addend += target.getImplicitAddend(bufLoc, type);
 
-    Symbol &sym = getFile<ELFT>()->getRelocTargetSym(rel);
+    Symbol &sym = this->file->getRelocTargetSym(rel);
     RelExpr expr = target.getRelExpr(type, sym, bufLoc);
     if (expr == R_NONE)
       continue;
@@ -939,7 +939,7 @@ void InputSection::relocateNonAlloc(uint8_t *buf, ArrayRef<RelTy> rels) {
           val = *tombstone;
         } else {
           val = sym.getVA(addend) -
-                (getFile<ELFT>()->getRelocTargetSym(rels[i]).getVA(0) +
+                (this->file->getRelocTargetSym(rels[i]).getVA(0) +
                  getAddend<ELFT>(rels[i]));
         }
         if (overwriteULEB128(bufLoc, val) >= 0x80)
diff --git a/lld/ELF/MarkLive.cpp b/lld/ELF/MarkLive.cpp
index 0073ed42112a85..93c66e81d2fa91 100644
--- a/lld/ELF/MarkLive.cpp
+++ b/lld/ELF/MarkLive.cpp
@@ -89,9 +89,8 @@ template <class ELFT>
 template <class RelTy>
 void MarkLive<ELFT>::resolveReloc(InputSectionBase &sec, RelTy &rel,
                                   bool fromFDE) {
-  Symbol &sym = sec.getFile<ELFT>()->getRelocTargetSym(rel);
-
   // If a symbol is referenced in a live section, it is used.
+  Symbol &sym = sec.file->getRelocTargetSym(rel);
   sym.used = true;
 
   if (auto *d = dyn_cast<Defined>(&sym)) {
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index 66b3e835cabc57..206fb0f5376664 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -365,8 +365,7 @@ CieRecord *EhFrameSection::addCie(EhSectionPiece &cie, ArrayRef<RelTy> rels) {
   Symbol *personality = nullptr;
   unsigned firstRelI = cie.firstRelocation;
   if (firstRelI != (unsigned)-1)
-    personality =
-        &cie.sec->template getFile<ELFT>()->getRelocTargetSym(rels[firstRelI]);
+    personality = &cie.sec->file->getRelocTargetSym(rels[firstRelI]);
 
   // Search for an existing CIE by CIE contents/relocation target pair.
   CieRecord *&rec = cieMap[{cie.data(), personality}];
@@ -396,7 +395,7 @@ Defined *EhFrameSection::isFdeLive(EhSectionPiece &fde, ArrayRef<RelTy> rels) {
     return nullptr;
 
   const RelTy &rel = rels[firstRelI];
-  Symbol &b = sec->template getFile<ELFT>()->getRelocTargetSym(rel);
+  Symbol &b = sec->file->getRelocTargetSym(rel);
 
   // FDEs for garbage-collected or merged-by-ICF sections, or sections in
   // another partition, are dead.
diff --git a/lldb/docs/resources/build.rst b/lldb/docs/resources/build.rst
index 5f4d35ced6236c..995273a97b653a 100644
--- a/lldb/docs/resources/build.rst
+++ b/lldb/docs/resources/build.rst
@@ -73,7 +73,7 @@ commands below.
 ::
 
   $ yum install libedit-devel libxml2-devel ncurses-devel python-devel swig
-  $ sudo apt-get install build-essential swig python3-dev libedit-dev libncurses5-dev
+  $ sudo apt-get install build-essential swig python3-dev libedit-dev libncurses5-dev libxml2-dev
   $ pkg install swig python libxml2
   $ pkgin install swig python36 cmake ninja-build
   $ brew install swig cmake ninja
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 111c8cfa15d828..494d8abeb64d21 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -173,15 +173,18 @@ endforeach()
 set(NEED_LIBC_HDRGEN FALSE)
 if("libc" IN_LIST LLVM_ENABLE_RUNTIMES)
   set(NEED_LIBC_HDRGEN TRUE)
-else()
-  foreach(_name ${LLVM_RUNTIME_TARGETS})
-    if("libc" IN_LIST RUNTIMES_${_name}_LLVM_ENABLE_RUNTIMES)
-      set(NEED_LIBC_HDRGEN TRUE)
-      if("${_name}" STREQUAL "amdgcn-amd-amdhsa" OR "${_name}" STREQUAL "nvptx64-nvidia-cuda")
-        set(LLVM_LIBC_GPU_BUILD ON)
-      endif()
+endif()
+foreach(_name ${LLVM_RUNTIME_TARGETS})
+  if("libc" IN_LIST RUNTIMES_${_name}_LLVM_ENABLE_RUNTIMES)
+    set(NEED_LIBC_HDRGEN TRUE)
+    if("${_name}" STREQUAL "amdgcn-amd-amdhsa" OR "${_name}" STREQUAL "nvptx64-nvidia-cuda")
+      set(LLVM_LIBC_GPU_BUILD ON)
     endif()
-  endforeach()
+  endif()
+endforeach()
+if("${LIBC_TARGET_TRIPLE}" STREQUAL "amdgcn-amd-amdhsa" OR
+   "${LIBC_TARGET_TRIPLE}" STREQUAL "nvptx64-nvidia-cuda")
+  set(LLVM_LIBC_GPU_BUILD ON)
 endif()
 if(NEED_LIBC_HDRGEN)
   # To build the libc runtime, we need to be able to build few libc build
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 7f39f69cae60db..f5f37d9e8a3b03 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -525,16 +525,7 @@ it supports. Such code objects may not perform as well as those for the non-gene
 
 Generic processors are only available on code object V6 and above (see :ref:`amdgpu-elf-code-object`).
 
-Generic processor code objects are versioned (see :ref:`amdgpu-elf-header-e_flags-table-v6-onwards`) between 1 and 255.
-The version of non-generic code objects is always set to 0.
-
-For a generic code object, adding a new supported processor may require the code generated for the generic target to be changed
-so it can continue to execute on the previously supported processors as well as on the new one.
-When this happens, the generic code object version number is incremented at the same time as the generic target is updated.
-
-Each supported processor of a generic target is mapped to the version it was introduced in.
-A generic code object can execute on a supported processor if the version of the code object being loaded is
-greater than or equal to the version in which the processor was added to the generic target.
+Generic processor code objects are versioned. See :ref:`amdgpu-generic-processor-versioning` for more information on how versioning works.
 
   .. table:: AMDGPU Generic Processors
      :name: amdgpu-generic-processor-table
@@ -621,6 +612,21 @@ greater than or equal to the version in which the processor was added to the gen
                                                                                                 - ``gfx1151``
      ==================== ============== ================= ================== ================= =================================
 
+.. _amdgpu-generic-processor-versioning:
+
+Generic Processor Versioning
+----------------------------
+
+Generic processor (see :ref:`amdgpu-generic-processor-table`) code objects are versioned (see :ref:`amdgpu-elf-header-e_flags-table-v6-onwards`) between 1 and 255.
+The version of non-generic code objects is always set to 0.
+
+For a generic code object, adding a new supported processor may require the code generated for the generic target to be changed
+so it can continue to execute on the previously supported processors as well as on the new one.
+When this happens, the generic code object version number is incremented at the same time as the generic target is updated.
+
+Each supported processor of a generic target is mapped to the version it was introduced in.
+A generic code object can execute on a supported processor if the version of the code object being loaded is
+greater than or equal to the version in which the processor was added to the generic target.
 
 .. _amdgpu-target-features:
 
@@ -1803,7 +1809,7 @@ The AMDGPU backend uses the following ELF header:
                                                              mask. This is a value between 1 and 255,
                                                              stored in the most significant byte
                                                              of EFLAGS.
-                                                             See :ref:`amdgpu-generic-processor-table`
+                                                             See :ref:`amdgpu-generic-processor-versioning`
      ============================================ ========== =========================================
 
   .. table:: AMDGPU ``EF_AMDGPU_MACH`` Values
diff --git a/llvm/docs/CMake.rst b/llvm/docs/CMake.rst
index 35c47989a7eef0..be5da5652e31e3 100644
--- a/llvm/docs/CMake.rst
+++ b/llvm/docs/CMake.rst
@@ -363,7 +363,7 @@ enabled sub-projects. Nearly all of these variable names begin with
   documentation targets being as part of a normal build.  If the ``install``
   target is run then this also enables all built documentation targets to be
   installed. Defaults to OFF.  To enable a particular documentation target, see
-  see LLVM_ENABLE_SPHINX and LLVM_ENABLE_DOXYGEN.
+  LLVM_ENABLE_SPHINX and LLVM_ENABLE_DOXYGEN.
 
 **LLVM_BUILD_EXAMPLES**:BOOL
   Build LLVM examples. Defaults to OFF. Targets for building each example are
diff --git a/llvm/docs/CommandGuide/llvm-remarkutil.rst b/llvm/docs/CommandGuide/llvm-remarkutil.rst
index 20f2b34ce00883..af7d8eb31c0181 100644
--- a/llvm/docs/CommandGuide/llvm-remarkutil.rst
+++ b/llvm/docs/CommandGuide/llvm-remarkutil.rst
@@ -3,12 +3,12 @@ llvm-remarkutil - Remark utility
 
 .. program:: llvm-remarkutil
 
-SYNOPSIS
+Synopsis
 --------
 
 :program:`llvm-remarkutil` [*subcommmand*] [*options*]
 
-DESCRIPTION
+Description
 -----------
 
 Utility for displaying information from, and converting between different
@@ -72,12 +72,14 @@ Instruction count remarks require asm-printer remarks.
 CSV format is as follows:
 
 ::
+
   Function,InstructionCount
   foo,123
 
 if `--use-debug-loc` is passed then the CSV will include the source path, line number and column.
 
 ::
+
   Source,Function,InstructionCount
   path:line:column,foo,3
 
@@ -101,12 +103,14 @@ Annotation count remarks require AnnotationRemarksPass remarks.
 CSV format is as follows:
 
 ::
+
   Function,Count
   foo,123
 
 if `--use-debug-loc` is passed then the CSV will include the source path, line number and column.
 
 ::
+  
   Source,Function,Count
   path:line:column,foo,3
 
@@ -115,67 +119,83 @@ if `--use-debug-loc` is passed then the CSV will include the source path, line n
 count
 ~~~~~
 
-..program:: llvm-remarkutil count
+.. program:: llvm-remarkutil count
 
 USAGE: :program:`llvm-remarkutil` count [*options*] <input file>
 
 Summary
 ^^^^^^^
 
-:program:`llvm-remarkutil count` counts `remarks <https://llvm.org/docs/Remarks.html>` based on specified properties.
+:program:`llvm-remarkutil count` counts `remarks <https://llvm.org/docs/Remarks.html>`_ based on specified properties.
 By default the tool counts remarks based on how many occur in a source file or function or total for the generated remark file.
 The tool also supports collecting count based on specific remark arguments. The specified arguments should have an integer value to be able to report a count.
 
 The tool contains utilities to filter the remark count based on remark name, pass name, argument value and remark type.
-OPTIONS
--------
+
+Options
+^^^^^^^
 
 .. option:: --parser=<yaml|bitstream>
 
   Select the type of input remark parser. Required.
-  * ``yaml``: The tool will parse YAML remarks.
-  * ``bitstream``: The tool will parse bitstream remarks.
 
-.. option:: --count-by<value>
+  * ``yaml`` : The tool will parse YAML remarks.
+  * ``bitstream`` : The tool will parse bitstream remarks.
+
+.. option:: --count-by=<value>
+
   Select option to collect remarks by.
-  * ``remark-name``: count how many individual remarks exist.
-  * ``arg``: count remarks based on specified arguments passed by --(r)args. The argument value must be a number.
+
+  * ``remark-name`` : count how many individual remarks exist.
+  * ``arg`` : count remarks based on specified arguments passed by --(r)args. The argument value must be a number.
 
 .. option:: --group-by=<value>
+
   group count of remarks by property.
-  * ``source``: Count will be collected per source path. Remarks with no debug location will not be counted.
-  * ``function``: Count is collected per function.
-  * ``function-with-loc``: Count is collected per function per source. Remarks with no debug location will not be counted.
-  * ``Total``: Report a count for the provided remark file.
+
+  * ``source`` : Count will be collected per source path. Remarks with no debug location will not be counted.
+  * ``function`` : Count is collected per function.
+  * ``function-with-loc`` : Count is collected per function per source. Remarks with no debug location will not be counted.
+  * ``Total`` : Report a count for the provided remark file.
 
 .. option:: --args[=arguments]
+
   If `count-by` is set to `arg` this flag can be used to collect from specified remark arguments represented as a comma separated string.
   The arguments must have a numeral value to be able to count remarks by
 
 .. option:: --rargs[=arguments]
+
   If `count-by` is set to `arg` this flag can be used to collect from specified remark arguments using regular expression.
   The arguments must have a numeral value to be able to count remarks by
 
 .. option:: --pass-name[=<string>]
+
   Filter count by pass name.
 
 .. option:: --rpass-name[=<string>]
+
   Filter count by pass name using regular expressions.
 
 .. option:: --remark-name[=<string>]
+
   Filter count by remark name.
 
 .. option:: --rremark-name[=<string>]
+
   Filter count by remark name using regular expressions.
 
 .. option:: --filter-arg-by[=<string>]
+
   Filter count by argument value.
 
 .. option:: --rfilter-arg-by[=<string>]
+
   Filter count by argument value using regular expressions.
 
 .. option:: --remark-type=<value>
+
   Filter remarks by type with the following options.
+
   * ``unknown``
   * ``passed``
   * ``missed``
@@ -210,20 +230,22 @@ compiling a **fixed source** with **differing compilers** or
 `bitstream <https://llvm.org/docs/Remarks.html#llvm-bitstream-remarks>`_
 remarks.
 
-OPTIONS
--------
+Options
+^^^^^^^
 
 .. option:: --parser=<yaml|bitstream>
 
-  Select the type of input remark parser. Required.
-  * ``yaml``: The tool will parse YAML remarks.
-  * ``bitstream``: The tool will parse bitstream remarks.
+Select the type of input remark parser. Required.
+
+* ``yaml`` : The tool will parse YAML remarks.
+* ``bitstream`` : The tool will parse bitstream remarks.
 
 .. option:: --report-style=<human|json>
 
   Output style.
-  * ``human``: Human-readable textual report. Default option.
-  * ``json``: JSON report.
+
+  * ``human`` : Human-readable textual report. Default option.
+  * ``json`` : JSON report.
 
 .. option:: --pretty
 
@@ -235,8 +257,8 @@ OPTIONS
 
   Output file for the report. Outputs to stdout by default.
 
-HUMAN-READABLE OUTPUT
----------------------
+Human-Readable Output
+^^^^^^^^^^^^^^^^^^^^^
 
 The human-readable format for :program:`llvm-remarkutil size-diff` is composed of
 two sections:
@@ -245,7 +267,7 @@ two sections:
 * A high-level summary of all changes.
 
 Changed Function Section
-~~~~~~~~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^^^^^^^^
 
 Suppose you are comparing two remark files OLD and NEW.
 
@@ -282,7 +304,7 @@ A breakdown of the format is below:
   Second file stack byte count - first file stack byte count.
 
 Summary Section
-~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^
 
 :program:`llvm-remarkutil size-diff` will output a high-level summary after
 printing all changed functions.
@@ -307,10 +329,10 @@ printing all changed functions.
   file.
 
 JSON OUTPUT
------------
+^^^^^^^^^^^^
 
 High-Level view
-~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^
 
 Suppose we are comparing two files, OLD and NEW.
 
@@ -352,7 +374,7 @@ Suppose we are comparing two files, OLD and NEW.
   Functions only present in the second file.
 
 Function JSON
-~~~~~~~~~~~~~
+^^^^^^^^^^^^^
 
 The ``InBoth``, ``OnlyInA``, and ``OnlyInB`` sections contain size information
 for each function in the input remark files.
@@ -387,7 +409,7 @@ for each function in the input remark files.
   *  ``STACK_BYTES_B``: Stack bytes in NEW.
 
 Computing Diffs From Function JSON
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Function JSON does not contain the diffs. Tools consuming JSON output from
 :program:`llvm-remarkutil size-diff` are responsible for computing the diffs
@@ -399,7 +421,7 @@ separately.
 * Stack byte count diff: ``STACK_BYTES_B - STACK_BYTES_A``
 
 EXIT STATUS
------------
+^^^^^^^^^^^
 
 :program:`llvm-remarkutil size-diff` returns 0 on success, and a non-zero value
 otherwise.
diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
index 257643c109ba1c..3f0fc160f9ea4e 100644
--- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -244,14 +244,13 @@ class MachineRegisterInfo {
   bool isUpdatedCSRsInitialized() const { return IsUpdatedCSRsInitialized; }
 
   /// Returns true if a register can be used as an argument to a function.
-  bool isArgumentRegister(const MachineFunction &MF, MCRegister Reg) const;
+  bool isArgumentRegister(MCRegister Reg) const;
 
   /// Returns true if a register is a fixed register.
-  bool isFixedRegister(const MachineFunction &MF, MCRegister Reg) const;
+  bool isFixedRegister(MCRegister Reg) const;
 
   /// Returns true if a register is a general purpose register.
-  bool isGeneralPurposeRegister(const MachineFunction &MF,
-                                MCRegister Reg) const;
+  bool isGeneralPurposeRegister(MCRegister Reg) const;
 
   /// Disables the register from the list of CSRs.
   /// I.e. the register will not appear as part of the CSR mask.
@@ -930,7 +929,7 @@ class MachineRegisterInfo {
 
   /// freezeReservedRegs - Called by the register allocator to freeze the set
   /// of reserved registers before allocation begins.
-  void freezeReservedRegs(const MachineFunction&);
+  void freezeReservedRegs();
 
   /// reserveReg -- Mark a register as reserved so checks like isAllocatable 
   /// will not suggest using it. This should not be used during the middle
diff --git a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
index 85de18f5169e5e..32ff15fc75936a 100644
--- a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
@@ -20,7 +20,7 @@
 #include "llvm/ADT/SparseMultiSet.h"
 #include "llvm/ADT/SparseSet.h"
 #include "llvm/ADT/identity.h"
-#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/LiveRegUnits.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
@@ -263,7 +263,7 @@ namespace llvm {
     MachineInstr *FirstDbgValue = nullptr;
 
     /// Set of live physical registers for updating kill flags.
-    LivePhysRegs LiveRegs;
+    LiveRegUnits LiveRegs;
 
   public:
     explicit ScheduleDAGInstrs(MachineFunction &mf,
diff --git a/llvm/lib/CodeGen/HardwareLoops.cpp b/llvm/lib/CodeGen/HardwareLoops.cpp
index e7b14d700a44a1..c536ec9f79d625 100644
--- a/llvm/lib/CodeGen/HardwareLoops.cpp
+++ b/llvm/lib/CodeGen/HardwareLoops.cpp
@@ -503,6 +503,8 @@ Value *HardwareLoop::InitLoopCount() {
 
 Value* HardwareLoop::InsertIterationSetup(Value *LoopCountInit) {
   IRBuilder<> Builder(BeginBB->getTerminator());
+  if (BeginBB->getParent()->getAttributes().hasFnAttr(Attribute::StrictFP))
+    Builder.setIsFPConstrained(true);
   Type *Ty = LoopCountInit->getType();
   bool UsePhi = UsePHICounter || Opts.ForcePhi;
   Intrinsic::ID ID = UseLoopGuard
@@ -535,6 +537,9 @@ Value* HardwareLoop::InsertIterationSetup(Value *LoopCountInit) {
 
 void HardwareLoop::InsertLoopDec() {
   IRBuilder<> CondBuilder(ExitBranch);
+  if (ExitBranch->getParent()->getParent()->getAttributes().hasFnAttr(
+          Attribute::StrictFP))
+    CondBuilder.setIsFPConstrained(true);
 
   Function *DecFunc =
     Intrinsic::getDeclaration(M, Intrinsic::loop_decrement,
@@ -557,6 +562,9 @@ void HardwareLoop::InsertLoopDec() {
 
 Instruction* HardwareLoop::InsertLoopRegDec(Value *EltsRem) {
   IRBuilder<> CondBuilder(ExitBranch);
+  if (ExitBranch->getParent()->getParent()->getAttributes().hasFnAttr(
+          Attribute::StrictFP))
+    CondBuilder.setIsFPConstrained(true);
 
   Function *DecFunc =
       Intrinsic::getDeclaration(M, Intrinsic::loop_decrement_reg,
diff --git a/llvm/lib/CodeGen/LiveIntervals.cpp b/llvm/lib/CodeGen/LiveIntervals.cpp
index 68fff9bc221d0b..42c769399a1401 100644
--- a/llvm/lib/CodeGen/LiveIntervals.cpp
+++ b/llvm/lib/CodeGen/LiveIntervals.cpp
@@ -1666,13 +1666,27 @@ LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB,
     for (const MachineOperand &MO : MI.operands()) {
       if (MO.isReg() && MO.getReg().isVirtual()) {
         Register Reg = MO.getReg();
-        // If the new instructions refer to subregs but the old instructions did
-        // not, throw away any old live interval so it will be recomputed with
-        // subranges.
         if (MO.getSubReg() && hasInterval(Reg) &&
-            !getInterval(Reg).hasSubRanges() &&
-            MRI->shouldTrackSubRegLiveness(Reg))
-          removeInterval(Reg);
+            MRI->shouldTrackSubRegLiveness(Reg)) {
+          LiveInterval &LI = getInterval(Reg);
+          if (!LI.hasSubRanges()) {
+            // If the new instructions refer to subregs but the old instructions
+            // did not, throw away any old live interval so it will be
+            // recomputed with subranges.
+            removeInterval(Reg);
+          } else if (MO.isDef()) {
+            // Similarly if a subreg def has no precise subrange match then
+            // assume we need to recompute all subranges.
+            unsigned SubReg = MO.getSubReg();
+            LaneBitmask Mask = TRI->getSubRegIndexLaneMask(SubReg);
+            if (llvm::none_of(LI.subranges(),
+                              [Mask](LiveInterval::SubRange &SR) {
+                                return SR.LaneMask == Mask;
+                              })) {
+              removeInterval(Reg);
+            }
+          }
+        }
         if (!hasInterval(Reg)) {
           createAndComputeVirtRegInterval(Reg);
           // Don't bother to repair a freshly calculated live interval.
diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
index 54f55623131b35..e09318a486955b 100644
--- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -574,7 +574,7 @@ MIRParserImpl::initializeMachineFunction(const yaml::MachineFunction &YamlMF,
   // FIXME: This is a temporary workaround until the reserved registers can be
   // serialized.
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  MRI.freezeReservedRegs(MF);
+  MRI.freezeReservedRegs();
 
   computeFunctionProperties(MF);
 
diff --git a/llvm/lib/CodeGen/MachineOutliner.cpp b/llvm/lib/CodeGen/MachineOutliner.cpp
index b8d3b2e30e6e6a..dc2f5ef15206e8 100644
--- a/llvm/lib/CodeGen/MachineOutliner.cpp
+++ b/llvm/lib/CodeGen/MachineOutliner.cpp
@@ -759,7 +759,7 @@ MachineFunction *MachineOutliner::createOutlinedFunction(
   MF.getProperties().set(MachineFunctionProperties::Property::NoPHIs);
   MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs);
   MF.getProperties().set(MachineFunctionProperties::Property::TracksLiveness);
-  MF.getRegInfo().freezeReservedRegs(MF);
+  MF.getRegInfo().freezeReservedRegs();
 
   // Compute live-in set for outlined fn
   const MachineRegisterInfo &MRI = MF.getRegInfo();
diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
index e88487fcc9f9e7..55d7c8370e9c46 100644
--- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -517,8 +517,8 @@ LLVM_DUMP_METHOD void MachineRegisterInfo::dumpUses(Register Reg) const {
 }
 #endif
 
-void MachineRegisterInfo::freezeReservedRegs(const MachineFunction &MF) {
-  ReservedRegs = getTargetRegisterInfo()->getReservedRegs(MF);
+void MachineRegisterInfo::freezeReservedRegs() {
+  ReservedRegs = getTargetRegisterInfo()->getReservedRegs(*MF);
   assert(ReservedRegs.size() == getTargetRegisterInfo()->getNumRegs() &&
          "Invalid ReservedRegs vector from target");
 }
@@ -660,17 +660,14 @@ bool MachineRegisterInfo::isReservedRegUnit(unsigned Unit) const {
   return false;
 }
 
-bool MachineRegisterInfo::isArgumentRegister(const MachineFunction &MF,
-                                             MCRegister Reg) const {
-  return getTargetRegisterInfo()->isArgumentRegister(MF, Reg);
+bool MachineRegisterInfo::isArgumentRegister(MCRegister Reg) const {
+  return getTargetRegisterInfo()->isArgumentRegister(*MF, Reg);
 }
 
-bool MachineRegisterInfo::isFixedRegister(const MachineFunction &MF,
-                                          MCRegister Reg) const {
-  return getTargetRegisterInfo()->isFixedRegister(MF, Reg);
+bool MachineRegisterInfo::isFixedRegister(MCRegister Reg) const {
+  return getTargetRegisterInfo()->isFixedRegister(*MF, Reg);
 }
 
-bool MachineRegisterInfo::isGeneralPurposeRegister(const MachineFunction &MF,
-                                                   MCRegister Reg) const {
-  return getTargetRegisterInfo()->isGeneralPurposeRegister(MF, Reg);
+bool MachineRegisterInfo::isGeneralPurposeRegister(MCRegister Reg) const {
+  return getTargetRegisterInfo()->isGeneralPurposeRegister(*MF, Reg);
 }
diff --git a/llvm/lib/CodeGen/RegAllocBase.cpp b/llvm/lib/CodeGen/RegAllocBase.cpp
index 900f0e9079d698..d0dec372f68961 100644
--- a/llvm/lib/CodeGen/RegAllocBase.cpp
+++ b/llvm/lib/CodeGen/RegAllocBase.cpp
@@ -61,7 +61,7 @@ void RegAllocBase::init(VirtRegMap &vrm, LiveIntervals &lis,
   VRM = &vrm;
   LIS = &lis;
   Matrix = &mat;
-  MRI->freezeReservedRegs(vrm.getMachineFunction());
+  MRI->freezeReservedRegs();
   RegClassInfo.runOnMachineFunction(vrm.getMachineFunction());
 }
 
diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp
index e81d4793013682..6740e1f0edb4f4 100644
--- a/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -1740,7 +1740,7 @@ bool RegAllocFast::runOnMachineFunction(MachineFunction &MF) {
   TRI = STI.getRegisterInfo();
   TII = STI.getInstrInfo();
   MFI = &MF.getFrameInfo();
-  MRI->freezeReservedRegs(MF);
+  MRI->freezeReservedRegs();
   RegClassInfo.runOnMachineFunction(MF);
   unsigned NumRegUnits = TRI->getNumRegUnits();
   UsedInInstr.clear();
diff --git a/llvm/lib/CodeGen/RegAllocPBQP.cpp b/llvm/lib/CodeGen/RegAllocPBQP.cpp
index b8ee5dc0f8494b..aea92788057971 100644
--- a/llvm/lib/CodeGen/RegAllocPBQP.cpp
+++ b/llvm/lib/CodeGen/RegAllocPBQP.cpp
@@ -809,7 +809,7 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
   std::unique_ptr<Spiller> VRegSpiller(
       createInlineSpiller(*this, MF, VRM, DefaultVRAI));
 
-  MF.getRegInfo().freezeReservedRegs(MF);
+  MF.getRegInfo().freezeReservedRegs();
 
   LLVM_DEBUG(dbgs() << "PBQP Register Allocating for " << MF.getName() << "\n");
 
diff --git a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
index 0190fa345eb363..51ede7992af53d 100644
--- a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -1103,7 +1103,7 @@ void ScheduleDAGInstrs::reduceHugeMemNodeMaps(Value2SUsMap &stores,
              dbgs() << "Loading SUnits:\n"; loads.dump());
 }
 
-static void toggleKills(const MachineRegisterInfo &MRI, LivePhysRegs &LiveRegs,
+static void toggleKills(const MachineRegisterInfo &MRI, LiveRegUnits &LiveRegs,
                         MachineInstr &MI, bool addToLiveRegs) {
   for (MachineOperand &MO : MI.operands()) {
     if (!MO.isReg() || !MO.readsReg())
@@ -1113,8 +1113,10 @@ static void toggleKills(const MachineRegisterInfo &MRI, LivePhysRegs &LiveRegs,
       continue;
 
     // Things that are available after the instruction are killed by it.
-    bool IsKill = LiveRegs.available(MRI, Reg);
-    MO.setIsKill(IsKill);
+    bool IsKill = LiveRegs.available(Reg);
+
+    // Exception: Do not kill reserved registers
+    MO.setIsKill(IsKill && !MRI.isReserved(Reg));
     if (addToLiveRegs)
       LiveRegs.addReg(Reg);
   }
@@ -1144,7 +1146,7 @@ void ScheduleDAGInstrs::fixupKills(MachineBasicBlock &MBB) {
           continue;
         LiveRegs.removeReg(Reg);
       } else if (MO.isRegMask()) {
-        LiveRegs.removeRegsInMask(MO);
+        LiveRegs.removeRegsNotPreserved(MO.getRegMask());
       }
     }
 
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index a2aeb66835b29e..8ac55ee6a5d0c1 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -2336,7 +2336,7 @@ bool TargetLoweringBase::isLoadBitCastBeneficial(
 }
 
 void TargetLoweringBase::finalizeLowering(MachineFunction &MF) const {
-  MF.getRegInfo().freezeReservedRegs(MF);
+  MF.getRegInfo().freezeReservedRegs();
 }
 
 MachineMemOperand::Flags TargetLoweringBase::getLoadMemOperandFlags(
diff --git a/llvm/lib/CodeGen/TypePromotion.cpp b/llvm/lib/CodeGen/TypePromotion.cpp
index 34aeb62a87a026..48ad8de778010e 100644
--- a/llvm/lib/CodeGen/TypePromotion.cpp
+++ b/llvm/lib/CodeGen/TypePromotion.cpp
@@ -136,7 +136,6 @@ class IRPromoter {
 
 class TypePromotionImpl {
   unsigned TypeSize = 0;
-  const TargetLowering *TLI = nullptr;
   LLVMContext *Ctx = nullptr;
   unsigned RegisterBitWidth = 0;
   SmallPtrSet<Value *, 16> AllVisited;
@@ -273,58 +272,64 @@ bool TypePromotionImpl::isSink(Value *V) {
 
 /// Return whether this instruction can safely wrap.
 bool TypePromotionImpl::isSafeWrap(Instruction *I) {
-  // We can support a potentially wrapping Add/Sub instruction (I) if:
+  // We can support a potentially wrapping instruction (I) if:
   // - It is only used by an unsigned icmp.
   // - The icmp uses a constant.
+  // - The wrapping value (I) is decreasing, i.e would underflow - wrapping
+  //   around zero to become a larger number than before.
   // - The wrapping instruction (I) also uses a constant.
   //
-  // This a common pattern emitted to check if a value is within a range.
+  // We can then use the two constants to calculate whether the result would
+  // wrap in respect to itself in the original bitwidth. If it doesn't wrap,
+  // just underflows the range, the icmp would give the same result whether the
+  // result has been truncated or not. We calculate this by:
+  // - Zero extending both constants, if needed, to RegisterBitWidth.
+  // - Take the absolute value of I's constant, adding this to the icmp const.
+  // - Check that this value is not out of range for small type. If it is, it
+  //   means that it has underflowed enough to wrap around the icmp constant.
   //
   // For example:
   //
-  // %sub = sub i8 %a, C1
-  // %cmp = icmp ule i8 %sub, C2
-  //
-  // or
-  //
-  // %add = add i8 %a, C1
-  // %cmp = icmp ule i8 %add, C2.
-  //
-  // We will treat an add as though it were a subtract by -C1. To promote
-  // the Add/Sub we will zero extend the LHS and the subtracted amount. For Add,
-  // this means we need to negate the constant, zero extend to RegisterBitWidth,
-  // and negate in the larger type.
+  // %sub = sub i8 %a, 2
+  // %cmp = icmp ule i8 %sub, 254
   //
-  // This will produce a value in the range [-zext(C1), zext(X)-zext(C1)] where
-  // C1 is the subtracted amount. This is either a small unsigned number or a
-  // large unsigned number in the promoted type.
+  // If %a = 0, %sub = -2 == FE == 254
+  // But if this is evalulated as a i32
+  // %sub = -2 == FF FF FF FE == 4294967294
+  // So the unsigned compares (i8 and i32) would not yield the same result.
   //
-  // Now we need to correct the compare constant C2. Values >= C1 in the
-  // original add result range have been remapped to large values in the
-  // promoted range. If the compare constant fell into this range we need to
-  // remap it as well. We can do this as -(zext(-C2)).
+  // Another way to look at it is:
+  // %a - 2 <= 254
+  // %a + 2 <= 254 + 2
+  // %a <= 256
+  // And we can't represent 256 in the i8 format, so we don't support it.
   //
-  // For example:
+  // Whereas:
   //
-  // %sub = sub i8 %a, 2
+  // %sub i8 %a, 1
   // %cmp = icmp ule i8 %sub, 254
   //
-  // becomes
+  // If %a = 0, %sub = -1 == FF == 255
+  // As i32:
+  // %sub = -1 == FF FF FF FF == 4294967295
   //
-  // %zext = zext %a to i32
-  // %sub = sub i32 %zext, 2
-  // %cmp = icmp ule i32 %sub, 4294967294
+  // In this case, the unsigned compare results would be the same and this
+  // would also be true for ult, uge and ugt:
+  // - (255 < 254) == (0xFFFFFFFF < 254) == false
+  // - (255 <= 254) == (0xFFFFFFFF <= 254) == false
+  // - (255 > 254) == (0xFFFFFFFF > 254) == true
+  // - (255 >= 254) == (0xFFFFFFFF >= 254) == true
   //
-  // Another example:
+  // To demonstrate why we can't handle increasing values:
   //
-  // %sub = sub i8 %a, 1
-  // %cmp = icmp ule i8 %sub, 254
+  // %add = add i8 %a, 2
+  // %cmp = icmp ult i8 %add, 127
   //
-  // becomes
+  // If %a = 254, %add = 256 == (i8 1)
+  // As i32:
+  // %add = 256
   //
-  // %zext = zext %a to i32
-  // %sub = sub i32 %zext, 1
-  // %cmp = icmp ule i32 %sub, 254
+  // (1 < 127) != (256 < 127)
 
   unsigned Opc = I->getOpcode();
   if (Opc != Instruction::Add && Opc != Instruction::Sub)
@@ -351,23 +356,15 @@ bool TypePromotionImpl::isSafeWrap(Instruction *I) {
   APInt OverflowConst = cast<ConstantInt>(I->getOperand(1))->getValue();
   if (Opc == Instruction::Sub)
     OverflowConst = -OverflowConst;
-
-  // If the constant is positive, we will end up filling the promoted bits with
-  // all 1s. Make sure that results in a cheap add constant.
-  if (!OverflowConst.isNonPositive()) {
-    // We don't have the true promoted width, just use 64 so we can create an
-    // int64_t for the isLegalAddImmediate call.
-    if (OverflowConst.getBitWidth() >= 64)
-      return false;
-
-    APInt NewConst = -((-OverflowConst).zext(64));
-    if (!TLI->isLegalAddImmediate(NewConst.getSExtValue()))
-      return false;
-  }
+  if (!OverflowConst.isNonPositive())
+    return false;
 
   SafeWrap.insert(I);
 
-  if (OverflowConst.ugt(ICmpConst)) {
+  // Using C1 = OverflowConst and C2 = ICmpConst, we can either prove that:
+  //   zext(x) + sext(C1) <u zext(C2)  if C1 < 0 and C1 >s C2
+  //   zext(x) + sext(C1) <u sext(C2)  if C1 < 0 and C1 <=s C2
+  if (OverflowConst.sgt(ICmpConst)) {
     LLVM_DEBUG(dbgs() << "IR Promotion: Allowing safe overflow for sext "
                       << "const of " << *I << "\n");
     return true;
@@ -490,24 +487,18 @@ void IRPromoter::PromoteTree() {
         continue;
 
       if (auto *Const = dyn_cast<ConstantInt>(Op)) {
-        // For subtract, we only need to zext the constant. We only put it in
+        // For subtract, we don't need to sext the constant. We only put it in
         // SafeWrap because SafeWrap.size() is used elsewhere.
-        // For Add and ICmp we need to find how far the constant is from the
-        // top of its original unsigned range and place it the same distance
-        // from the top of its new unsigned range. We can do this by negating
-        // the constant, zero extending it, then negating in the new type.
-        APInt NewConst;
-        if (SafeWrap.contains(I)) {
-          if (I->getOpcode() == Instruction::ICmp)
-            NewConst = -((-Const->getValue()).zext(PromotedWidth));
-          else if (I->getOpcode() == Instruction::Add && i == 1)
-            NewConst = -((-Const->getValue()).zext(PromotedWidth));
-          else
-            NewConst = Const->getValue().zext(PromotedWidth);
-        } else
-          NewConst = Const->getValue().zext(PromotedWidth);
-
-        I->setOperand(i, ConstantInt::get(Const->getContext(), NewConst));
+        // For cmp, we need to sign extend a constant appearing in either
+        // operand. For add, we should only sign extend the RHS.
+        Constant *NewConst =
+            ConstantInt::get(Const->getContext(),
+                             (SafeWrap.contains(I) &&
+                              (I->getOpcode() == Instruction::ICmp || i == 1) &&
+                              I->getOpcode() != Instruction::Sub)
+                                 ? Const->getValue().sext(PromotedWidth)
+                                 : Const->getValue().zext(PromotedWidth));
+        I->setOperand(i, NewConst);
       } else if (isa<UndefValue>(Op))
         I->setOperand(i, ConstantInt::get(ExtTy, 0));
     }
@@ -926,7 +917,7 @@ bool TypePromotionImpl::run(Function &F, const TargetMachine *TM,
   bool MadeChange = false;
   const DataLayout &DL = F.getParent()->getDataLayout();
   const TargetSubtargetInfo *SubtargetInfo = TM->getSubtargetImpl(F);
-  TLI = SubtargetInfo->getTargetLowering();
+  const TargetLowering *TLI = SubtargetInfo->getTargetLowering();
   RegisterBitWidth =
       TTI.getRegisterBitWidth(TargetTransformInfo::RGK_Scalar).getFixedValue();
   Ctx = &F.getParent()->getContext();
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
index 73e2b365f109a9..e09c632842d6e9 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
@@ -116,22 +116,22 @@ class SectionEntry {
 /// linker.
 class RelocationEntry {
 public:
-  /// SectionID - the section this relocation points to.
-  unsigned SectionID;
-
   /// Offset - offset into the section.
   uint64_t Offset;
 
-  /// RelType - relocation type.
-  uint32_t RelType;
-
   /// Addend - the relocation addend encoded in the instruction itself.  Also
   /// used to make a relocation section relative instead of symbol relative.
   int64_t Addend;
 
+  /// SectionID - the section this relocation points to.
+  unsigned SectionID;
+
+  /// RelType - relocation type.
+  uint32_t RelType;
+
   struct SectionPair {
-      uint32_t SectionA;
-      uint32_t SectionB;
+    uint32_t SectionA;
+    uint32_t SectionB;
   };
 
   /// SymOffset - Section offset of the relocation entry's symbol (used for GOT
@@ -141,36 +141,36 @@ class RelocationEntry {
     SectionPair Sections;
   };
 
-  /// True if this is a PCRel relocation (MachO specific).
-  bool IsPCRel;
-
   /// The size of this relocation (MachO specific).
   unsigned Size;
 
+  /// True if this is a PCRel relocation (MachO specific).
+  bool IsPCRel : 1;
+
   // ARM (MachO and COFF) specific.
-  bool IsTargetThumbFunc = false;
+  bool IsTargetThumbFunc : 1;
 
   RelocationEntry(unsigned id, uint64_t offset, uint32_t type, int64_t addend)
-      : SectionID(id), Offset(offset), RelType(type), Addend(addend),
-        SymOffset(0), IsPCRel(false), Size(0), IsTargetThumbFunc(false) {}
+      : Offset(offset), Addend(addend), SectionID(id), RelType(type),
+        SymOffset(0), Size(0), IsPCRel(false), IsTargetThumbFunc(false) {}
 
   RelocationEntry(unsigned id, uint64_t offset, uint32_t type, int64_t addend,
                   uint64_t symoffset)
-      : SectionID(id), Offset(offset), RelType(type), Addend(addend),
-        SymOffset(symoffset), IsPCRel(false), Size(0),
+      : Offset(offset), Addend(addend), SectionID(id), RelType(type),
+        SymOffset(symoffset), Size(0), IsPCRel(false),
         IsTargetThumbFunc(false) {}
 
   RelocationEntry(unsigned id, uint64_t offset, uint32_t type, int64_t addend,
                   bool IsPCRel, unsigned Size)
-      : SectionID(id), Offset(offset), RelType(type), Addend(addend),
-        SymOffset(0), IsPCRel(IsPCRel), Size(Size), IsTargetThumbFunc(false) {}
+      : Offset(offset), Addend(addend), SectionID(id), RelType(type),
+        SymOffset(0), Size(Size), IsPCRel(IsPCRel), IsTargetThumbFunc(false) {}
 
   RelocationEntry(unsigned id, uint64_t offset, uint32_t type, int64_t addend,
                   unsigned SectionA, uint64_t SectionAOffset, unsigned SectionB,
                   uint64_t SectionBOffset, bool IsPCRel, unsigned Size)
-      : SectionID(id), Offset(offset), RelType(type),
-        Addend(SectionAOffset - SectionBOffset + addend), IsPCRel(IsPCRel),
-        Size(Size), IsTargetThumbFunc(false) {
+      : Offset(offset), Addend(SectionAOffset - SectionBOffset + addend),
+        SectionID(id), RelType(type), Size(Size), IsPCRel(IsPCRel),
+        IsTargetThumbFunc(false) {
     Sections.SectionA = SectionA;
     Sections.SectionB = SectionB;
   }
@@ -179,9 +179,9 @@ class RelocationEntry {
                   unsigned SectionA, uint64_t SectionAOffset, unsigned SectionB,
                   uint64_t SectionBOffset, bool IsPCRel, unsigned Size,
                   bool IsTargetThumbFunc)
-      : SectionID(id), Offset(offset), RelType(type),
-        Addend(SectionAOffset - SectionBOffset + addend), IsPCRel(IsPCRel),
-        Size(Size), IsTargetThumbFunc(IsTargetThumbFunc) {
+      : Offset(offset), Addend(SectionAOffset - SectionBOffset + addend),
+        SectionID(id), RelType(type), Size(Size), IsPCRel(IsPCRel),
+        IsTargetThumbFunc(IsTargetThumbFunc) {
     Sections.SectionA = SectionA;
     Sections.SectionB = SectionB;
   }
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index ce221758ef798b..e863ef3eb8d6d7 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -149,6 +149,18 @@ void Instruction::insertBefore(BasicBlock &BB,
   if (!InsertAtHead) {
     DPMarker *SrcMarker = BB.getMarker(InsertPos);
     if (SrcMarker && !SrcMarker->empty()) {
+      // If this assertion fires, the calling code is about to insert a PHI
+      // after debug-records, which would form a sequence like:
+      //     %0 = PHI
+      //     #dbg_value
+      //     %1 = PHI
+      // Which is de-normalised and undesired -- hence the assertion. To avoid
+      // this, you must insert at that position using an iterator, and it must
+      // be aquired by calling getFirstNonPHIIt / begin or similar methods on
+      // the block. This will signal to this behind-the-scenes debug-info
+      // maintenence code that you intend the PHI to be ahead of everything,
+      // including any debug-info.
+      assert(!isa<PHINode>(this) && "Inserting PHI after debug-records!");
       adoptDbgValues(&BB, InsertPos, false);
     }
   }
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index ce090c3b8a7444..0e6c01802cfb8c 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -2691,6 +2691,11 @@ void Verifier::visitFunction(const Function &F) {
   Check(verifyAttributeCount(Attrs, FT->getNumParams()),
         "Attribute after last parameter!", &F);
 
+  CheckDI(F.IsNewDbgInfoFormat == F.getParent()->IsNewDbgInfoFormat,
+          "Function debug format should match parent module", &F,
+          F.IsNewDbgInfoFormat, F.getParent(),
+          F.getParent()->IsNewDbgInfoFormat);
+
   bool IsIntrinsic = F.isIntrinsic();
 
   // Check function attributes.
@@ -3034,6 +3039,11 @@ void Verifier::visitBasicBlock(BasicBlock &BB) {
     Check(I.getParent() == &BB, "Instruction has bogus parent pointer!");
   }
 
+  CheckDI(BB.IsNewDbgInfoFormat == BB.getParent()->IsNewDbgInfoFormat,
+          "BB debug format should match parent function", &BB,
+          BB.IsNewDbgInfoFormat, BB.getParent(),
+          BB.getParent()->IsNewDbgInfoFormat);
+
   // Confirm that no issues arise from the debug program.
   if (BB.IsNewDbgInfoFormat)
     CheckDI(!BB.getTrailingDPValues(), "Basic Block has trailing DbgRecords!",
diff --git a/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp b/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp
index 4afc678abaca63..d21aa59659a25d 100644
--- a/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp
@@ -183,7 +183,7 @@ static MachineFunction &createFrameHelperMachineFunction(Module *M,
   MF.getProperties().reset(MachineFunctionProperties::Property::TracksLiveness);
   MF.getProperties().reset(MachineFunctionProperties::Property::IsSSA);
   MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs);
-  MF.getRegInfo().freezeReservedRegs(MF);
+  MF.getRegInfo().freezeReservedRegs();
 
   // Create entry block.
   BasicBlock *EntryBB = BasicBlock::Create(C, "entry", F);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 34fcb9aec38fa4..2cfd1de93a04f9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -1671,7 +1671,7 @@ PtrParts SplitPtrStructs::visitSelectInst(SelectInst &SI) {
   auto [TrueRsrc, TrueOff] = getPtrParts(True);
   auto [FalseRsrc, FalseOff] = getPtrParts(False);
 
-  Value *RsrcRes = RsrcRes =
+  Value *RsrcRes =
       IRB.CreateSelect(Cond, TrueRsrc, FalseRsrc, SI.getName() + ".rsrc", &SI);
   copyMetadata(RsrcRes, &SI);
   Conditionals.push_back(&SI);
@@ -1841,6 +1841,7 @@ static Function *moveFunctionAdaptingType(Function *OldF, FunctionType *NewTy,
   bool IsIntrinsic = OldF->isIntrinsic();
   Function *NewF =
       Function::Create(NewTy, OldF->getLinkage(), OldF->getAddressSpace());
+  NewF->IsNewDbgInfoFormat = OldF->IsNewDbgInfoFormat;
   NewF->copyAttributesFrom(OldF);
   NewF->copyMetadata(OldF, 0);
   NewF->takeName(OldF);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index 5762f1906a16d3..b85cb26fdc9565 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -340,15 +340,25 @@ class AMDGPULowerModuleLDS {
 
     // Get uses from the current function, excluding uses by called functions
     // Two output variables to avoid walking the globals list twice
+    std::optional<bool> HasAbsoluteGVs;
     for (auto &GV : M.globals()) {
       if (!AMDGPU::isLDSVariableToLower(GV)) {
         continue;
       }
 
-      if (GV.isAbsoluteSymbolRef()) {
-        report_fatal_error(
-            "LDS variables with absolute addresses are unimplemented.");
-      }
+      // Check if the module is consistent: either all GVs are absolute (happens
+      // when we run the pass more than once), or none are.
+      const bool IsAbsolute = GV.isAbsoluteSymbolRef();
+      if (HasAbsoluteGVs.has_value()) {
+        if (*HasAbsoluteGVs != IsAbsolute) {
+          report_fatal_error(
+              "Module cannot mix absolute and non-absolute LDS GVs");
+        }
+      } else
+        HasAbsoluteGVs = IsAbsolute;
+
+      if (IsAbsolute)
+        continue;
 
       for (User *V : GV.users()) {
         if (auto *I = dyn_cast<Instruction>(V)) {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index a6a01479b5b18a..4700a984770bfb 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -70,7 +70,7 @@ static MCRegisterInfo *createAMDGPUMCRegisterInfo(const Triple &TT) {
 
 MCRegisterInfo *llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour DwarfFlavour) {
   MCRegisterInfo *X = new MCRegisterInfo();
-  InitAMDGPUMCRegisterInfo(X, AMDGPU::PC_REG, DwarfFlavour);
+  InitAMDGPUMCRegisterInfo(X, AMDGPU::PC_REG, DwarfFlavour, DwarfFlavour);
   return X;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index e8022c8b0afa19..c19c3c6017a7c8 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3780,8 +3780,28 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
     for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
       MIB.add(MI.getOperand(I));
     updateLiveVariables(LV, MI, *MIB);
-    if (LIS)
+    if (LIS) {
       LIS->ReplaceMachineInstrInMaps(MI, *MIB);
+      // SlotIndex of defs needs to be updated when converting to early-clobber
+      MachineOperand &Def = MIB->getOperand(0);
+      if (Def.isEarlyClobber() && Def.isReg() &&
+          LIS->hasInterval(Def.getReg())) {
+        SlotIndex OldIndex = LIS->getInstructionIndex(*MIB).getRegSlot(false);
+        SlotIndex NewIndex = LIS->getInstructionIndex(*MIB).getRegSlot(true);
+        auto &LI = LIS->getInterval(Def.getReg());
+        auto UpdateDefIndex = [&](LiveRange &LR) {
+          auto S = LR.find(OldIndex);
+          if (S != LR.end() && S->start == OldIndex) {
+            assert(S->valno && S->valno->def == OldIndex);
+            S->start = NewIndex;
+            S->valno->def = NewIndex;
+          }
+        };
+        UpdateDefIndex(LI);
+        for (auto &SR : LI.subranges())
+          UpdateDefIndex(SR);
+      }
+    }
     return MIB;
   }
 
diff --git a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
index 0c57110b4eb15d..398f870a9f5311 100644
--- a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
@@ -156,7 +156,7 @@ void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) {
   RegsToRewrite.clear();
 
   // Update the set of reserved registers to include WWM ones.
-  MRI->freezeReservedRegs(MF);
+  MRI->freezeReservedRegs();
 }
 
 #ifndef NDEBUG
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 3664535b325997..5c64c6bcd1968c 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -318,8 +318,9 @@ struct SGPRSpillBuilder {
 } // namespace llvm
 
 SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST)
-    : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour()), ST(ST),
-      SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) {
+    : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour(),
+                            ST.getAMDGPUDwarfFlavour()),
+      ST(ST), SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) {
 
   assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 &&
          getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) &&
diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index f0b69b0b09809f..0f7858a3be9f9f 100644
--- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -1470,13 +1470,19 @@ void ARMExpandPseudo::CMSESaveClearFPRegsV8(
 
   // Lazy store all fp registers to the stack.
   // This executes as NOP in the absence of floating-point support.
-  MachineInstrBuilder VLSTM = BuildMI(MBB, MBBI, DL, TII->get(ARM::VLSTM))
-                                  .addReg(ARM::SP)
-                                  .add(predOps(ARMCC::AL));
-  for (auto R : {ARM::VPR, ARM::FPSCR, ARM::FPSCR_NZCV, ARM::Q0, ARM::Q1,
-                 ARM::Q2, ARM::Q3, ARM::Q4, ARM::Q5, ARM::Q6, ARM::Q7})
-    VLSTM.addReg(R, RegState::Implicit |
-                        (LiveRegs.contains(R) ? 0 : RegState::Undef));
+  MachineInstrBuilder VLSTM =
+      BuildMI(MBB, MBBI, DL, TII->get(ARM::VLSTM))
+          .addReg(ARM::SP)
+          .add(predOps(ARMCC::AL))
+          .addImm(0); // Represents a pseoudo register list, has no effect on
+                      // the encoding.
+  // Mark non-live registers as undef
+  for (MachineOperand &MO : VLSTM->implicit_operands()) {
+    if (MO.isReg() && !MO.isDef()) {
+      Register Reg = MO.getReg();
+      MO.setIsUndef(!LiveRegs.contains(Reg));
+    }
+  }
 
   // Restore all arguments
   for (const auto &Regs : ClearedFPRegs) {
@@ -1564,13 +1570,19 @@ void ARMExpandPseudo::CMSESaveClearFPRegsV81(MachineBasicBlock &MBB,
         .add(predOps(ARMCC::AL));
 
     // Lazy store all FP registers to the stack
-    MachineInstrBuilder VLSTM = BuildMI(MBB, MBBI, DL, TII->get(ARM::VLSTM))
-                                    .addReg(ARM::SP)
-                                    .add(predOps(ARMCC::AL));
-    for (auto R : {ARM::VPR, ARM::FPSCR, ARM::FPSCR_NZCV, ARM::Q0, ARM::Q1,
-                   ARM::Q2, ARM::Q3, ARM::Q4, ARM::Q5, ARM::Q6, ARM::Q7})
-      VLSTM.addReg(R, RegState::Implicit |
-                          (LiveRegs.contains(R) ? 0 : RegState::Undef));
+    MachineInstrBuilder VLSTM =
+        BuildMI(MBB, MBBI, DL, TII->get(ARM::VLSTM))
+            .addReg(ARM::SP)
+            .add(predOps(ARMCC::AL))
+            .addImm(0); // Represents a pseoudo register list, has no effect on
+                        // the encoding.
+    // Mark non-live registers as undef
+    for (MachineOperand &MO : VLSTM->implicit_operands()) {
+      if (MO.isReg() && MO.isImplicit() && !MO.isDef()) {
+        Register Reg = MO.getReg();
+        MO.setIsUndef(!LiveRegs.contains(Reg));
+      }
+    }
   } else {
     // Push all the callee-saved registers (s16-s31).
     MachineInstrBuilder VPUSH =
@@ -1673,9 +1685,12 @@ void ARMExpandPseudo::CMSERestoreFPRegsV8(
 
   // Lazy load fp regs from stack.
   // This executes as NOP in the absence of floating-point support.
-  MachineInstrBuilder VLLDM = BuildMI(MBB, MBBI, DL, TII->get(ARM::VLLDM))
-                                  .addReg(ARM::SP)
-                                  .add(predOps(ARMCC::AL));
+  MachineInstrBuilder VLLDM =
+      BuildMI(MBB, MBBI, DL, TII->get(ARM::VLLDM))
+          .addReg(ARM::SP)
+          .add(predOps(ARMCC::AL))
+          .addImm(0); // Represents a pseoudo register list, has no effect on
+                      // the encoding.
 
   if (STI->fixCMSE_CVE_2021_35465()) {
     auto Bundler = MIBundleBuilder(MBB, VLLDM);
@@ -1757,7 +1772,9 @@ void ARMExpandPseudo::CMSERestoreFPRegsV81(
     // Load FP registers from stack.
     BuildMI(MBB, MBBI, DL, TII->get(ARM::VLLDM))
         .addReg(ARM::SP)
-        .add(predOps(ARMCC::AL));
+        .add(predOps(ARMCC::AL))
+        .addImm(0); // Represents a pseoudo register list, has no effect on the
+                    // encoding.
 
     // Pop the stack space
     BuildMI(MBB, MBBI, DL, TII->get(ARM::tADDspi), ARM::SP)
diff --git a/llvm/lib/Target/ARM/ARMInstrFormats.td b/llvm/lib/Target/ARM/ARMInstrFormats.td
index 14e315534570d2..404085820a6660 100644
--- a/llvm/lib/Target/ARM/ARMInstrFormats.td
+++ b/llvm/lib/Target/ARM/ARMInstrFormats.td
@@ -1749,6 +1749,37 @@ class AXSI4<dag oops, dag iops, IndexMode im, InstrItinClass itin,
   let Inst{8}     = 0;          // Single precision
 }
 
+// Single Precision with fixed registers.
+// For when the registers-to-be-stored/loaded are fixed, e.g. VLLDM and VLSTM
+class AXSI4FR<string asm, bit et, bit load>
+    : InstARM<AddrMode4, 4, IndexModeNone, VFPLdStMulFrm, VFPDomain, "", NoItinerary> {
+  // Instruction operands.
+  bits<4> Rn;
+  bits<13> regs;    // Does not affect encoding, for assembly/disassembly only.
+  list<Predicate> Predicates = [HasVFP2];
+  let OutOperandList = (outs);
+  let InOperandList = (ins GPRnopc:$Rn, pred:$p, dpr_reglist:$regs);
+  let AsmString = asm;
+  let Pattern = [];
+  let DecoderNamespace = "VFP";
+  // Encode instruction operands.
+  let Inst{19-16} = Rn;
+  let Inst{31-28} = 0b1110;
+  let Inst{27-25} = 0b110;
+  let Inst{24}    = 0b0;
+  let Inst{23}    = 0b0;
+  let Inst{22}    = 0b0;
+  let Inst{21}    = 0b1;
+  let Inst{20}    = load;       // Distinguishes vlldm from vlstm
+  let Inst{15-12} = 0b0000;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 0;          // Single precision
+  let Inst{7}     = et;         // encoding type, 0 for T1 and 1 for T2.
+  let Inst{6-0}   = 0b0000000;
+  let mayLoad     = load;
+  let mayStore    = !eq(load, 0);
+}
+
 // Double precision, unary
 class ADuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
            bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc,
diff --git a/llvm/lib/Target/ARM/ARMInstrVFP.td b/llvm/lib/Target/ARM/ARMInstrVFP.td
index 55d3efbd9b9a2b..3094a4db2b4d12 100644
--- a/llvm/lib/Target/ARM/ARMInstrVFP.td
+++ b/llvm/lib/Target/ARM/ARMInstrVFP.td
@@ -313,29 +313,51 @@ def : MnemonicAlias<"vstm", "vstmia">;
 //===----------------------------------------------------------------------===//
 //  Lazy load / store multiple Instructions
 //
-def VLLDM : AXSI4<(outs), (ins GPRnopc:$Rn, pred:$p), IndexModeNone,
-                  NoItinerary, "vlldm${p}\t$Rn", "", []>,
+// VLLDM and VLSTM:
+// 2 encoding options:
+// T1 (bit 7 is 0):
+// T1 takes an optional dpr_reglist, must be '{d0-d15}' (exactly)
+// T1 require v8-M.Main, secure state, target with 16 D registers (or with no D registers - NOP)
+// T2 (bit 7 is 1):
+// T2 takes a mandatory dpr_reglist, must be '{d0-d31}' (exactly)
+// T2 require v8.1-M.Main, secure state, target with 16/32 D registers (or with no D registers - NOP)
+// (source: Arm v8-M ARM, DDI0553B.v ID16122022)
+
+def VLLDM : AXSI4FR<"vlldm${p}\t$Rn, $regs", 0, 1>,
             Requires<[HasV8MMainline, Has8MSecExt]> {
-    let Inst{24-23} = 0b00;
-    let Inst{22}    = 0;
-    let Inst{21}    = 1;
-    let Inst{20}    = 1;
-    let Inst{15-12} = 0;
-    let Inst{7-0}   = 0;
-    let mayLoad     = 1;
-    let Defs = [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, VPR, FPSCR, FPSCR_NZCV];
-}
-
-def VLSTM : AXSI4<(outs), (ins GPRnopc:$Rn, pred:$p), IndexModeNone,
-                  NoItinerary, "vlstm${p}\t$Rn", "", []>,
+    let Defs = [VPR, FPSCR, FPSCR_NZCV, D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11, D12, D13, D14, D15];
+    let DecoderMethod = "DecodeLazyLoadStoreMul";
+}
+// T1: assembly does not contains the register list.
+def : InstAlias<"vlldm${p}\t$Rn", (VLLDM GPRnopc:$Rn, pred:$p, 0)>,
+                Requires<[HasV8MMainline, Has8MSecExt]>;
+// T2: assembly must contains the register list.
+// The register list has no effect on the encoding, it is for assembly/disassembly purposes only.
+def VLLDM_T2 : AXSI4FR<"vlldm${p}\t$Rn, $regs", 1, 1>,
+            Requires<[HasV8_1MMainline, Has8MSecExt]> {
+    let Defs = [VPR, FPSCR, FPSCR_NZCV, D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,  D8,  D9,  D10, D11, D12, D13, D14, D15,
+                                        D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30, D31];
+    let DecoderMethod = "DecodeLazyLoadStoreMul";
+}
+// T1: assembly contains the register list.
+// The register list has no effect on the encoding, it is for assembly/disassembly purposes only.
+def VLSTM : AXSI4FR<"vlstm${p}\t$Rn, $regs", 0, 0>,
             Requires<[HasV8MMainline, Has8MSecExt]> {
-    let Inst{24-23} = 0b00;
-    let Inst{22}    = 0;
-    let Inst{21}    = 1;
-    let Inst{20}    = 0;
-    let Inst{15-12} = 0;
-    let Inst{7-0}   = 0;
-    let mayStore    = 1;
+    let Defs = [VPR, FPSCR, FPSCR_NZCV];
+    let Uses = [VPR, FPSCR, FPSCR_NZCV, D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11, D12, D13, D14, D15];
+    let DecoderMethod = "DecodeLazyLoadStoreMul";
+}
+// T1: assembly does not contain the register list.
+def : InstAlias<"vlstm${p}\t$Rn", (VLSTM GPRnopc:$Rn, pred:$p, 0)>,
+                Requires<[HasV8MMainline, Has8MSecExt]>;
+// T2: assembly must contain the register list.
+// The register list has no effect on the encoding, it is for assembly/disassembly purposes only.
+def VLSTM_T2 : AXSI4FR<"vlstm${p}\t$Rn, $regs", 1, 0>,
+            Requires<[HasV8_1MMainline, Has8MSecExt]> {
+    let Defs = [VPR, FPSCR, FPSCR_NZCV];
+    let Uses = [VPR, FPSCR, FPSCR_NZCV, D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,  D8,  D9,  D10, D11, D12, D13, D14, D15,
+                                        D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30, D31];
+    let DecoderMethod = "DecodeLazyLoadStoreMul";
 }
 
 def : InstAlias<"vpush${p} $r", (VSTMDDB_UPD SP, pred:$p, dpr_reglist:$r), 0>,
diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index efec163c6ed634..c320bf723c88bb 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -450,11 +450,12 @@ class ARMAsmParser : public MCTargetAsmParser {
   bool validatetSTMRegList(const MCInst &Inst, const OperandVector &Operands,
                            unsigned ListNo);
 
-  int tryParseRegister();
+  int tryParseRegister(bool AllowOutofBoundReg = false);
   bool tryParseRegisterWithWriteBack(OperandVector &);
   int tryParseShiftRegister(OperandVector &);
   bool parseRegisterList(OperandVector &, bool EnforceOrder = true,
-                         bool AllowRAAC = false);
+                         bool AllowRAAC = false,
+                         bool AllowOutOfBoundReg = false);
   bool parseMemory(OperandVector &);
   bool parseOperand(OperandVector &, StringRef Mnemonic);
   bool parseImmExpr(int64_t &Out);
@@ -4073,7 +4074,7 @@ ParseStatus ARMAsmParser::tryParseRegister(MCRegister &Reg, SMLoc &StartLoc,
 /// Try to parse a register name.  The token must be an Identifier when called,
 /// and if it is a register name the token is eaten and the register number is
 /// returned.  Otherwise return -1.
-int ARMAsmParser::tryParseRegister() {
+int ARMAsmParser::tryParseRegister(bool AllowOutOfBoundReg) {
   MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
   if (Tok.isNot(AsmToken::Identifier)) return -1;
@@ -4117,7 +4118,8 @@ int ARMAsmParser::tryParseRegister() {
   }
 
   // Some FPUs only have 16 D registers, so D16-D31 are invalid
-  if (!hasD32() && RegNum >= ARM::D16 && RegNum <= ARM::D31)
+  if (!AllowOutOfBoundReg && !hasD32() && RegNum >= ARM::D16 &&
+      RegNum <= ARM::D31)
     return -1;
 
   Parser.Lex(); // Eat identifier token.
@@ -4457,7 +4459,7 @@ insertNoDuplicates(SmallVectorImpl<std::pair<unsigned, unsigned>> &Regs,
 
 /// Parse a register list.
 bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder,
-                                     bool AllowRAAC) {
+                                     bool AllowRAAC, bool AllowOutOfBoundReg) {
   MCAsmParser &Parser = getParser();
   if (Parser.getTok().isNot(AsmToken::LCurly))
     return TokError("Token is not a Left Curly Brace");
@@ -4511,7 +4513,7 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder,
         return Error(RegLoc, "pseudo-register not allowed");
       Parser.Lex(); // Eat the minus.
       SMLoc AfterMinusLoc = Parser.getTok().getLoc();
-      int EndReg = tryParseRegister();
+      int EndReg = tryParseRegister(AllowOutOfBoundReg);
       if (EndReg == -1)
         return Error(AfterMinusLoc, "register expected");
       if (EndReg == ARM::RA_AUTH_CODE)
@@ -4546,7 +4548,7 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder,
     RegLoc = Parser.getTok().getLoc();
     int OldReg = Reg;
     const AsmToken RegTok = Parser.getTok();
-    Reg = tryParseRegister();
+    Reg = tryParseRegister(AllowOutOfBoundReg);
     if (Reg == -1)
       return Error(RegLoc, "register expected");
     if (!AllowRAAC && Reg == ARM::RA_AUTH_CODE)
@@ -6086,8 +6088,11 @@ bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
   }
   case AsmToken::LBrac:
     return parseMemory(Operands);
-  case AsmToken::LCurly:
-    return parseRegisterList(Operands, !Mnemonic.starts_with("clr"));
+  case AsmToken::LCurly: {
+    bool AllowOutOfBoundReg = Mnemonic == "vlldm" || Mnemonic == "vlstm";
+    return parseRegisterList(Operands, !Mnemonic.starts_with("clr"), false,
+                             AllowOutOfBoundReg);
+  }
   case AsmToken::Dollar:
   case AsmToken::Hash: {
     // #42 -> immediate
@@ -7597,6 +7602,33 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
 
   const unsigned Opcode = Inst.getOpcode();
   switch (Opcode) {
+  case ARM::VLLDM:
+  case ARM::VLLDM_T2:
+  case ARM::VLSTM:
+  case ARM::VLSTM_T2: {
+    // Since in some cases both T1 and T2 are valid, tablegen can not always
+    // pick the correct instruction.
+    if (Operands.size() == 4) { // a register list has been provided
+      ARMOperand &Op = static_cast<ARMOperand &>(
+          *Operands[3]); // the register list, a dpr_reglist
+      assert(Op.isDPRRegList());
+      auto &RegList = Op.getRegList();
+      // T2 requires v8.1-M.Main (cannot be handled by tablegen)
+      if (RegList.size() == 32 && !hasV8_1MMainline()) {
+        return Error(Op.getEndLoc(), "T2 version requires v8.1-M.Main");
+      }
+      // When target has 32 D registers, T1 is undefined.
+      if (hasD32() && RegList.size() != 32) {
+        return Error(Op.getEndLoc(), "operand must be exactly {d0-d31}");
+      }
+      // When target has 16 D registers, both T1 and T2 are valid.
+      if (!hasD32() && (RegList.size() != 16 && RegList.size() != 32)) {
+        return Error(Op.getEndLoc(),
+                     "operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)");
+      }
+    }
+    return false;
+  }
   case ARM::t2IT: {
     // Encoding is unpredictable if it ever results in a notional 'NV'
     // predicate. Since we don't parse 'NV' directly this means an 'AL'
@@ -8732,6 +8764,32 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
   }
 
   switch (Inst.getOpcode()) {
+  case ARM::VLLDM:
+  case ARM::VLSTM: {
+    // In some cases both T1 and T2 are valid, causing tablegen pick T1 instead
+    // of T2
+    if (Operands.size() == 4) { // a register list has been provided
+      ARMOperand &Op = static_cast<ARMOperand &>(
+          *Operands[3]); // the register list, a dpr_reglist
+      assert(Op.isDPRRegList());
+      auto &RegList = Op.getRegList();
+      // When the register list is {d0-d31} the instruction has to be the T2
+      // variant
+      if (RegList.size() == 32) {
+        const unsigned Opcode =
+            (Inst.getOpcode() == ARM::VLLDM) ? ARM::VLLDM_T2 : ARM::VLSTM_T2;
+        MCInst TmpInst;
+        TmpInst.setOpcode(Opcode);
+        TmpInst.addOperand(Inst.getOperand(0));
+        TmpInst.addOperand(Inst.getOperand(1));
+        TmpInst.addOperand(Inst.getOperand(2));
+        TmpInst.addOperand(Inst.getOperand(3));
+        Inst = TmpInst;
+        return true;
+      }
+    }
+    return false;
+  }
   // Alias for alternate form of 'ldr{,b}t Rt, [Rn], #imm' instruction.
   case ARM::LDRT_POST:
   case ARM::LDRBT_POST: {
diff --git a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index 604f22d7111900..705f3cbce12f02 100644
--- a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -700,6 +700,9 @@ DecodeMVEOverlappingLongShift(MCInst &Inst, unsigned Insn, uint64_t Address,
 static DecodeStatus DecodeT2AddSubSPImm(MCInst &Inst, unsigned Insn,
                                         uint64_t Address,
                                         const MCDisassembler *Decoder);
+static DecodeStatus DecodeLazyLoadStoreMul(MCInst &Inst, unsigned Insn,
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder);
 
 #include "ARMGenDisassemblerTables.inc"
 
@@ -7030,3 +7033,23 @@ static DecodeStatus DecodeT2AddSubSPImm(MCInst &Inst, unsigned Insn,
 
   return DS;
 }
+
+static DecodeStatus DecodeLazyLoadStoreMul(MCInst &Inst, unsigned Insn,
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  const unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  // Adding Rn, holding memory location to save/load to/from, the only argument
+  // that is being encoded.
+  // '$Rn' in the assembly.
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  // An optional predicate, '$p' in the assembly.
+  DecodePredicateOperand(Inst, ARMCC::AL, Address, Decoder);
+  // An immediate that represents a floating point registers list. '$regs' in
+  // the assembly.
+  Inst.addOperand(MCOperand::createImm(0)); // Arbitrary value, has no effect.
+
+  return S;
+}
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
index fbd067d79af0b3..24e627cd9a4e1f 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
@@ -91,6 +91,38 @@ void ARMInstPrinter::printInst(const MCInst *MI, uint64_t Address,
   unsigned Opcode = MI->getOpcode();
 
   switch (Opcode) {
+  case ARM::VLLDM: {
+    const MCOperand &Reg = MI->getOperand(0);
+    O << '\t' << "vlldm" << '\t';
+    printRegName(O, Reg.getReg());
+    O << ", "
+      << "{d0 - d15}";
+    return;
+  }
+  case ARM::VLLDM_T2: {
+    const MCOperand &Reg = MI->getOperand(0);
+    O << '\t' << "vlldm" << '\t';
+    printRegName(O, Reg.getReg());
+    O << ", "
+      << "{d0 - d31}";
+    return;
+  }
+  case ARM::VLSTM: {
+    const MCOperand &Reg = MI->getOperand(0);
+    O << '\t' << "vlstm" << '\t';
+    printRegName(O, Reg.getReg());
+    O << ", "
+      << "{d0 - d15}";
+    return;
+  }
+  case ARM::VLSTM_T2: {
+    const MCOperand &Reg = MI->getOperand(0);
+    O << '\t' << "vlstm" << '\t';
+    printRegName(O, Reg.getReg());
+    O << ", "
+      << "{d0 - d31}";
+    return;
+  }
   // Check for MOVs and print canonical forms, instead.
   case ARM::MOVsr: {
     // FIXME: Thumb variants?
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 9b748cdcf74511..08678a859ae2b6 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -9080,15 +9080,20 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
     SDValue Result, Chain;
 
     // TODO: We restrict this to unmasked loads currently in consideration of
-    // the complexity of hanlding all falses masks.
-    if (IsUnmasked && isNullConstant(Stride)) {
-      MVT ScalarVT = ContainerVT.getVectorElementType();
+    // the complexity of handling all falses masks.
+    MVT ScalarVT = ContainerVT.getVectorElementType();
+    if (IsUnmasked && isNullConstant(Stride) && ContainerVT.isInteger()) {
       SDValue ScalarLoad =
           DAG.getExtLoad(ISD::ZEXTLOAD, DL, XLenVT, Load->getChain(), Ptr,
                          ScalarVT, Load->getMemOperand());
       Chain = ScalarLoad.getValue(1);
       Result = lowerScalarSplat(SDValue(), ScalarLoad, VL, ContainerVT, DL, DAG,
                                 Subtarget);
+    } else if (IsUnmasked && isNullConstant(Stride) && isTypeLegal(ScalarVT)) {
+      SDValue ScalarLoad = DAG.getLoad(ScalarVT, DL, Load->getChain(), Ptr,
+                                       Load->getMemOperand());
+      Chain = ScalarLoad.getValue(1);
+      Result = DAG.getSplat(ContainerVT, DL, ScalarLoad);
     } else {
       SDValue IntID = DAG.getTargetConstant(
           IsUnmasked ? Intrinsic::riscv_vlse : Intrinsic::riscv_vlse_mask, DL,
@@ -13485,6 +13490,7 @@ struct NodeExtensionHelper {
     MVT NarrowVT = getNarrowType(Root, *SupportsExt);
 
     SDValue Source = getSource();
+    assert(Subtarget.getTargetLowering()->isTypeLegal(Source.getValueType()));
     if (Source.getValueType() == NarrowVT)
       return Source;
 
@@ -13652,10 +13658,6 @@ struct NodeExtensionHelper {
       unsigned ScalarBits = VT.getScalarSizeInBits();
       unsigned NarrowScalarBits = NarrowVT.getScalarSizeInBits();
 
-      // Ensure the narrowing element type is legal
-      if (!Subtarget.getTargetLowering()->isTypeLegal(NarrowElt.getValueType()))
-        break;
-
       // Ensure the extension's semantic is equivalent to rvv vzext or vsext.
       if (ScalarBits != NarrowScalarBits * 2)
         break;
@@ -13727,14 +13729,11 @@ struct NodeExtensionHelper {
   }
 
   /// Check if \p Root supports any extension folding combines.
-  static bool isSupportedRoot(const SDNode *Root, const SelectionDAG &DAG) {
-    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  static bool isSupportedRoot(const SDNode *Root) {
     switch (Root->getOpcode()) {
     case ISD::ADD:
     case ISD::SUB:
     case ISD::MUL: {
-      if (!TLI.isTypeLegal(Root->getValueType(0)))
-        return false;
       return Root->getValueType(0).isScalableVector();
     }
     // Vector Widening Integer Add/Sub/Mul Instructions
@@ -13751,7 +13750,7 @@ struct NodeExtensionHelper {
     case RISCVISD::FMUL_VL:
     case RISCVISD::VFWADD_W_VL:
     case RISCVISD::VFWSUB_W_VL:
-      return TLI.isTypeLegal(Root->getValueType(0));
+      return true;
     default:
       return false;
     }
@@ -13760,9 +13759,10 @@ struct NodeExtensionHelper {
   /// Build a NodeExtensionHelper for \p Root.getOperand(\p OperandIdx).
   NodeExtensionHelper(SDNode *Root, unsigned OperandIdx, SelectionDAG &DAG,
                       const RISCVSubtarget &Subtarget) {
-    assert(isSupportedRoot(Root, DAG) && "Trying to build an helper with an "
-                                         "unsupported root");
+    assert(isSupportedRoot(Root) && "Trying to build an helper with an "
+                                    "unsupported root");
     assert(OperandIdx < 2 && "Requesting something else than LHS or RHS");
+    assert(DAG.getTargetLoweringInfo().isTypeLegal(Root->getValueType(0)));
     OrigOperand = Root->getOperand(OperandIdx);
 
     unsigned Opc = Root->getOpcode();
@@ -13812,7 +13812,7 @@ struct NodeExtensionHelper {
   static std::pair<SDValue, SDValue>
   getMaskAndVL(const SDNode *Root, SelectionDAG &DAG,
                const RISCVSubtarget &Subtarget) {
-    assert(isSupportedRoot(Root, DAG) && "Unexpected root");
+    assert(isSupportedRoot(Root) && "Unexpected root");
     switch (Root->getOpcode()) {
     case ISD::ADD:
     case ISD::SUB:
@@ -14112,8 +14112,10 @@ static SDValue combineBinOp_VLToVWBinOp_VL(SDNode *N,
                                            TargetLowering::DAGCombinerInfo &DCI,
                                            const RISCVSubtarget &Subtarget) {
   SelectionDAG &DAG = DCI.DAG;
+  if (DCI.isBeforeLegalize())
+    return SDValue();
 
-  if (!NodeExtensionHelper::isSupportedRoot(N, DAG))
+  if (!NodeExtensionHelper::isSupportedRoot(N))
     return SDValue();
 
   SmallVector<SDNode *> Worklist;
@@ -14124,7 +14126,7 @@ static SDValue combineBinOp_VLToVWBinOp_VL(SDNode *N,
 
   while (!Worklist.empty()) {
     SDNode *Root = Worklist.pop_back_val();
-    if (!NodeExtensionHelper::isSupportedRoot(Root, DAG))
+    if (!NodeExtensionHelper::isSupportedRoot(Root))
       return SDValue();
 
     NodeExtensionHelper LHS(N, 0, DAG, Subtarget);
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
index d2d824da9c7831..d7807c12037807 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
@@ -1637,11 +1637,11 @@ def VID_V : RVInstV<0b010100, 0b10001, OPMVV, (outs VR:$vd),
 let vm = 1, RVVConstraint = NoConstraint in {
 def VMV_X_S : RVInstV<0b010000, 0b00000, OPMVV, (outs GPR:$vd),
                       (ins VR:$vs2), "vmv.x.s", "$vd, $vs2">,
-              Sched<[WriteVIMovVX, ReadVIMovVX]>;
+              Sched<[WriteVMovXS, ReadVMovXS]>;
 let Constraints = "$vd = $vd_wb" in
 def VMV_S_X : RVInstV2<0b010000, 0b00000, OPMVX, (outs VR:$vd_wb),
                       (ins VR:$vd, GPR:$rs1), "vmv.s.x", "$vd, $rs1">,
-              Sched<[WriteVIMovXV, ReadVIMovXV, ReadVIMovXX]>;
+              Sched<[WriteVMovSX, ReadVMovSX_V, ReadVMovSX_X]>;
 }
 
 } // hasSideEffects = 0, mayLoad = 0, mayStore = 0
@@ -1655,11 +1655,11 @@ let hasSideEffects = 0, mayLoad = 0, mayStore = 0, vm = 1,
 // Floating-Point Scalar Move Instructions
 def VFMV_F_S : RVInstV<0b010000, 0b00000, OPFVV, (outs FPR32:$vd),
                       (ins VR:$vs2), "vfmv.f.s", "$vd, $vs2">,
-               Sched<[WriteVFMovVF, ReadVFMovVF]>;
+               Sched<[WriteVMovFS, ReadVMovFS]>;
 let Constraints = "$vd = $vd_wb" in
 def VFMV_S_F : RVInstV2<0b010000, 0b00000, OPFVF, (outs VR:$vd_wb),
                        (ins VR:$vd, FPR32:$rs1), "vfmv.s.f", "$vd, $rs1">,
-               Sched<[WriteVFMovFV, ReadVFMovFV, ReadVFMovFX]>;
+               Sched<[WriteVMovSF, ReadVMovSF_V, ReadVMovSF_F]>;
 
 } // hasSideEffects = 0, mayLoad = 0, mayStore = 0, vm = 1
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 48cf48e8af58fb..ae93bf69487565 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -6767,14 +6767,14 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
   let HasSEWOp = 1, BaseInstr = VMV_X_S in
   def PseudoVMV_X_S:
     Pseudo<(outs GPR:$rd), (ins VR:$rs2, ixlenimm:$sew), []>,
-    Sched<[WriteVIMovVX, ReadVIMovVX]>,
+    Sched<[WriteVMovXS, ReadVMovXS]>,
     RISCVVPseudo;
   let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VMV_S_X,
       Constraints = "$rd = $rs1" in
   def PseudoVMV_S_X: Pseudo<(outs VR:$rd),
                             (ins VR:$rs1, GPR:$rs2, AVL:$vl, ixlenimm:$sew),
                             []>,
-    Sched<[WriteVIMovXV, ReadVIMovXV, ReadVIMovXX]>,
+    Sched<[WriteVMovSX, ReadVMovSX_V, ReadVMovSX_X]>,
     RISCVVPseudo;
 }
 } // Predicates = [HasVInstructions]
@@ -6793,7 +6793,7 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
         def "PseudoVFMV_" # f.FX # "_S_" # mx :
           Pseudo<(outs f.fprclass:$rd),
                  (ins m.vrclass:$rs2, ixlenimm:$sew), []>,
-          Sched<[WriteVFMovVF, ReadVFMovVF]>,
+          Sched<[WriteVMovFS, ReadVMovFS]>,
           RISCVVPseudo;
         let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VFMV_S_F,
             Constraints = "$rd = $rs1" in
@@ -6802,7 +6802,7 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
                                                  (ins m.vrclass:$rs1, f.fprclass:$rs2,
                                                       AVL:$vl, ixlenimm:$sew),
                                                  []>,
-          Sched<[WriteVFMovFV, ReadVFMovFV, ReadVFMovFX]>,
+          Sched<[WriteVMovSF, ReadVMovSF_V, ReadVMovSF_F]>,
           RISCVVPseudo;
       }
     }
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
index b21a56bdcdd20a..240d170bfcf6f9 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
@@ -887,10 +887,10 @@ foreach mx = SchedMxList in {
 
 // 16. Vector Permutation Instructions
 let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, 1)] in {
-  def : WriteRes<WriteVIMovVX, [SiFive7VCQ, SiFive7VA]>;
-  def : WriteRes<WriteVIMovXV, [SiFive7VCQ, SiFive7VA]>;
-  def : WriteRes<WriteVFMovVF, [SiFive7VCQ, SiFive7VA]>;
-  def : WriteRes<WriteVFMovFV, [SiFive7VCQ, SiFive7VA]>;
+  def : WriteRes<WriteVMovSX, [SiFive7VCQ, SiFive7VA]>;
+  def : WriteRes<WriteVMovXS, [SiFive7VCQ, SiFive7VA]>;
+  def : WriteRes<WriteVMovSF, [SiFive7VCQ, SiFive7VA]>;
+  def : WriteRes<WriteVMovFS, [SiFive7VCQ, SiFive7VA]>;
 }
 foreach mx = SchedMxList in {
   defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
@@ -1190,12 +1190,12 @@ defm "" : LMULReadAdvance<"ReadVMSFSV", 0>;
 defm "" : LMULReadAdvance<"ReadVIotaV", 0>;
 
 // 17. Vector Permutation Instructions
-def : ReadAdvance<ReadVIMovVX, 0>;
-def : ReadAdvance<ReadVIMovXV, 0>;
-def : ReadAdvance<ReadVIMovXX, 0>;
-def : ReadAdvance<ReadVFMovVF, 0>;
-def : ReadAdvance<ReadVFMovFV, 0>;
-def : ReadAdvance<ReadVFMovFX, 0>;
+def : ReadAdvance<ReadVMovXS, 0>;
+def : ReadAdvance<ReadVMovSX_V, 0>;
+def : ReadAdvance<ReadVMovSX_X, 0>;
+def : ReadAdvance<ReadVMovFS, 0>;
+def : ReadAdvance<ReadVMovSF_V, 0>;
+def : ReadAdvance<ReadVMovSF_F, 0>;
 defm "" : LMULReadAdvance<"ReadVISlideV", 0>;
 defm "" : LMULReadAdvance<"ReadVISlideX", 0>;
 defm "" : LMULReadAdvance<"ReadVFSlideV", 0>;
diff --git a/llvm/lib/Target/RISCV/RISCVScheduleV.td b/llvm/lib/Target/RISCV/RISCVScheduleV.td
index 0be681de3daf69..379622d4ca834c 100644
--- a/llvm/lib/Target/RISCV/RISCVScheduleV.td
+++ b/llvm/lib/Target/RISCV/RISCVScheduleV.td
@@ -196,7 +196,7 @@ multiclass LMULSEWReadAdvanceImpl<string name, int val, list<SchedWrite> writes
 // by the ReadAdvance. For example:
 // ```
 //   defm "" : LMULReadAdvance<"ReadVIALUX", 1,
-//                             LMULSchedWriteList<["WriteVIMovVX"]>.value>;
+//                             LMULSchedWriteList<["WriteVMovSX"]>.value>;
 // ```
 class LMULSchedWriteListImpl<list<string> names, list<string> MxList> {
   list<SchedWrite> value = !foldl([]<SchedWrite>,
@@ -484,11 +484,11 @@ defm "" : LMULSchedWrites<"WriteVIdxV">;
 
 // 16. Vector Permutation Instructions
 // 16.1. Integer Scalar Move Instructions
-def WriteVIMovVX : SchedWrite;
-def WriteVIMovXV : SchedWrite;
+def WriteVMovSX : SchedWrite;
+def WriteVMovXS : SchedWrite;
 // 16.2. Floating-Point Scalar Move Instructions
-def WriteVFMovVF : SchedWrite;
-def WriteVFMovFV : SchedWrite;
+def WriteVMovSF : SchedWrite;
+def WriteVMovFS : SchedWrite;
 // 16.3. Vector Slide Instructions
 defm "" : LMULSchedWrites<"WriteVISlideX">;
 defm "" : LMULSchedWrites<"WriteVISlideI">;
@@ -709,13 +709,13 @@ defm "" : LMULSchedReads<"ReadVIotaV">;
 
 // 16. Vector Permutation Instructions
 // 16.1. Integer Scalar Move Instructions
-def ReadVIMovVX : SchedRead;
-def ReadVIMovXV : SchedRead;
-def ReadVIMovXX : SchedRead;
+def ReadVMovXS : SchedRead;
+def ReadVMovSX_V : SchedRead;
+def ReadVMovSX_X : SchedRead;
 // 16.2. Floating-Point Scalar Move Instructions
-def ReadVFMovVF : SchedRead;
-def ReadVFMovFV : SchedRead;
-def ReadVFMovFX : SchedRead;
+def ReadVMovFS : SchedRead;
+def ReadVMovSF_V : SchedRead;
+def ReadVMovSF_F : SchedRead;
 // 16.3. Vector Slide Instructions
 defm "" : LMULSchedReads<"ReadVISlideV">;
 defm "" : LMULSchedReads<"ReadVISlideX">;
@@ -921,10 +921,10 @@ defm "" : LMULWriteRes<"WriteVIotaV", []>;
 defm "" : LMULWriteRes<"WriteVIdxV", []>;
 
 // 16. Vector Permutation Instructions
-def : WriteRes<WriteVIMovVX, []>;
-def : WriteRes<WriteVIMovXV, []>;
-def : WriteRes<WriteVFMovVF, []>;
-def : WriteRes<WriteVFMovFV, []>;
+def : WriteRes<WriteVMovSX, []>;
+def : WriteRes<WriteVMovXS, []>;
+def : WriteRes<WriteVMovSF, []>;
+def : WriteRes<WriteVMovFS, []>;
 defm "" : LMULWriteRes<"WriteVISlideX", []>;
 defm "" : LMULWriteRes<"WriteVISlideI", []>;
 defm "" : LMULWriteRes<"WriteVISlide1X", []>;
@@ -1082,12 +1082,12 @@ defm "" : LMULReadAdvance<"ReadVMSFSV", 0>;
 defm "" : LMULReadAdvance<"ReadVIotaV", 0>;
 
 // 16. Vector Permutation Instructions
-def : ReadAdvance<ReadVIMovVX, 0>;
-def : ReadAdvance<ReadVIMovXV, 0>;
-def : ReadAdvance<ReadVIMovXX, 0>;
-def : ReadAdvance<ReadVFMovVF, 0>;
-def : ReadAdvance<ReadVFMovFV, 0>;
-def : ReadAdvance<ReadVFMovFX, 0>;
+def : ReadAdvance<ReadVMovXS, 0>;
+def : ReadAdvance<ReadVMovSX_V, 0>;
+def : ReadAdvance<ReadVMovSX_X, 0>;
+def : ReadAdvance<ReadVMovFS, 0>;
+def : ReadAdvance<ReadVMovSF_V, 0>;
+def : ReadAdvance<ReadVMovSF_F, 0>;
 defm "" : LMULReadAdvance<"ReadVISlideV", 0>;
 defm "" : LMULReadAdvance<"ReadVISlideX", 0>;
 defm "" : LMULReadAdvance<"ReadVFSlideV", 0>;
diff --git a/llvm/lib/Target/RISCV/RISCVTargetObjectFile.cpp b/llvm/lib/Target/RISCV/RISCVTargetObjectFile.cpp
index 1535149b919b55..e7c1a7e5d8bca1 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetObjectFile.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetObjectFile.cpp
@@ -32,6 +32,16 @@ void RISCVELFTargetObjectFile::Initialize(MCContext &Ctx,
       ".sdata", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
   SmallBSSSection = getContext().getELFSection(".sbss", ELF::SHT_NOBITS,
                                                ELF::SHF_WRITE | ELF::SHF_ALLOC);
+  SmallRODataSection =
+      getContext().getELFSection(".srodata", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
+  SmallROData4Section = getContext().getELFSection(
+      ".srodata.cst4", ELF::SHT_PROGBITS, ELF::SHF_ALLOC | ELF::SHF_MERGE, 4);
+  SmallROData8Section = getContext().getELFSection(
+      ".srodata.cst8", ELF::SHT_PROGBITS, ELF::SHF_ALLOC | ELF::SHF_MERGE, 8);
+  SmallROData16Section = getContext().getELFSection(
+      ".srodata.cst16", ELF::SHT_PROGBITS, ELF::SHF_ALLOC | ELF::SHF_MERGE, 16);
+  SmallROData32Section = getContext().getELFSection(
+      ".srodata.cst32", ELF::SHT_PROGBITS, ELF::SHF_ALLOC | ELF::SHF_MERGE, 32);
 }
 
 const MCExpr *RISCVELFTargetObjectFile::getIndirectSymViaGOTPCRel(
@@ -126,8 +136,19 @@ bool RISCVELFTargetObjectFile::isConstantInSmallSection(
 MCSection *RISCVELFTargetObjectFile::getSectionForConstant(
     const DataLayout &DL, SectionKind Kind, const Constant *C,
     Align &Alignment) const {
-  if (isConstantInSmallSection(DL, C))
-    return SmallDataSection;
+  if (isConstantInSmallSection(DL, C)) {
+    if (Kind.isMergeableConst4())
+      return SmallROData4Section;
+    if (Kind.isMergeableConst8())
+      return SmallROData8Section;
+    if (Kind.isMergeableConst16())
+      return SmallROData16Section;
+    if (Kind.isMergeableConst32())
+      return SmallROData32Section;
+    // LLVM only generate up to .rodata.cst32, and use .rodata section if more
+    // than 32 bytes, so just use .srodata here.
+    return SmallRODataSection;
+  }
 
   // Otherwise, we work the same as ELF.
   return TargetLoweringObjectFileELF::getSectionForConstant(DL, Kind, C,
diff --git a/llvm/lib/Target/RISCV/RISCVTargetObjectFile.h b/llvm/lib/Target/RISCV/RISCVTargetObjectFile.h
index 0910fbd3d95041..05e61ac874abba 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetObjectFile.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetObjectFile.h
@@ -16,6 +16,11 @@ namespace llvm {
 /// This implementation is used for RISC-V ELF targets.
 class RISCVELFTargetObjectFile : public TargetLoweringObjectFileELF {
   MCSection *SmallDataSection;
+  MCSection *SmallRODataSection;
+  MCSection *SmallROData4Section;
+  MCSection *SmallROData8Section;
+  MCSection *SmallROData16Section;
+  MCSection *SmallROData32Section;
   MCSection *SmallBSSSection;
   unsigned SSThreshold = 8;
 
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 9370fb51a96c56..e4adb7be564952 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -20,6 +20,8 @@
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "systemztti"
@@ -1284,17 +1286,42 @@ InstructionCost SystemZTTIImpl::getInterleavedMemoryOpCost(
   return NumVectorMemOps + NumPermutes;
 }
 
-static int getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy) {
+static int
+getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
+                            const SmallVectorImpl<Type *> &ParamTys) {
   if (RetTy->isVectorTy() && ID == Intrinsic::bswap)
     return getNumVectorRegs(RetTy); // VPERM
+
+  if (ID == Intrinsic::vector_reduce_add) {
+    // Retrieve number and size of elements for the vector op.
+    auto *VTy = cast<FixedVectorType>(ParamTys.front());
+    unsigned NumElements = VTy->getNumElements();
+    unsigned ScalarSize = VTy->getScalarSizeInBits();
+    // For scalar sizes >128 bits, we fall back to the generic cost estimate.
+    if (ScalarSize > SystemZ::VectorBits)
+      return -1;
+    // A single vector register can hold this many elements.
+    unsigned MaxElemsPerVector = SystemZ::VectorBits / ScalarSize;
+    // This many vector regs are needed to represent the input elements (V).
+    unsigned VectorRegsNeeded = getNumVectorRegs(VTy);
+    // This many instructions are needed for the final sum of vector elems (S).
+    unsigned LastVectorHandling =
+        2 * Log2_32_Ceil(std::min(NumElements, MaxElemsPerVector));
+    // We use vector adds to create a sum vector, which takes
+    // V/2 + V/4 + ... = V - 1 operations.
+    // Then, we need S operations to sum up the elements of that sum vector,
+    // for a total of V + S - 1 operations.
+    int Cost = VectorRegsNeeded + LastVectorHandling - 1;
+    return Cost;
+  }
   return -1;
 }
 
 InstructionCost
 SystemZTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                       TTI::TargetCostKind CostKind) {
-  InstructionCost Cost =
-      getVectorIntrinsicInstrCost(ICA.getID(), ICA.getReturnType());
+  InstructionCost Cost = getVectorIntrinsicInstrCost(
+      ICA.getID(), ICA.getReturnType(), ICA.getArgTypes());
   if (Cost != -1)
     return Cost;
   return BaseT::getIntrinsicInstrCost(ICA, CostKind);
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 5cbd9ab4dc2d6c..76c6c1645239ab 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -6133,14 +6133,18 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
 
     MachineSDNode *CNode;
     if (NeedMask) {
-      unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrr : X86::PCMPISTRMrr;
-      unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrm : X86::PCMPISTRMrm;
+      unsigned ROpc =
+          Subtarget->hasAVX() ? X86::VPCMPISTRMrri : X86::PCMPISTRMrri;
+      unsigned MOpc =
+          Subtarget->hasAVX() ? X86::VPCMPISTRMrmi : X86::PCMPISTRMrmi;
       CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node);
       ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
     }
     if (NeedIndex || !NeedMask) {
-      unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrr : X86::PCMPISTRIrr;
-      unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrm : X86::PCMPISTRIrm;
+      unsigned ROpc =
+          Subtarget->hasAVX() ? X86::VPCMPISTRIrri : X86::PCMPISTRIrri;
+      unsigned MOpc =
+          Subtarget->hasAVX() ? X86::VPCMPISTRIrmi : X86::PCMPISTRIrmi;
       CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node);
       ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
     }
@@ -6168,15 +6172,19 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
 
     MachineSDNode *CNode;
     if (NeedMask) {
-      unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrr : X86::PCMPESTRMrr;
-      unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrm : X86::PCMPESTRMrm;
-      CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node,
-                           InGlue);
+      unsigned ROpc =
+          Subtarget->hasAVX() ? X86::VPCMPESTRMrri : X86::PCMPESTRMrri;
+      unsigned MOpc =
+          Subtarget->hasAVX() ? X86::VPCMPESTRMrmi : X86::PCMPESTRMrmi;
+      CNode =
+          emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node, InGlue);
       ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
     }
     if (NeedIndex || !NeedMask) {
-      unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrr : X86::PCMPESTRIrr;
-      unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrm : X86::PCMPESTRIrm;
+      unsigned ROpc =
+          Subtarget->hasAVX() ? X86::VPCMPESTRIrri : X86::PCMPESTRIrri;
+      unsigned MOpc =
+          Subtarget->hasAVX() ? X86::VPCMPESTRIrmi : X86::PCMPESTRIrmi;
       CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InGlue);
       ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
     }
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index eb249b25374a7d..a74901958ac056 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -28926,6 +28926,9 @@ SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
 // supported by the Subtarget
 static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget,
                                         unsigned Opcode) {
+  assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
+         "Unexpected shift opcode");
+
   if (!VT.isSimple())
     return false;
 
@@ -28959,6 +28962,9 @@ bool supportedVectorShiftWithBaseAmnt(EVT VT, const X86Subtarget &Subtarget,
 // natively supported by the Subtarget
 static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget,
                                     unsigned Opcode) {
+  assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
+         "Unexpected shift opcode");
+
   if (!VT.isSimple())
     return false;
 
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index af0ed071c29aba..b65f49527ae5dd 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -2491,12 +2491,12 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
     WorkingMI->removeOperand(3);
     break;
   }
-  case X86::PCLMULQDQrr:
-  case X86::VPCLMULQDQrr:
-  case X86::VPCLMULQDQYrr:
-  case X86::VPCLMULQDQZrr:
-  case X86::VPCLMULQDQZ128rr:
-  case X86::VPCLMULQDQZ256rr: {
+  case X86::PCLMULQDQrri:
+  case X86::VPCLMULQDQrri:
+  case X86::VPCLMULQDQYrri:
+  case X86::VPCLMULQDQZrri:
+  case X86::VPCLMULQDQZ128rri:
+  case X86::VPCLMULQDQZ256rri: {
     // SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0]
     // SRC2 64bits = Imm[4] ? SRC2[127:64] : SRC2[63:0]
     unsigned Imm = MI.getOperand(3).getImm();
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index fd20090fe0973b..4a542b7e5a1bb0 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -6561,12 +6561,12 @@ let Constraints = "$src1 = $dst" in
 //===----------------------------------------------------------------------===//
 
 multiclass pcmpistrm_SS42AI<string asm> {
-  def rr : SS42AI<0x62, MRMSrcReg, (outs),
+  def rri : SS42AI<0x62, MRMSrcReg, (outs),
     (ins VR128:$src1, VR128:$src2, u8imm:$src3),
     !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
     []>, Sched<[WritePCmpIStrM]>;
   let mayLoad = 1 in
-  def rm :SS42AI<0x62, MRMSrcMem, (outs),
+  def rmi :SS42AI<0x62, MRMSrcMem, (outs),
     (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
     !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
     []>, Sched<[WritePCmpIStrM.Folded, WritePCmpIStrM.ReadAfterFold]>;
@@ -6579,12 +6579,12 @@ let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in {
 }
 
 multiclass SS42AI_pcmpestrm<string asm> {
-  def rr : SS42AI<0x60, MRMSrcReg, (outs),
+  def rri : SS42AI<0x60, MRMSrcReg, (outs),
     (ins VR128:$src1, VR128:$src3, u8imm:$src5),
     !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
     []>, Sched<[WritePCmpEStrM]>;
   let mayLoad = 1 in
-  def rm : SS42AI<0x60, MRMSrcMem, (outs),
+  def rmi : SS42AI<0x60, MRMSrcMem, (outs),
     (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
     !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
     []>, Sched<[WritePCmpEStrM.Folded, WritePCmpEStrM.ReadAfterFold]>;
@@ -6597,12 +6597,12 @@ let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
 }
 
 multiclass SS42AI_pcmpistri<string asm> {
-  def rr : SS42AI<0x63, MRMSrcReg, (outs),
+  def rri : SS42AI<0x63, MRMSrcReg, (outs),
     (ins VR128:$src1, VR128:$src2, u8imm:$src3),
     !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
     []>, Sched<[WritePCmpIStrI]>;
   let mayLoad = 1 in
-  def rm : SS42AI<0x63, MRMSrcMem, (outs),
+  def rmi : SS42AI<0x63, MRMSrcMem, (outs),
     (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
     !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
     []>, Sched<[WritePCmpIStrI.Folded, WritePCmpIStrI.ReadAfterFold]>;
@@ -6615,12 +6615,12 @@ let Defs = [ECX, EFLAGS], hasSideEffects = 0 in {
 }
 
 multiclass SS42AI_pcmpestri<string asm> {
-  def rr : SS42AI<0x61, MRMSrcReg, (outs),
+  def rri : SS42AI<0x61, MRMSrcReg, (outs),
     (ins VR128:$src1, VR128:$src3, u8imm:$src5),
     !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
     []>, Sched<[WritePCmpEStrI]>;
   let mayLoad = 1 in
-  def rm : SS42AI<0x61, MRMSrcMem, (outs),
+  def rmi : SS42AI<0x61, MRMSrcMem, (outs),
     (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
     !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
     []>, Sched<[WritePCmpEStrI.Folded, WritePCmpEStrI.ReadAfterFold]>;
@@ -6917,14 +6917,14 @@ def PCLMULCommuteImm : SDNodeXForm<timm, [{
 let Predicates = [NoAVX, HasPCLMUL] in {
   let Constraints = "$src1 = $dst" in {
     let isCommutable = 1 in
-    def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
+    def PCLMULQDQrri : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
               (ins VR128:$src1, VR128:$src2, u8imm:$src3),
               "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
               [(set VR128:$dst,
                 (int_x86_pclmulqdq VR128:$src1, VR128:$src2, timm:$src3))]>,
                 Sched<[WriteCLMul]>;
 
-    def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
+    def PCLMULQDQrmi : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
               (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
               "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
               [(set VR128:$dst,
@@ -6935,7 +6935,7 @@ let Predicates = [NoAVX, HasPCLMUL] in {
 
   def : Pat<(int_x86_pclmulqdq (memop addr:$src2), VR128:$src1,
                                 (i8 timm:$src3)),
-            (PCLMULQDQrm VR128:$src1, addr:$src2,
+            (PCLMULQDQrmi VR128:$src1, addr:$src2,
                           (PCLMULCommuteImm timm:$src3))>;
 } // Predicates = [NoAVX, HasPCLMUL]
 
@@ -6943,10 +6943,10 @@ let Predicates = [NoAVX, HasPCLMUL] in {
 foreach HI = ["hq","lq"] in
 foreach LO = ["hq","lq"] in {
   def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
-                  (PCLMULQDQrr VR128:$dst, VR128:$src,
+                  (PCLMULQDQrri VR128:$dst, VR128:$src,
                    !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
   def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
-                  (PCLMULQDQrm VR128:$dst, i128mem:$src,
+                  (PCLMULQDQrmi VR128:$dst, i128mem:$src,
                    !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
 }
 
@@ -6954,25 +6954,25 @@ foreach LO = ["hq","lq"] in {
 multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp,
                       PatFrag LdFrag, Intrinsic IntId> {
   let isCommutable = 1 in
-  def rr : PCLMULIi8<0x44, MRMSrcReg, (outs RC:$dst),
-            (ins RC:$src1, RC:$src2, u8imm:$src3),
-            "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-            [(set RC:$dst,
-              (IntId RC:$src1, RC:$src2, timm:$src3))]>,
-            Sched<[WriteCLMul]>;
-
-  def rm : PCLMULIi8<0x44, MRMSrcMem, (outs RC:$dst),
-            (ins RC:$src1, MemOp:$src2, u8imm:$src3),
-            "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-            [(set RC:$dst,
-               (IntId RC:$src1, (LdFrag addr:$src2), timm:$src3))]>,
-            Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
+  def rri : PCLMULIi8<0x44, MRMSrcReg, (outs RC:$dst),
+             (ins RC:$src1, RC:$src2, u8imm:$src3),
+             "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+             [(set RC:$dst,
+               (IntId RC:$src1, RC:$src2, timm:$src3))]>,
+             Sched<[WriteCLMul]>;
+
+  def rmi : PCLMULIi8<0x44, MRMSrcMem, (outs RC:$dst),
+             (ins RC:$src1, MemOp:$src2, u8imm:$src3),
+             "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+             [(set RC:$dst,
+                (IntId RC:$src1, (LdFrag addr:$src2), timm:$src3))]>,
+             Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
 
   // We can commute a load in the first operand by swapping the sources and
   // rotating the immediate.
   def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 timm:$src3)),
-            (!cast<Instruction>(NAME#"rm") RC:$src1, addr:$src2,
-                                           (PCLMULCommuteImm timm:$src3))>;
+            (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
+                                            (PCLMULCommuteImm timm:$src3))>;
 }
 
 let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in
@@ -6986,10 +6986,10 @@ defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, load,
 multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC,
                                    X86MemOperand MemOp, string Hi, string Lo> {
   def : InstAlias<"vpclmul"#Hi#Lo#"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                  (!cast<Instruction>(InstStr # "rr") RC:$dst, RC:$src1, RC:$src2,
+                  (!cast<Instruction>(InstStr # "rri") RC:$dst, RC:$src1, RC:$src2,
                         !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
   def : InstAlias<"vpclmul"#Hi#Lo#"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                  (!cast<Instruction>(InstStr # "rm") RC:$dst, RC:$src1, MemOp:$src2,
+                  (!cast<Instruction>(InstStr # "rmi") RC:$dst, RC:$src1, MemOp:$src2,
                         !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
 }
 
diff --git a/llvm/lib/Target/X86/X86SchedAlderlakeP.td b/llvm/lib/Target/X86/X86SchedAlderlakeP.td
index 8e3e5542826480..4dc5ea3c861125 100644
--- a/llvm/lib/Target/X86/X86SchedAlderlakeP.td
+++ b/llvm/lib/Target/X86/X86SchedAlderlakeP.td
@@ -2295,7 +2295,7 @@ def ADLPWriteResGroup263 : SchedWriteRes<[ADLPPort02_03_11, ADLPPort05]> {
 }
 def : InstRW<[ADLPWriteResGroup263, ReadAfterVecYLd], (instregex "^VPACK(S|U)S(DW|WB)Yrm$")>;
 def : InstRW<[ADLPWriteResGroup263, ReadAfterVecYLd], (instrs VPCMPGTQYrm)>;
-def : InstRW<[ADLPWriteResGroup263, ReadAfterVecXLd], (instrs VPCLMULQDQYrm)>;
+def : InstRW<[ADLPWriteResGroup263, ReadAfterVecXLd], (instrs VPCLMULQDQYrmi)>;
 
 def ADLPWriteResGroup264 : SchedWriteRes<[ADLPPort01_05, ADLPPort02_03_11]> {
   let Latency = 9;
diff --git a/llvm/lib/Target/X86/X86SchedSapphireRapids.td b/llvm/lib/Target/X86/X86SchedSapphireRapids.td
index 78c5994ee96470..3c698d2c9f7a01 100644
--- a/llvm/lib/Target/X86/X86SchedSapphireRapids.td
+++ b/llvm/lib/Target/X86/X86SchedSapphireRapids.td
@@ -2665,8 +2665,8 @@ def : InstRW<[SPRWriteResGroup258, ReadAfterVecYLd], (instregex "^VALIGN(D|Q)Z((
                                                                 "^VPUNPCK(H|L)(BW|WD)Zrmk(z?)$")>;
 def : InstRW<[SPRWriteResGroup258, ReadAfterVecYLd], (instrs VPCMPGTQYrm)>;
 def : InstRW<[SPRWriteResGroup258, ReadAfterVecXLd], (instregex "^VPALIGNRZ128rmik(z?)$",
-                                                                "^VPCLMULQDQ(Y|Z)rm$")>;
-def : InstRW<[SPRWriteResGroup258, ReadAfterVecXLd], (instrs VPCLMULQDQZ256rm)>;
+                                                                "^VPCLMULQDQ(Y|Z)rmi$")>;
+def : InstRW<[SPRWriteResGroup258, ReadAfterVecXLd], (instrs VPCLMULQDQZ256rmi)>;
 
 def SPRWriteResGroup259 : SchedWriteRes<[SPRPort00_01_05, SPRPort02_03_11]> {
   let ReleaseAtCycles = [3, 1];
diff --git a/llvm/lib/Target/X86/X86ScheduleBdVer2.td b/llvm/lib/Target/X86/X86ScheduleBdVer2.td
index c9749979576f27..296504cfc78513 100644
--- a/llvm/lib/Target/X86/X86ScheduleBdVer2.td
+++ b/llvm/lib/Target/X86/X86ScheduleBdVer2.td
@@ -1275,12 +1275,12 @@ def : InstRW<[WritePHAdd.Folded], (instrs PHADDDrm, PHSUBDrm,
 
 defm : PdWriteResXMMPair<WriteCLMul, [PdFPU0, PdFPMMA], 12, [1, 7], 5, 1>;
 
-def PdWriteVPCLMULQDQrr : SchedWriteRes<[PdFPU0, PdFPMMA]> {
+def PdWriteVPCLMULQDQrri : SchedWriteRes<[PdFPU0, PdFPMMA]> {
   let Latency = 12;
   let ReleaseAtCycles = [1, 7];
   let NumMicroOps = 6;
 }
-def : InstRW<[PdWriteVPCLMULQDQrr], (instrs VPCLMULQDQrr)>;
+def : InstRW<[PdWriteVPCLMULQDQrri], (instrs VPCLMULQDQrri)>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // SSE4A instructions.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index d2756b0d4d54fb..f5f3716d390d77 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -2164,8 +2164,22 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       }
     }
 
+    // usub_sat((sub nuw C, A), C1) -> usub_sat(usub_sat(C, C1), A)
+    // which after that:
+    // usub_sat((sub nuw C, A), C1) -> usub_sat(C - C1, A) if C1 u< C
+    // usub_sat((sub nuw C, A), C1) -> 0 otherwise
+    Constant *C, *C1;
+    Value *A;
+    if (IID == Intrinsic::usub_sat &&
+        match(Arg0, m_NUWSub(m_ImmConstant(C), m_Value(A))) &&
+        match(Arg1, m_ImmConstant(C1))) {
+      auto *NewC = Builder.CreateBinaryIntrinsic(Intrinsic::usub_sat, C, C1);
+      auto *NewSub =
+          Builder.CreateBinaryIntrinsic(Intrinsic::usub_sat, NewC, A);
+      return replaceInstUsesWith(*SI, NewSub);
+    }
+
     // ssub.sat(X, C) -> sadd.sat(X, -C) if C != MIN
-    Constant *C;
     if (IID == Intrinsic::ssub_sat && match(Arg1, m_Constant(C)) &&
         C->isNotMinSignedValue()) {
       Value *NegVal = ConstantExpr::getNeg(C);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 40ebec7305b467..d75e322a74cfa6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -198,24 +198,21 @@ void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
   assert(!Parent && "Recipe already in some VPBasicBlock");
   assert(InsertPos->getParent() &&
          "Insertion position not in any VPBasicBlock");
-  Parent = InsertPos->getParent();
-  Parent->getRecipeList().insert(InsertPos->getIterator(), this);
+  InsertPos->getParent()->insert(this, InsertPos->getIterator());
 }
 
 void VPRecipeBase::insertBefore(VPBasicBlock &BB,
                                 iplist<VPRecipeBase>::iterator I) {
   assert(!Parent && "Recipe already in some VPBasicBlock");
   assert(I == BB.end() || I->getParent() == &BB);
-  Parent = &BB;
-  BB.getRecipeList().insert(I, this);
+  BB.insert(this, I);
 }
 
 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
   assert(!Parent && "Recipe already in some VPBasicBlock");
   assert(InsertPos->getParent() &&
          "Insertion position not in any VPBasicBlock");
-  Parent = InsertPos->getParent();
-  Parent->getRecipeList().insertAfter(InsertPos->getIterator(), this);
+  InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator()));
 }
 
 void VPRecipeBase::removeFromParent() {
diff --git a/llvm/runtimes/CMakeLists.txt b/llvm/runtimes/CMakeLists.txt
index 9b5e758b6ede57..54ec8bd28d4ca6 100644
--- a/llvm/runtimes/CMakeLists.txt
+++ b/llvm/runtimes/CMakeLists.txt
@@ -358,6 +358,14 @@ function(runtime_register_target name)
         endif()
       endif()
     endforeach()
+    foreach(variable_name ${${name}_extra_args})
+      string(FIND "${variable_name}" "-DRUNTIMES_${extra_name}_" out)
+      if("${out}" EQUAL 0)
+        string(REPLACE "-DRUNTIMES_${extra_name}_" "" new_name ${variable_name})
+        string(REPLACE ";" "|" new_value "${new_name}")
+        list(APPEND ${name}_extra_args "-D${new_value}")
+      endif()
+    endforeach()
   endforeach()
 
   set_enable_per_target_runtime_dir()
@@ -438,21 +446,37 @@ if(runtimes)
     if(NOT hdrgen_exe)
       message(FATAL_ERROR "libc-hdrgen executable missing")
     endif()
-    set(libc_cmake_args "-DLIBC_HDRGEN_EXE=${hdrgen_exe}"
-                        "-DLLVM_LIBC_FULL_BUILD=ON")
+    list(APPEND libc_cmake_args "-DLIBC_HDRGEN_EXE=${hdrgen_exe}")
     list(APPEND extra_deps ${hdrgen_deps})
-    if(LLVM_LIBC_GPU_BUILD)
-      list(APPEND libc_cmake_args "-DLLVM_LIBC_GPU_BUILD=ON")
-      # The `libc` project may require '-DCUDAToolkit_ROOT' in GPU mode.
-      if(CUDAToolkit_ROOT)
-        list(APPEND libc_cmake_args "-DCUDAToolkit_ROOT=${CUDAToolkit_ROOT}")
+  endif()
+  if(LLVM_LIBC_GPU_BUILD)
+    list(APPEND libc_cmake_args "-DLLVM_LIBC_GPU_BUILD=ON")
+    if("libc" IN_LIST RUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES)
+      if(TARGET amdhsa-loader)
+        list(APPEND libc_cmake_args
+             "-DRUNTIMES_amdgcn-amd-amdhsa_LIBC_GPU_LOADER_EXECUTABLE=$<TARGET_FILE:amdhsa-loader>")
+        list(APPEND extra_deps amdhsa-loader amdgpu-arch)
       endif()
-      foreach(dep clang-offload-packager nvptx-arch amdgpu-arch)
-        if(TARGET ${dep})
-          list(APPEND extra_deps ${dep})
-        endif()
-      endforeach()
+      list(APPEND libc_cmake_args "-DRUNTIMES_amdgcn-amd-amdhsa_LLVM_LIBC_FULL_BUILD=ON")
     endif()
+    if("libc" IN_LIST RUNTIMES_nvptx64-nvidia-cuda_LLVM_ENABLE_RUNTIMES)
+      if(TARGET nvptx-loader)
+        list(APPEND libc_cmake_args
+             "-DRUNTIMES_nvptx64-nvidia-cuda_LIBC_GPU_LOADER_EXECUTABLE=$<TARGET_FILE:nvptx-loader>")
+        list(APPEND extra_deps nvptx-loader nvptx-arch)
+      endif()
+      list(APPEND libc_cmake_args "-DRUNTIMES_nvptx64-nvidia-cuda_LLVM_LIBC_FULL_BUILD=ON")
+    endif()
+    # The `libc` project may require '-DCUDAToolkit_ROOT' in GPU mode.
+    if(CUDAToolkit_ROOT)
+      list(APPEND libc_cmake_args "-DCUDAToolkit_ROOT=${CUDAToolkit_ROOT}")
+    endif()
+    if(TARGET clang-offload-packager)
+      list(APPEND extra_deps clang-offload-packager)
+    endif()
+  endif()
+  if(LLVM_LIBC_FULL_BUILD)
+    list(APPEND libc_cmake_args "-DLLVM_LIBC_FULL_BUILD=ON")
   endif()
   if(NOT LLVM_RUNTIME_TARGETS)
     runtime_default_target(
diff --git a/llvm/test/Analysis/CostModel/SystemZ/reduce-add.ll b/llvm/test/Analysis/CostModel/SystemZ/reduce-add.ll
new file mode 100644
index 00000000000000..061e5ece44a4e7
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/SystemZ/reduce-add.ll
@@ -0,0 +1,128 @@
+; RUN: opt < %s -mtriple=systemz-unknown -mcpu=z13 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s
+
+define void @reduce(ptr %src, ptr %dst) {
+; CHECK-LABEL: 'reduce'
+; CHECK:  Cost Model: Found an estimated cost of 2 for instruction: %R2_64 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %V2_64)
+; CHECK:  Cost Model: Found an estimated cost of 3 for instruction: %R4_64 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %V4_64)
+; CHECK:  Cost Model: Found an estimated cost of 5 for instruction: %R8_64 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %V8_64)
+; CHECK:  Cost Model: Found an estimated cost of 9 for instruction: %R16_64 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %V16_64)
+; CHECK:  Cost Model: Found an estimated cost of 2 for instruction: %R2_32 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %V2_32)
+; CHECK:  Cost Model: Found an estimated cost of 4 for instruction: %R4_32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %V4_32)
+; CHECK:  Cost Model: Found an estimated cost of 5 for instruction: %R8_32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %V8_32)
+; CHECK:  Cost Model: Found an estimated cost of 7 for instruction: %R16_32 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %V16_32)
+; CHECK:  Cost Model: Found an estimated cost of 2 for instruction: %R2_16 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %V2_16)
+; CHECK:  Cost Model: Found an estimated cost of 4 for instruction: %R4_16 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %V4_16)
+; CHECK:  Cost Model: Found an estimated cost of 6 for instruction: %R8_16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %V8_16)
+; CHECK:  Cost Model: Found an estimated cost of 7 for instruction: %R16_16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %V16_16)
+; CHECK:  Cost Model: Found an estimated cost of 2 for instruction: %R2_8 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %V2_8)
+; CHECK:  Cost Model: Found an estimated cost of 4 for instruction: %R4_8 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %V4_8)
+; CHECK:  Cost Model: Found an estimated cost of 6 for instruction: %R8_8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %V8_8)
+; CHECK:  Cost Model: Found an estimated cost of 8 for instruction: %R16_8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %V16_8)
+;
+; CHECK:  Cost Model: Found an estimated cost of 15 for instruction: %R128_8 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %V128_8)
+; CHECK:  Cost Model: Found an estimated cost of 20 for instruction: %R4_256 = call i256 @llvm.vector.reduce.add.v4i256(<4 x i256> %V4_256)
+
+  ; REDUCEADD64
+
+  %V2_64 = load <2 x i64>, ptr %src, align 8
+  %R2_64 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %V2_64)
+  store volatile i64 %R2_64, ptr %dst, align 4
+
+  %V4_64 = load <4 x i64>, ptr %src, align 8
+  %R4_64 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %V4_64)
+  store volatile i64 %R4_64, ptr %dst, align 4
+
+  %V8_64 = load <8 x i64>, ptr %src, align 8
+  %R8_64 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %V8_64)
+  store volatile i64 %R8_64, ptr %dst, align 4
+
+  %V16_64 = load <16 x i64>, ptr %src, align 8
+  %R16_64 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %V16_64)
+  store volatile i64 %R16_64, ptr %dst, align 4
+
+  ; REDUCEADD32
+
+  %V2_32 = load <2 x i32>, ptr %src, align 8
+  %R2_32 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %V2_32)
+  store volatile i32 %R2_32, ptr %dst, align 4
+
+  %V4_32 = load <4 x i32>, ptr %src, align 8
+  %R4_32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %V4_32)
+  store volatile i32 %R4_32, ptr %dst, align 4
+
+  %V8_32 = load <8 x i32>, ptr %src, align 8
+  %R8_32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %V8_32)
+  store volatile i32 %R8_32, ptr %dst, align 4
+
+  %V16_32 = load <16 x i32>, ptr %src, align 8
+  %R16_32 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %V16_32)
+  store volatile i32 %R16_32, ptr %dst, align 4
+
+  ; REDUCEADD16
+
+  %V2_16 = load <2 x i16>, ptr %src, align 8
+  %R2_16 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %V2_16)
+  store volatile i16 %R2_16, ptr %dst, align 4
+
+  %V4_16 = load <4 x i16>, ptr %src, align 8
+  %R4_16 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %V4_16)
+  store volatile i16 %R4_16, ptr %dst, align 4
+
+  %V8_16 = load <8 x i16>, ptr %src, align 8
+  %R8_16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %V8_16)
+  store volatile i16 %R8_16, ptr %dst, align 4
+
+  %V16_16 = load <16 x i16>, ptr %src, align 8
+  %R16_16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %V16_16)
+  store volatile i16 %R16_16, ptr %dst, align 4
+
+  ; REDUCEADD8
+
+  %V2_8 = load <2 x i8>, ptr %src, align 8
+  %R2_8 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %V2_8)
+  store volatile i8 %R2_8, ptr %dst, align 4
+
+  %V4_8 = load <4 x i8>, ptr %src, align 8
+  %R4_8 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %V4_8)
+  store volatile i8 %R4_8, ptr %dst, align 4
+
+  %V8_8 = load <8 x i8>, ptr %src, align 8
+  %R8_8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %V8_8)
+  store volatile i8 %R8_8, ptr %dst, align 4
+
+  %V16_8 = load <16 x i8>, ptr %src, align 8
+  %R16_8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %V16_8)
+  store volatile i8 %R16_8, ptr %dst, align 4
+
+  ; EXTREME VALUES
+
+  %V128_8 = load <128 x i8>, ptr %src, align 8
+  %R128_8 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %V128_8)
+  store volatile i8 %R128_8, ptr %dst, align 4
+
+  %V4_256 = load <4 x i256>, ptr %src, align 8
+  %R4_256 = call i256 @llvm.vector.reduce.add.v4i256(<4 x i256> %V4_256)
+  store volatile i256 %R4_256, ptr %dst, align 8
+
+  ret void
+}
+
+declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
+declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
+declare i16 @llvm.vector.reduce.add.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
+declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
+
+declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>)
+declare i256 @llvm.vector.reduce.add.v4i256(<4 x i256>)
diff --git a/llvm/test/CodeGen/AArch64/and-mask-removal.ll b/llvm/test/CodeGen/AArch64/and-mask-removal.ll
index a8a59f1591268f..17ff0159701689 100644
--- a/llvm/test/CodeGen/AArch64/and-mask-removal.ll
+++ b/llvm/test/CodeGen/AArch64/and-mask-removal.ll
@@ -65,8 +65,9 @@ if.end:                                           ; preds = %if.then, %entry
 define zeroext i1 @test8_0(i8 zeroext %x)  align 2 {
 ; CHECK-LABEL: test8_0:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    sub w8, w0, #182
-; CHECK-NEXT:    cmn w8, #20
+; CHECK-NEXT:    add w8, w0, #74
+; CHECK-NEXT:    and w8, w8, #0xff
+; CHECK-NEXT:    cmp w8, #236
 ; CHECK-NEXT:    cset w0, lo
 ; CHECK-NEXT:    ret
 entry:
@@ -507,17 +508,16 @@ define i64 @pr58109(i8 signext %0) {
 define i64 @pr58109b(i8 signext %0, i64 %a, i64 %b) {
 ; CHECK-SD-LABEL: pr58109b:
 ; CHECK-SD:       ; %bb.0:
-; CHECK-SD-NEXT:    and w8, w0, #0xff
-; CHECK-SD-NEXT:    sub w8, w8, #255
-; CHECK-SD-NEXT:    cmn w8, #254
-; CHECK-SD-NEXT:    csel x0, x1, x2, lo
+; CHECK-SD-NEXT:    add w8, w0, #1
+; CHECK-SD-NEXT:    tst w8, #0xfe
+; CHECK-SD-NEXT:    csel x0, x1, x2, eq
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: pr58109b:
 ; CHECK-GI:       ; %bb.0:
-; CHECK-GI-NEXT:    mov w8, #-255 ; =0xffffff01
-; CHECK-GI-NEXT:    add w8, w8, w0, uxtb
-; CHECK-GI-NEXT:    cmn w8, #254
+; CHECK-GI-NEXT:    add w8, w0, #1
+; CHECK-GI-NEXT:    and w8, w8, #0xff
+; CHECK-GI-NEXT:    cmp w8, #2
 ; CHECK-GI-NEXT:    csel x0, x1, x2, lo
 ; CHECK-GI-NEXT:    ret
   %2 = add i8 %0, 1
diff --git a/llvm/test/CodeGen/AArch64/signed-truncation-check.ll b/llvm/test/CodeGen/AArch64/signed-truncation-check.ll
index bb4df6d8935b1b..ab42e6463feeed 100644
--- a/llvm/test/CodeGen/AArch64/signed-truncation-check.ll
+++ b/llvm/test/CodeGen/AArch64/signed-truncation-check.ll
@@ -396,7 +396,7 @@ define i1 @add_ultcmp_bad_i24_i8(i24 %x) nounwind {
 define i1 @add_ulecmp_bad_i16_i8(i16 %x) nounwind {
 ; CHECK-LABEL: add_ulecmp_bad_i16_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w0, #1 // =0x1
+; CHECK-NEXT:    mov w0, #1
 ; CHECK-NEXT:    ret
   %tmp0 = add i16 %x, 128 ; 1U << (8-1)
   %tmp1 = icmp ule i16 %tmp0, -1 ; when we +1 it, it will wrap to 0
diff --git a/llvm/test/CodeGen/AArch64/typepromotion-overflow.ll b/llvm/test/CodeGen/AArch64/typepromotion-overflow.ll
index 39edc03ced442e..ccfbf456693d7a 100644
--- a/llvm/test/CodeGen/AArch64/typepromotion-overflow.ll
+++ b/llvm/test/CodeGen/AArch64/typepromotion-overflow.ll
@@ -246,8 +246,9 @@ define i32 @safe_sub_var_imm(ptr nocapture readonly %b) local_unnamed_addr #1 {
 ; CHECK-LABEL: safe_sub_var_imm:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldrb w8, [x0]
-; CHECK-NEXT:    sub w8, w8, #248
-; CHECK-NEXT:    cmn w8, #4
+; CHECK-NEXT:    add w8, w8, #8
+; CHECK-NEXT:    and w8, w8, #0xff
+; CHECK-NEXT:    cmp w8, #252
 ; CHECK-NEXT:    cset w0, hi
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/acc-ldst.ll b/llvm/test/CodeGen/AMDGPU/acc-ldst.ll
index cf9c5a2e8f51d0..be4d6a2c278957 100644
--- a/llvm/test/CodeGen/AMDGPU/acc-ldst.ll
+++ b/llvm/test/CodeGen/AMDGPU/acc-ldst.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -early-live-intervals < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
 
 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
 declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/add.ll b/llvm/test/CodeGen/AMDGPU/add.ll
index 39f9cf7cf8fffc..422e2747094ce2 100644
--- a/llvm/test/CodeGen/AMDGPU/add.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.ll
@@ -1263,7 +1263,6 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX10-NEXT:  ; %bb.1: ; %else
 ; GFX10-NEXT:    s_add_u32 s4, s4, s6
 ; GFX10-NEXT:    s_addc_u32 s5, s5, s7
-; GFX10-NEXT:    s_mov_b32 s6, 0
 ; GFX10-NEXT:    s_cbranch_execnz .LBB9_3
 ; GFX10-NEXT:  .LBB9_2: ; %if
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
@@ -1275,7 +1274,6 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ; GFX10-NEXT:  .LBB9_4:
-; GFX10-NEXT:    s_mov_b32 s6, -1
 ; GFX10-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX10-NEXT:    s_branch .LBB9_2
 ;
@@ -1288,7 +1286,6 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-NEXT:  ; %bb.1: ; %else
 ; GFX11-NEXT:    s_add_u32 s4, s4, s6
 ; GFX11-NEXT:    s_addc_u32 s5, s5, s7
-; GFX11-NEXT:    s_mov_b32 s6, 0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB9_3
 ; GFX11-NEXT:  .LBB9_2: ; %if
 ; GFX11-NEXT:    s_load_b64 s[4:5], s[2:3], 0x0
@@ -1301,7 +1298,6 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ; GFX11-NEXT:  .LBB9_4:
-; GFX11-NEXT:    s_mov_b32 s6, -1
 ; GFX11-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX11-NEXT:    s_branch .LBB9_2
 ;
@@ -1313,7 +1309,6 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12-NEXT:    s_cbranch_scc0 .LBB9_4
 ; GFX12-NEXT:  ; %bb.1: ; %else
 ; GFX12-NEXT:    s_add_nc_u64 s[4:5], s[4:5], s[6:7]
-; GFX12-NEXT:    s_mov_b32 s6, 0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB9_3
 ; GFX12-NEXT:  .LBB9_2: ; %if
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x0
@@ -1326,7 +1321,6 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 ; GFX12-NEXT:  .LBB9_4:
-; GFX12-NEXT:    s_mov_b32 s6, -1
 ; GFX12-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX12-NEXT:    s_branch .LBB9_2
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
index 502e6f390433cf..b6359f18169799 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
@@ -1499,7 +1499,6 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(
 ; SI-NEXT:    s_mov_b32 s8, s2
 ; SI-NEXT:    s_mov_b32 s9, s3
 ; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 offset:2
-; SI-NEXT:    s_mov_b64 s[2:3], 0
 ; SI-NEXT:    s_cbranch_execnz .LBB14_3
 ; SI-NEXT:  .LBB14_2: ; %if
 ; SI-NEXT:    s_and_b32 s2, s4, 0xffff
@@ -1513,7 +1512,6 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ; SI-NEXT:  .LBB14_4:
-; SI-NEXT:    s_mov_b64 s[2:3], -1
 ; SI-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-NEXT:    s_branch .LBB14_2
 ;
@@ -1531,7 +1529,6 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(
 ; VI-NEXT:    s_mov_b32 s8, s2
 ; VI-NEXT:    s_mov_b32 s9, s3
 ; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 offset:2
-; VI-NEXT:    s_mov_b64 s[2:3], 0
 ; VI-NEXT:    s_cbranch_execnz .LBB14_3
 ; VI-NEXT:  .LBB14_2: ; %if
 ; VI-NEXT:    s_and_b32 s2, s4, 0xffff
@@ -1545,7 +1542,6 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(
 ; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ; VI-NEXT:  .LBB14_4:
-; VI-NEXT:    s_mov_b64 s[2:3], -1
 ; VI-NEXT:    ; implicit-def: $vgpr0
 ; VI-NEXT:    s_branch .LBB14_2
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop64.ll b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
index 3b9c3e3ba17523..131ce14a7847c8 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
@@ -358,7 +358,6 @@ define amdgpu_kernel void @ctpop_i64_in_br(ptr addrspace(1) %out, ptr addrspace(
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ; SI-NEXT:  .LBB7_4:
-; SI-NEXT:    s_mov_b64 s[6:7], -1
 ; SI-NEXT:    ; implicit-def: $sgpr0_sgpr1
 ; SI-NEXT:    s_branch .LBB7_2
 ;
@@ -372,7 +371,6 @@ define amdgpu_kernel void @ctpop_i64_in_br(ptr addrspace(1) %out, ptr addrspace(
 ; VI-NEXT:    s_cbranch_scc0 .LBB7_4
 ; VI-NEXT:  ; %bb.1: ; %else
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x8
-; VI-NEXT:    s_mov_b64 s[6:7], 0
 ; VI-NEXT:    s_cbranch_execnz .LBB7_3
 ; VI-NEXT:  .LBB7_2: ; %if
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -387,7 +385,6 @@ define amdgpu_kernel void @ctpop_i64_in_br(ptr addrspace(1) %out, ptr addrspace(
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ; VI-NEXT:  .LBB7_4:
-; VI-NEXT:    s_mov_b64 s[6:7], -1
 ; VI-NEXT:    ; implicit-def: $sgpr0_sgpr1
 ; VI-NEXT:    s_branch .LBB7_2
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll
index 3a8f06ba59a129..01af3346523827 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll
@@ -5,6 +5,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,UNALIGNED,VECT %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,VECT %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,ALIGNED,VECT %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -mattr=+cumode -early-live-intervals < %s | FileCheck -check-prefixes=GCN,ALIGNED,VECT %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,UNALIGNED,VECT %s
 
 ; GCN-LABEL: test_local_misaligned_v2:
diff --git a/llvm/test/CodeGen/AMDGPU/lds-reject-absolute-addresses.ll b/llvm/test/CodeGen/AMDGPU/lds-reject-mixed-absolute-addresses.ll
similarity index 81%
rename from llvm/test/CodeGen/AMDGPU/lds-reject-absolute-addresses.ll
rename to llvm/test/CodeGen/AMDGPU/lds-reject-mixed-absolute-addresses.ll
index 659cdb55ded2fe..b512a43aa10222 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-reject-absolute-addresses.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-reject-mixed-absolute-addresses.ll
@@ -2,8 +2,9 @@
 ; RUN: not --crash opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s
 
 @var1 = addrspace(3) global i32 undef, !absolute_symbol !0
+@var2 = addrspace(3) global i32 undef
 
-; CHECK: LLVM ERROR: LDS variables with absolute addresses are unimplemented.
+; CHECK: Module cannot mix absolute and non-absolute LDS GVs
 define amdgpu_kernel void @kern() {
   %val0 = load i32, ptr addrspace(3) @var1
   %val1 = add i32 %val0, 4
@@ -12,4 +13,3 @@ define amdgpu_kernel void @kern() {
 }
 
 !0 = !{i32 0, i32 1}
-
diff --git a/llvm/test/CodeGen/AMDGPU/lds-run-twice-absolute-md.ll b/llvm/test/CodeGen/AMDGPU/lds-run-twice-absolute-md.ll
new file mode 100644
index 00000000000000..52b44eea35c827
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lds-run-twice-absolute-md.ll
@@ -0,0 +1,16 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds %s -o %t.ll
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds %t.ll -o %t.second.ll
+; RUN: diff -ub %t.ll %t.second.ll -I ".*ModuleID.*"
+
+; Check AMDGPULowerModuleLDS can run more than once on the same module, and that
+; the second run is a no-op.
+
+@lds = internal unnamed_addr addrspace(3) global i32 undef, align 4, !absolute_symbol !0
+
+define amdgpu_kernel void @test() {
+entry:
+  store i32 1, ptr addrspace(3) @lds
+  ret void
+}
+
+!0 = !{i32 0, i32 1}
diff --git a/llvm/test/CodeGen/AMDGPU/lds-run-twice.ll b/llvm/test/CodeGen/AMDGPU/lds-run-twice.ll
new file mode 100644
index 00000000000000..b830ccb944a282
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lds-run-twice.ll
@@ -0,0 +1,14 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds %s -o %t.ll
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds %t.ll -o %t.second.ll
+; RUN: diff -ub %t.ll %t.second.ll -I ".*ModuleID.*"
+
+; Check AMDGPULowerModuleLDS can run more than once on the same module, and that
+; the second run is a no-op.
+
+@lds = internal unnamed_addr addrspace(3) global i32 undef, align 4
+
+define amdgpu_kernel void @test() {
+entry:
+  store i32 1, ptr addrspace(3) @lds
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
index f30c890934c92b..8302af7450ed9d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
@@ -100,7 +100,6 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
-; GCN-NEXT:    s_mov_b64 s[2:3], 0
 ; GCN-NEXT:    s_cbranch_execnz .LBB4_2
 ; GCN-NEXT:  .LBB4_4: ; %.zero
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
index 8ae6e13303446b..8dbbab3c57f72f 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY,GREEDY908 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY,GREEDY90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -early-live-intervals -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY,GREEDY90A %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY,GREEDY90A %s
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY,GREEDY90A-GISEL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -sgpr-regalloc=fast -vgpr-regalloc=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,FAST %s
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index 0d2558c4f0124f..b4272049f36a4c 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -2517,7 +2517,6 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX10-NEXT:    s_add_i32 s7, s8, s7
 ; GFX10-NEXT:    s_mul_i32 s4, s4, s6
 ; GFX10-NEXT:    s_add_i32 s5, s7, s5
-; GFX10-NEXT:    s_mov_b32 s6, 0
 ; GFX10-NEXT:    s_cbranch_execnz .LBB16_4
 ; GFX10-NEXT:  .LBB16_2: ; %if
 ; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
@@ -2527,7 +2526,6 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX10-NEXT:    s_branch .LBB16_5
 ; GFX10-NEXT:  .LBB16_3:
-; GFX10-NEXT:    s_mov_b32 s6, -1
 ; GFX10-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX10-NEXT:    s_branch .LBB16_2
 ; GFX10-NEXT:  .LBB16_4:
@@ -2553,7 +2551,6 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-NEXT:    s_add_i32 s7, s8, s7
 ; GFX11-NEXT:    s_mul_i32 s4, s4, s6
 ; GFX11-NEXT:    s_add_i32 s5, s7, s5
-; GFX11-NEXT:    s_mov_b32 s6, 0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB16_4
 ; GFX11-NEXT:  .LBB16_2: ; %if
 ; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
@@ -2563,7 +2560,6 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[4:7], 0
 ; GFX11-NEXT:    s_branch .LBB16_5
 ; GFX11-NEXT:  .LBB16_3:
-; GFX11-NEXT:    s_mov_b32 s6, -1
 ; GFX11-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX11-NEXT:    s_branch .LBB16_2
 ; GFX11-NEXT:  .LBB16_4:
@@ -2585,7 +2581,6 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12-NEXT:    s_cbranch_scc0 .LBB16_3
 ; GFX12-NEXT:  ; %bb.1: ; %else
 ; GFX12-NEXT:    s_mul_u64 s[4:5], s[4:5], s[6:7]
-; GFX12-NEXT:    s_mov_b32 s6, 0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB16_4
 ; GFX12-NEXT:  .LBB16_2: ; %if
 ; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
@@ -2595,7 +2590,6 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12-NEXT:    buffer_load_b64 v[0:1], off, s[4:7], null
 ; GFX12-NEXT:    s_branch .LBB16_5
 ; GFX12-NEXT:  .LBB16_3:
-; GFX12-NEXT:    s_mov_b32 s6, -1
 ; GFX12-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX12-NEXT:    s_branch .LBB16_2
 ; GFX12-NEXT:  .LBB16_4:
diff --git a/llvm/test/CodeGen/ARM/cmse-vlldm-no-reorder.mir b/llvm/test/CodeGen/ARM/cmse-vlldm-no-reorder.mir
index 2bc4288884f192..3d49fee8fdaf43 100644
--- a/llvm/test/CodeGen/ARM/cmse-vlldm-no-reorder.mir
+++ b/llvm/test/CodeGen/ARM/cmse-vlldm-no-reorder.mir
@@ -89,7 +89,7 @@ body:             |
 # CHECK: $sp = t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, $r4, $r5, $r6, undef $r7, $r8, $r9, $r10, $r11
 # CHECK-NEXT:  $r0 = t2BICri $r0, 1, 14 /* CC::al */, $noreg, $noreg
 # CHECK-NEXT:  $sp = tSUBspi $sp, 34, 14 /* CC::al */, $noreg
-# CHECK-NEXT:  VLSTM $sp, 14 /* CC::al */, $noreg, implicit undef $vpr, implicit undef $fpscr, implicit undef $fpscr_nzcv, implicit undef $q0, implicit undef $q1, implicit undef $q2, implicit undef $q3, implicit undef $q4, implicit undef $q5, implicit undef $q6, implicit undef $q7
+# CHECK-NEXT:  VLSTM $sp, 14 /* CC::al */, $noreg, 0, implicit-def $vpr, implicit-def $fpscr, implicit-def $fpscr_nzcv, implicit undef $vpr, implicit undef $fpscr, implicit undef $fpscr_nzcv, implicit undef $d0, implicit undef $d1, implicit undef $d2, implicit undef $d3, implicit undef $d4, implicit undef $d5, implicit undef $d6, implicit undef $d7, implicit $d8, implicit $d9, implicit $d10, implicit $d11, implicit $d12, implicit $d13, implicit $d14, implicit $d15
 # CHECK-NEXT:  $r1 = tMOVr $r0, 14 /* CC::al */, $noreg
 # CHECK-NEXT:  $r2 = tMOVr $r0, 14 /* CC::al */, $noreg
 # CHECK-NEXT:  $r3 = tMOVr $r0, 14 /* CC::al */, $noreg
@@ -105,8 +105,7 @@ body:             |
 # CHECK-NEXT:  t2MSR_M 3072, $r0, 14 /* CC::al */, $noreg, implicit-def $cpsr
 # CHECK-NEXT:  tBLXNSr 14 /* CC::al */, $noreg, killed $r0, csr_aapcs, implicit-def $lr, implicit $sp, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $s0
 # CHECK-NEXT:  $r12 = VMOVRS $s0, 14 /* CC::al */, $noreg
-# CHECK-NEXT:  VLLDM $sp, 14 /* CC::al */, $noreg, implicit-def $q0, implicit-def $q1, implicit-def $q2, implicit-def $q3, implicit-def $q4, implicit-def $q5, implicit-def $q6, implicit-def $q7, implicit-def $vpr, implicit-def $fpscr, implicit-def $fpscr_nzcv
+# CHECK-NEXT:  VLLDM $sp, 14 /* CC::al */, $noreg, 0, implicit-def $vpr, implicit-def $fpscr, implicit-def $fpscr_nzcv, implicit-def $d0, implicit-def $d1, implicit-def $d2, implicit-def $d3, implicit-def $d4, implicit-def $d5, implicit-def $d6, implicit-def $d7, implicit-def $d8, implicit-def $d9, implicit-def $d10, implicit-def $d11, implicit-def $d12, implicit-def $d13, implicit-def $d14, implicit-def $d15
 # CHECK-NEXT:  $s0 = VMOVSR $r12, 14 /* CC::al */, $noreg
 # CHECK-NEXT:  $sp = tADDspi $sp, 34, 14 /* CC::al */, $noreg
 # CHECK-NEXT:  $sp = t2LDMIA_UPD $sp, 14 /* CC::al */, $noreg, def $r4, def $r5, def $r6, def $r7, def $r8, def $r9, def $r10, def $r11
- 
diff --git a/llvm/test/CodeGen/ARM/vlldm-vlstm-uops.mir b/llvm/test/CodeGen/ARM/vlldm-vlstm-uops.mir
index 8c49a531674115..8fa9337eae6cd9 100644
--- a/llvm/test/CodeGen/ARM/vlldm-vlstm-uops.mir
+++ b/llvm/test/CodeGen/ARM/vlldm-vlstm-uops.mir
@@ -60,9 +60,9 @@ body:             |
     $sp = t2STMDB_UPD $sp, 14, $noreg, $r4, killed $r5, killed $r6, killed $r7, killed $r8, killed $r9, killed $r10, killed $r11
     $r4 = t2BICri $r4, 1, 14, $noreg, $noreg
     $sp = tSUBspi $sp, 34, 14, $noreg
-    VLSTM $sp, 14, $noreg
-    tBLXNSr 14, $noreg, killed $r4, csr_aapcs, implicit-def $lr, implicit $sp, implicit-def dead $lr, implicit $sp, implicit-def $sp
-    VLLDM $sp, 14, $noreg, implicit-def $q0, implicit-def $q1, implicit-def $q2, implicit-def $q3, implicit-def $q4, implicit-def $q5, implicit-def $q6, implicit-def $q7, implicit-def $vpr, implicit-def $fpscr, implicit-def $fpscr_nzcv
+    VLSTM $sp, 14 /* CC::al */, $noreg, 0, implicit-def $vpr, implicit-def $fpscr, implicit-def $fpscr_nzcv, implicit undef $vpr, implicit undef $fpscr, implicit undef $fpscr_nzcv, implicit undef $d0, implicit undef $d1, implicit undef $d2, implicit undef $d3, implicit undef $d4, implicit undef $d5, implicit undef $d6, implicit undef $d7, implicit $d8, implicit $d9, implicit $d10, implicit $d11, implicit $d12, implicit $d13, implicit $d14, implicit $d15
+    tBLXNSr 14, $noreg, killed $r4, csr_aapcs, implicit-def $lr, implicit $sp, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $q0, implicit-def $q1, implicit-def $q2, implicit-def $q3, implicit-def $q4, implicit-def $q5, implicit-def $q6, implicit-def $q7
+    VLLDM $sp, 14 /* CC::al */, $noreg, 0, implicit-def $vpr, implicit-def $fpscr, implicit-def $fpscr_nzcv, implicit-def $d0, implicit-def $d1, implicit-def $d2, implicit-def $d3, implicit-def $d4, implicit-def $d5, implicit-def $d6, implicit-def $d7, implicit-def $d8, implicit-def $d9, implicit-def $d10, implicit-def $d11, implicit-def $d12, implicit-def $d13, implicit-def $d14, implicit-def $d15
     $sp = tADDspi $sp, 34, 14, $noreg
     $sp = t2LDMIA_UPD $sp, 14, $noreg, def $r4, def $r5, def $r6, def $r7, def $r8, def $r9, def $r10, def $r11
     $sp = t2LDMIA_RET $sp, 14, $noreg, def $r4, def $pc
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
index 17d64c86dd53a4..c38406bafa8a97 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
@@ -915,3 +915,42 @@ bb4:                                              ; preds = %bb4, %bb2
 bb16:                                             ; preds = %bb4, %bb
   ret void
 }
+
+define void @gather_zero_stride_fp(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
+; CHECK-LABEL: gather_zero_stride_fp:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a2, 1
+; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    vsetivli zero, 8, e32, m1, ta, ma
+; CHECK-NEXT:  .LBB15_1: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    flw fa5, 0(a1)
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vfadd.vf v8, v8, fa5
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a0, a0, 128
+; CHECK-NEXT:    addi a1, a1, 640
+; CHECK-NEXT:    bne a0, a2, .LBB15_1
+; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %vec.ind = phi <8 x i64> [ zeroinitializer, %entry ], [ %vec.ind.next, %vector.body ]
+  %i = mul nuw nsw <8 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
+  %i1 = getelementptr inbounds float, ptr %B, <8 x i64> %i
+  %wide.masked.gather = call <8 x float> @llvm.masked.gather.v8f32.v32p0(<8 x ptr> %i1, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef)
+  %i2 = getelementptr inbounds float, ptr %A, i64 %index
+  %wide.load = load <8 x float>, ptr %i2, align 4
+  %i4 = fadd <8 x float> %wide.load, %wide.masked.gather
+  store <8 x float> %i4, ptr %i2, align 4
+  %index.next = add nuw i64 %index, 32
+  %vec.ind.next = add <8 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %i6 = icmp eq i64 %index.next, 1024
+  br i1 %i6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/srodata.ll b/llvm/test/CodeGen/RISCV/srodata.ll
new file mode 100644
index 00000000000000..1d5bd904f233fe
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/srodata.ll
@@ -0,0 +1,47 @@
+; RUN: sed 's/SMALL_DATA_LIMIT/0/g' %s | \
+; RUN:   llc -mtriple=riscv32 -mattr=+d | \
+; RUN:   FileCheck -check-prefix=CHECK-SDL-0 %s
+; RUN: sed 's/SMALL_DATA_LIMIT/0/g' %s | \
+; RUN:   llc -mtriple=riscv64 -mattr=+d | \
+; RUN:   FileCheck -check-prefix=CHECK-SDL-0 %s
+; RUN: sed 's/SMALL_DATA_LIMIT/4/g' %s | \
+; RUN:   llc -mtriple=riscv32 -mattr=+d | \
+; RUN:   FileCheck -check-prefix=CHECK-SDL-4 %s
+; RUN: sed 's/SMALL_DATA_LIMIT/4/g' %s | \
+; RUN:   llc -mtriple=riscv64 -mattr=+d | \
+; RUN:   FileCheck -check-prefix=CHECK-SDL-4 %s
+; RUN: sed 's/SMALL_DATA_LIMIT/8/g' %s | \
+; RUN:   llc -mtriple=riscv32 -mattr=+d | \
+; RUN:   FileCheck -check-prefix=CHECK-SDL-8 %s
+; RUN: sed 's/SMALL_DATA_LIMIT/8/g' %s | \
+; RUN:   llc -mtriple=riscv64 -mattr=+d | \
+; RUN:   FileCheck -check-prefix=CHECK-SDL-8 %s
+; RUN: sed 's/SMALL_DATA_LIMIT/16/g' %s | \
+; RUN:   llc -mtriple=riscv32 -mattr=+d | \
+; RUN:   FileCheck -check-prefix=CHECK-SDL-16 %s
+; RUN: sed 's/SMALL_DATA_LIMIT/16/g' %s | \
+; RUN:   llc -mtriple=riscv64 -mattr=+d | \
+; RUN:   FileCheck -check-prefix=CHECK-SDL-16 %s
+
+define dso_local float @foof() {
+entry:
+  ret float 0x400A08ACA0000000
+}
+
+define dso_local double @foo() {
+entry:
+  ret double 0x400A08AC91C3E242
+}
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 8, !"SmallDataLimit", i32 SMALL_DATA_LIMIT}
+
+; CHECK-SDL-0-NOT:    .section        .srodata.cst4
+; CHECK-SDL-0-NOT:    .section        .srodata.cst8
+; CHECK-SDL-4:        .section        .srodata.cst4
+; CHECK-SDL-4-NOT:    .section        .srodata.cst8
+; CHECK-SDL-8:        .section        .srodata.cst4
+; CHECK-SDL-8:        .section        .srodata.cst8
+; CHECK-SDL-16:       .section        .srodata.cst4
+; CHECK-SDL-16:       .section        .srodata.cst8
diff --git a/llvm/test/CodeGen/RISCV/typepromotion-overflow.ll b/llvm/test/CodeGen/RISCV/typepromotion-overflow.ll
index ec7e0ecce80caa..3740dc675949fa 100644
--- a/llvm/test/CodeGen/RISCV/typepromotion-overflow.ll
+++ b/llvm/test/CodeGen/RISCV/typepromotion-overflow.ll
@@ -283,8 +283,9 @@ define i32 @safe_sub_var_imm(ptr nocapture readonly %b) local_unnamed_addr #1 {
 ; CHECK-LABEL: safe_sub_var_imm:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lbu a0, 0(a0)
-; CHECK-NEXT:    addi a0, a0, -248
-; CHECK-NEXT:    sltiu a0, a0, -3
+; CHECK-NEXT:    addi a0, a0, 8
+; CHECK-NEXT:    andi a0, a0, 255
+; CHECK-NEXT:    sltiu a0, a0, 253
 ; CHECK-NEXT:    xori a0, a0, 1
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/MC/ARM/thumbv8m.s b/llvm/test/MC/ARM/thumbv8m.s
index 0e9ab4a9b3bf91..f03dd03dae3a4f 100644
--- a/llvm/test/MC/ARM/thumbv8m.s
+++ b/llvm/test/MC/ARM/thumbv8m.s
@@ -184,13 +184,13 @@ ttat r0, r1
 // 'Lazy Load/Store Multiple'
 
 // UNDEF-BASELINE: error: instruction requires: armv8m.main
-// CHECK-MAINLINE: vlldm r5          @ encoding: [0x35,0xec,0x00,0x0a]
-// CHECK-MAINLINE_DSP: vlldm r5      @ encoding: [0x35,0xec,0x00,0x0a]
+// CHECK-MAINLINE: vlldm r5, {d0 - d15} @ encoding: [0x35,0xec,0x00,0x0a]
+// CHECK-MAINLINE_DSP: vlldm r5, {d0 - d15} @ encoding: [0x35,0xec,0x00,0x0a]
 vlldm r5
 
 // UNDEF-BASELINE: error: instruction requires: armv8m.main
-// CHECK-MAINLINE: vlstm r10         @ encoding: [0x2a,0xec,0x00,0x0a]
-// CHECK-MAINLINE_DSP: vlstm r10     @ encoding: [0x2a,0xec,0x00,0x0a]
+// CHECK-MAINLINE: vlstm r10, {d0 - d15} @ encoding: [0x2a,0xec,0x00,0x0a]
+// CHECK-MAINLINE_DSP: vlstm r10, {d0 - d15} @ encoding: [0x2a,0xec,0x00,0x0a]
 vlstm r10
 
 // New SYSm's
diff --git a/llvm/test/MC/ARM/vlstm-vlldm-8.1m.s b/llvm/test/MC/ARM/vlstm-vlldm-8.1m.s
new file mode 100644
index 00000000000000..4e35883ffe4332
--- /dev/null
+++ b/llvm/test/MC/ARM/vlstm-vlldm-8.1m.s
@@ -0,0 +1,11 @@
+// RUN: llvm-mc -triple=armv8.1m.main-arm-none-eabi -mcpu=generic -show-encoding %s \
+// RUN: | FileCheck --check-prefixes=CHECK %s
+
+// RUN: llvm-mc -triple=thumbv8.1m.main-none-eabi -mcpu=generic -show-encoding %s \
+// RUN: | FileCheck --check-prefixes=CHECK %s
+
+vlstm r8, {d0 - d31}
+// CHECK: vlstm	r8, {d0 - d31} @ encoding: [0x28,0xec,0x80,0x0a]
+
+vlldm r8, {d0 - d31}
+// CHECK: vlldm	r8, {d0 - d31} @ encoding: [0x38,0xec,0x80,0x0a]
diff --git a/llvm/test/MC/ARM/vlstm-vlldm-8m.s b/llvm/test/MC/ARM/vlstm-vlldm-8m.s
new file mode 100644
index 00000000000000..bbc95318aeb3d0
--- /dev/null
+++ b/llvm/test/MC/ARM/vlstm-vlldm-8m.s
@@ -0,0 +1,17 @@
+// RUN: llvm-mc -triple=armv8m.main-arm-none-eabi -mcpu=generic -show-encoding %s \
+// RUN: | FileCheck --check-prefixes=CHECK %s
+
+// RUN: llvm-mc -triple=thumbv8m.main-none-eabi -mcpu=generic -show-encoding %s \
+// RUN: | FileCheck --check-prefixes=CHECK %s
+
+vlstm r8, {d0 - d15}
+// CHECK: vlstm	r8, {d0 - d15} @ encoding: [0x28,0xec,0x00,0x0a]
+
+vlldm r8, {d0 - d15}
+// CHECK: vlldm	r8, {d0 - d15} @ encoding: [0x38,0xec,0x00,0x0a]
+
+vlstm r8
+// CHECK: vlstm	r8, {d0 - d15} @ encoding: [0x28,0xec,0x00,0x0a]
+
+vlldm r8
+// CHECK: vlldm r8, {d0 - d15} @ encoding: [0x38,0xec,0x00,0x0a]
diff --git a/llvm/test/MC/ARM/vlstm-vlldm-diag.s b/llvm/test/MC/ARM/vlstm-vlldm-diag.s
new file mode 100644
index 00000000000000..b57f535c6a25cf
--- /dev/null
+++ b/llvm/test/MC/ARM/vlstm-vlldm-diag.s
@@ -0,0 +1,61 @@
+// RUN: not llvm-mc -triple=armv8.1m.main-arm-none-eabi -mcpu=generic -show-encoding %s 2>&1 >/dev/null \
+// RUN: | FileCheck --check-prefixes=ERR %s
+
+// RUN: not llvm-mc -triple=armv8.1m.main-arm-none-eabi -mcpu=generic -show-encoding %s 2>&1 >/dev/null \
+// RUN: | FileCheck --check-prefixes=ERRT2 %s
+
+vlstm r8, {d0 - d11}
+// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)
+// ERR-NEXT: vlstm r8, {d0 - d11}
+
+vlldm r8, {d0 - d11}
+// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)
+// ERR-NEXT: vlldm r8, {d0 - d11}
+
+vlstm r8, {d3 - d15}
+// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)
+// ERR-NEXT: vlstm r8, {d3 - d15}
+
+vlldm r8, {d3 - d15}
+// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)
+// ERR-NEXT: vlldm r8, {d3 - d15}
+
+vlstm r8, {d0 - d29}
+// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)
+// ERR-NEXT: vlstm r8, {d0 - d29}
+
+vlldm r8, {d0 - d29}
+// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)
+// ERR-NEXT: vlldm r8, {d0 - d29}
+
+vlstm r8, {d3 - d31}
+// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)
+// ERR-NEXT: vlstm r8, {d3 - d31}
+
+vlldm r8, {d3 - d31}
+// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)
+// ERR-NEXT: vlldm r8, {d3 - d31}
+
+vlstm r8, {d0 - d35}
+// ERR: error: register expected
+// ERR-NEXT: vlstm r8, {d0 - d35}
+
+vlldm r8, {d0 - d35}
+// ERR: error: register expected
+// ERR-NEXT: vlldm r8, {d0 - d35}
+
+vlstm pc
+// ERR: error: operand must be a register in range [r0, r14]
+// ERR-NEXT: vlstm pc
+
+vlldm pc
+// ERR: error: operand must be a register in range [r0, r14]
+// ERR-NEXT: vlldm pc
+
+vlstm pc
+// ERRT2: error: operand must be a register in range [r0, r14]
+// ERRT2-NEXT: vlstm pc
+
+vlldm pc
+// ERRT2: error: operand must be a register in range [r0, r14]
+// ERRT2-NEXT: vlldm pc
\ No newline at end of file
diff --git a/llvm/test/MC/Disassembler/ARM/armv8.1m-vlldm_vlstm-8.1.main.txt b/llvm/test/MC/Disassembler/ARM/armv8.1m-vlldm_vlstm-8.1.main.txt
new file mode 100644
index 00000000000000..6b9882454c06a3
--- /dev/null
+++ b/llvm/test/MC/Disassembler/ARM/armv8.1m-vlldm_vlstm-8.1.main.txt
@@ -0,0 +1,11 @@
+// RUN: llvm-mc -triple=armv8.1m.main-arm-none-eabi -mcpu=generic -show-encoding -disassemble %s \
+// RUN: | FileCheck %s --check-prefixes=CHECK-DISS
+
+// RUN: llvm-mc -triple=thumbv8.1m.main-none-eabi -mcpu=generic -show-encoding -disassemble %s \
+// RUN: | FileCheck %s --check-prefixes=CHECK-DISS
+
+[0x28,0xec,0x80,0x0a]
+// CHECK-DISS: vlstm r8, {d0 - d31} @ encoding: [0x28,0xec,0x80,0x0a]
+
+[0x38,0xec,0x80,0x0a]
+// CHECK-DISS: vlldm r8, {d0 - d31} @ encoding: [0x38,0xec,0x80,0x0a]
\ No newline at end of file
diff --git a/llvm/test/MC/Disassembler/ARM/armv8.1m-vlldm_vlstm-8.main.txt b/llvm/test/MC/Disassembler/ARM/armv8.1m-vlldm_vlstm-8.main.txt
new file mode 100644
index 00000000000000..1e28d5284c5b2a
--- /dev/null
+++ b/llvm/test/MC/Disassembler/ARM/armv8.1m-vlldm_vlstm-8.main.txt
@@ -0,0 +1,17 @@
+// RUN: llvm-mc -triple=armv8m.main-arm-none-eabi -mcpu=generic -show-encoding -disassemble %s \
+// RUN: | FileCheck %s --check-prefixes=CHECK-DISS
+
+// RUN: llvm-mc -triple=thumbv8m.main-none-eabi -mcpu=generic -show-encoding -disassemble %s \
+// RUN: | FileCheck %s --check-prefixes=CHECK-DISS
+
+[0x28,0xec,0x00,0x0a]
+// CHECK-DISS: vlstm r8, {d0 - d15} @ encoding: [0x28,0xec,0x00,0x0a]
+
+[0x38,0xec,0x00,0x0a]
+// CHECK-DISS: vlldm r8, {d0 - d15} @ encoding: [0x38,0xec,0x00,0x0a]
+
+[0x28,0xec,0x00,0x0a]
+// CHECK-DISS: vlstm r8, {d0 - d15} @ encoding: [0x28,0xec,0x00,0x0a]
+
+[0x38,0xec,0x00,0x0a]
+// CHECK-DISS: vlldm r8, {d0 - d15} @ encoding: [0x38,0xec,0x00,0x0a]
\ No newline at end of file
diff --git a/llvm/test/TableGen/x86-fold-tables.inc b/llvm/test/TableGen/x86-fold-tables.inc
index d0ae2c474e85ac..e0fccd42e47f73 100644
--- a/llvm/test/TableGen/x86-fold-tables.inc
+++ b/llvm/test/TableGen/x86-fold-tables.inc
@@ -866,10 +866,10 @@ static const X86FoldTableEntry Table1[] = {
   {X86::PABSBrr, X86::PABSBrm, TB_ALIGN_16},
   {X86::PABSDrr, X86::PABSDrm, TB_ALIGN_16},
   {X86::PABSWrr, X86::PABSWrm, TB_ALIGN_16},
-  {X86::PCMPESTRIrr, X86::PCMPESTRIrm, 0},
-  {X86::PCMPESTRMrr, X86::PCMPESTRMrm, 0},
-  {X86::PCMPISTRIrr, X86::PCMPISTRIrm, 0},
-  {X86::PCMPISTRMrr, X86::PCMPISTRMrm, 0},
+  {X86::PCMPESTRIrri, X86::PCMPESTRIrmi, 0},
+  {X86::PCMPESTRMrri, X86::PCMPESTRMrmi, 0},
+  {X86::PCMPISTRIrri, X86::PCMPISTRIrmi, 0},
+  {X86::PCMPISTRMrri, X86::PCMPISTRMrmi, 0},
   {X86::PF2IDrr, X86::PF2IDrm, 0},
   {X86::PF2IWrr, X86::PF2IWrm, 0},
   {X86::PFRCPrr, X86::PFRCPrm, 0},
@@ -1544,10 +1544,10 @@ static const X86FoldTableEntry Table1[] = {
   {X86::VPBROADCASTWZ256rr, X86::VPBROADCASTWZ256rm, TB_NO_REVERSE},
   {X86::VPBROADCASTWZrr, X86::VPBROADCASTWZrm, TB_NO_REVERSE},
   {X86::VPBROADCASTWrr, X86::VPBROADCASTWrm, TB_NO_REVERSE},
-  {X86::VPCMPESTRIrr, X86::VPCMPESTRIrm, 0},
-  {X86::VPCMPESTRMrr, X86::VPCMPESTRMrm, 0},
-  {X86::VPCMPISTRIrr, X86::VPCMPISTRIrm, 0},
-  {X86::VPCMPISTRMrr, X86::VPCMPISTRMrm, 0},
+  {X86::VPCMPESTRIrri, X86::VPCMPESTRIrmi, 0},
+  {X86::VPCMPESTRMrri, X86::VPCMPESTRMrmi, 0},
+  {X86::VPCMPISTRIrri, X86::VPCMPISTRIrmi, 0},
+  {X86::VPCMPISTRMrri, X86::VPCMPISTRMrmi, 0},
   {X86::VPCONFLICTDZ128rr, X86::VPCONFLICTDZ128rm, 0},
   {X86::VPCONFLICTDZ256rr, X86::VPCONFLICTDZ256rm, 0},
   {X86::VPCONFLICTDZrr, X86::VPCONFLICTDZrm, 0},
@@ -2129,7 +2129,7 @@ static const X86FoldTableEntry Table2[] = {
   {X86::PAVGWrr, X86::PAVGWrm, TB_ALIGN_16},
   {X86::PBLENDVBrr0, X86::PBLENDVBrm0, TB_ALIGN_16},
   {X86::PBLENDWrri, X86::PBLENDWrmi, TB_ALIGN_16},
-  {X86::PCLMULQDQrr, X86::PCLMULQDQrm, TB_ALIGN_16},
+  {X86::PCLMULQDQrri, X86::PCLMULQDQrmi, TB_ALIGN_16},
   {X86::PCMPEQBrr, X86::PCMPEQBrm, TB_ALIGN_16},
   {X86::PCMPEQDrr, X86::PCMPEQDrm, TB_ALIGN_16},
   {X86::PCMPEQQrr, X86::PCMPEQQrm, TB_ALIGN_16},
@@ -3058,11 +3058,11 @@ static const X86FoldTableEntry Table2[] = {
   {X86::VPBROADCASTWZ128rrkz, X86::VPBROADCASTWZ128rmkz, TB_NO_REVERSE},
   {X86::VPBROADCASTWZ256rrkz, X86::VPBROADCASTWZ256rmkz, TB_NO_REVERSE},
   {X86::VPBROADCASTWZrrkz, X86::VPBROADCASTWZrmkz, TB_NO_REVERSE},
-  {X86::VPCLMULQDQYrr, X86::VPCLMULQDQYrm, 0},
-  {X86::VPCLMULQDQZ128rr, X86::VPCLMULQDQZ128rm, 0},
-  {X86::VPCLMULQDQZ256rr, X86::VPCLMULQDQZ256rm, 0},
-  {X86::VPCLMULQDQZrr, X86::VPCLMULQDQZrm, 0},
-  {X86::VPCLMULQDQrr, X86::VPCLMULQDQrm, 0},
+  {X86::VPCLMULQDQYrri, X86::VPCLMULQDQYrmi, 0},
+  {X86::VPCLMULQDQZ128rri, X86::VPCLMULQDQZ128rmi, 0},
+  {X86::VPCLMULQDQZ256rri, X86::VPCLMULQDQZ256rmi, 0},
+  {X86::VPCLMULQDQZrri, X86::VPCLMULQDQZrmi, 0},
+  {X86::VPCLMULQDQrri, X86::VPCLMULQDQrmi, 0},
   {X86::VPCMOVYrrr, X86::VPCMOVYrmr, 0},
   {X86::VPCMOVrrr, X86::VPCMOVrmr, 0},
   {X86::VPCMPBZ128rri, X86::VPCMPBZ128rmi, 0},
diff --git a/llvm/test/Transforms/HardwareLoops/scalar-while-strictfp.ll b/llvm/test/Transforms/HardwareLoops/scalar-while-strictfp.ll
new file mode 100644
index 00000000000000..951aacc0653628
--- /dev/null
+++ b/llvm/test/Transforms/HardwareLoops/scalar-while-strictfp.ll
@@ -0,0 +1,428 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes
+; RUN: opt -passes='hardware-loops<force-hardware-loops;hardware-loop-decrement=1;hardware-loop-counter-bitwidth=32>' -S %s -o - | FileCheck %s --check-prefix=CHECK-DEC
+; RUN: opt -passes='hardware-loops<force-hardware-loops;hardware-loop-decrement=1;hardware-loop-counter-bitwidth=32;force-hardware-loop-phi>' -S %s -o - | FileCheck %s --check-prefix=CHECK-PHI
+
+define void @while_lt(i32 %i, i32 %N, ptr nocapture %A) strictfp {
+; CHECK-DEC: Function Attrs: strictfp
+; CHECK-DEC-LABEL: @while_lt(
+; CHECK-DEC-NEXT:  entry:
+; CHECK-DEC-NEXT:    [[CMP4:%.*]] = icmp ult i32 [[I:%.*]], [[N:%.*]]
+; CHECK-DEC-NEXT:    br i1 [[CMP4]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
+; CHECK-DEC:       while.body.preheader:
+; CHECK-DEC-NEXT:    [[TMP0:%.*]] = sub i32 [[N]], [[I]]
+; CHECK-DEC-NEXT:    call void @llvm.set.loop.iterations.i32(i32 [[TMP0]]) #[[ATTR0:[0-9]+]]
+; CHECK-DEC-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK-DEC:       while.body:
+; CHECK-DEC-NEXT:    [[I_ADDR_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ [[I]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-DEC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-DEC-NEXT:    store i32 [[I_ADDR_05]], ptr [[ARRAYIDX]], align 4
+; CHECK-DEC-NEXT:    [[INC]] = add nuw i32 [[I_ADDR_05]], 1
+; CHECK-DEC-NEXT:    [[TMP1:%.*]] = call i1 @llvm.loop.decrement.i32(i32 1) #[[ATTR0]]
+; CHECK-DEC-NEXT:    br i1 [[TMP1]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-DEC:       while.end:
+; CHECK-DEC-NEXT:    ret void
+;
+; CHECK-PHI: Function Attrs: strictfp
+; CHECK-PHI-LABEL: @while_lt(
+; CHECK-PHI-NEXT:  entry:
+; CHECK-PHI-NEXT:    [[CMP4:%.*]] = icmp ult i32 [[I:%.*]], [[N:%.*]]
+; CHECK-PHI-NEXT:    br i1 [[CMP4]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
+; CHECK-PHI:       while.body.preheader:
+; CHECK-PHI-NEXT:    [[TMP0:%.*]] = sub i32 [[N]], [[I]]
+; CHECK-PHI-NEXT:    [[TMP1:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP0]]) #[[ATTR0:[0-9]+]]
+; CHECK-PHI-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK-PHI:       while.body:
+; CHECK-PHI-NEXT:    [[I_ADDR_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ [[I]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-PHI-NEXT:    [[TMP2:%.*]] = phi i32 [ [[TMP1]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP3:%.*]], [[WHILE_BODY]] ]
+; CHECK-PHI-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-PHI-NEXT:    store i32 [[I_ADDR_05]], ptr [[ARRAYIDX]], align 4
+; CHECK-PHI-NEXT:    [[INC]] = add nuw i32 [[I_ADDR_05]], 1
+; CHECK-PHI-NEXT:    [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP2]], i32 1) #[[ATTR0]]
+; CHECK-PHI-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
+; CHECK-PHI-NEXT:    br i1 [[TMP4]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-PHI:       while.end:
+; CHECK-PHI-NEXT:    ret void
+;
+entry:
+  %cmp4 = icmp ult i32 %i, %N
+  br i1 %cmp4, label %while.body, label %while.end
+
+while.body:
+  %i.addr.05 = phi i32 [ %inc, %while.body ], [ %i, %entry ]
+  %arrayidx = getelementptr inbounds i32, ptr %A, i32 %i.addr.05
+  store i32 %i.addr.05, ptr %arrayidx, align 4
+  %inc = add nuw i32 %i.addr.05, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %while.end, label %while.body
+
+while.end:
+  ret void
+}
+
+define void @while_gt(i32 %i, i32 %N, ptr nocapture %A) strictfp {
+; CHECK-DEC: Function Attrs: strictfp
+; CHECK-DEC-LABEL: @while_gt(
+; CHECK-DEC-NEXT:  entry:
+; CHECK-DEC-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[I:%.*]], [[N:%.*]]
+; CHECK-DEC-NEXT:    br i1 [[CMP4]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
+; CHECK-DEC:       while.body.preheader:
+; CHECK-DEC-NEXT:    [[TMP0:%.*]] = sub i32 [[I]], [[N]]
+; CHECK-DEC-NEXT:    call void @llvm.set.loop.iterations.i32(i32 [[TMP0]]) #[[ATTR0]]
+; CHECK-DEC-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK-DEC:       while.body:
+; CHECK-DEC-NEXT:    [[I_ADDR_05:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[I]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-DEC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-DEC-NEXT:    store i32 [[I_ADDR_05]], ptr [[ARRAYIDX]], align 4
+; CHECK-DEC-NEXT:    [[DEC]] = add nsw i32 [[I_ADDR_05]], -1
+; CHECK-DEC-NEXT:    [[TMP1:%.*]] = call i1 @llvm.loop.decrement.i32(i32 1) #[[ATTR0]]
+; CHECK-DEC-NEXT:    br i1 [[TMP1]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-DEC:       while.end:
+; CHECK-DEC-NEXT:    ret void
+;
+; CHECK-PHI: Function Attrs: strictfp
+; CHECK-PHI-LABEL: @while_gt(
+; CHECK-PHI-NEXT:  entry:
+; CHECK-PHI-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[I:%.*]], [[N:%.*]]
+; CHECK-PHI-NEXT:    br i1 [[CMP4]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
+; CHECK-PHI:       while.body.preheader:
+; CHECK-PHI-NEXT:    [[TMP0:%.*]] = sub i32 [[I]], [[N]]
+; CHECK-PHI-NEXT:    [[TMP1:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP0]]) #[[ATTR0]]
+; CHECK-PHI-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK-PHI:       while.body:
+; CHECK-PHI-NEXT:    [[I_ADDR_05:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[I]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-PHI-NEXT:    [[TMP2:%.*]] = phi i32 [ [[TMP1]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP3:%.*]], [[WHILE_BODY]] ]
+; CHECK-PHI-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-PHI-NEXT:    store i32 [[I_ADDR_05]], ptr [[ARRAYIDX]], align 4
+; CHECK-PHI-NEXT:    [[DEC]] = add nsw i32 [[I_ADDR_05]], -1
+; CHECK-PHI-NEXT:    [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP2]], i32 1) #[[ATTR0]]
+; CHECK-PHI-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
+; CHECK-PHI-NEXT:    br i1 [[TMP4]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-PHI:       while.end:
+; CHECK-PHI-NEXT:    ret void
+;
+entry:
+  %cmp4 = icmp sgt i32 %i, %N
+  br i1 %cmp4, label %while.body, label %while.end
+
+while.body:
+  %i.addr.05 = phi i32 [ %dec, %while.body ], [ %i, %entry ]
+  %arrayidx = getelementptr inbounds i32, ptr %A, i32 %i.addr.05
+  store i32 %i.addr.05, ptr %arrayidx, align 4
+  %dec = add nsw i32 %i.addr.05, -1
+  %cmp = icmp sgt i32 %dec, %N
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:
+  ret void
+}
+
+define void @while_gte(i32 %i, i32 %N, ptr nocapture %A) strictfp {
+; CHECK-DEC: Function Attrs: strictfp
+; CHECK-DEC-LABEL: @while_gte(
+; CHECK-DEC-NEXT:  entry:
+; CHECK-DEC-NEXT:    [[CMP4:%.*]] = icmp slt i32 [[I:%.*]], [[N:%.*]]
+; CHECK-DEC-NEXT:    br i1 [[CMP4]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; CHECK-DEC:       while.body.preheader:
+; CHECK-DEC-NEXT:    [[TMP0:%.*]] = add i32 [[I]], 1
+; CHECK-DEC-NEXT:    [[TMP1:%.*]] = sub i32 [[TMP0]], [[N]]
+; CHECK-DEC-NEXT:    call void @llvm.set.loop.iterations.i32(i32 [[TMP1]]) #[[ATTR0]]
+; CHECK-DEC-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK-DEC:       while.body:
+; CHECK-DEC-NEXT:    [[I_ADDR_05:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[I]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-DEC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-DEC-NEXT:    store i32 [[I_ADDR_05]], ptr [[ARRAYIDX]], align 4
+; CHECK-DEC-NEXT:    [[DEC]] = add nsw i32 [[I_ADDR_05]], -1
+; CHECK-DEC-NEXT:    [[TMP2:%.*]] = call i1 @llvm.loop.decrement.i32(i32 1) #[[ATTR0]]
+; CHECK-DEC-NEXT:    br i1 [[TMP2]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-DEC:       while.end:
+; CHECK-DEC-NEXT:    ret void
+;
+; CHECK-PHI: Function Attrs: strictfp
+; CHECK-PHI-LABEL: @while_gte(
+; CHECK-PHI-NEXT:  entry:
+; CHECK-PHI-NEXT:    [[CMP4:%.*]] = icmp slt i32 [[I:%.*]], [[N:%.*]]
+; CHECK-PHI-NEXT:    br i1 [[CMP4]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; CHECK-PHI:       while.body.preheader:
+; CHECK-PHI-NEXT:    [[TMP0:%.*]] = add i32 [[I]], 1
+; CHECK-PHI-NEXT:    [[TMP1:%.*]] = sub i32 [[TMP0]], [[N]]
+; CHECK-PHI-NEXT:    [[TMP2:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP1]]) #[[ATTR0]]
+; CHECK-PHI-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK-PHI:       while.body:
+; CHECK-PHI-NEXT:    [[I_ADDR_05:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[I]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-PHI-NEXT:    [[TMP3:%.*]] = phi i32 [ [[TMP2]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP4:%.*]], [[WHILE_BODY]] ]
+; CHECK-PHI-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-PHI-NEXT:    store i32 [[I_ADDR_05]], ptr [[ARRAYIDX]], align 4
+; CHECK-PHI-NEXT:    [[DEC]] = add nsw i32 [[I_ADDR_05]], -1
+; CHECK-PHI-NEXT:    [[TMP4]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP3]], i32 1) #[[ATTR0]]
+; CHECK-PHI-NEXT:    [[TMP5:%.*]] = icmp ne i32 [[TMP4]], 0
+; CHECK-PHI-NEXT:    br i1 [[TMP5]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-PHI:       while.end:
+; CHECK-PHI-NEXT:    ret void
+;
+entry:
+  %cmp4 = icmp slt i32 %i, %N
+  br i1 %cmp4, label %while.end, label %while.body
+
+while.body:
+  %i.addr.05 = phi i32 [ %dec, %while.body ], [ %i, %entry ]
+  %arrayidx = getelementptr inbounds i32, ptr %A, i32 %i.addr.05
+  store i32 %i.addr.05, ptr %arrayidx, align 4
+  %dec = add nsw i32 %i.addr.05, -1
+  %cmp = icmp sgt i32 %i.addr.05, %N
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:
+  ret void
+}
+
+define void @while_ne(i32 %N, ptr nocapture %A) strictfp {
+; CHECK-DEC: Function Attrs: strictfp
+; CHECK-DEC-LABEL: @while_ne(
+; CHECK-DEC-NEXT:  entry:
+; CHECK-DEC-NEXT:    [[CMP:%.*]] = icmp ne i32 [[N:%.*]], 0
+; CHECK-DEC-NEXT:    br i1 [[CMP]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
+; CHECK-DEC:       while.body.preheader:
+; CHECK-DEC-NEXT:    call void @llvm.set.loop.iterations.i32(i32 [[N]]) #[[ATTR0]]
+; CHECK-DEC-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK-DEC:       while.body:
+; CHECK-DEC-NEXT:    [[I_ADDR_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-DEC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-DEC-NEXT:    store i32 [[I_ADDR_05]], ptr [[ARRAYIDX]], align 4
+; CHECK-DEC-NEXT:    [[INC]] = add nuw i32 [[I_ADDR_05]], 1
+; CHECK-DEC-NEXT:    [[TMP0:%.*]] = call i1 @llvm.loop.decrement.i32(i32 1) #[[ATTR0]]
+; CHECK-DEC-NEXT:    br i1 [[TMP0]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-DEC:       while.end:
+; CHECK-DEC-NEXT:    ret void
+;
+; CHECK-PHI: Function Attrs: strictfp
+; CHECK-PHI-LABEL: @while_ne(
+; CHECK-PHI-NEXT:  entry:
+; CHECK-PHI-NEXT:    [[CMP:%.*]] = icmp ne i32 [[N:%.*]], 0
+; CHECK-PHI-NEXT:    br i1 [[CMP]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
+; CHECK-PHI:       while.body.preheader:
+; CHECK-PHI-NEXT:    [[TMP0:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[N]]) #[[ATTR0]]
+; CHECK-PHI-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK-PHI:       while.body:
+; CHECK-PHI-NEXT:    [[I_ADDR_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-PHI-NEXT:    [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP2:%.*]], [[WHILE_BODY]] ]
+; CHECK-PHI-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-PHI-NEXT:    store i32 [[I_ADDR_05]], ptr [[ARRAYIDX]], align 4
+; CHECK-PHI-NEXT:    [[INC]] = add nuw i32 [[I_ADDR_05]], 1
+; CHECK-PHI-NEXT:    [[TMP2]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP1]], i32 1) #[[ATTR0]]
+; CHECK-PHI-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-PHI-NEXT:    br i1 [[TMP3]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-PHI:       while.end:
+; CHECK-PHI-NEXT:    ret void
+;
+entry:
+  %cmp = icmp ne i32 %N, 0
+  br i1 %cmp, label %while.body, label %while.end
+
+while.body:
+  %i.addr.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, ptr %A, i32 %i.addr.05
+  store i32 %i.addr.05, ptr %arrayidx, align 4
+  %inc = add nuw i32 %i.addr.05, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %while.end, label %while.body
+
+while.end:
+  ret void
+}
+
+define void @while_eq(i32 %N, ptr nocapture %A) strictfp {
+; CHECK-DEC: Function Attrs: strictfp
+; CHECK-DEC-LABEL: @while_eq(
+; CHECK-DEC-NEXT:  entry:
+; CHECK-DEC-NEXT:    [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 0
+; CHECK-DEC-NEXT:    br i1 [[CMP]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; CHECK-DEC:       while.body.preheader:
+; CHECK-DEC-NEXT:    call void @llvm.set.loop.iterations.i32(i32 [[N]]) #[[ATTR0]]
+; CHECK-DEC-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK-DEC:       while.body:
+; CHECK-DEC-NEXT:    [[I_ADDR_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-DEC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-DEC-NEXT:    store i32 [[I_ADDR_05]], ptr [[ARRAYIDX]], align 4
+; CHECK-DEC-NEXT:    [[INC]] = add nuw i32 [[I_ADDR_05]], 1
+; CHECK-DEC-NEXT:    [[TMP0:%.*]] = call i1 @llvm.loop.decrement.i32(i32 1) #[[ATTR0]]
+; CHECK-DEC-NEXT:    br i1 [[TMP0]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-DEC:       while.end:
+; CHECK-DEC-NEXT:    ret void
+;
+; CHECK-PHI: Function Attrs: strictfp
+; CHECK-PHI-LABEL: @while_eq(
+; CHECK-PHI-NEXT:  entry:
+; CHECK-PHI-NEXT:    [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 0
+; CHECK-PHI-NEXT:    br i1 [[CMP]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; CHECK-PHI:       while.body.preheader:
+; CHECK-PHI-NEXT:    [[TMP0:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[N]]) #[[ATTR0]]
+; CHECK-PHI-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK-PHI:       while.body:
+; CHECK-PHI-NEXT:    [[I_ADDR_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-PHI-NEXT:    [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP2:%.*]], [[WHILE_BODY]] ]
+; CHECK-PHI-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-PHI-NEXT:    store i32 [[I_ADDR_05]], ptr [[ARRAYIDX]], align 4
+; CHECK-PHI-NEXT:    [[INC]] = add nuw i32 [[I_ADDR_05]], 1
+; CHECK-PHI-NEXT:    [[TMP2]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP1]], i32 1) #[[ATTR0]]
+; CHECK-PHI-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-PHI-NEXT:    br i1 [[TMP3]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-PHI:       while.end:
+; CHECK-PHI-NEXT:    ret void
+;
+entry:
+  %cmp = icmp eq i32 %N, 0
+  br i1 %cmp, label %while.end, label %while.body
+
+while.body:
+  %i.addr.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, ptr %A, i32 %i.addr.05
+  store i32 %i.addr.05, ptr %arrayidx, align 4
+  %inc = add nuw i32 %i.addr.05, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %while.end, label %while.body
+
+while.end:
+  ret void
+}
+
+define void @while_preheader_eq(i32 %N, ptr nocapture %A) strictfp {
+; CHECK-DEC: Function Attrs: strictfp
+; CHECK-DEC-LABEL: @while_preheader_eq(
+; CHECK-DEC-NEXT:  entry:
+; CHECK-DEC-NEXT:    br label [[PREHEADER:%.*]]
+; CHECK-DEC:       preheader:
+; CHECK-DEC-NEXT:    [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 0
+; CHECK-DEC-NEXT:    br i1 [[CMP]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; CHECK-DEC:       while.body.preheader:
+; CHECK-DEC-NEXT:    call void @llvm.set.loop.iterations.i32(i32 [[N]]) #[[ATTR0]]
+; CHECK-DEC-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK-DEC:       while.body:
+; CHECK-DEC-NEXT:    [[I_ADDR_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-DEC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-DEC-NEXT:    store i32 [[I_ADDR_05]], ptr [[ARRAYIDX]], align 4
+; CHECK-DEC-NEXT:    [[INC]] = add nuw i32 [[I_ADDR_05]], 1
+; CHECK-DEC-NEXT:    [[TMP0:%.*]] = call i1 @llvm.loop.decrement.i32(i32 1) #[[ATTR0]]
+; CHECK-DEC-NEXT:    br i1 [[TMP0]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-DEC:       while.end:
+; CHECK-DEC-NEXT:    ret void
+;
+; CHECK-PHI: Function Attrs: strictfp
+; CHECK-PHI-LABEL: @while_preheader_eq(
+; CHECK-PHI-NEXT:  entry:
+; CHECK-PHI-NEXT:    br label [[PREHEADER:%.*]]
+; CHECK-PHI:       preheader:
+; CHECK-PHI-NEXT:    [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 0
+; CHECK-PHI-NEXT:    br i1 [[CMP]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; CHECK-PHI:       while.body.preheader:
+; CHECK-PHI-NEXT:    [[TMP0:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[N]]) #[[ATTR0]]
+; CHECK-PHI-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK-PHI:       while.body:
+; CHECK-PHI-NEXT:    [[I_ADDR_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-PHI-NEXT:    [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP2:%.*]], [[WHILE_BODY]] ]
+; CHECK-PHI-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-PHI-NEXT:    store i32 [[I_ADDR_05]], ptr [[ARRAYIDX]], align 4
+; CHECK-PHI-NEXT:    [[INC]] = add nuw i32 [[I_ADDR_05]], 1
+; CHECK-PHI-NEXT:    [[TMP2]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP1]], i32 1) #[[ATTR0]]
+; CHECK-PHI-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-PHI-NEXT:    br i1 [[TMP3]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-PHI:       while.end:
+; CHECK-PHI-NEXT:    ret void
+;
+entry:
+  br label %preheader
+
+preheader:
+  %cmp = icmp eq i32 %N, 0
+  br i1 %cmp, label %while.end, label %while.body
+
+while.body:
+  %i.addr.05 = phi i32 [ %inc, %while.body ], [ 0, %preheader ]
+  %arrayidx = getelementptr inbounds i32, ptr %A, i32 %i.addr.05
+  store i32 %i.addr.05, ptr %arrayidx, align 4
+  %inc = add nuw i32 %i.addr.05, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %while.end, label %while.body
+
+while.end:
+  ret void
+}
+
+define void @nested(ptr nocapture %A, i32 %N) strictfp {
+; CHECK-DEC: Function Attrs: strictfp
+; CHECK-DEC-LABEL: @nested(
+; CHECK-DEC-NEXT:  entry:
+; CHECK-DEC-NEXT:    [[CMP20:%.*]] = icmp eq i32 [[N:%.*]], 0
+; CHECK-DEC-NEXT:    br i1 [[CMP20]], label [[WHILE_END7:%.*]], label [[WHILE_COND1_PREHEADER_US:%.*]]
+; CHECK-DEC:       while.cond1.preheader.us:
+; CHECK-DEC-NEXT:    [[I_021_US:%.*]] = phi i32 [ [[INC6_US:%.*]], [[WHILE_COND1_WHILE_END_CRIT_EDGE_US:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-DEC-NEXT:    [[MUL_US:%.*]] = mul i32 [[I_021_US]], [[N]]
+; CHECK-DEC-NEXT:    call void @llvm.set.loop.iterations.i32(i32 [[N]]) #[[ATTR0]]
+; CHECK-DEC-NEXT:    br label [[WHILE_BODY3_US:%.*]]
+; CHECK-DEC:       while.body3.us:
+; CHECK-DEC-NEXT:    [[J_019_US:%.*]] = phi i32 [ 0, [[WHILE_COND1_PREHEADER_US]] ], [ [[INC_US:%.*]], [[WHILE_BODY3_US]] ]
+; CHECK-DEC-NEXT:    [[ADD_US:%.*]] = add i32 [[J_019_US]], [[MUL_US]]
+; CHECK-DEC-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[ADD_US]]
+; CHECK-DEC-NEXT:    store i32 [[ADD_US]], ptr [[ARRAYIDX_US]], align 4
+; CHECK-DEC-NEXT:    [[INC_US]] = add nuw i32 [[J_019_US]], 1
+; CHECK-DEC-NEXT:    [[TMP0:%.*]] = call i1 @llvm.loop.decrement.i32(i32 1) #[[ATTR0]]
+; CHECK-DEC-NEXT:    br i1 [[TMP0]], label [[WHILE_BODY3_US]], label [[WHILE_COND1_WHILE_END_CRIT_EDGE_US]]
+; CHECK-DEC:       while.cond1.while.end_crit_edge.us:
+; CHECK-DEC-NEXT:    [[INC6_US]] = add nuw i32 [[I_021_US]], 1
+; CHECK-DEC-NEXT:    [[EXITCOND23:%.*]] = icmp eq i32 [[INC6_US]], [[N]]
+; CHECK-DEC-NEXT:    br i1 [[EXITCOND23]], label [[WHILE_END7]], label [[WHILE_COND1_PREHEADER_US]]
+; CHECK-DEC:       while.end7:
+; CHECK-DEC-NEXT:    ret void
+;
+; CHECK-PHI: Function Attrs: strictfp
+; CHECK-PHI-LABEL: @nested(
+; CHECK-PHI-NEXT:  entry:
+; CHECK-PHI-NEXT:    [[CMP20:%.*]] = icmp eq i32 [[N:%.*]], 0
+; CHECK-PHI-NEXT:    br i1 [[CMP20]], label [[WHILE_END7:%.*]], label [[WHILE_COND1_PREHEADER_US:%.*]]
+; CHECK-PHI:       while.cond1.preheader.us:
+; CHECK-PHI-NEXT:    [[I_021_US:%.*]] = phi i32 [ [[INC6_US:%.*]], [[WHILE_COND1_WHILE_END_CRIT_EDGE_US:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-PHI-NEXT:    [[MUL_US:%.*]] = mul i32 [[I_021_US]], [[N]]
+; CHECK-PHI-NEXT:    [[TMP0:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[N]]) #[[ATTR0]]
+; CHECK-PHI-NEXT:    br label [[WHILE_BODY3_US:%.*]]
+; CHECK-PHI:       while.body3.us:
+; CHECK-PHI-NEXT:    [[J_019_US:%.*]] = phi i32 [ 0, [[WHILE_COND1_PREHEADER_US]] ], [ [[INC_US:%.*]], [[WHILE_BODY3_US]] ]
+; CHECK-PHI-NEXT:    [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[WHILE_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[WHILE_BODY3_US]] ]
+; CHECK-PHI-NEXT:    [[ADD_US:%.*]] = add i32 [[J_019_US]], [[MUL_US]]
+; CHECK-PHI-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[ADD_US]]
+; CHECK-PHI-NEXT:    store i32 [[ADD_US]], ptr [[ARRAYIDX_US]], align 4
+; CHECK-PHI-NEXT:    [[INC_US]] = add nuw i32 [[J_019_US]], 1
+; CHECK-PHI-NEXT:    [[TMP2]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP1]], i32 1) #[[ATTR0]]
+; CHECK-PHI-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-PHI-NEXT:    br i1 [[TMP3]], label [[WHILE_BODY3_US]], label [[WHILE_COND1_WHILE_END_CRIT_EDGE_US]]
+; CHECK-PHI:       while.cond1.while.end_crit_edge.us:
+; CHECK-PHI-NEXT:    [[INC6_US]] = add nuw i32 [[I_021_US]], 1
+; CHECK-PHI-NEXT:    [[EXITCOND23:%.*]] = icmp eq i32 [[INC6_US]], [[N]]
+; CHECK-PHI-NEXT:    br i1 [[EXITCOND23]], label [[WHILE_END7]], label [[WHILE_COND1_PREHEADER_US]]
+; CHECK-PHI:       while.end7:
+; CHECK-PHI-NEXT:    ret void
+;
+entry:
+  %cmp20 = icmp eq i32 %N, 0
+  br i1 %cmp20, label %while.end7, label %while.cond1.preheader.us
+
+while.cond1.preheader.us:
+  %i.021.us = phi i32 [ %inc6.us, %while.cond1.while.end_crit_edge.us ], [ 0, %entry ]
+  %mul.us = mul i32 %i.021.us, %N
+  br label %while.body3.us
+
+while.body3.us:
+  %j.019.us = phi i32 [ 0, %while.cond1.preheader.us ], [ %inc.us, %while.body3.us ]
+  %add.us = add i32 %j.019.us, %mul.us
+  %arrayidx.us = getelementptr inbounds i32, ptr %A, i32 %add.us
+  store i32 %add.us, ptr %arrayidx.us, align 4
+  %inc.us = add nuw i32 %j.019.us, 1
+  %exitcond = icmp eq i32 %inc.us, %N
+  br i1 %exitcond, label %while.cond1.while.end_crit_edge.us, label %while.body3.us
+
+while.cond1.while.end_crit_edge.us:
+  %inc6.us = add nuw i32 %i.021.us, 1
+  %exitcond23 = icmp eq i32 %inc6.us, %N
+  br i1 %exitcond23, label %while.end7, label %while.cond1.preheader.us
+
+while.end7:
+  ret void
+}
diff --git a/llvm/test/Transforms/InstCombine/unsigned_saturated_sub.ll b/llvm/test/Transforms/InstCombine/unsigned_saturated_sub.ll
index 5cece931b8d987..ab147584d2108f 100644
--- a/llvm/test/Transforms/InstCombine/unsigned_saturated_sub.ll
+++ b/llvm/test/Transforms/InstCombine/unsigned_saturated_sub.ll
@@ -8,6 +8,95 @@ declare void @use(i64)
 declare void @usei32(i32)
 declare void @usei1(i1)
 
+; usub_sat((sub nuw C1, A), C2) to usub_sat(usub_sat(C1 - C2), A)
+define i32 @usub_sat_C1_C2(i32 %a){
+; CHECK-LABEL: @usub_sat_C1_C2(
+; CHECK-NEXT:    [[COND:%.*]] = call i32 @llvm.usub.sat.i32(i32 50, i32 [[A:%.*]])
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %add = sub nuw i32 64, %a
+  %cond = call i32 @llvm.usub.sat.i32(i32 %add, i32 14)
+  ret i32 %cond
+}
+
+define i32 @usub_sat_C1_C2_produce_0(i32 %a){
+; CHECK-LABEL: @usub_sat_C1_C2_produce_0(
+; CHECK-NEXT:    ret i32 0
+;
+  %add = sub nuw i32 14, %a
+  %cond = call i32 @llvm.usub.sat.i32(i32 %add, i32 14)
+  ret i32 %cond
+}
+
+define i32 @usub_sat_C1_C2_produce_0_too(i32 %a){
+; CHECK-LABEL: @usub_sat_C1_C2_produce_0_too(
+; CHECK-NEXT:    ret i32 0
+;
+  %add = sub nuw i32 12, %a
+  %cond = call i32 @llvm.usub.sat.i32(i32 %add, i32 14)
+  ret i32 %cond
+}
+
+; vector tests
+define <2 x i16> @usub_sat_C1_C2_splat(<2 x i16> %a) {
+; CHECK-LABEL: @usub_sat_C1_C2_splat(
+; CHECK-NEXT:    [[COND:%.*]] = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> <i16 50, i16 50>, <2 x i16> [[A:%.*]])
+; CHECK-NEXT:    ret <2 x i16> [[COND]]
+;
+  %add = sub nuw <2 x i16> <i16 64, i16 64>, %a
+  %cond = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %add, <2 x i16> <i16 14, i16 14>)
+  ret <2 x i16> %cond
+}
+
+define <2 x i16> @usub_sat_C1_C2_non_splat(<2 x i16> %a) {
+; CHECK-LABEL: @usub_sat_C1_C2_non_splat(
+; CHECK-NEXT:    [[COND:%.*]] = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> <i16 30, i16 50>, <2 x i16> [[A:%.*]])
+; CHECK-NEXT:    ret <2 x i16> [[COND]]
+;
+  %add = sub nuw <2 x i16> <i16 50, i16 64>, %a
+  %cond = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %add, <2 x i16> <i16 20, i16 14>)
+  ret <2 x i16> %cond
+}
+
+define <2 x i16> @usub_sat_C1_C2_splat_produce_0(<2 x i16> %a){
+; CHECK-LABEL: @usub_sat_C1_C2_splat_produce_0(
+; CHECK-NEXT:    ret <2 x i16> zeroinitializer
+;
+  %add = sub nuw <2 x i16> <i16 14, i16 14>, %a
+  %cond = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %add, <2 x i16> <i16 14, i16 14>)
+  ret <2 x i16> %cond
+}
+
+define <2 x i16> @usub_sat_C1_C2_splat_produce_0_too(<2 x i16> %a){
+; CHECK-LABEL: @usub_sat_C1_C2_splat_produce_0_too(
+; CHECK-NEXT:    ret <2 x i16> zeroinitializer
+;
+  %add = sub nuw <2 x i16> <i16 12, i16 12>, %a
+  %cond = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %add, <2 x i16> <i16 14, i16 14>)
+  ret <2 x i16> %cond
+}
+
+define <2 x i16> @usub_sat_C1_C2_non_splat_produce_0_too(<2 x i16> %a){
+; CHECK-LABEL: @usub_sat_C1_C2_non_splat_produce_0_too(
+; CHECK-NEXT:    ret <2 x i16> zeroinitializer
+;
+  %add = sub nuw <2 x i16> <i16 12, i16 13>, %a
+  %cond = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %add, <2 x i16> <i16 14, i16 15>)
+  ret <2 x i16> %cond
+}
+
+; negative tests this souldn't work
+define i32 @usub_sat_C1_C2_without_nuw(i32 %a){
+; CHECK-LABEL: @usub_sat_C1_C2_without_nuw(
+; CHECK-NEXT:    [[ADD:%.*]] = sub i32 12, [[A:%.*]]
+; CHECK-NEXT:    [[COND:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[ADD]], i32 14)
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %add = sub i32 12, %a
+  %cond = call i32 @llvm.usub.sat.i32(i32 %add, i32 14)
+  ret i32 %cond
+}
+
 ; (a > b) ? a - b : 0 -> usub.sat(a, b)
 
 define i64 @max_sub_ugt(i64 %a, i64 %b) {
diff --git a/llvm/test/Transforms/TypePromotion/ARM/icmps.ll b/llvm/test/Transforms/TypePromotion/ARM/icmps.ll
index fb537a1f64705c..842aab121b96fb 100644
--- a/llvm/test/Transforms/TypePromotion/ARM/icmps.ll
+++ b/llvm/test/Transforms/TypePromotion/ARM/icmps.ll
@@ -4,9 +4,8 @@
 define i32 @test_ult_254_inc_imm(i8 zeroext %x) {
 ; CHECK-LABEL: @test_ult_254_inc_imm(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[X:%.*]] to i32
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP0]], -255
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[ADD]], -2
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[X:%.*]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[ADD]], -2
 ; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 35, i32 47
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
@@ -369,7 +368,7 @@ if.end:
 define i32 @degenerateicmp() {
 ; CHECK-LABEL: @degenerateicmp(
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 190, 0
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp ugt i32 -31, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ugt i32 225, [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i32 1, i32 0
 ; CHECK-NEXT:    ret i32 [[TMP3]]
 ;
diff --git a/llvm/test/Transforms/TypePromotion/ARM/wrapping.ll b/llvm/test/Transforms/TypePromotion/ARM/wrapping.ll
index 78c5e7323ceab3..377708cf71134a 100644
--- a/llvm/test/Transforms/TypePromotion/ARM/wrapping.ll
+++ b/llvm/test/Transforms/TypePromotion/ARM/wrapping.ll
@@ -89,9 +89,8 @@ define i32 @overflow_add_const_limit(i8 zeroext %a, i8 zeroext %b) {
 
 define i32 @overflow_add_positive_const_limit(i8 zeroext %a) {
 ; CHECK-LABEL: @overflow_add_positive_const_limit(
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP1]], -255
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[ADD]], -128
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[A:%.*]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[ADD]], -128
 ; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 8, i32 16
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
@@ -145,9 +144,8 @@ define i32 @safe_add_underflow_neg(i8 zeroext %a) {
 
 define i32 @overflow_sub_negative_const_limit(i8 zeroext %a) {
 ; CHECK-LABEL: @overflow_sub_negative_const_limit(
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
-; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[TMP1]], 255
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[SUB]], -128
+; CHECK-NEXT:    [[SUB:%.*]] = sub i8 [[A:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[SUB]], -128
 ; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 8, i32 16
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
diff --git a/llvm/tools/llvm-exegesis/lib/Assembler.cpp b/llvm/tools/llvm-exegesis/lib/Assembler.cpp
index 3aad9135978968..92ab3a96d91e6b 100644
--- a/llvm/tools/llvm-exegesis/lib/Assembler.cpp
+++ b/llvm/tools/llvm-exegesis/lib/Assembler.cpp
@@ -305,7 +305,7 @@ Error assembleToStream(const ExegesisTarget &ET,
 
   // prologue/epilogue pass needs the reserved registers to be frozen, this
   // is usually done by the SelectionDAGISel pass.
-  MF.getRegInfo().freezeReservedRegs(MF);
+  MF.getRegInfo().freezeReservedRegs();
 
   // We create the pass manager, run the passes to populate AsmBuffer.
   MCContext &MCContext = MMIWP->getMMI().getContext();
diff --git a/llvm/tools/llvm-pdbutil/DumpOutputStyle.cpp b/llvm/tools/llvm-pdbutil/DumpOutputStyle.cpp
index 63b173a727ce60..1cecbfc463fec5 100644
--- a/llvm/tools/llvm-pdbutil/DumpOutputStyle.cpp
+++ b/llvm/tools/llvm-pdbutil/DumpOutputStyle.cpp
@@ -1070,7 +1070,7 @@ Error DumpOutputStyle::dumpStringTableFromPdb() {
     if (IS->name_ids().empty())
       P.formatLine("Empty");
     else {
-      auto MaxID = llvm::max_element(IS->name_ids(), IS->name_ids());
+      auto MaxID = llvm::max_element(IS->name_ids());
       uint32_t Digits = NumDigits(*MaxID);
 
       P.formatLine("{0} | {1}", fmt_align("ID", AlignStyle::Right, Digits),
diff --git a/llvm/tools/llvm-reduce/CMakeLists.txt b/llvm/tools/llvm-reduce/CMakeLists.txt
index 2f1164b0478533..a4c605fcd2443c 100644
--- a/llvm/tools/llvm-reduce/CMakeLists.txt
+++ b/llvm/tools/llvm-reduce/CMakeLists.txt
@@ -31,7 +31,7 @@ add_llvm_tool(llvm-reduce
   deltas/ReduceAttributes.cpp
   deltas/ReduceBasicBlocks.cpp
   deltas/ReduceDIMetadata.cpp
-  deltas/ReduceDPValues.cpp
+  deltas/ReduceDbgRecords.cpp
   deltas/ReduceFunctionBodies.cpp
   deltas/ReduceFunctions.cpp
   deltas/ReduceGlobalObjects.cpp
diff --git a/llvm/tools/llvm-reduce/DeltaManager.cpp b/llvm/tools/llvm-reduce/DeltaManager.cpp
index fa42920ee91296..67fbc2fdc7ad43 100644
--- a/llvm/tools/llvm-reduce/DeltaManager.cpp
+++ b/llvm/tools/llvm-reduce/DeltaManager.cpp
@@ -20,7 +20,7 @@
 #include "deltas/ReduceAttributes.h"
 #include "deltas/ReduceBasicBlocks.h"
 #include "deltas/ReduceDIMetadata.h"
-#include "deltas/ReduceDPValues.h"
+#include "deltas/ReduceDbgRecords.h"
 #include "deltas/ReduceFunctionBodies.h"
 #include "deltas/ReduceFunctions.h"
 #include "deltas/ReduceGlobalObjects.h"
diff --git a/llvm/tools/llvm-reduce/ReducerWorkItem.cpp b/llvm/tools/llvm-reduce/ReducerWorkItem.cpp
index 353216766717e3..78e6f72d7032d5 100644
--- a/llvm/tools/llvm-reduce/ReducerWorkItem.cpp
+++ b/llvm/tools/llvm-reduce/ReducerWorkItem.cpp
@@ -414,7 +414,7 @@ static std::unique_ptr<MachineFunction> cloneMF(MachineFunction *SrcMF,
   if (!DstMF->cloneInfoFrom(*SrcMF, Src2DstMBB))
     report_fatal_error("target does not implement MachineFunctionInfo cloning");
 
-  DstMRI->freezeReservedRegs(*DstMF);
+  DstMRI->freezeReservedRegs();
 
   DstMF->verify(nullptr, "", /*AbortOnError=*/true);
   return DstMF;
diff --git a/llvm/tools/llvm-reduce/deltas/ReduceDPValues.cpp b/llvm/tools/llvm-reduce/deltas/ReduceDbgRecords.cpp
similarity index 83%
rename from llvm/tools/llvm-reduce/deltas/ReduceDPValues.cpp
rename to llvm/tools/llvm-reduce/deltas/ReduceDbgRecords.cpp
index f0d02a78ac6acf..94b12eb34cf6cd 100644
--- a/llvm/tools/llvm-reduce/deltas/ReduceDPValues.cpp
+++ b/llvm/tools/llvm-reduce/deltas/ReduceDbgRecords.cpp
@@ -1,4 +1,4 @@
-//===- ReduceDPValues.cpp - Specialized Delta Pass ------------------------===//
+//===- ReduceDbgRecords.cpp - Specialized Delta Pass ----------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -17,13 +17,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ReduceDPValues.h"
+#include "ReduceDbgRecords.h"
 #include "Utils.h"
 #include "llvm/ADT/STLExtras.h"
 
 using namespace llvm;
 
-static void extractDPValuesFromModule(Oracle &O, ReducerWorkItem &WorkItem) {
+static void extractDbgRecordsFromModule(Oracle &O, ReducerWorkItem &WorkItem) {
   Module &M = WorkItem.getModule();
 
   for (auto &F : M)
@@ -35,5 +35,5 @@ static void extractDPValuesFromModule(Oracle &O, ReducerWorkItem &WorkItem) {
 }
 
 void llvm::reduceDbgRecordDeltaPass(TestRunner &Test) {
-  runDeltaPass(Test, extractDPValuesFromModule, "Reducing DbgRecords");
+  runDeltaPass(Test, extractDbgRecordsFromModule, "Reducing DbgRecords");
 }
diff --git a/llvm/tools/llvm-reduce/deltas/ReduceDPValues.h b/llvm/tools/llvm-reduce/deltas/ReduceDbgRecords.h
similarity index 80%
rename from llvm/tools/llvm-reduce/deltas/ReduceDPValues.h
rename to llvm/tools/llvm-reduce/deltas/ReduceDbgRecords.h
index 1d3b8a35daa34b..6a8f62155ec3e2 100644
--- a/llvm/tools/llvm-reduce/deltas/ReduceDPValues.h
+++ b/llvm/tools/llvm-reduce/deltas/ReduceDbgRecords.h
@@ -1,4 +1,4 @@
-//===- ReduceDPValues.h -----------------------------------------*- C++ -*-===//
+//===- ReduceDbgRecords.h ---------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVM_REDUCE_DELTAS_REDUCEDPVALUES_H
-#define LLVM_TOOLS_LLVM_REDUCE_DELTAS_REDUCEDPVALUES_H
+#ifndef LLVM_TOOLS_LLVM_REDUCE_DELTAS_REDUCEDBGRECORDS_H
+#define LLVM_TOOLS_LLVM_REDUCE_DELTAS_REDUCEDBGRECORDS_H
 
 #include "Delta.h"
 #include "llvm/IR/BasicBlock.h"
diff --git a/llvm/unittests/Analysis/LazyCallGraphTest.cpp b/llvm/unittests/Analysis/LazyCallGraphTest.cpp
index 6ef31042b6003f..69af7d92c7cf0d 100644
--- a/llvm/unittests/Analysis/LazyCallGraphTest.cpp
+++ b/llvm/unittests/Analysis/LazyCallGraphTest.cpp
@@ -1829,7 +1829,7 @@ TEST(LazyCallGraphTest, InternalRefEdgeToCallBothPartitionAndMerge) {
   // a cycle.
   //
   // Diagram for the graph we want on the left and the graph we use to force
-  // the ordering on the right. Edges ponit down or right.
+  // the ordering on the right. Edges point down or right.
   //
   //   A    |    A    |
   //  / \   |   / \   |
diff --git a/llvm/unittests/MC/AMDGPU/DwarfRegMappings.cpp b/llvm/unittests/MC/AMDGPU/DwarfRegMappings.cpp
index e1acb8677a0462..7f7a3720cf7ceb 100644
--- a/llvm/unittests/MC/AMDGPU/DwarfRegMappings.cpp
+++ b/llvm/unittests/MC/AMDGPU/DwarfRegMappings.cpp
@@ -55,6 +55,7 @@ TEST(AMDGPUDwarfRegMappingTests, TestWave64DwarfRegMapping) {
       for (int llvmReg : {16, 17, 32, 95, 1088, 1129, 2560, 2815, 3072, 3327}) {
         MCRegister PCReg(*MRI->getLLVMRegNum(llvmReg, false));
         EXPECT_EQ(llvmReg, MRI->getDwarfRegNum(PCReg, false));
+        EXPECT_EQ(llvmReg, MRI->getDwarfRegNum(PCReg, true));
       }
     }
   }
@@ -73,6 +74,7 @@ TEST(AMDGPUDwarfRegMappingTests, TestWave32DwarfRegMapping) {
       for (int llvmReg : {16, 1, 32, 95, 1088, 1129, 1536, 1791, 2048, 2303}) {
         MCRegister PCReg(*MRI->getLLVMRegNum(llvmReg, false));
         EXPECT_EQ(llvmReg, MRI->getDwarfRegNum(PCReg, false));
+        EXPECT_EQ(llvmReg, MRI->getDwarfRegNum(PCReg, true));
       }
     }
   }
diff --git a/llvm/unittests/Target/AMDGPU/DwarfRegMappings.cpp b/llvm/unittests/Target/AMDGPU/DwarfRegMappings.cpp
index 620835c5dfc5c2..56da4ce7b43af0 100644
--- a/llvm/unittests/Target/AMDGPU/DwarfRegMappings.cpp
+++ b/llvm/unittests/Target/AMDGPU/DwarfRegMappings.cpp
@@ -29,6 +29,7 @@ TEST(AMDGPU, TestWave64DwarfRegMapping) {
              {16, 17, 32, 95, 1088, 1129, 2560, 2815, 3072, 3327}) {
           MCRegister PCReg(*MRI->getLLVMRegNum(llvmReg, false));
           EXPECT_EQ(llvmReg, MRI->getDwarfRegNum(PCReg, false));
+          EXPECT_EQ(llvmReg, MRI->getDwarfRegNum(PCReg, true));
         }
       }
     }
@@ -52,6 +53,7 @@ TEST(AMDGPU, TestWave32DwarfRegMapping) {
              {16, 1, 32, 95, 1088, 1129, 1536, 1791, 2048, 2303}) {
           MCRegister PCReg(*MRI->getLLVMRegNum(llvmReg, false));
           EXPECT_EQ(llvmReg, MRI->getDwarfRegNum(PCReg, false));
+          EXPECT_EQ(llvmReg, MRI->getDwarfRegNum(PCReg, true));
         }
       }
     }
diff --git a/llvm/unittests/Target/ARM/MachineInstrTest.cpp b/llvm/unittests/Target/ARM/MachineInstrTest.cpp
index aeb25bf012d034..3a76054ca4f36d 100644
--- a/llvm/unittests/Target/ARM/MachineInstrTest.cpp
+++ b/llvm/unittests/Target/ARM/MachineInstrTest.cpp
@@ -1126,7 +1126,9 @@ TEST(MachineInstr, HasSideEffects) {
       VLDR_VPR_post,
       VLDR_VPR_pre,
       VLLDM,
+      VLLDM_T2,
       VLSTM,
+      VLSTM_T2,
       VMRS,
       VMRS_FPCXTNS,
       VMRS_FPCXTS,
diff --git a/llvm/utils/TableGen/CodeEmitterGen.cpp b/llvm/utils/TableGen/CodeEmitterGen.cpp
index 1e80eb6b1ad50e..9194c13ccdcb08 100644
--- a/llvm/utils/TableGen/CodeEmitterGen.cpp
+++ b/llvm/utils/TableGen/CodeEmitterGen.cpp
@@ -434,10 +434,7 @@ void CodeEmitterGen::run(raw_ostream &o) {
   ArrayRef<const CodeGenInstruction *> NumberedInstructions =
       Target.getInstructionsByEnumValue();
 
-  if (any_of(NumberedInstructions, [](const CodeGenInstruction *CGI) {
-        Record *R = CGI->TheDef;
-        return R->getValue("Inst") && isa<DagInit>(R->getValueInit("Inst"));
-      })) {
+  if (Target.hasVariableLengthEncodings()) {
     emitVarLenCodeEmitter(Records, o);
   } else {
     const CodeGenHwModes &HWM = Target.getHwModes();
diff --git a/llvm/utils/TableGen/CodeGenInstruction.h b/llvm/utils/TableGen/CodeGenInstruction.h
index 963c9f0b259259..b658259b4892ee 100644
--- a/llvm/utils/TableGen/CodeGenInstruction.h
+++ b/llvm/utils/TableGen/CodeGenInstruction.h
@@ -17,14 +17,13 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGenTypes/MachineValueType.h"
+#include "llvm/TableGen/Record.h"
 #include <cassert>
 #include <string>
 #include <utility>
 #include <vector>
 
 namespace llvm {
-class Record;
-class DagInit;
 class CodeGenTarget;
 
 class CGIOperandList {
@@ -333,6 +332,12 @@ class CodeGenInstruction {
     return isOperandImpl("InOperandList", i, "IsImmediate");
   }
 
+  /// Return true if the instruction uses a variable length encoding.
+  bool isVariableLengthEncoding() const {
+    const RecordVal *RV = TheDef->getValue("Inst");
+    return RV && isa<DagInit>(RV->getValue());
+  }
+
 private:
   bool isOperandImpl(StringRef OpListName, unsigned i,
                      StringRef PropertyName) const;
diff --git a/llvm/utils/TableGen/CodeGenTarget.cpp b/llvm/utils/TableGen/CodeGenTarget.cpp
index 980c9bdb6367f7..e1cf33e7f62ffc 100644
--- a/llvm/utils/TableGen/CodeGenTarget.cpp
+++ b/llvm/utils/TableGen/CodeGenTarget.cpp
@@ -480,8 +480,11 @@ void CodeGenTarget::ReadInstructions() const {
     PrintFatalError("No 'Instruction' subclasses defined!");
 
   // Parse the instructions defined in the .td file.
-  for (unsigned i = 0, e = Insts.size(); i != e; ++i)
-    Instructions[Insts[i]] = std::make_unique<CodeGenInstruction>(Insts[i]);
+  for (Record *R : Insts) {
+    Instructions[R] = std::make_unique<CodeGenInstruction>(R);
+    if (Instructions[R]->isVariableLengthEncoding())
+      HasVariableLengthEncodings = true;
+  }
 }
 
 static const CodeGenInstruction *GetInstByName(
diff --git a/llvm/utils/TableGen/CodeGenTarget.h b/llvm/utils/TableGen/CodeGenTarget.h
index 2ae3a3a2204dd0..e109c717dc018e 100644
--- a/llvm/utils/TableGen/CodeGenTarget.h
+++ b/llvm/utils/TableGen/CodeGenTarget.h
@@ -65,6 +65,7 @@ class CodeGenTarget {
   mutable SmallVector<ValueTypeByHwMode, 8> LegalValueTypes;
   CodeGenHwModes CGH;
   std::vector<Record *> MacroFusions;
+  mutable bool HasVariableLengthEncodings = false;
 
   void ReadRegAltNameIndices() const;
   void ReadInstructions() const;
@@ -209,6 +210,9 @@ class CodeGenTarget {
   }
   inst_iterator inst_end() const { return getInstructionsByEnumValue().end(); }
 
+  /// Return whether instructions have variable length encodings on this target.
+  bool hasVariableLengthEncodings() const { return HasVariableLengthEncodings; }
+
   /// isLittleEndianEncoding - are instruction bit patterns defined as  [0..n]?
   ///
   bool isLittleEndianEncoding() const;
diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp
index 27ff84bce4058e..88f24523813828 100644
--- a/llvm/utils/TableGen/DecoderEmitter.cpp
+++ b/llvm/utils/TableGen/DecoderEmitter.cpp
@@ -2499,8 +2499,8 @@ void DecoderEmitter::run(raw_ostream &o) {
   const auto &NumberedInstructions = Target.getInstructionsByEnumValue();
   NumberedEncodings.reserve(NumberedInstructions.size());
   for (const auto &NumberedInstruction : NumberedInstructions) {
-    if (const RecordVal *RV =
-            NumberedInstruction->TheDef->getValue("EncodingInfos")) {
+    const Record *InstDef = NumberedInstruction->TheDef;
+    if (const RecordVal *RV = InstDef->getValue("EncodingInfos")) {
       if (DefInit *DI = dyn_cast_or_null<DefInit>(RV->getValue())) {
         EncodingInfoByHwMode EBM(DI->getDef(), HWM);
         for (auto &KV : EBM)
@@ -2513,12 +2513,11 @@ void DecoderEmitter::run(raw_ostream &o) {
     // This instruction is encoded the same on all HwModes. Emit it for all
     // HwModes by default, otherwise leave it in a single common table.
     if (DecoderEmitterSuppressDuplicates) {
-      NumberedEncodings.emplace_back(NumberedInstruction->TheDef,
-                                     NumberedInstruction, "AllModes");
+      NumberedEncodings.emplace_back(InstDef, NumberedInstruction, "AllModes");
     } else {
       for (StringRef HwModeName : HwModeNames)
-        NumberedEncodings.emplace_back(NumberedInstruction->TheDef,
-                                       NumberedInstruction, HwModeName);
+        NumberedEncodings.emplace_back(InstDef, NumberedInstruction,
+                                       HwModeName);
     }
   }
   for (const auto &NumberedAlias :
@@ -2531,12 +2530,7 @@ void DecoderEmitter::run(raw_ostream &o) {
       OpcMap;
   std::map<unsigned, std::vector<OperandInfo>> Operands;
   std::vector<unsigned> InstrLen;
-
-  bool IsVarLenInst =
-      any_of(NumberedInstructions, [](const CodeGenInstruction *CGI) {
-        RecordVal *RV = CGI->TheDef->getValue("Inst");
-        return RV && isa<DagInit>(RV->getValue());
-      });
+  bool IsVarLenInst = Target.hasVariableLengthEncodings();
   unsigned MaxInstLen = 0;
 
   for (unsigned i = 0; i < NumberedEncodings.size(); ++i) {
diff --git a/llvm/utils/gn/secondary/clang/unittests/Interpreter/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/Interpreter/BUILD.gn
index a20066436a3bf1..4107bbc12be2bc 100644
--- a/llvm/utils/gn/secondary/clang/unittests/Interpreter/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/unittests/Interpreter/BUILD.gn
@@ -14,6 +14,7 @@ unittest("ClangReplInterpreterTests") {
     "CodeCompletionTest.cpp",
     "IncrementalCompilerBuilderTest.cpp",
     "IncrementalProcessingTest.cpp",
+    "InterpreterExtensionsTest.cpp",
     "InterpreterTest.cpp",
   ]
 
diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index cb52d99e5b2af7..f3d7b1bceb4d51 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -770,6 +770,7 @@ if (current_toolchain == default_toolchain) {
       "__thread/thread.h",
       "__thread/timed_backoff_policy.h",
       "__tree",
+      "__tuple/find_index.h",
       "__tuple/make_tuple_types.h",
       "__tuple/pair_like.h",
       "__tuple/sfinae_helpers.h",
diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-reduce/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-reduce/BUILD.gn
index 02a1db908af3f9..2f5d159dbb9d39 100644
--- a/llvm/utils/gn/secondary/llvm/tools/llvm-reduce/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/tools/llvm-reduce/BUILD.gn
@@ -22,7 +22,7 @@ executable("llvm-reduce") {
     "deltas/ReduceAttributes.cpp",
     "deltas/ReduceBasicBlocks.cpp",
     "deltas/ReduceDIMetadata.cpp",
-    "deltas/ReduceDPValues.cpp",
+    "deltas/ReduceDbgRecords.cpp",
     "deltas/ReduceFunctionBodies.cpp",
     "deltas/ReduceFunctions.cpp",
     "deltas/ReduceGlobalObjects.cpp",
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index b4bf1b5191232d..21942b179a0013 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -441,6 +441,11 @@ def AMDGPU_LDSBarrierOp : AMDGPU_Op<"lds_barrier"> {
     to complete before execution continues. Therefore, it should be used when
     operations on global memory can be issued far in advance of when their results
     are used (for example, by writing them to LDS).
+
+    WARNING: On architectures that do not support the BackOffBarrier feature,
+    (those which will implement this barrier by emitting inline assembly),
+    use of this operation will impede the usabiliity of memory watches (including
+    breakpoints set on variables) when debugging.
   }];
   let assemblyFormat = "attr-dict";
 }
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 53e9f2dc6a9949..32b5a1c016b6f8 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -194,6 +194,23 @@ def ROCDL_GridDimZOp : ROCDL_DeviceFunctionOp<"grid.dim.z",
 //===----------------------------------------------------------------------===//
 // Synchronization primitives
 
+// Emits the waintcnt instruction. The bitfield's semantics depend
+// on the target chipset
+def ROCDL_WaitcntOp : ROCDL_Op<"waitcnt">, Arguments<(ins I32Attr:$bitfield)> {
+  string llvmBuilder = [{
+    createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_s_waitcnt,
+      {builder.getInt32($bitfield)});
+  }];
+  let assemblyFormat = "attr-dict $bitfield";
+}
+
+def ROCDL_SBarrierOp : ROCDL_Op<"s.barrier"> {
+  string llvmBuilder = [{
+    createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_s_barrier);
+  }];
+  let assemblyFormat = "attr-dict";
+}
+
 def ROCDL_BarrierOp : ROCDL_Op<"barrier"> {
   string llvmBuilder = [{
     llvm::LLVMContext &llvmContext = builder.getContext();
diff --git a/mlir/include/mlir/IR/PatternMatch.h b/mlir/include/mlir/IR/PatternMatch.h
index 8a5c1128152e83..4481c88b71776d 100644
--- a/mlir/include/mlir/IR/PatternMatch.h
+++ b/mlir/include/mlir/IR/PatternMatch.h
@@ -658,7 +658,10 @@ class RewriterBase : public OpBuilder {
     for (auto it : llvm::zip(from, to))
       replaceAllUsesWith(std::get<0>(it), std::get<1>(it));
   }
-  void replaceAllUsesWith(Operation *from, ValueRange to) {
+  // Note: This function cannot be called `replaceAllUsesWith` because the
+  // overload resolution, when called with an op that can be implicitly
+  // converted to a Value, would be ambiguous.
+  void replaceAllOpUsesWith(Operation *from, ValueRange to) {
     replaceAllUsesWith(from->getResults(), to);
   }
 
@@ -672,9 +675,12 @@ class RewriterBase : public OpBuilder {
   void replaceUsesWithIf(ValueRange from, ValueRange to,
                          function_ref<bool(OpOperand &)> functor,
                          bool *allUsesReplaced = nullptr);
-  void replaceUsesWithIf(Operation *from, ValueRange to,
-                         function_ref<bool(OpOperand &)> functor,
-                         bool *allUsesReplaced = nullptr) {
+  // Note: This function cannot be called `replaceOpUsesWithIf` because the
+  // overload resolution, when called with an op that can be implicitly
+  // converted to a Value, would be ambiguous.
+  void replaceOpUsesWithIf(Operation *from, ValueRange to,
+                           function_ref<bool(OpOperand &)> functor,
+                           bool *allUsesReplaced = nullptr) {
     replaceUsesWithIf(from->getResults(), to, functor, allUsesReplaced);
   }
 
@@ -682,9 +688,9 @@ class RewriterBase : public OpBuilder {
   /// the listener about every in-place op modification (for every use that was
   /// replaced). The optional `allUsesReplaced` flag is set to "true" if all
   /// uses were replaced.
-  void replaceUsesWithinBlock(Operation *op, ValueRange newValues, Block *block,
-                              bool *allUsesReplaced = nullptr) {
-    replaceUsesWithIf(
+  void replaceOpUsesWithinBlock(Operation *op, ValueRange newValues,
+                                Block *block, bool *allUsesReplaced = nullptr) {
+    replaceOpUsesWithIf(
         op, newValues,
         [block](OpOperand &use) {
           return block->getParentOp()->isProperAncestor(use.getOwner());
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 12d2462061dcf0..7e073bae75c0c9 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -270,21 +270,54 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
 };
 
 struct LDSBarrierOpLowering : public ConvertOpToLLVMPattern<LDSBarrierOp> {
-  using ConvertOpToLLVMPattern<LDSBarrierOp>::ConvertOpToLLVMPattern;
+  LDSBarrierOpLowering(LLVMTypeConverter &converter, Chipset chipset)
+      : ConvertOpToLLVMPattern<LDSBarrierOp>(converter), chipset(chipset) {}
+
+  Chipset chipset;
 
   LogicalResult
   matchAndRewrite(LDSBarrierOp op, LDSBarrierOp::Adaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    auto asmDialectAttr = LLVM::AsmDialectAttr::get(rewriter.getContext(),
-                                                    LLVM::AsmDialect::AD_ATT);
-    const char *asmStr = "s_waitcnt lgkmcnt(0)\ns_barrier";
-    const char *constraints = "";
-    rewriter.replaceOpWithNewOp<LLVM::InlineAsmOp>(
-        op,
-        /*resultTypes=*/TypeRange(), /*operands=*/ValueRange(),
-        /*asm_string=*/asmStr, constraints, /*has_side_effects=*/true,
-        /*is_align_stack=*/false, /*asm_dialect=*/asmDialectAttr,
-        /*operand_attrs=*/ArrayAttr());
+    bool requiresInlineAsm =
+        chipset.majorVersion < 9 ||
+        (chipset.majorVersion == 9 && chipset.minorVersion < 0x0a) ||
+        (chipset.majorVersion == 11);
+
+    if (requiresInlineAsm) {
+      auto asmDialectAttr = LLVM::AsmDialectAttr::get(rewriter.getContext(),
+                                                      LLVM::AsmDialect::AD_ATT);
+      const char *asmStr =
+          ";;;WARNING: BREAKS DEBUG WATCHES\ns_waitcnt lgkmcnt(0)\ns_barrier";
+      const char *constraints = "";
+      rewriter.replaceOpWithNewOp<LLVM::InlineAsmOp>(
+          op,
+          /*resultTypes=*/TypeRange(), /*operands=*/ValueRange(),
+          /*asm_string=*/asmStr, constraints, /*has_side_effects=*/true,
+          /*is_align_stack=*/false, /*asm_dialect=*/asmDialectAttr,
+          /*operand_attrs=*/ArrayAttr());
+      return success();
+    }
+    constexpr int32_t ldsOnlyBitsGfx6789 = ~(0x1f << 8);
+    constexpr int32_t ldsOnlyBitsGfx10 = ~(0x3f << 8);
+    // Left in place in case someone disables the inline ASM path or future
+    // chipsets use the same bit pattern.
+    constexpr int32_t ldsOnlyBitsGfx11 = ~(0x3f << 4);
+
+    int32_t ldsOnlyBits;
+    if (chipset.majorVersion == 11)
+      ldsOnlyBits = ldsOnlyBitsGfx11;
+    else if (chipset.majorVersion == 10)
+      ldsOnlyBits = ldsOnlyBitsGfx10;
+    else if (chipset.majorVersion <= 9)
+      ldsOnlyBits = ldsOnlyBitsGfx6789;
+    else
+      return op.emitOpError(
+                 "don't know how to lower this for chipset major version")
+             << chipset.majorVersion;
+
+    Location loc = op->getLoc();
+    rewriter.create<ROCDL::WaitcntOp>(loc, ldsOnlyBits);
+    rewriter.replaceOpWithNewOp<ROCDL::SBarrierOp>(op);
     return success();
   }
 };
@@ -834,7 +867,6 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
     return converter.convertType(t.clone(IntegerType::get(t.getContext(), 16)));
   });
 
-  patterns.add<LDSBarrierOpLowering>(converter);
   patterns
       .add<RawBufferOpLowering<RawBufferLoadOp, ROCDL::RawPtrBufferLoadOp>,
            RawBufferOpLowering<RawBufferStoreOp, ROCDL::RawPtrBufferStoreOp>,
@@ -848,9 +880,9 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
                                ROCDL::RawPtrBufferAtomicUminOp>,
            RawBufferOpLowering<RawBufferAtomicCmpswapOp,
                                ROCDL::RawPtrBufferAtomicCmpSwap>,
-           MFMAOpLowering, WMMAOpLowering, ExtPackedFp8OpLowering,
-           PackedTrunc2xFp8OpLowering, PackedStochRoundFp8OpLowering>(converter,
-                                                                      chipset);
+           LDSBarrierOpLowering, MFMAOpLowering, WMMAOpLowering,
+           ExtPackedFp8OpLowering, PackedTrunc2xFp8OpLowering,
+           PackedStochRoundFp8OpLowering>(converter, chipset);
 }
 
 std::unique_ptr<Pass> mlir::createConvertAMDGPUToROCDLPass() {
diff --git a/mlir/lib/Dialect/Linalg/Transforms/DecomposeLinalgOps.cpp b/mlir/lib/Dialect/Linalg/Transforms/DecomposeLinalgOps.cpp
index 1658ea67a46077..999359c7fa8724 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/DecomposeLinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/DecomposeLinalgOps.cpp
@@ -370,8 +370,8 @@ DecomposeLinalgOp::matchAndRewrite(GenericOp genericOp,
       scalarReplacements.push_back(
           residualGenericOpBody->getArgument(num + origNumInputs));
     bool allUsesReplaced = false;
-    rewriter.replaceUsesWithinBlock(peeledScalarOperation, scalarReplacements,
-                                    residualGenericOpBody, &allUsesReplaced);
+    rewriter.replaceOpUsesWithinBlock(peeledScalarOperation, scalarReplacements,
+                                      residualGenericOpBody, &allUsesReplaced);
     assert(!allUsesReplaced &&
            "peeled scalar operation is erased when it wasnt expected to be");
   }
diff --git a/mlir/lib/IR/PatternMatch.cpp b/mlir/lib/IR/PatternMatch.cpp
index 0a88e40f73ec6c..4079ccc7567256 100644
--- a/mlir/lib/IR/PatternMatch.cpp
+++ b/mlir/lib/IR/PatternMatch.cpp
@@ -122,7 +122,7 @@ void RewriterBase::replaceOp(Operation *op, ValueRange newValues) {
     rewriteListener->notifyOperationReplaced(op, newValues);
 
   // Replace all result uses. Also notifies the listener of modifications.
-  replaceAllUsesWith(op, newValues);
+  replaceAllOpUsesWith(op, newValues);
 
   // Erase op and notify listener.
   eraseOp(op);
@@ -141,7 +141,7 @@ void RewriterBase::replaceOp(Operation *op, Operation *newOp) {
     rewriteListener->notifyOperationReplaced(op, newOp);
 
   // Replace all result uses. Also notifies the listener of modifications.
-  replaceAllUsesWith(op, newOp->getResults());
+  replaceAllOpUsesWith(op, newOp->getResults());
 
   // Erase op and notify listener.
   eraseOp(op);
diff --git a/mlir/lib/Transforms/Utils/RegionUtils.cpp b/mlir/lib/Transforms/Utils/RegionUtils.cpp
index eff8acdfb33d20..e25867b527b716 100644
--- a/mlir/lib/Transforms/Utils/RegionUtils.cpp
+++ b/mlir/lib/Transforms/Utils/RegionUtils.cpp
@@ -161,7 +161,7 @@ SmallVector<Value> mlir::makeRegionIsolatedFromAbove(
   rewriter.setInsertionPointToStart(newEntryBlock);
   for (auto *clonedOp : clonedOperations) {
     Operation *newOp = rewriter.clone(*clonedOp, map);
-    rewriter.replaceUsesWithIf(clonedOp, newOp->getResults(), replaceIfFn);
+    rewriter.replaceOpUsesWithIf(clonedOp, newOp->getResults(), replaceIfFn);
   }
   rewriter.mergeBlocks(
       entryBlock, newEntryBlock,
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
index 76e42791323494..bb1cedaa276b33 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
@@ -1,12 +1,13 @@
-// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx908 | FileCheck %s
-// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1030 | FileCheck %s --check-prefix=RDNA
-// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1100 | FileCheck %s --check-prefix=RDNA
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx908 | FileCheck %s --check-prefixes=CHECK,GFX9,GFX908
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx90a | FileCheck %s --check-prefixes=CHECK,GFX9,GFX90A
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1030 | FileCheck %s --check-prefixes=CHECK,GFX10,RDNA
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1100 | FileCheck %s --check-prefixes=CHECK,GFX11,RDNA
 
 // CHECK-LABEL: func @gpu_gcn_raw_buffer_load_scalar_i32
 func.func @gpu_gcn_raw_buffer_load_scalar_i32(%buf: memref<i32>) -> i32 {
   // CHECK: %[[stride:.*]] = llvm.mlir.constant(0 : i16)
   // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(4 : i32)
-  // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+  // GFX9:  %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
   // RDNA:  %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
   // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %[[stride]], %[[numRecords]], %[[flags]] : !llvm.ptr to <8>
   // CHECK: %[[ret:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
@@ -19,7 +20,7 @@ func.func @gpu_gcn_raw_buffer_load_scalar_i32(%buf: memref<i32>) -> i32 {
 func.func @gpu_gcn_raw_buffer_load_i32(%buf: memref<64xi32>, %idx: i32) -> i32 {
   // CHECK: %[[stride:.*]] = llvm.mlir.constant(0 : i16)
   // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
-  // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+  // GFX9:  %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
   // RDNA:  %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
   // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %[[stride]], %[[numRecords]], %[[flags]] : !llvm.ptr to <8>
   // CHECK: %[[ret:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
@@ -30,11 +31,11 @@ func.func @gpu_gcn_raw_buffer_load_i32(%buf: memref<64xi32>, %idx: i32) -> i32 {
 
 // CHECK-LABEL: func @gpu_gcn_raw_buffer_load_i32_oob_off
 func.func @gpu_gcn_raw_buffer_load_i32_oob_off(%buf: memref<64xi32>, %idx: i32) -> i32 {
-  // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+  // GFX9:  %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
   // RDNA:  %[[flags:.*]] = llvm.mlir.constant(553807872 : i32)
-  // RDNA:  %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %{{.*}}, %[[flags]]
-  // RDNA:  %[[ret:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
-  // RDNA:  return %[[ret]]
+  // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %{{.*}}, %[[flags]]
+  // CHECK: %[[ret:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
+  // CHECK: return %[[ret]]
   %0 = amdgpu.raw_buffer_load {boundsCheck = false} %buf[%idx] : memref<64xi32>, i32 -> i32
   func.return %0 : i32
 }
@@ -103,7 +104,8 @@ func.func @gpu_gcn_raw_buffer_load_4xf8E4M3FNUZ(%buf: memref<64xf8E4M3FNUZ>, %id
 // Since the lowering logic is shared with loads, only bitcasts need to be rechecked
 // CHECK-LABEL: func @gpu_gcn_raw_buffer_store_scalar_i32
 func.func @gpu_gcn_raw_buffer_store_scalar_i32(%value: i32, %buf: memref<i32>) {
-  // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+  // GFX9:  %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+  // RDNA:  %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
   // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %{{.*}}, %[[flags]]
   // CHECK: rocdl.raw.ptr.buffer.store %{{.*}}, %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
   amdgpu.raw_buffer_store {boundsCheck = true} %value -> %buf[] : i32 -> memref<i32>
@@ -113,7 +115,8 @@ func.func @gpu_gcn_raw_buffer_store_scalar_i32(%value: i32, %buf: memref<i32>) {
 // CHECK-LABEL: func @gpu_gcn_raw_buffer_store_i32
 func.func @gpu_gcn_raw_buffer_store_i32(%value: i32, %buf: memref<64xi32>, %idx: i32) {
   // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
-  // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+  // GFX9:  %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+  // RDNA:  %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
   // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
   // CHECK: rocdl.raw.ptr.buffer.store %{{.*}}, %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
   amdgpu.raw_buffer_store {boundsCheck = true} %value -> %buf[%idx] : i32 -> memref<64xi32>, i32
@@ -140,7 +143,8 @@ func.func @gpu_gcn_raw_buffer_store_16xi8(%value: vector<16xi8>, %buf: memref<64
 // CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_fadd_f32
 func.func @gpu_gcn_raw_buffer_atomic_fadd_f32(%value: f32, %buf: memref<64xf32>, %idx: i32) {
   // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
-  // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+  // GFX9:  %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+  // RDNA:  %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
   // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
   // CHECK: rocdl.raw.ptr.buffer.atomic.fadd %{{.*}}, %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : f32
   amdgpu.raw_buffer_atomic_fadd {boundsCheck = true} %value -> %buf[%idx] : f32 -> memref<64xf32>, i32
@@ -150,7 +154,8 @@ func.func @gpu_gcn_raw_buffer_atomic_fadd_f32(%value: f32, %buf: memref<64xf32>,
 // CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_fmax_f32
 func.func @gpu_gcn_raw_buffer_atomic_fmax_f32(%value: f32, %buf: memref<64xf32>, %idx: i32) {
   // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
-  // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+  // GFX9:  %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+  // RDNA:  %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
   // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
   // CHECK: rocdl.raw.ptr.buffer.atomic.fmax %{{.*}}, %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : f32
   amdgpu.raw_buffer_atomic_fmax {boundsCheck = true} %value -> %buf[%idx] : f32 -> memref<64xf32>, i32
@@ -160,7 +165,8 @@ func.func @gpu_gcn_raw_buffer_atomic_fmax_f32(%value: f32, %buf: memref<64xf32>,
 // CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_smax_i32
 func.func @gpu_gcn_raw_buffer_atomic_smax_i32(%value: i32, %buf: memref<64xi32>, %idx: i32) {
   // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
-  // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+  // GFX9:  %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+  // RDNA:  %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
   // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
   // CHECK: rocdl.raw.ptr.buffer.atomic.smax %{{.*}} %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
   amdgpu.raw_buffer_atomic_smax {boundsCheck = true} %value -> %buf[%idx] : i32 -> memref<64xi32>, i32
@@ -170,7 +176,8 @@ func.func @gpu_gcn_raw_buffer_atomic_smax_i32(%value: i32, %buf: memref<64xi32>,
 // CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_umin_i32
 func.func @gpu_gcn_raw_buffer_atomic_umin_i32(%value: i32, %buf: memref<64xi32>, %idx: i32) {
   // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
-  // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+  // GFX9:  %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+  // RDNA:  %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
   // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
   // CHECK: rocdl.raw.ptr.buffer.atomic.umin %{{.*}} %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
   amdgpu.raw_buffer_atomic_umin {boundsCheck = true} %value -> %buf[%idx] : i32 -> memref<64xi32>, i32
@@ -183,7 +190,8 @@ func.func @amdgpu_raw_buffer_atomic_cmpswap_f32(%src : f32, %cmp : f32, %buf : m
   // CHECK: %[[srcCast:.*]] = llvm.bitcast %[[src]] : f32 to i32
   // CHECK: %[[cmpCast:.*]] = llvm.bitcast %[[cmp]] : f32 to i32
   // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
-  // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+  // GFX9:  %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+  // RDNA:  %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
   // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
   // CHECK: %[[dst:.*]] = rocdl.raw.ptr.buffer.atomic.cmpswap %[[srcCast]], %[[cmpCast]], %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
   // CHECK: %[[dstCast:.*]] = llvm.bitcast %[[dst]] : i32 to f32
@@ -196,7 +204,8 @@ func.func @amdgpu_raw_buffer_atomic_cmpswap_f32(%src : f32, %cmp : f32, %buf : m
 // CHECK-SAME: (%[[src:.*]]: i64, %[[cmp:.*]]: i64, {{.*}})
 func.func @amdgpu_raw_buffer_atomic_cmpswap_i64(%src : i64, %cmp : i64, %buf : memref<64xi64>, %idx: i32) -> i64 {
   // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(512 : i32)
-  // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+  // GFX9:  %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+  // RDNA:  %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
   // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
   // CHECK: %[[dst:.*]] = rocdl.raw.ptr.buffer.atomic.cmpswap %[[src]], %[[cmp]], %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i64
   // CHECK: return %[[dst]]
@@ -206,7 +215,14 @@ func.func @amdgpu_raw_buffer_atomic_cmpswap_i64(%src : i64, %cmp : i64, %buf : m
 
 // CHECK-LABEL: func @lds_barrier
 func.func @lds_barrier() {
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "s_waitcnt lgkmcnt(0)\0As_barrier"
+  // GFX908: llvm.inline_asm has_side_effects asm_dialect = att
+  // GFX908-SAME: ";;;WARNING: BREAKS DEBUG WATCHES\0As_waitcnt lgkmcnt(0)\0As_barrier"
+  // GFX90A: rocdl.waitcnt -7937
+  // GFX90A-NEXT: rocdl.s.barrier
+  // GFX10:  rocdl.waitcnt -16129
+  // GFX10-NEXT: rocdl.s.barrier
+  // GFX11:  llvm.inline_asm has_side_effects asm_dialect = att
+  // GFX11-SAME: ";;;WARNING: BREAKS DEBUG WATCHES\0As_waitcnt lgkmcnt(0)\0As_barrier"
   amdgpu.lds_barrier
   func.return
 }
diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
index 89e8e7836c3a0c..6519186d2cfdcc 100644
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -363,6 +363,19 @@ llvm.func @rocdl_8bit_floats(%source: i32, %stoch: i32) -> i32 {
   llvm.return %source5 : i32
 }
 
+llvm.func @rocdl.waitcnt() {
+  // CHECK-LABEL: rocdl.waitcnt
+  // CHECK: rocdl.waitcnt 0
+  rocdl.waitcnt 0
+  llvm.return
+}
+
+llvm.func @rocdl.s.barrier() {
+  // CHECK-LABEL: rocdl.s.barrier
+  // CHECK: rocdl.s.barrier
+  rocdl.s.barrier
+  llvm.return
+}
 // -----
 
 // expected-error@below {{attribute attached to unexpected op}}
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 3ea6292c679d90..d35acb0475e6f8 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -88,7 +88,23 @@ llvm.func @rocdl.bpermute(%src : i32) -> i32 {
   llvm.return %0 : i32
 }
 
+llvm.func @rocdl.waitcnt() {
+  // CHECK-LABEL: rocdl.waitcnt
+  // CHECK-NEXT: call void @llvm.amdgcn.s.waitcnt(i32 0)
+  rocdl.waitcnt 0
+  llvm.return
+}
+
+llvm.func @rocdl.s.barrier() {
+  // CHECK-LABEL: rocdl.s.barrier
+  // CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+  rocdl.s.barrier
+  llvm.return
+}
+
+
 llvm.func @rocdl.barrier() {
+  // CHECK-LABEL: rocdl.barrier
   // CHECK:      fence syncscope("workgroup") release
   // CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
   // CHECK-NEXT: fence syncscope("workgroup") acquire
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index 6510dd9b3561d3..48d7124e56c57c 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -1404,9 +1404,19 @@ extern void __kmp_query_cpuid(kmp_cpuinfo_t *p);
 // subleaf is only needed for cache and topology discovery and can be set to
 // zero in most cases
 static inline void __kmp_x86_cpuid(int leaf, int subleaf, struct kmp_cpuid *p) {
+#if KMP_ARCH_X86 && (defined(__pic__) || defined(__PIC__))
+  // on i386 arch, the ebx reg. is used by pic, thus we need to preserve from
+  // being trashed beforehand
+  __asm__ __volatile__("mov %%ebx, %%edi\n"
+                       "cpuid\n"
+                       "xchg %%edi, %%ebx\n"
+                       : "=a"(p->eax), "=b"(p->ebx), "=c"(p->ecx), "=d"(p->edx)
+                       : "a"(leaf), "c"(subleaf));
+#else
   __asm__ __volatile__("cpuid"
                        : "=a"(p->eax), "=b"(p->ebx), "=c"(p->ecx), "=d"(p->edx)
                        : "a"(leaf), "c"(subleaf));
+#endif
 }
 // Load p into FPU control word
 static inline void __kmp_load_x87_fpu_control_word(const kmp_int16 *p) {
diff --git a/openmp/runtime/src/kmp_affinity.cpp b/openmp/runtime/src/kmp_affinity.cpp
index ae0b6459d79ed0..b79b57eafd6a81 100644
--- a/openmp/runtime/src/kmp_affinity.cpp
+++ b/openmp/runtime/src/kmp_affinity.cpp
@@ -1829,14 +1829,8 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
 
   // Figure out the depth and types in the topology
   depth = 0;
-  pu = hwloc_get_pu_obj_by_os_index(tp, __kmp_affin_fullMask->begin());
-  KMP_ASSERT(pu);
-  obj = pu;
-  types[depth] = KMP_HW_THREAD;
-  hwloc_types[depth] = obj->type;
-  depth++;
-  while (obj != root && obj != NULL) {
-    obj = obj->parent;
+  obj = hwloc_get_pu_obj_by_os_index(tp, __kmp_affin_fullMask->begin());
+  while (obj && obj != root) {
 #if HWLOC_API_VERSION >= 0x00020000
     if (obj->memory_arity) {
       hwloc_obj_t memory;
@@ -1858,6 +1852,7 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
       hwloc_types[depth] = obj->type;
       depth++;
     }
+    obj = obj->parent;
   }
   KMP_ASSERT(depth > 0);
 
diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp
index ce775ff49f4d99..a60bdb968371e0 100644
--- a/openmp/runtime/src/kmp_runtime.cpp
+++ b/openmp/runtime/src/kmp_runtime.cpp
@@ -1743,14 +1743,8 @@ __kmp_serial_fork_call(ident_t *loc, int gtid, enum fork_context_e call_context,
       __kmp_alloc_argv_entries(argc, team, TRUE);
       team->t.t_argc = argc;
       argv = (void **)team->t.t_argv;
-      if (ap) {
-        for (i = argc - 1; i >= 0; --i)
-          *argv++ = va_arg(kmp_va_deref(ap), void *);
-      } else {
-        for (i = 0; i < argc; ++i)
-          // Get args from parent team for teams construct
-          argv[i] = parent_team->t.t_argv[i];
-      }
+      for (i = argc - 1; i >= 0; --i)
+        *argv++ = va_arg(kmp_va_deref(ap), void *);
       // AC: revert change made in __kmpc_serialized_parallel()
       //     because initial code in teams should have level=0
       team->t.t_level--;
diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp
index ec86ee07472c1e..b9c8289b5c51c0 100644
--- a/openmp/runtime/src/kmp_settings.cpp
+++ b/openmp/runtime/src/kmp_settings.cpp
@@ -4373,8 +4373,8 @@ static void __kmp_stg_parse_omp_schedule(char const *name, char const *value,
                                          void *data) {
   size_t length;
   const char *ptr = value;
-  SKIP_WS(ptr);
-  if (value) {
+  if (ptr) {
+    SKIP_WS(ptr);
     length = KMP_STRLEN(value);
     if (length) {
       if (value[length - 1] == '"' || value[length - 1] == '\'')
@@ -4889,9 +4889,6 @@ static void __kmp_stg_parse_spin_backoff_params(const char *name,
       if (num <= 0) { // The number of retries should be > 0
         msg = KMP_I18N_STR(ValueTooSmall);
         num = 1;
-      } else if (num > KMP_INT_MAX) {
-        msg = KMP_I18N_STR(ValueTooLarge);
-        num = KMP_INT_MAX;
       }
       if (msg != NULL) {
         // Message is not empty. Print warning.
@@ -4988,9 +4985,6 @@ static void __kmp_stg_parse_adaptive_lock_props(const char *name,
       if (num < 0) { // The number of retries should be >= 0
         msg = KMP_I18N_STR(ValueTooSmall);
         num = 1;
-      } else if (num > KMP_INT_MAX) {
-        msg = KMP_I18N_STR(ValueTooLarge);
-        num = KMP_INT_MAX;
       }
       if (msg != NULL) {
         // Message is not empty. Print warning.
diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index 6e8b948efa064f..155e17ba7ec874 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -2662,8 +2662,8 @@ void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
   if (tg == NULL)
     tg = thread->th.th_current_task->td_taskgroup;
   KMP_ASSERT(tg != NULL);
-  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)(tg->reduce_data);
-  kmp_int32 num = tg->reduce_num_data;
+  kmp_taskred_data_t *arr;
+  kmp_int32 num;
   kmp_int32 tid = thread->th.th_info.ds.ds_tid;
 
 #if OMPX_TASKGRAPH
@@ -2680,6 +2680,8 @@ void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
 
   KMP_ASSERT(data != NULL);
   while (tg != NULL) {
+    arr = (kmp_taskred_data_t *)(tg->reduce_data);
+    num = tg->reduce_num_data;
     for (int i = 0; i < num; ++i) {
       if (!arr[i].flags.lazy_priv) {
         if (data == arr[i].reduce_shar ||
@@ -2713,8 +2715,6 @@ void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
     }
     KMP_ASSERT(tg->parent);
     tg = tg->parent;
-    arr = (kmp_taskred_data_t *)(tg->reduce_data);
-    num = tg->reduce_num_data;
   }
   KMP_ASSERT2(0, "Unknown task reduction item");
   return NULL; // ERROR, this line never executed
diff --git a/openmp/runtime/src/kmp_threadprivate.cpp b/openmp/runtime/src/kmp_threadprivate.cpp
index b79ac7d6d2b254..c4a1ec6e10239a 100644
--- a/openmp/runtime/src/kmp_threadprivate.cpp
+++ b/openmp/runtime/src/kmp_threadprivate.cpp
@@ -248,16 +248,16 @@ void __kmp_common_destroy_gtid(int gtid) {
         if (d_tn->is_vec) {
           if (d_tn->dt.dtorv != 0) {
             (void)(*d_tn->dt.dtorv)(tn->par_addr, d_tn->vec_len);
-          }
-          if (d_tn->obj_init != 0) {
-            (void)(*d_tn->dt.dtorv)(d_tn->obj_init, d_tn->vec_len);
+            if (d_tn->obj_init != 0) {
+              (void)(*d_tn->dt.dtorv)(d_tn->obj_init, d_tn->vec_len);
+            }
           }
         } else {
           if (d_tn->dt.dtor != 0) {
             (void)(*d_tn->dt.dtor)(tn->par_addr);
-          }
-          if (d_tn->obj_init != 0) {
-            (void)(*d_tn->dt.dtor)(d_tn->obj_init);
+            if (d_tn->obj_init != 0) {
+              (void)(*d_tn->dt.dtor)(d_tn->obj_init);
+            }
           }
         }
       }