diff --git a/clang-tools-extra/clang-tidy/android/ComparisonInTempFailureRetryCheck.h b/clang-tools-extra/clang-tidy/android/ComparisonInTempFailureRetryCheck.h
--- a/clang-tools-extra/clang-tidy/android/ComparisonInTempFailureRetryCheck.h
+++ b/clang-tools-extra/clang-tidy/android/ComparisonInTempFailureRetryCheck.h
@@ -10,6 +10,9 @@
 #define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_COMPARISONINTEMPFAILURERETRYCHECK_H
 
 #include "../ClangTidyCheck.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include <string>
 
 namespace clang {
 namespace tidy {
@@ -22,10 +25,14 @@
 /// TEMP_FAILURE_RETRY is a macro provided by both glibc and Bionic.
 class ComparisonInTempFailureRetryCheck : public ClangTidyCheck {
 public:
-  ComparisonInTempFailureRetryCheck(StringRef Name, ClangTidyContext *Context)
-      : ClangTidyCheck(Name, Context) {}
+  ComparisonInTempFailureRetryCheck(StringRef Name, ClangTidyContext *Context);
+  void storeOptions(ClangTidyOptions::OptionMap &Opts) override;
   void registerMatchers(ast_matchers::MatchFinder *Finder) override;
   void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
+
+private:
+  const std::string RawRetryList;
+  SmallVector<StringRef, 5> RetryMacros;
 };
 
 } // namespace android
diff --git a/clang-tools-extra/clang-tidy/android/ComparisonInTempFailureRetryCheck.cpp b/clang-tools-extra/clang-tidy/android/ComparisonInTempFailureRetryCheck.cpp
--- a/clang-tools-extra/clang-tidy/android/ComparisonInTempFailureRetryCheck.cpp
+++ b/clang-tools-extra/clang-tidy/android/ComparisonInTempFailureRetryCheck.cpp
@@ -18,32 +18,17 @@
 namespace tidy {
 namespace android {
 
-namespace {
-AST_MATCHER(BinaryOperator, isRHSATempFailureRetryArg) {
-  if (!Node.getBeginLoc().isMacroID())
-    return false;
-
-  const SourceManager &SM = Finder->getASTContext().getSourceManager();
-  if (!SM.isMacroArgExpansion(Node.getRHS()->IgnoreParenCasts()->getBeginLoc()))
-    return false;
-
-  const LangOptions &Opts = Finder->getASTContext().getLangOpts();
-  SourceLocation LocStart = Node.getBeginLoc();
-  while (LocStart.isMacroID()) {
-    SourceLocation Invocation = SM.getImmediateMacroCallerLoc(LocStart);
-    Token Tok;
-    if (!Lexer::getRawToken(SM.getSpellingLoc(Invocation), Tok, SM, Opts,
-                            /*IgnoreWhiteSpace=*/true)) {
-      if (Tok.getKind() == tok::raw_identifier &&
-          Tok.getRawIdentifier() == "TEMP_FAILURE_RETRY")
-        return true;
-    }
+ComparisonInTempFailureRetryCheck::ComparisonInTempFailureRetryCheck(
+    StringRef Name, ClangTidyContext *Context)
+    : ClangTidyCheck(Name, Context),
+      RawRetryList(Options.get("RetryMacros", "TEMP_FAILURE_RETRY")) {
+  StringRef(RawRetryList).split(RetryMacros, ",", -1, false);
+}
 
-    LocStart = Invocation;
-  }
-  return false;
+void ComparisonInTempFailureRetryCheck::storeOptions(
+    ClangTidyOptions::OptionMap &Opts) {
+  Options.store(Opts, "RetryMacros", RawRetryList);
 }
-} // namespace
 
 void ComparisonInTempFailureRetryCheck::registerMatchers(MatchFinder *Finder) {
   // Both glibc's and Bionic's TEMP_FAILURE_RETRY macros structurally look like:
@@ -63,15 +48,43 @@
   Finder->addMatcher(
       binaryOperator(hasOperatorName("="),
                      hasRHS(ignoringParenCasts(
-                         binaryOperator(isComparisonOperator()).bind("binop"))),
-                     isRHSATempFailureRetryArg()),
+                         binaryOperator(isComparisonOperator()).bind("inner"))))
+          .bind("outer"),
       this);
 }
 
 void ComparisonInTempFailureRetryCheck::check(
     const MatchFinder::MatchResult &Result) {
-  const auto &BinOp = *Result.Nodes.getNodeAs<BinaryOperator>("binop");
-  diag(BinOp.getOperatorLoc(), "top-level comparison in TEMP_FAILURE_RETRY");
+  StringRef RetryMacroName;
+  const auto &Node = *Result.Nodes.getNodeAs<BinaryOperator>("outer");
+  if (!Node.getBeginLoc().isMacroID())
+    return;
+
+  const SourceManager &SM = *Result.SourceManager;
+  if (!SM.isMacroArgExpansion(Node.getRHS()->IgnoreParenCasts()->getBeginLoc()))
+    return;
+
+  const LangOptions &Opts = Result.Context->getLangOpts();
+  SourceLocation LocStart = Node.getBeginLoc();
+  while (LocStart.isMacroID()) {
+    SourceLocation Invocation = SM.getImmediateMacroCallerLoc(LocStart);
+    Token Tok;
+    if (!Lexer::getRawToken(SM.getSpellingLoc(Invocation), Tok, SM, Opts,
+                            /*IgnoreWhiteSpace=*/true)) {
+      if (Tok.getKind() == tok::raw_identifier &&
+          llvm::is_contained(RetryMacros, Tok.getRawIdentifier())) {
+        RetryMacroName = Tok.getRawIdentifier();
+        break;
+      }
+    }
+
+    LocStart = Invocation;
+  }
+  if (RetryMacroName.empty())
+    return;
+
+  const auto &Inner = *Result.Nodes.getNodeAs<BinaryOperator>("inner");
+  diag(Inner.getOperatorLoc(), "top-level comparison in %0") << RetryMacroName;
 
   // FIXME: FixIts would be nice, but potentially nontrivial when nested macros
   // happen, e.g. `TEMP_FAILURE_RETRY(IS_ZERO(foo()))`
diff --git a/clang-tools-extra/clangd/URI.cpp b/clang-tools-extra/clangd/URI.cpp
--- a/clang-tools-extra/clangd/URI.cpp
+++ b/clang-tools-extra/clangd/URI.cpp
@@ -111,7 +111,6 @@
 /// - Reserved characters always escaped with exceptions like '/'.
 /// - All other characters are escaped.
 void percentEncode(llvm::StringRef Content, std::string &Out) {
-  std::string Result;
   for (unsigned char C : Content)
     if (shouldEscape(C)) {
       Out.push_back('%');
diff --git a/clang-tools-extra/clangd/quality/CompletionModelCodegen.py b/clang-tools-extra/clangd/quality/CompletionModelCodegen.py
--- a/clang-tools-extra/clangd/quality/CompletionModelCodegen.py
+++ b/clang-tools-extra/clangd/quality/CompletionModelCodegen.py
@@ -1,7 +1,7 @@
 """Code generator for Code Completion Model Inference.
 
 Tool runs on the Decision Forest model defined in {model} directory.
-It generates two files: {output_dir}/{filename}.h and {output_dir}/{filename}.cpp 
+It generates two files: {output_dir}/{filename}.h and {output_dir}/{filename}.cpp
 The generated files defines the Example class named {cpp_class} having all the features as class members.
 The generated runtime provides an `Evaluate` function which can be used to score a code completion candidate.
 """
@@ -39,34 +39,32 @@
 
 
 def boost_node(n, label, next_label):
-    """Returns code snippet for a leaf/boost node.
-    Adds value of leaf to the score and jumps to the root of the next tree."""
-    return "%s: Score += %s; goto %s;" % (
-            label, n['score'], next_label)
+    """Returns code snippet for a leaf/boost node."""
+    return "%s: return %s;" % (label, n['score'])
 
 
 def if_greater_node(n, label, next_label):
     """Returns code snippet for a if_greater node.
-    Jumps to true_label if the Example feature (NUMBER) is greater than the threshold. 
-    Comparing integers is much faster than comparing floats. Assuming floating points 
+    Jumps to true_label if the Example feature (NUMBER) is greater than the threshold.
+    Comparing integers is much faster than comparing floats. Assuming floating points
     are represented as IEEE 754, it order-encodes the floats to integers before comparing them.
     Control falls through if condition is evaluated to false."""
     threshold = n["threshold"]
-    return "%s: if (E.%s >= %s /*%s*/) goto %s;" % (
-            label, n['feature'], order_encode(threshold), threshold, next_label)
+    return "%s: if (E.get%s() >= %s /*%s*/) goto %s;" % (
+        label, n['feature'], order_encode(threshold), threshold, next_label)
 
 
 def if_member_node(n, label, next_label):
     """Returns code snippet for a if_member node.
-    Jumps to true_label if the Example feature (ENUM) is present in the set of enum values 
+    Jumps to true_label if the Example feature (ENUM) is present in the set of enum values
     described in the node.
     Control falls through if condition is evaluated to false."""
     members = '|'.join([
         "BIT(%s_type::%s)" % (n['feature'], member)
         for member in n["set"]
     ])
-    return "%s: if (E.%s & (%s)) goto %s;" % (
-            label, n['feature'], members, next_label)
+    return "%s: if (E.get%s() & (%s)) goto %s;" % (
+        label, n['feature'], members, next_label)
 
 
 def node(n, label, next_label):
@@ -94,8 +92,6 @@
     """
     label = "t%d_n%d" % (tree_num, node_num)
     code = []
-    if node_num == 0:
-        code.append("t%d:" % tree_num)
 
     if t["operation"] == "boost":
         code.append(node(t, label=label, next_label="t%d" % (tree_num + 1)))
@@ -119,13 +115,15 @@
     """Returns code for header declaring the inference runtime.
 
     Declares the Example class named {cpp_class} inside relevant namespaces.
-    The Example class contains all the features as class members. This 
+    The Example class contains all the features as class members. This
     class can be used to represent a code completion candidate.
     Provides `float Evaluate()` function which can be used to score the Example.
     """
     setters = []
+    getters = []
     for f in features_json:
         feature = f["name"]
+
         if f["kind"] == "NUMBER":
             # Floats are order-encoded to integers for faster comparison.
             setters.append(
@@ -138,8 +136,15 @@
             raise ValueError("Unhandled feature type.", f["kind"])
 
     # Class members represent all the features of the Example.
-    class_members = ["uint32_t %s = 0;" % f['name'] for f in features_json]
-
+    class_members = [
+        "uint32_t %s = 0;" % f['name']
+        for f in features_json
+    ]
+    getters = [
+        "LLVM_ATTRIBUTE_ALWAYS_INLINE uint32_t get%s() const { return %s; }"
+        % (f['name'], f['name'])
+        for f in features_json
+    ]
     nline = "\n  "
     guard = header_guard(filename)
     return """#ifndef %s
@@ -150,6 +155,10 @@
 %s
 class %s {
 public:
+  // Setters.
+  %s
+
+  // Getters.
   %s
 
 private:
@@ -158,18 +167,16 @@
   // Produces an integer that sorts in the same order as F.
   // That is: a < b <==> orderEncode(a) < orderEncode(b).
   static uint32_t OrderEncode(float F);
-  friend float Evaluate(const %s&);
 };
 
-// The function may have large number of lines of code. MSAN
-// build times out in such case.
-LLVM_NO_SANITIZE("memory")
 float Evaluate(const %s&);
 %s
 #endif // %s
-""" % (guard, guard, cpp_class.ns_begin(), cpp_class.name, nline.join(setters),
-        nline.join(class_members), cpp_class.name, cpp_class.name,
-        cpp_class.ns_end(), guard)
+""" % (guard, guard, cpp_class.ns_begin(), cpp_class.name,
+        nline.join(setters),
+        nline.join(getters),
+        nline.join(class_members),
+        cpp_class.name, cpp_class.ns_end(), guard)
 
 
 def order_encode(v):
@@ -182,21 +189,33 @@
 
 
 def evaluate_func(forest_json, cpp_class):
-    """Generates code for `float Evaluate(const {Example}&)` function.
-    The generated function can be used to score an Example."""
-    code = "float Evaluate(const %s& E) {\n" % cpp_class.name
-    lines = []
-    lines.append("float Score = 0;")
+    """Generates evaluation functions for each tree and combines them in
+    `float Evaluate(const {Example}&)` function. This function can be 
+    used to score an Example."""
+
+    code = ""
+
+    # Generate evaluation function of each tree.
+    code += "namespace {\n"
     tree_num = 0
     for tree_json in forest_json:
-        lines.extend(tree(tree_json, tree_num=tree_num, node_num=0)[0])
-        lines.append("")
+        code += "LLVM_ATTRIBUTE_NOINLINE float EvaluateTree%d(const %s& E) {\n" % (tree_num, cpp_class.name)
+        code += "  " + \
+            "\n  ".join(
+                tree(tree_json, tree_num=tree_num, node_num=0)[0]) + "\n"
+        code += "}\n\n"
         tree_num += 1
+    code += "} // namespace\n\n"
+
+    # Combine the scores of all trees in the final function.
+    # MSAN will timeout if these functions are inlined.
+    code += "float Evaluate(const %s& E) {\n" % cpp_class.name
+    code += "  float Score = 0;\n"
+    for tree_num in range(len(forest_json)):
+        code += "  Score += EvaluateTree%d(E);\n" % tree_num
+    code += "  return Score;\n"
+    code += "}\n"
 
-    lines.append("t%s: // No such tree." % len(forest_json))
-    lines.append("return Score;")
-    code += "  " + "\n  ".join(lines)
-    code += "\n}"
     return code
 
 
@@ -218,9 +237,9 @@
 
     # using-decl for ENUM features.
     using_decls = "\n".join("using %s_type = %s;" % (
-                                feature['name'], feature['type'])
-                            for feature in features_json
-                            if feature["kind"] == "ENUM")
+        feature['name'], feature['type'])
+        for feature in features_json
+        if feature["kind"] == "ENUM")
     nl = "\n"
     return """%s
 
@@ -287,7 +306,9 @@
 
     with open(header_file, 'w+t') as output_h:
         output_h.write(gen_header_code(
-            features_json=features_json, cpp_class=cpp_class, filename=filename))
+            features_json=features_json,
+            cpp_class=cpp_class,
+            filename=filename))
 
 
 if __name__ == '__main__':
diff --git a/clang-tools-extra/clangd/test/check-fail.test b/clang-tools-extra/clangd/test/check-fail.test
new file mode 100644
--- /dev/null
+++ b/clang-tools-extra/clangd/test/check-fail.test
@@ -0,0 +1,14 @@
+// RUN: cp %s %t.cpp
+// RUN: not clangd -check=%t.cpp 2>&1 | FileCheck -strict-whitespace %s
+
+// CHECK: Testing on source file {{.*}}check-fail.test
+// CHECK: internal (cc1) args are: -cc1
+// CHECK: Building preamble...
+// CHECK: [pp_file_not_found] Line {{.*}}: 'missing.h' file not found
+// CHECK: Building AST...
+// CHECK: Testing features at each token
+// CHECK: tweak: ExpandAutoType ==> FAIL
+// CHECK: All checks completed, 2 errors
+
+#include "missing.h"
+auto x = []{};
diff --git a/clang-tools-extra/clangd/test/check.test b/clang-tools-extra/clangd/test/check.test
new file mode 100644
--- /dev/null
+++ b/clang-tools-extra/clangd/test/check.test
@@ -0,0 +1,13 @@
+# RUN: clangd -log=verbose -check 2>&1 | FileCheck -strict-whitespace %s
+
+CHECK: Testing on source file {{.*}}test.cc
+CHECK: internal (cc1) args are: -cc1
+CHECK: Building preamble...
+CHECK: Built preamble
+CHECK: Building AST...
+CHECK: Testing features at each token
+CHECK-DAG: hover: false
+CHECK-DAG: hover: true
+CHECK-DAG: tweak: AddUsing
+CHECK: All checks completed, 0 errors
+
diff --git a/clang-tools-extra/clangd/tool/CMakeLists.txt b/clang-tools-extra/clangd/tool/CMakeLists.txt
--- a/clang-tools-extra/clangd/tool/CMakeLists.txt
+++ b/clang-tools-extra/clangd/tool/CMakeLists.txt
@@ -3,6 +3,7 @@
 
 add_clang_tool(clangd
   ClangdMain.cpp
+  Check.cpp
   $<TARGET_OBJECTS:obj.clangDaemonTweaks>
   )
 
diff --git a/clang-tools-extra/clangd/tool/Check.cpp b/clang-tools-extra/clangd/tool/Check.cpp
new file mode 100644
--- /dev/null
+++ b/clang-tools-extra/clangd/tool/Check.cpp
@@ -0,0 +1,258 @@
+//===--- Check.cpp - clangd self-diagnostics ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Many basic problems can occur processing a file in clangd, e.g.:
+//  - system includes are not found
+//  - crash when indexing its AST
+// clangd --check provides a simplified, isolated way to reproduce these,
+// with no editor, LSP, threads, background indexing etc to contend with.
+//
+// One important use case is gathering information for bug reports.
+// Another is reproducing crashes, and checking which setting prevent them.
+//
+// It simulates opening a file (determining compile command, parsing, indexing)
+// and then running features at many locations.
+//
+// Currently it adds some basic logging of progress and results.
+// We should consider extending it to also recognize common symptoms and
+// recommend solutions (e.g. standard library installation issues).
+//
+//===----------------------------------------------------------------------===//
+
+#include "ClangdLSPServer.h"
+#include "CodeComplete.h"
+#include "GlobalCompilationDatabase.h"
+#include "Hover.h"
+#include "ParsedAST.h"
+#include "Preamble.h"
+#include "SourceCode.h"
+#include "XRefs.h"
+#include "index/CanonicalIncludes.h"
+#include "index/FileIndex.h"
+#include "refactor/Tweak.h"
+#include "support/ThreadsafeFS.h"
+#include "clang/AST/ASTContext.h"
+#include "clang/Basic/DiagnosticIDs.h"
+#include "clang/Format/Format.h"
+#include "clang/Frontend/CompilerInvocation.h"
+#include "clang/Tooling/CompilationDatabase.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Path.h"
+
+namespace clang {
+namespace clangd {
+namespace {
+
+// Print (and count) the error-level diagnostics (warnings are ignored).
+unsigned showErrors(llvm::ArrayRef<Diag> Diags) {
+  unsigned ErrCount = 0;
+  for (const auto &D : Diags) {
+    if (D.Severity >= DiagnosticsEngine::Error) {
+      elog("[{0}] Line {1}: {2}", D.Name, D.Range.start.line + 1, D.Message);
+      ++ErrCount;
+    }
+  }
+  return ErrCount;
+}
+
+// This class is just a linear pipeline whose functions get called in sequence.
+// Each exercises part of clangd's logic on our test file and logs results.
+// Later steps depend on state built in earlier ones (such as the AST).
+// Many steps can fatally fail (return false), then subsequent ones cannot run.
+// Nonfatal failures are logged and tracked in ErrCount.
+class Checker {
+  // from constructor
+  std::string File;
+  ClangdLSPServer::Options Opts;
+  // from buildCommand
+  tooling::CompileCommand Cmd;
+  // from buildInvocation
+  ParseInputs Inputs;
+  std::unique_ptr<CompilerInvocation> Invocation;
+  format::FormatStyle Style;
+  // from buildAST
+  std::shared_ptr<const PreambleData> Preamble;
+  llvm::Optional<ParsedAST> AST;
+  FileIndex Index;
+
+public:
+  // Number of non-fatal errors seen.
+  unsigned ErrCount = 0;
+
+  Checker(llvm::StringRef File, const ClangdLSPServer::Options &Opts)
+      : File(File), Opts(Opts) {}
+
+  // Read compilation database and choose a compile command for the file.
+  bool buildCommand() {
+    log("Loading compilation database...");
+    std::unique_ptr<GlobalCompilationDatabase> BaseCDB =
+        std::make_unique<DirectoryBasedGlobalCompilationDatabase>(
+            Opts.CompileCommandsDir);
+    BaseCDB = getQueryDriverDatabase(llvm::makeArrayRef(Opts.QueryDriverGlobs),
+                                     std::move(BaseCDB));
+    auto Mangler = CommandMangler::detect();
+    if (Opts.ResourceDir)
+      Mangler.ResourceDir = *Opts.ResourceDir;
+    auto CDB = std::make_unique<OverlayCDB>(
+        BaseCDB.get(), std::vector<std::string>{},
+        tooling::ArgumentsAdjuster(std::move(Mangler)));
+
+    if (auto TrueCmd = CDB->getCompileCommand(File)) {
+      Cmd = std::move(*TrueCmd);
+      log("Compile command from CDB is: {0}", llvm::join(Cmd.CommandLine, " "));
+    } else {
+      Cmd = CDB->getFallbackCommand(File);
+      log("Generic fallback command is: {0}", llvm::join(Cmd.CommandLine, " "));
+    }
+
+    return true;
+  }
+
+  // Prepare inputs and build CompilerInvocation (parsed compile command).
+  bool buildInvocation(const ThreadsafeFS &TFS,
+                       llvm::Optional<std::string> Contents) {
+    StoreDiags CaptureInvocationDiags;
+    std::vector<std::string> CC1Args;
+    Inputs.CompileCommand = Cmd;
+    Inputs.TFS = &TFS;
+    if (Contents.hasValue()) {
+      Inputs.Contents = *Contents;
+      log("Imaginary source file contents:\n{0}", Inputs.Contents);
+    } else {
+      if (auto Contents = TFS.view(llvm::None)->getBufferForFile(File)) {
+        Inputs.Contents = Contents->get()->getBuffer().str();
+      } else {
+        elog("Couldn't read {0}: {1}", File, Contents.getError().message());
+        return false;
+      }
+    }
+    Inputs.Opts.ClangTidyOpts =
+        Opts.GetClangTidyOptions(*TFS.view(llvm::None), File);
+    log("Parsing command...");
+    Invocation =
+        buildCompilerInvocation(Inputs, CaptureInvocationDiags, &CC1Args);
+    auto InvocationDiags = CaptureInvocationDiags.take();
+    ErrCount += showErrors(InvocationDiags);
+    log("internal (cc1) args are: {0}", llvm::join(CC1Args, " "));
+    if (!Invocation) {
+      elog("Failed to parse command line");
+      return false;
+    }
+
+    // FIXME: Check that resource-dir/built-in-headers exist?
+
+    Style = getFormatStyleForFile(File, Inputs.Contents, TFS);
+
+    return true;
+  }
+
+  // Build preamble and AST, and index them.
+  bool buildAST() {
+    log("Building preamble...");
+    Preamble =
+        buildPreamble(File, *Invocation, Inputs, /*StoreInMemory=*/true,
+                      [&](ASTContext &Ctx, std::shared_ptr<Preprocessor> PP,
+                          const CanonicalIncludes &Includes) {
+                        if (!Opts.BuildDynamicSymbolIndex)
+                          return;
+                        log("Indexing headers...");
+                        Index.updatePreamble(File, /*Version=*/"null", Ctx,
+                                             std::move(PP), Includes);
+                      });
+    if (!Preamble) {
+      elog("Failed to build preamble");
+      return false;
+    }
+    ErrCount += showErrors(Preamble->Diags);
+
+    log("Building AST...");
+    AST = ParsedAST::build(File, Inputs, std::move(Invocation),
+                           /*InvocationDiags=*/std::vector<Diag>{}, Preamble);
+    if (!AST) {
+      elog("Failed to build AST");
+      return false;
+    }
+    ErrCount += showErrors(llvm::makeArrayRef(AST->getDiagnostics())
+                               .drop_front(Preamble->Diags.size()));
+
+    if (Opts.BuildDynamicSymbolIndex) {
+      log("Indexing AST...");
+      Index.updateMain(File, *AST);
+    }
+    return true;
+  }
+
+  // Run AST-based features at each token in the file.
+  void testLocationFeatures() {
+    log("Testing features at each token (may be slow in large files)");
+    auto SpelledTokens =
+        AST->getTokens().spelledTokens(AST->getSourceManager().getMainFileID());
+    for (const auto &Tok : SpelledTokens) {
+      unsigned Start = AST->getSourceManager().getFileOffset(Tok.location());
+      unsigned End = Start + Tok.length();
+      Position Pos = offsetToPosition(Inputs.Contents, Start);
+      // FIXME: dumping the tokens may leak sensitive code into bug reports.
+      // Add an option to turn this off, once we decide how options work.
+      vlog("  {0} {1}", Pos, Tok.text(AST->getSourceManager()));
+      auto Tree = SelectionTree::createRight(AST->getASTContext(),
+                                             AST->getTokens(), Start, End);
+      Tweak::Selection Selection(&Index, *AST, Start, End, std::move(Tree));
+      for (const auto &T : prepareTweaks(Selection, Opts.TweakFilter)) {
+        auto Result = T->apply(Selection);
+        if (!Result) {
+          elog("    tweak: {0} ==> FAIL: {1}", T->id(), Result.takeError());
+          ++ErrCount;
+        } else {
+          vlog("    tweak: {0}", T->id());
+        }
+      }
+      unsigned Definitions = locateSymbolAt(*AST, Pos, &Index).size();
+      vlog("    definition: {0}", Definitions);
+
+      auto Hover = getHover(*AST, Pos, Style, &Index);
+      vlog("    hover: {0}", Hover.hasValue());
+
+      // FIXME: it'd be nice to include code completion, but it's too slow.
+      // Maybe in combination with a line restriction?
+    }
+  }
+};
+
+} // namespace
+
+bool check(llvm::StringRef File, const ThreadsafeFS &TFS,
+           const ClangdLSPServer::Options &Opts) {
+  llvm::SmallString<0> FakeFile;
+  llvm::Optional<std::string> Contents;
+  if (File.empty()) {
+    llvm::sys::path::system_temp_directory(false, FakeFile);
+    llvm::sys::path::append(FakeFile, "test.cc");
+    File = FakeFile;
+    Contents = R"cpp(
+      #include <stddef.h>
+      #include <string>
+
+      size_t N = 50;
+      auto xxx = std::string(N, 'x');
+    )cpp";
+  }
+  log("Testing on source file {0}", File);
+
+  Checker C(File, Opts);
+  if (!C.buildCommand() || !C.buildInvocation(TFS, Contents) || !C.buildAST())
+    return false;
+  C.testLocationFeatures();
+
+  log("All checks completed, {0} errors", C.ErrCount);
+  return C.ErrCount == 0;
+}
+
+} // namespace clangd
+} // namespace clang
diff --git a/clang-tools-extra/clangd/tool/ClangdMain.cpp b/clang-tools-extra/clangd/tool/ClangdMain.cpp
--- a/clang-tools-extra/clangd/tool/ClangdMain.cpp
+++ b/clang-tools-extra/clangd/tool/ClangdMain.cpp
@@ -47,6 +47,11 @@
 
 namespace clang {
 namespace clangd {
+
+// Implemented in Check.cpp.
+bool check(const llvm::StringRef File, const ThreadsafeFS &TFS,
+           const ClangdLSPServer::Options &Opts);
+
 namespace {
 
 using llvm::cl::cat;
@@ -57,6 +62,7 @@
 using llvm::cl::list;
 using llvm::cl::opt;
 using llvm::cl::OptionCategory;
+using llvm::cl::ValueOptional;
 using llvm::cl::values;
 
 // All flags must be placed in a category, or they will be shown neither in
@@ -354,6 +360,16 @@
     Hidden,
 };
 
+opt<Path> CheckFile{
+    "check",
+    cat(Misc),
+    desc("Parse one file in isolation instead of acting as a language server. "
+         "Useful to investigate/reproduce crashes or configuration problems. "
+         "With --check=<filename>, attempts to parse a particular file."),
+    init(""),
+    ValueOptional,
+};
+
 enum PCHStorageFlag { Disk, Memory };
 opt<PCHStorageFlag> PCHStorage{
     "pch-storage",
@@ -541,7 +557,8 @@
 
 enum class ErrorResultCode : int {
   NoShutdownRequest = 1,
-  CantRunAsXPCService = 2
+  CantRunAsXPCService = 2,
+  CheckFailed = 3
 };
 
 int main(int argc, char *argv[]) {
@@ -646,7 +663,8 @@
   // If a user ran `clangd` in a terminal without redirecting anything,
   // it's somewhat likely they're confused about how to use clangd.
   // Show them the help overview, which explains.
-  if (llvm::outs().is_displayed() && llvm::errs().is_displayed())
+  if (llvm::outs().is_displayed() && llvm::errs().is_displayed() &&
+      !CheckFile.getNumOccurrences())
     llvm::errs() << Overview << "\n";
   // Use buffered stream to stderr (we still flush each log message). Unbuffered
   // stream can cause significant (non-deterministic) latency for the logger.
@@ -825,6 +843,15 @@
   // Shall we allow to customize the file limit?
   Opts.Rename.AllowCrossFile = CrossFileRename;
 
+  if (CheckFile.getNumOccurrences()) {
+    llvm::SmallString<256> Path;
+    llvm::sys::fs::real_path(CheckFile, Path, /*expand_tilde=*/true);
+    log("Entering check mode (no LSP server)");
+    return check(Path, TFS, Opts)
+               ? 0
+               : static_cast<int>(ErrorResultCode::CheckFailed);
+  }
+
   // Initialize and run ClangdLSPServer.
   // Change stdin to binary to not lose \r\n on windows.
   llvm::sys::ChangeStdinToBinary();
@@ -835,7 +862,7 @@
     TransportLayer = newXPCTransport();
 #else
     llvm::errs() << "This clangd binary wasn't built with XPC support.\n";
-    return (int)ErrorResultCode::CantRunAsXPCService;
+    return static_cast<int>(ErrorResultCode::CantRunAsXPCService);
 #endif
   } else {
     log("Starting LSP over stdin/stdout");
diff --git a/clang-tools-extra/docs/clang-tidy/checks/android-comparison-in-temp-failure-retry.rst b/clang-tools-extra/docs/clang-tidy/checks/android-comparison-in-temp-failure-retry.rst
--- a/clang-tools-extra/docs/clang-tidy/checks/android-comparison-in-temp-failure-retry.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/android-comparison-in-temp-failure-retry.rst
@@ -34,3 +34,10 @@
   while (TEMP_FAILURE_RETRY(read(STDIN_FILENO, cs, sizeof(cs))) != 0) {
     // Do something with cs.
   }
+
+Options
+-------
+
+.. option:: RetryMacros
+
+   A comma-separated list of the names of retry macros to be checked.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/android-comparison-in-temp-failure-retry-custom-macro.c b/clang-tools-extra/test/clang-tidy/checkers/android-comparison-in-temp-failure-retry-custom-macro.c
new file mode 100644
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/android-comparison-in-temp-failure-retry-custom-macro.c
@@ -0,0 +1,46 @@
+// RUN: %check_clang_tidy %s android-comparison-in-temp-failure-retry %t -- -config="{CheckOptions: [{key: android-comparison-in-temp-failure-retry.RetryMacros, value: 'MY_TEMP_FAILURE_RETRY,MY_OTHER_TEMP_FAILURE_RETRY'}]}"
+
+#define MY_TEMP_FAILURE_RETRY(x) \
+  ({                             \
+    typeof(x) __z;               \
+    do                           \
+      __z = (x);                 \
+    while (__z == -1);           \
+    __z;                         \
+  })
+
+#define MY_OTHER_TEMP_FAILURE_RETRY(x) \
+  ({                                   \
+    typeof(x) __z;                     \
+    do                                 \
+      __z = (x);                       \
+    while (__z == -1);                 \
+    __z;                               \
+  })
+
+int foo();
+int bar(int a);
+
+void with_custom_macro() {
+  MY_TEMP_FAILURE_RETRY(foo());
+  MY_TEMP_FAILURE_RETRY(foo() == 1);
+  // CHECK-MESSAGES: :[[@LINE-1]]:31: warning: top-level comparison in MY_TEMP_FAILURE_RETRY
+  MY_TEMP_FAILURE_RETRY((foo()));
+  MY_TEMP_FAILURE_RETRY((int)(foo() == 1));
+  // CHECK-MESSAGES: :[[@LINE-1]]:37: warning: top-level comparison in MY_TEMP_FAILURE_RETRY
+  MY_TEMP_FAILURE_RETRY((bar(foo() == 1)));
+  MY_TEMP_FAILURE_RETRY((int)((bar(foo() == 1)) == 1));
+  // CHECK-MESSAGES: :[[@LINE-1]]:49: warning: top-level comparison in MY_TEMP_FAILURE_RETRY
+}
+
+void with_other_custom_macro() {
+  MY_OTHER_TEMP_FAILURE_RETRY(foo());
+  MY_OTHER_TEMP_FAILURE_RETRY(foo() == 1);
+  // CHECK-MESSAGES: :[[@LINE-1]]:37: warning: top-level comparison in MY_OTHER_TEMP_FAILURE_RETRY
+  MY_OTHER_TEMP_FAILURE_RETRY((foo()));
+  MY_OTHER_TEMP_FAILURE_RETRY((int)(foo() == 1));
+  // CHECK-MESSAGES: :[[@LINE-1]]:43: warning: top-level comparison in MY_OTHER_TEMP_FAILURE_RETRY
+  MY_OTHER_TEMP_FAILURE_RETRY((bar(foo() == 1)));
+  MY_OTHER_TEMP_FAILURE_RETRY((int)((bar(foo() == 1)) == 1));
+  // CHECK-MESSAGES: :[[@LINE-1]]:55: warning: top-level comparison in MY_OTHER_TEMP_FAILURE_RETRY
+}
diff --git a/clang/cmake/caches/Fuchsia-stage2.cmake b/clang/cmake/caches/Fuchsia-stage2.cmake
--- a/clang/cmake/caches/Fuchsia-stage2.cmake
+++ b/clang/cmake/caches/Fuchsia-stage2.cmake
@@ -84,7 +84,6 @@
   set(RUNTIMES_${target}_CMAKE_SYSTEM_NAME Windows CACHE STRING "")
   set(RUNTIMES_${target}_CMAKE_BUILD_TYPE RelWithDebInfo CACHE STRING "")
   set(RUNTIMES_${target}_LIBCXX_ABI_VERSION 2 CACHE STRING "")
-  set(RUNTIMES_${target}_LIBCXX_HAS_WIN32_THREAD_API ON CACHE BOOL "")
   set(RUNTIMES_${target}_LIBCXX_ENABLE_EXPERIMENTAL_LIBRARY OFF CACHE BOOL "")
   set(RUNTIMES_${target}_LIBCXX_ENABLE_FILESYSTEM OFF CACHE BOOL "")
   set(RUNTIMES_${target}_LIBCXX_ENABLE_ABI_LINKER_SCRIPT OFF CACHE BOOL "")
diff --git a/clang/include/clang/Tooling/Syntax/Nodes.h b/clang/include/clang/Tooling/Syntax/Nodes.h
--- a/clang/include/clang/Tooling/Syntax/Nodes.h
+++ b/clang/include/clang/Tooling/Syntax/Nodes.h
@@ -99,10 +99,14 @@
   ParametersAndQualifiers,
   MemberPointer,
   UnqualifiedId,
+
+  // Lists
+  DeclaratorList,
   ParameterDeclarationList,
   CallArguments,
-  // Nested Name Specifiers.
   NestedNameSpecifier,
+
+  // Name Specifiers.
   GlobalNameSpecifier,
   DecltypeNameSpecifier,
   IdentifierNameSpecifier,
@@ -179,6 +183,7 @@
   Member,
   Callee,
   Arguments,
+  Declarators
 };
 /// For debugging purposes.
 raw_ostream &operator<<(raw_ostream &OS, NodeRole R);
@@ -823,6 +828,17 @@
   }
 };
 
+class DeclaratorList final : public List {
+public:
+  DeclaratorList() : List(NodeKind::DeclaratorList) {}
+  static bool classof(const Node *N) {
+    return N->getKind() == NodeKind::DeclaratorList;
+  }
+  std::vector<SimpleDeclarator *> getDeclarators();
+  std::vector<List::ElementAndDelimiter<syntax::SimpleDeclarator>>
+  getDeclaratorsAndCommas();
+};
+
 /// Groups multiple declarators (e.g. variables, typedefs, etc.) together. All
 /// grouped declarators share the same declaration specifiers (e.g. 'int' or
 /// 'typedef').
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -2287,6 +2287,10 @@
   if (isa<ParmVarDecl>(this))
     return false;
 
+  // The values of weak variables are never usable in constant expressions.
+  if (isWeak())
+    return false;
+
   // In C++11, any variable of reference type can be used in a constant
   // expression if it is initialized by a constant expression.
   if (Lang.CPlusPlus11 && getType()->isReferenceType())
@@ -2414,10 +2418,6 @@
 }
 
 bool VarDecl::checkInitIsICE() const {
-  // Initializers of weak variables are never ICEs.
-  if (isWeak())
-    return false;
-
   EvaluatedStmt *Eval = ensureEvaluatedStmt();
   if (Eval->CheckedICE)
     // We have already checked whether this subexpression is an
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -2439,7 +2439,8 @@
     return false;
   }
 
-  if (St & APFloat::opStatus::opInvalidOp) {
+  if ((St & APFloat::opStatus::opInvalidOp) &&
+      FPO.getFPExceptionMode() != LangOptions::FPE_Ignore) {
     // There is no usefully definable result.
     Info.FFDiag(E);
     return false;
@@ -14816,7 +14817,7 @@
         const VarDecl *VD;
         // Look for a declaration of this variable that has an initializer, and
         // check whether it is an ICE.
-        if (Dcl->getAnyInitializer(VD) && VD->checkInitIsICE())
+        if (Dcl->getAnyInitializer(VD) && !VD->isWeak() && VD->checkInitIsICE())
           return NoDiag();
         else
           return ICEDiag(IK_NotICE, cast<DeclRefExpr>(E)->getLocation());
diff --git a/clang/lib/Basic/Targets/OSTargets.h b/clang/lib/Basic/Targets/OSTargets.h
--- a/clang/lib/Basic/Targets/OSTargets.h
+++ b/clang/lib/Basic/Targets/OSTargets.h
@@ -154,7 +154,8 @@
       MinVersion = llvm::VersionTuple(5U);
       break;
     default:
-      llvm_unreachable("Unexpected OS");
+      // Conservatively return 8 bytes if OS is unknown.
+      return 64;
     }
 
     unsigned Major, Minor, Micro;
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -1214,6 +1214,9 @@
   PB.registerLoopAnalyses(LAM);
   PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
 
+  if (TM)
+    TM->registerPassBuilderCallbacks(PB, CodeGenOpts.DebugPassManager);
+
   ModulePassManager MPM(CodeGenOpts.DebugPassManager);
 
   if (!CodeGenOpts.DisableLLVMPasses) {
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -1794,11 +1794,6 @@
                            llvm::utostr(CodeGenOpts.SSPBufferSize));
     FuncAttrs.addAttribute("no-signed-zeros-fp-math",
                            llvm::toStringRef(LangOpts.NoSignedZero));
-    if (getLangOpts().OpenCL) {
-      FuncAttrs.addAttribute(
-          "correctly-rounded-divide-sqrt-fp-math",
-          llvm::toStringRef(CodeGenOpts.CorrectlyRoundedDivSqrt));
-    }
 
     // TODO: Reciprocal estimate codegen options should apply to instructions?
     const std::vector<std::string> &Recips = CodeGenOpts.Reciprocals;
diff --git a/clang/lib/Driver/ToolChains/AIX.cpp b/clang/lib/Driver/ToolChains/AIX.cpp
--- a/clang/lib/Driver/ToolChains/AIX.cpp
+++ b/clang/lib/Driver/ToolChains/AIX.cpp
@@ -162,6 +162,9 @@
     if (Args.hasArg(options::OPT_pthreads, options::OPT_pthread))
       CmdArgs.push_back("-lpthreads");
 
+    if (D.CCCIsCXX())
+      CmdArgs.push_back("-lm");
+
     CmdArgs.push_back("-lc");
   }
 
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -11112,7 +11112,7 @@
     // might be foobar, including it failing to be a constant expression.
     // TODO Handle more ways the lookup or result can be invalid.
     if (!VD->isStaticDataMember() || !VD->isConstexpr() || !VD->hasInit() ||
-        !VD->checkInitIsICE())
+        VD->isWeak() || !VD->checkInitIsICE())
       return UnsupportedSTLError(USS_InvalidMember, MemName, VD);
 
     // Attempt to evaluate the var decl as a constant expression and extract
diff --git a/clang/lib/Tooling/Inclusions/HeaderIncludes.cpp b/clang/lib/Tooling/Inclusions/HeaderIncludes.cpp
--- a/clang/lib/Tooling/Inclusions/HeaderIncludes.cpp
+++ b/clang/lib/Tooling/Inclusions/HeaderIncludes.cpp
@@ -233,7 +233,12 @@
 bool IncludeCategoryManager::isMainHeader(StringRef IncludeName) const {
   if (!IncludeName.startswith("\""))
     return false;
-  StringRef HeaderStem = matchingStem(IncludeName.drop_front(1).drop_back(1));
+
+  // Not matchingStem: implementation files may have compound extensions but
+  // headers may not.
+  StringRef HeaderStem =
+      llvm::sys::path::stem(IncludeName.drop_front(1).drop_back(
+          1) /* remove the surrounding "" or <> */);
   if (FileStem.startswith(HeaderStem) ||
       FileStem.startswith_lower(HeaderStem)) {
     llvm::Regex MainIncludeRegex(HeaderStem.str() + Style.IncludeIsMainRegex,
diff --git a/clang/lib/Tooling/Syntax/BuildTree.cpp b/clang/lib/Tooling/Syntax/BuildTree.cpp
--- a/clang/lib/Tooling/Syntax/BuildTree.cpp
+++ b/clang/lib/Tooling/Syntax/BuildTree.cpp
@@ -397,6 +397,17 @@
       Mapping.add(From, New);
   }
 
+  /// Populate children for \p New list, assuming it covers tokens from a
+  /// subrange of \p SuperRange.
+  void foldList(ArrayRef<syntax::Token> SuperRange, syntax::List *New,
+                ASTPtr From) {
+    assert(New);
+    auto ListRange = Pending.shrinkToFitList(SuperRange);
+    Pending.foldChildren(Arena, ListRange, New);
+    if (From)
+      Mapping.add(From, New);
+  }
+
   /// Notifies that we should not consume trailing semicolon when computing
   /// token range of \p D.
   void noticeDeclWithoutSemicolon(Decl *D);
@@ -579,6 +590,35 @@
       It->second->setRole(Role);
     }
 
+    /// Shrink \p Range to a subrange that only contains tokens of a list.
+    /// List elements and delimiters should already have correct roles.
+    ArrayRef<syntax::Token> shrinkToFitList(ArrayRef<syntax::Token> Range) {
+      auto BeginChildren = Trees.lower_bound(Range.begin());
+      assert((BeginChildren == Trees.end() ||
+              BeginChildren->first == Range.begin()) &&
+             "Range crosses boundaries of existing subtrees");
+
+      auto EndChildren = Trees.lower_bound(Range.end());
+      assert(
+          (EndChildren == Trees.end() || EndChildren->first == Range.end()) &&
+          "Range crosses boundaries of existing subtrees");
+
+      auto BelongsToList = [](decltype(Trees)::value_type KV) {
+        auto Role = KV.second->getRole();
+        return Role == syntax::NodeRole::ListElement ||
+               Role == syntax::NodeRole::ListDelimiter;
+      };
+
+      auto BeginListChildren =
+          std::find_if(BeginChildren, EndChildren, BelongsToList);
+
+      auto EndListChildren =
+          std::find_if_not(BeginListChildren, EndChildren, BelongsToList);
+
+      return ArrayRef<syntax::Token>(BeginListChildren->first,
+                                     EndListChildren->first);
+    }
+
     /// Add \p Node to the forest and attach child nodes based on \p Tokens.
     void foldChildren(const syntax::Arena &A, ArrayRef<syntax::Token> Tokens,
                       syntax::Tree *Node) {
@@ -1513,14 +1553,31 @@
 
     // There doesn't have to be a declarator (e.g. `void foo(int)` only has
     // declaration, but no declarator).
-    if (Range.getBegin().isValid()) {
-      auto *N = new (allocator()) syntax::SimpleDeclarator;
-      Builder.foldNode(Builder.getRange(Range), N, nullptr);
-      Builder.markChild(N, syntax::NodeRole::Declarator);
+    if (!Range.getBegin().isValid()) {
+      Builder.markChild(new (allocator()) syntax::DeclaratorList,
+                        syntax::NodeRole::Declarators);
+      Builder.foldNode(Builder.getDeclarationRange(D),
+                       new (allocator()) syntax::SimpleDeclaration, D);
+      return true;
     }
 
-    if (Builder.isResponsibleForCreatingDeclaration(D)) {
-      Builder.foldNode(Builder.getDeclarationRange(D),
+    auto *N = new (allocator()) syntax::SimpleDeclarator;
+    Builder.foldNode(Builder.getRange(Range), N, nullptr);
+    Builder.markChild(N, syntax::NodeRole::ListElement);
+
+    if (!Builder.isResponsibleForCreatingDeclaration(D)) {
+      // If this is not the last declarator in the declaration we expect a
+      // delimiter after it.
+      const auto *DelimiterToken = std::next(Builder.findToken(Range.getEnd()));
+      if (DelimiterToken->kind() == clang::tok::TokenKind::comma)
+        Builder.markChildToken(DelimiterToken, syntax::NodeRole::ListDelimiter);
+    } else {
+      auto *DL = new (allocator()) syntax::DeclaratorList;
+      auto DeclarationRange = Builder.getDeclarationRange(D);
+      Builder.foldList(DeclarationRange, DL, nullptr);
+
+      Builder.markChild(DL, syntax::NodeRole::Declarators);
+      Builder.foldNode(DeclarationRange,
                        new (allocator()) syntax::SimpleDeclaration, D);
     }
     return true;
diff --git a/clang/lib/Tooling/Syntax/Nodes.cpp b/clang/lib/Tooling/Syntax/Nodes.cpp
--- a/clang/lib/Tooling/Syntax/Nodes.cpp
+++ b/clang/lib/Tooling/Syntax/Nodes.cpp
@@ -136,6 +136,8 @@
     return OS << "CallArguments";
   case NodeKind::ParameterDeclarationList:
     return OS << "ParameterDeclarationList";
+  case NodeKind::DeclaratorList:
+    return OS << "DeclaratorList";
   }
   llvm_unreachable("unknown node kind");
 }
@@ -218,6 +220,8 @@
     return OS << "Callee";
   case syntax::NodeRole::Arguments:
     return OS << "Arguments";
+  case syntax::NodeRole::Declarators:
+    return OS << "Declarators";
   }
   llvm_unreachable("invalid role");
 }
@@ -291,6 +295,29 @@
   return Children;
 }
 
+std::vector<syntax::SimpleDeclarator *>
+syntax::DeclaratorList::getDeclarators() {
+  auto DeclaratorsAsNodes = getElementsAsNodes();
+  std::vector<syntax::SimpleDeclarator *> Children;
+  for (const auto &DeclaratorAsNode : DeclaratorsAsNodes) {
+    Children.push_back(llvm::cast<syntax::SimpleDeclarator>(DeclaratorAsNode));
+  }
+  return Children;
+}
+
+std::vector<syntax::List::ElementAndDelimiter<syntax::SimpleDeclarator>>
+syntax::DeclaratorList::getDeclaratorsAndCommas() {
+  auto DeclaratorsAsNodesAndCommas = getElementsAsNodesAndDelimiters();
+  std::vector<syntax::List::ElementAndDelimiter<syntax::SimpleDeclarator>>
+      Children;
+  for (const auto &DeclaratorAsNodeAndComma : DeclaratorsAsNodesAndCommas) {
+    Children.push_back(
+        {llvm::cast<syntax::SimpleDeclarator>(DeclaratorAsNodeAndComma.element),
+         DeclaratorAsNodeAndComma.delimiter});
+  }
+  return Children;
+}
+
 syntax::Expression *syntax::MemberExpression::getObject() {
   return cast_or_null<syntax::Expression>(findChild(syntax::NodeRole::Object));
 }
diff --git a/clang/lib/Tooling/Syntax/Synthesis.cpp b/clang/lib/Tooling/Syntax/Synthesis.cpp
--- a/clang/lib/Tooling/Syntax/Synthesis.cpp
+++ b/clang/lib/Tooling/Syntax/Synthesis.cpp
@@ -183,6 +183,8 @@
     return new (A.getAllocator()) syntax::CallArguments;
   case syntax::NodeKind::ParameterDeclarationList:
     return new (A.getAllocator()) syntax::ParameterDeclarationList;
+  case syntax::NodeKind::DeclaratorList:
+    return new (A.getAllocator()) syntax::DeclaratorList;
   }
   llvm_unreachable("unknown node kind");
 }
diff --git a/clang/test/CodeGen/builtin-nan-exception.c b/clang/test/CodeGen/builtin-nan-exception.c
--- a/clang/test/CodeGen/builtin-nan-exception.c
+++ b/clang/test/CodeGen/builtin-nan-exception.c
@@ -5,18 +5,32 @@
 
 // Run a variety of targets to ensure there's no target-based difference.
 
-// The builtin always produces a 64-bit (double).
 // An SNaN with no payload is formed by setting the bit after the
 // the quiet bit (MSB of the significand).
 
 // CHECK: float 0x7FF8000000000000, float 0x7FF4000000000000
-// CHECK: double 0x7FF8000000000000, double 0x7FF4000000000000
 
 float f[] = {
+  __builtin_nanf(""),
+  __builtin_nansf(""),
+};
+
+
+// Doubles are created and converted to floats.
+// Converting (truncating) to float quiets the NaN (sets the MSB
+// of the significand) and raises the APFloat invalidOp exception
+// but that should not cause a compilation error in the default
+// (ignore FP exceptions) mode.
+
+// CHECK: float 0x7FF8000000000000, float 0x7FFC000000000000
+
+float converted_to_float[] = {
   __builtin_nan(""),
   __builtin_nans(""),
 };
 
+// CHECK: double 0x7FF8000000000000, double 0x7FF4000000000000
+
 double d[] = {
   __builtin_nan(""),
   __builtin_nans(""),
diff --git a/clang/test/CodeGen/builtin-nan-legacy.c b/clang/test/CodeGen/builtin-nan-legacy.c
--- a/clang/test/CodeGen/builtin-nan-legacy.c
+++ b/clang/test/CodeGen/builtin-nan-legacy.c
@@ -1,7 +1,15 @@
 // RUN: %clang -target mipsel-unknown-linux -mnan=legacy -emit-llvm -S %s -o - | FileCheck %s
-// CHECK: float 0x7FF4000000000000, float 0x7FF8000000000000
+// CHECK: float 0x7FFC000000000000, float 0x7FF8000000000000
 // CHECK: double 0x7FF4000000000000, double 0x7FF8000000000000
 
+// The first line shows an unintended consequence.
+// __builtin_nan() creates a legacy QNAN double with an empty payload
+// (the first bit of the significand is clear to indicate quiet, so
+// the second bit of the payload is set to maintain NAN-ness).
+// The value is then truncated, but llvm::APFloat does not know about
+// the inverted quiet bit, so it sets the first bit on conversion
+// to indicate 'quiet' independently of the setting in clang.
+
 float f[] = {
   __builtin_nan(""),
   __builtin_nans(""),
diff --git a/clang/test/CodeGen/mips-unsupported-nan.c b/clang/test/CodeGen/mips-unsupported-nan.c
--- a/clang/test/CodeGen/mips-unsupported-nan.c
+++ b/clang/test/CodeGen/mips-unsupported-nan.c
@@ -39,7 +39,21 @@
 // CHECK-MIPS64: warning: ignoring '-mnan=2008' option because the 'mips64' architecture does not support it
 // CHECK-MIPS64R6: warning: ignoring '-mnan=legacy' option because the 'mips64r6' architecture does not support it
 
-// CHECK-NANLEGACY: float 0x7FF4000000000000
+// This call creates a QNAN double with an empty payload.
+// The quiet bit is inverted in legacy mode: it is clear to indicate QNAN,
+// so the next highest bit is set to maintain NAN (not infinity).
+// In regular (2008) mode, the quiet bit is set to indicate QNAN.
+
+// CHECK-NANLEGACY: double 0x7FF4000000000000
+// CHECK-NAN2008: double 0x7FF8000000000000
+
+double d =  __builtin_nan("");
+
+// This call creates a QNAN double with an empty payload and then truncates.
+// llvm::APFloat does not know about the inverted quiet bit, so it sets the
+// quiet bit on conversion independently of the setting in clang.
+
+// CHECK-NANLEGACY: float 0x7FFC000000000000
 // CHECK-NAN2008: float 0x7FF8000000000000
 
 float f =  __builtin_nan("");
diff --git a/clang/test/CodeGenOpenCL/amdgpu-attrs.cl b/clang/test/CodeGenOpenCL/amdgpu-attrs.cl
--- a/clang/test/CodeGenOpenCL/amdgpu-attrs.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-attrs.cl
@@ -190,5 +190,5 @@
 // CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64_WAVES_PER_EU_2_NUM_SGPR_32_NUM_VGPR_64]] = {{.*}} "amdgpu-flat-work-group-size"="32,64" "amdgpu-implicitarg-num-bytes"="56" "amdgpu-num-sgpr"="32" "amdgpu-num-vgpr"="64" "amdgpu-waves-per-eu"="2"
 // CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64_WAVES_PER_EU_2_4_NUM_SGPR_32_NUM_VGPR_64]] = {{.*}} "amdgpu-flat-work-group-size"="32,64" "amdgpu-implicitarg-num-bytes"="56" "amdgpu-num-sgpr"="32" "amdgpu-num-vgpr"="64" "amdgpu-waves-per-eu"="2,4"
 
-// CHECK-DAG: attributes [[A_FUNCTION]] = {{.*}} "correctly-rounded-divide-sqrt-fp-math"="false"
+// CHECK-DAG: attributes [[A_FUNCTION]] = {{.*}}
 // CHECK-DAG: attributes [[DEFAULT_KERNEL_ATTRS]] = {{.*}} "amdgpu-flat-work-group-size"="1,256" "amdgpu-implicitarg-num-bytes"="56"
diff --git a/clang/test/CodeGenOpenCL/fpmath.cl b/clang/test/CodeGenOpenCL/fpmath.cl
--- a/clang/test/CodeGenOpenCL/fpmath.cl
+++ b/clang/test/CodeGenOpenCL/fpmath.cl
@@ -7,7 +7,6 @@
 
 float spscalardiv(float a, float b) {
   // CHECK: @spscalardiv
-  // CHECK: #[[ATTR:[0-9]+]]
   // CHECK: fdiv{{.*}},
   // NODIVOPT: !fpmath ![[MD:[0-9]+]]
   // DIVOPT-NOT: !fpmath ![[MD:[0-9]+]]
@@ -16,7 +15,6 @@
 
 float4 spvectordiv(float4 a, float4 b) {
   // CHECK: @spvectordiv
-  // CHECK: #[[ATTR2:[0-9]+]]
   // CHECK: fdiv{{.*}},
   // NODIVOPT: !fpmath ![[MD]]
   // DIVOPT-NOT: !fpmath ![[MD]]
@@ -38,18 +36,9 @@
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 double dpscalardiv(double a, double b) {
   // CHECK: @dpscalardiv
-  // CHECK: #[[ATTR]]
   // CHECK-NOT: !fpmath
   return a / b;
 }
 #endif
 
-// CHECK: attributes #[[ATTR]] = {
-// NODIVOPT-SAME: "correctly-rounded-divide-sqrt-fp-math"="false"
-// DIVOPT-SAME: "correctly-rounded-divide-sqrt-fp-math"="true"
-// CHECK-SAME: }
-// CHECK: attributes #[[ATTR2]] = {
-// NODIVOPT-SAME: "correctly-rounded-divide-sqrt-fp-math"="false"
-// DIVOPT-SAME: "correctly-rounded-divide-sqrt-fp-math"="true"
-// CHECK-SAME: }
 // NODIVOPT: ![[MD]] = !{float 2.500000e+00}
diff --git a/clang/test/Driver/aix-ld.c b/clang/test/Driver/aix-ld.c
--- a/clang/test/Driver/aix-ld.c
+++ b/clang/test/Driver/aix-ld.c
@@ -20,6 +20,7 @@
 // CHECK-LD32:     "-L[[SYSROOT]]/usr/lib"
 // CHECK-LD32-NOT: "-lc++"
 // CHECK-LD32:     "[[RESOURCE_DIR]]{{/|\\\\}}lib{{/|\\\\}}aix{{/|\\\\}}libclang_rt.builtins-powerpc.a"
+// CHECK-LD32-NOT: "-lm"
 // CHECK-LD32:     "-lc"
 
 // Check powerpc64-ibm-aix7.1.0.0, 64-bit.
@@ -41,6 +42,7 @@
 // CHECK-LD64:     "-L[[SYSROOT]]/usr/lib"
 // CHECK-LD64-NOT: "-lc++"
 // CHECK-LD64:     "[[RESOURCE_DIR]]{{/|\\\\}}lib{{/|\\\\}}aix{{/|\\\\}}libclang_rt.builtins-powerpc64.a"
+// CHECK-LD64-NOT: "-lm"
 // CHECK-LD64:     "-lc"
 
 // Check powerpc-ibm-aix7.1.0.0, 32-bit. Enable POSIX thread support.
@@ -64,6 +66,7 @@
 // CHECK-LD32-PTHREAD-NOT: "-lc++"
 // CHECK-LD32-PTHREAD:     "[[RESOURCE_DIR]]{{/|\\\\}}lib{{/|\\\\}}aix{{/|\\\\}}libclang_rt.builtins-powerpc.a"
 // CHECK-LD32-PTHREAD:     "-lpthreads"
+// CHECK-LD32-PTHREAD-NOT: "-lm"
 // CHECK-LD32-PTHREAD:     "-lc"
 
 // Check powerpc64-ibm-aix7.1.0.0, 64-bit. POSIX thread alias.
@@ -87,6 +90,7 @@
 // CHECK-LD64-PTHREAD-NOT: "-lc++"
 // CHECK-LD64-PTHREAD:     "[[RESOURCE_DIR]]{{/|\\\\}}lib{{/|\\\\}}aix{{/|\\\\}}libclang_rt.builtins-powerpc64.a"
 // CHECK-LD64-PTHREAD:     "-lpthreads"
+// CHECK-LD64-PTHREAD-NOT: "-lm"
 // CHECK-LD64-PTHREAD:     "-lc"
 
 // Check powerpc-ibm-aix7.1.0.0, 32-bit. Enable profiling.
@@ -109,6 +113,7 @@
 // CHECK-LD32-PROF:     "-L[[SYSROOT]]/usr/lib"
 // CHECK-LD32-PROF-NOT: "-lc++"
 // CHECK-LD32-PROF:     "[[RESOURCE_DIR]]{{/|\\\\}}lib{{/|\\\\}}aix{{/|\\\\}}libclang_rt.builtins-powerpc.a"
+// CHECK-LD32-PROF-NOT: "-lm"
 // CHECK-LD32-PROF:     "-lc"
 
 // Check powerpc64-ibm-aix7.1.0.0, 64-bit. Enable g-profiling.
@@ -131,6 +136,7 @@
 // CHECK-LD64-GPROF:     "-L[[SYSROOT]]/usr/lib"
 // CHECK-LD64-GPROF-NOT: "-lc++"
 // CHECK-LD64-GPROF:     "[[RESOURCE_DIR]]{{/|\\\\}}lib{{/|\\\\}}aix{{/|\\\\}}libclang_rt.builtins-powerpc64.a"
+// CHECK-LD64-GPROF-NOT: "-lm"
 // CHECK-LD64-GPROF:     "-lc"
 
 // Check powerpc-ibm-aix7.1.0.0, 32-bit. Static linking.
@@ -153,6 +159,7 @@
 // CHECK-LD32-STATIC:     "-L[[SYSROOT]]/usr/lib"
 // CHECK-LD32-STATIC-NOT: "-lc++"
 // CHECK-LD32-STATIC:     "[[RESOURCE_DIR]]{{/|\\\\}}lib{{/|\\\\}}aix{{/|\\\\}}libclang_rt.builtins-powerpc.a"
+// CHECK-LD32-STATIC-NOT: "-lm"
 // CHECK-LD32-STATIC:     "-lc"
 
 // Check powerpc-ibm-aix7.1.0.0, 32-bit. Library search path.
@@ -176,6 +183,7 @@
 // CHECK-LD32-LIBP:     "-L[[SYSROOT]]/usr/lib"
 // CHECK-LD32-LIBP-NOT: "-lc++"
 // CHECK-LD32-LIBP:     "[[RESOURCE_DIR]]{{/|\\\\}}lib{{/|\\\\}}aix{{/|\\\\}}libclang_rt.builtins-powerpc.a"
+// CHECK-LD32-LIBP-NOT: "-lm"
 // CHECK-LD32-LIBP:     "-lc"
 
 // Check powerpc-ibm-aix7.1.0.0, 32-bit. nostdlib.
@@ -200,6 +208,7 @@
 // CHECK-LD32-NO-STD-LIB-NOT: "-lc++"
 // CHECK-LD32-NO-STD-LIB-NOT: "[[RESOURCE_DIR]]{{/|\\\\}}lib{{/|\\\\}}aix{{/|\\\\}}libclang_rt.builtins-powerpc.a"
 // CHECK-LD32-NO-STD-LIB-NOT: "-lpthreads"
+// CHECK-LD32-NO-STD-LIB-NOT: "-lm"
 // CHECK-LD32-NO-STD-LIB-NOT: "-lc"
 
 // Check powerpc64-ibm-aix7.1.0.0, 64-bit. nodefaultlibs.
@@ -224,6 +233,7 @@
 // CHECK-LD64-NO-DEFAULT-LIBS-NOT: "-lc++"
 // CHECK-LD64-NO-DEFAULT-LIBS-NOT: "[[RESOURCE_DIR]]{{/|\\\\}}lib{{/|\\\\}}aix{{/|\\\\}}libclang_rt.builtins-powerpc64.a"
 // CHECK-LD64-NO-DEFAULT-LIBS-NOT: "-lpthreads"
+// CHECK-LD64-NO-DEFAULT-LIBS-NOT: "-lm"
 // CHECK-LD64-NO-DEFAULT-LIBS-NOT: "-lc"
 
 // Check powerpc-ibm-aix7.1.0.0, 32-bit. 'bcdtors' and argument order.
@@ -247,6 +257,7 @@
 // CHECK-LD32-CXX-ARG-ORDER-NOT: "-bcdtors:all:0:s"
 // CHECK-LD32-CXX-ARG-ORDER:     "-lc++"
 // CHECK-LD32-CXX-ARG-ORDER:     "[[RESOURCE_DIR]]{{/|\\\\}}lib{{/|\\\\}}aix{{/|\\\\}}libclang_rt.builtins-powerpc.a"
+// CHECK-LD32-CXX-ARG-ORDER:     "-lm"
 // CHECK-LD32-CXX-ARG-ORDER:     "-lc"
 
 // Check powerpc-ibm-aix7.1.0.0, 32-bit. lc++ and lc order.
@@ -266,6 +277,7 @@
 // CHECK-LD32-CXX-ARG-LCXX:     "-L[[SYSROOT]]/usr/lib"
 // CHECK-LD32-CXX-ARG-LCXX:     "-lc++"
 // CHECK-LD32-CXX-ARG-LCXX:     "[[RESOURCE_DIR]]{{/|\\\\}}lib{{/|\\\\}}aix{{/|\\\\}}libclang_rt.builtins-powerpc.a"
+// CHECK-LD32-CXX-ARG-LCXX:     "-lm"
 // CHECK-LD32-CXX-ARG-LCXX:     "-lc"
 
 // Check powerpc64-ibm-aix7.1.0.0, 64-bit. lc++ and lc order.
@@ -285,6 +297,7 @@
 // CHECK-LD64-CXX-ARG-LCXX:     "-L[[SYSROOT]]/usr/lib"
 // CHECK-LD64-CXX-ARG-LCXX:     "-lc++"
 // CHECK-LD64-CXX-ARG-LCXX:     "[[RESOURCE_DIR]]{{/|\\\\}}lib{{/|\\\\}}aix{{/|\\\\}}libclang_rt.builtins-powerpc64.a"
+// CHECK-LD64-CXX-ARG-LCXX:     "-lm"
 // CHECK-LD64-CXX-ARG-LCXX:     "-lc"
 
 // Check powerpc-ibm-aix7.1.0.0, 32-bit. -nodefaultlibs.
@@ -305,6 +318,7 @@
 // CHECK-LD32-NODEFLIB-LCXX:     "-L[[SYSROOT]]/usr/lib"
 // CHECK-LD32-NODEFLIB-LCXX-NOT: "-lc++"
 // CHECK-LD32-NODEFLIB-LCXX-NOT: "[[RESOURCE_DIR]]{{/|\\\\}}lib{{/|\\\\}}aix{{/|\\\\}}libclang_rt.builtins-powerpc.a"
+// CHECK-LD32-NODEFLIB-LCXX-NOT: "-lm"
 // CHECK-LD32-NODEFLIB-LCXX-NOT: "-lc"
 
 // Check powerpc64-ibm-aix7.1.0.0, 64-bit. -nodefaultlibs.
@@ -325,6 +339,7 @@
 // CHECK-LD64-NODEFLIB-LCXX:     "-L[[SYSROOT]]/usr/lib"
 // CHECK-LD64-NODEFLIB-LCXX-NOT: "-lc++"
 // CHECK-LD64-NODEFLIB-LCXX-NOT: "[[RESOURCE_DIR]]{{/|\\\\}}lib{{/|\\\\}}aix{{/|\\\\}}libclang_rt.builtins-powerpc64.a"
+// CHECK-LD64-NODEFLIB-LCXX-NOT: "-lm"
 // CHECK-LD64-NODEFLIB-LCXX-NOT: "-lc"
 
 // Check powerpc-ibm-aix7.1.0.0, 32-bit. -nostdlib.
@@ -345,6 +360,7 @@
 // CHECK-LD32-NOSTDLIB-LCXX:     "-L[[SYSROOT]]/usr/lib"
 // CHECK-LD32-NOSTDLIB-LCXX-NOT: "-lc++"
 // CHECK-LD32-NOSTDLIB-LCXX-NOT: "[[RESOURCE_DIR]]{{/|\\\\}}lib{{/|\\\\}}aix{{/|\\\\}}libclang_rt.builtins-powerpc.a"
+// CHECK-LD32-NOSTDLIB-LCXX-NOT: "-lm"
 // CHECK-LD32-NOSTDLIB-LCXX-NOT: "-lc"
 
 // Check powerpc64-ibm-aix7.1.0.0, 64-bit. -nostdlib.
@@ -365,6 +381,7 @@
 // CHECK-LD64-NOSTDLIB-LCXX:     "-L[[SYSROOT]]/usr/lib"
 // CHECK-LD64-NOSTDLIB-LCXX-NOT: "-lc++"
 // CHECK-LD64-NOSTDLIB-LCXX-NOT: "[[RESOURCE_DIR]]{{/|\\\\}}lib{{/|\\\\}}aix{{/|\\\\}}libclang_rt.builtins-powerpc64.a"
+// CHECK-LD64-NOSTDLIB-LCXX-NOT: "-lm"
 // CHECK-LD64-NOSTDLIB-LCXX-NOT: "-lc"
 
 // Check powerpc-ibm-aix7.1.0.0, 32-bit. -nostdlib++.
@@ -386,6 +403,7 @@
 // CHECK-LD32-NOSTDLIBXX-LCXX:     "-L[[SYSROOT]]/usr/lib"
 // CHECK-LD32-NOSTDLIBXX-LCXX-NOT: "-lc++"
 // CHECK-LD32-NOSTDLIBXX-LCXX:     "[[RESOURCE_DIR]]{{/|\\\\}}lib{{/|\\\\}}aix{{/|\\\\}}libclang_rt.builtins-powerpc.a"
+// CHECK-LD32-NOSTDLIBXX-LCXX:     "-lm"
 // CHECK-LD32-NOSTDLIBXX-LCXX:     "-lc"
 
 // Check powerpc64-ibm-aix7.1.0.0, 64-bit. -nostdlib++.
@@ -406,6 +424,7 @@
 // CHECK-LD64-NOSTDLIBXX-LCXX:     "-L[[SYSROOT]]/usr/lib"
 // CHECK-LD64-NOSTDLIBXX-LCXX-NOT: "-lc++"
 // CHECK-LD64-NOSTDLIBXX-LCXX:     "[[RESOURCE_DIR]]{{/|\\\\}}lib{{/|\\\\}}aix{{/|\\\\}}libclang_rt.builtins-powerpc64.a"
+// CHECK-LD64-NOSTDLIBXX-LCXX:     "-lm"
 // CHECK-LD64-NOSTDLIBXX-LCXX:     "-lc"
 
 // Check powerpc64-ibm-aix7.1.0.0, 32-bit. -nostartfiles.
@@ -424,8 +443,9 @@
 // CHECK-LD32-NOSTARTFILES-LCXX-NOT: "[[SYSROOT]]/usr/lib{{/|\\\\}}crt0.o"
 // CHECK-LD32-NOSTARTFILES-LCXX-NOT: "[[SYSROOT]]/usr/lib{{/|\\\\}}crti.o"
 // CHECK-LD32-NOSTARTFILES-LCXX:     "-L[[SYSROOT]]/usr/lib"
-// CHECK-LD32-NOSTARTFILES-LCXX      "-lc++"
+// CHECK-LD32-NOSTARTFILES-LCXX:     "-lc++"
 // CHECK-LD32-NOSTARTFILES-LCXX:     "[[RESOURCE_DIR]]{{/|\\\\}}lib{{/|\\\\}}aix{{/|\\\\}}libclang_rt.builtins-powerpc.a"
+// CHECK-LD32-NOSTARTFILES-LCXX:     "-lm"
 // CHECK-LD32-NOSTARTFILES-LCXX:     "-lc"
 
 // Check powerpc64-ibm-aix7.1.0.0, 64-bit. -nostartfiles.
@@ -446,6 +466,7 @@
 // CHECK-LD64-NOSTARTFILES-LCXX:     "-L[[SYSROOT]]/usr/lib"
 // CHECK-LD64-NOSTARTFILES-LCXX:     "-lc++"
 // CHECK-LD64-NOSTARTFILES-LCXX:     "[[RESOURCE_DIR]]{{/|\\\\}}lib{{/|\\\\}}aix{{/|\\\\}}libclang_rt.builtins-powerpc64.a"
+// CHECK-LD64-NOSTARTFILES-LCXX:     "-lm"
 // CHECK-LD64-NOSTARTFILES-LCXX:     "-lc"
 
 // Check powerpc-ibm-aix7.1.0.0, 32-bit. -stdlib=libstdc++ invokes fatal error.
@@ -483,6 +504,7 @@
 // CHECK-LD32-SHARED:     "-L[[SYSROOT]]/usr/lib"
 // CHECK-LD32-SHARED:     "-lc++"
 // CHECK-LD32-SHARED:     "[[RESOURCE_DIR]]{{/|\\\\}}lib{{/|\\\\}}aix{{/|\\\\}}libclang_rt.builtins-powerpc.a"
+// CHECK-LD32-SHARED:     "-lm"
 // CHECK-LD32-SHARED:     "-lc"
 
 // Check powerpc64-ibm-aix7.1.0.0, 64-bit. -shared.
@@ -505,4 +527,5 @@
 // CHECK-LD64-SHARED:     "-L[[SYSROOT]]/usr/lib"
 // CHECK-LD64-SHARED:     "-lc++"
 // CHECK-LD64-SHARED:     "[[RESOURCE_DIR]]{{/|\\\\}}lib{{/|\\\\}}aix{{/|\\\\}}libclang_rt.builtins-powerpc64.a"
+// CHECK-LD64-SHARED:     "-lm"
 // CHECK-LD64-SHARED:     "-lc"
diff --git a/clang/test/Headers/arm-neon-header.c b/clang/test/Headers/arm-neon-header.c
--- a/clang/test/Headers/arm-neon-header.c
+++ b/clang/test/Headers/arm-neon-header.c
@@ -22,5 +22,6 @@
 
 // RUN: %clang -fsyntax-only -Wall -Werror -ffreestanding --target=aarch64-none-eabi -march=armv8.2-a+fp16fml+crypto+dotprod -std=c11 -xc -flax-vector-conversions=none %s
 // RUN: %clang -fsyntax-only -Wall -Werror -ffreestanding --target=aarch64_be-none-eabi -march=armv8.2-a+fp16fml+crypto+dotprod -std=c11 -xc -flax-vector-conversions=none %s
+// RUN: %clang -fsyntax-only -Wall -Werror -ffreestanding --target=arm64-linux-gnu -arch +neon -std=c11 -xc -flax-vector-conversions=none %s
 
 #include <arm_neon.h>
diff --git a/clang/test/SemaCXX/cxx20-constinit.cpp b/clang/test/SemaCXX/cxx20-constinit.cpp
new file mode 100644
--- /dev/null
+++ b/clang/test/SemaCXX/cxx20-constinit.cpp
@@ -0,0 +1,4 @@
+// RUN: %clang_cc1 %s -std=c++20 -verify
+// expected-no-diagnostics
+
+constinit int a __attribute__((weak)) = 0;
diff --git a/clang/test/SemaCXX/warn-overaligned-type-thrown.cpp b/clang/test/SemaCXX/warn-overaligned-type-thrown.cpp
--- a/clang/test/SemaCXX/warn-overaligned-type-thrown.cpp
+++ b/clang/test/SemaCXX/warn-overaligned-type-thrown.cpp
@@ -3,6 +3,7 @@
 // RUN: %clang_cc1 -triple arm64-apple-tvos10 -verify -fsyntax-only -std=c++11 -fcxx-exceptions -fexceptions -DUNDERALIGNED %s
 // RUN: %clang_cc1 -triple arm64-apple-watchos4 -verify -fsyntax-only -std=c++11 -fcxx-exceptions -fexceptions -DUNDERALIGNED %s
 // RUN: %clang_cc1 -triple arm-linux-gnueabi -verify -fsyntax-only -std=c++11 -fcxx-exceptions -fexceptions  -DUNDERALIGNED %s
+// RUN: %clang_cc1 -triple thumbv7em-apple-unknown-macho -verify -fsyntax-only -std=c++11 -fcxx-exceptions -fexceptions -DUNDERALIGNED %s
 // RUN: %clang_cc1 -triple x86_64-apple-macosx10.14 -verify -fsyntax-only -std=c++11 -fcxx-exceptions -fexceptions %s
 // RUN: %clang_cc1 -triple arm64-apple-ios12 -verify -fsyntax-only -std=c++11 -fcxx-exceptions -fexceptions %s
 // RUN: %clang_cc1 -triple arm64-apple-tvos12 -verify -fsyntax-only -std=c++11 -fcxx-exceptions -fexceptions %s
diff --git a/clang/unittests/Format/SortIncludesTest.cpp b/clang/unittests/Format/SortIncludesTest.cpp
--- a/clang/unittests/Format/SortIncludesTest.cpp
+++ b/clang/unittests/Format/SortIncludesTest.cpp
@@ -151,6 +151,16 @@
   EXPECT_TRUE(sortIncludes(FmtStyle, Code, GetCodeRange(Code), "a.cc").empty());
 }
 
+TEST_F(SortIncludesTest, NoMainFileHeader) {
+  std::string Code = "#include <string>\n"
+                     "\n"
+                     "#include \"a/extra_action.proto.h\"\n";
+  FmtStyle = getGoogleStyle(FormatStyle::LK_Cpp);
+  EXPECT_TRUE(
+      sortIncludes(FmtStyle, Code, GetCodeRange(Code), "a/extra_action.cc")
+          .empty());
+}
+
 TEST_F(SortIncludesTest, SortedIncludesInMultipleBlocksAreMerged) {
   Style.IncludeBlocks = tooling::IncludeStyle::IBS_Merge;
   EXPECT_EQ("#include \"a.h\"\n"
diff --git a/clang/unittests/Tooling/Syntax/BuildTreeTest.cpp b/clang/unittests/Tooling/Syntax/BuildTreeTest.cpp
--- a/clang/unittests/Tooling/Syntax/BuildTreeTest.cpp
+++ b/clang/unittests/Tooling/Syntax/BuildTreeTest.cpp
@@ -92,21 +92,23 @@
 TranslationUnit Detached
 |-SimpleDeclaration
 | |-'int'
-| |-SimpleDeclarator Declarator
-| | |-'main'
-| | `-ParametersAndQualifiers
-| |   |-'(' OpenParen
-| |   `-')' CloseParen
+| |-DeclaratorList Declarators
+| | `-SimpleDeclarator ListElement
+| |   |-'main'
+| |   `-ParametersAndQualifiers
+| |     |-'(' OpenParen
+| |     `-')' CloseParen
 | `-CompoundStatement
 |   |-'{' OpenParen
 |   `-'}' CloseParen
 `-SimpleDeclaration
   |-'void'
-  |-SimpleDeclarator Declarator
-  | |-'foo'
-  | `-ParametersAndQualifiers
-  |   |-'(' OpenParen
-  |   `-')' CloseParen
+  |-DeclaratorList Declarators
+  | `-SimpleDeclarator ListElement
+  |   |-'foo'
+  |   `-ParametersAndQualifiers
+  |     |-'(' OpenParen
+  |     `-')' CloseParen
   `-CompoundStatement
     |-'{' OpenParen
     `-'}' CloseParen
@@ -123,16 +125,18 @@
 TranslationUnit Detached
 |-SimpleDeclaration
 | |-'int'
-| |-SimpleDeclarator Declarator
-| | `-'a'
+| |-DeclaratorList Declarators
+| | `-SimpleDeclarator ListElement
+| |   `-'a'
 | `-';'
 `-SimpleDeclaration
   |-'int'
-  |-SimpleDeclarator Declarator
-  | |-'b'
-  | |-'='
-  | `-IntegerLiteralExpression
-  |   `-'42' LiteralToken
+  |-DeclaratorList Declarators
+  | `-SimpleDeclarator ListElement
+  |   |-'b'
+  |   |-'='
+  |   `-IntegerLiteralExpression
+  |     `-'42' LiteralToken
   `-';'
 )txt"));
 }
@@ -146,21 +150,24 @@
 TranslationUnit Detached
 `-SimpleDeclaration
   |-'void'
-  |-SimpleDeclarator Declarator
-  | |-'foo'
-  | `-ParametersAndQualifiers
-  |   |-'(' OpenParen
-  |   |-ParameterDeclarationList Parameters
-  |   | |-SimpleDeclaration ListElement
-  |   | | |-'int'
-  |   | | `-SimpleDeclarator Declarator
-  |   | |   `-'a'
-  |   | |-',' ListDelimiter
-  |   | `-SimpleDeclaration ListElement
-  |   |   |-'int'
-  |   |   `-SimpleDeclarator Declarator
-  |   |     `-'b'
-  |   `-')' CloseParen
+  |-DeclaratorList Declarators
+  | `-SimpleDeclarator ListElement
+  |   |-'foo'
+  |   `-ParametersAndQualifiers
+  |     |-'(' OpenParen
+  |     |-ParameterDeclarationList Parameters
+  |     | |-SimpleDeclaration ListElement
+  |     | | |-'int'
+  |     | | `-DeclaratorList Declarators
+  |     | |   `-SimpleDeclarator ListElement
+  |     | |     `-'a'
+  |     | |-',' ListDelimiter
+  |     | `-SimpleDeclaration ListElement
+  |     |   |-'int'
+  |     |   `-DeclaratorList Declarators
+  |     |     `-SimpleDeclarator ListElement
+  |     |       `-'b'
+  |     `-')' CloseParen
   `-CompoundStatement
     |-'{' OpenParen
     `-'}' CloseParen
@@ -178,8 +185,9 @@
 `-SimpleDeclaration
   |-'in\
 t'
-  |-SimpleDeclarator Declarator
-  | `-'a'
+  |-DeclaratorList Declarators
+  | `-SimpleDeclarator ListElement
+  |   `-'a'
   `-';'
 )txt"));
 }
@@ -264,8 +272,9 @@
 |-'('
 |-SimpleDeclaration
 | |-'int'
-| |-SimpleDeclarator Declarator
-| | `-'x'
+| |-DeclaratorList Declarators
+| | `-SimpleDeclarator ListElement
+| |   `-'x'
 | `-':'
 |-IdExpression
 | `-UnqualifiedId UnqualifiedId
@@ -287,11 +296,12 @@
 DeclarationStatement Statement
 |-SimpleDeclaration
 | |-'int'
-| `-SimpleDeclarator Declarator
-|   |-'a'
-|   |-'='
-|   `-IntegerLiteralExpression
-|     `-'10' LiteralToken
+| `-DeclaratorList Declarators
+|   `-SimpleDeclarator ListElement
+|     |-'a'
+|     |-'='
+|     `-IntegerLiteralExpression
+|       `-'10' LiteralToken
 `-';'
 )txt"}));
 }
@@ -391,11 +401,12 @@
 TranslationUnit Detached
 `-SimpleDeclaration
   |-'void'
-  |-SimpleDeclarator Declarator
-  | |-'test'
-  | `-ParametersAndQualifiers
-  |   |-'(' OpenParen
-  |   `-')' CloseParen
+  |-DeclaratorList Declarators
+  | `-SimpleDeclarator ListElement
+  |   |-'test'
+  |   `-ParametersAndQualifiers
+  |     |-'(' OpenParen
+  |     `-')' CloseParen
   `-CompoundStatement
     |-'{' OpenParen
     |-ExpressionStatement Statement
@@ -642,8 +653,9 @@
 | | `-'n'
 | `-'::' ListDelimiter
 |-'S'
-`-SimpleDeclarator Declarator
-  `-'s1'
+`-DeclaratorList Declarators
+  `-SimpleDeclarator ListElement
+    `-'s1'
 )txt",
        R"txt(
 SimpleDeclaration
@@ -652,8 +664,9 @@
 | | `-'n'
 | `-'::' ListDelimiter
 |-'S'
-`-SimpleDeclarator Declarator
-  `-'s2'
+`-DeclaratorList Declarators
+  `-SimpleDeclarator ListElement
+    `-'s2'
 )txt"}));
 }
 
@@ -684,8 +697,9 @@
 | | `-'>'
 | `-'::' ListDelimiter
 |-'S'
-`-SimpleDeclarator Declarator
-  `-'s1'
+`-DeclaratorList Declarators
+  `-SimpleDeclarator ListElement
+    `-'s1'
 )txt",
        R"txt(
 SimpleDeclaration
@@ -698,8 +712,9 @@
 | | `-'>'
 | `-'::' ListDelimiter
 |-'S'
-`-SimpleDeclarator Declarator
-  `-'s2'
+`-DeclaratorList Declarators
+  `-SimpleDeclarator ListElement
+    `-'s2'
 )txt"}));
 }
 
@@ -1363,11 +1378,12 @@
       "TranslationUnit Detached\n"
       "`-SimpleDeclaration\n"
       "  |-'void'\n"
-      "  |-SimpleDeclarator Declarator\n"
-      "  | |-'test'\n"
-      "  | `-ParametersAndQualifiers\n"
-      "  |   |-'(' OpenParen\n"
-      "  |   `-')' CloseParen\n"
+      "  |-DeclaratorList Declarators\n"
+      "  | `-SimpleDeclarator ListElement\n"
+      "  |   |-'test'\n"
+      "  |   `-ParametersAndQualifiers\n"
+      "  |     |-'(' OpenParen\n"
+      "  |     `-')' CloseParen\n"
       "  `-CompoundStatement\n"
       "    |-'{' OpenParen\n"
       "    |-ExpressionStatement Statement\n"
@@ -2875,21 +2891,23 @@
 TranslationUnit Detached
 |-SimpleDeclaration
 | |-'int'
-| |-SimpleDeclarator Declarator
-| | |-'*'
-| | `-'a'
-| |-','
-| |-SimpleDeclarator Declarator
-| | `-'b'
+| |-DeclaratorList Declarators
+| | |-SimpleDeclarator ListElement
+| | | |-'*'
+| | | `-'a'
+| | |-',' ListDelimiter
+| | `-SimpleDeclarator ListElement
+| |   `-'b'
 | `-';'
 `-SimpleDeclaration
   |-'int'
-  |-SimpleDeclarator Declarator
-  | |-'*'
-  | `-'c'
-  |-','
-  |-SimpleDeclarator Declarator
-  | `-'d'
+  |-DeclaratorList Declarators
+  | |-SimpleDeclarator ListElement
+  | | |-'*'
+  | | `-'c'
+  | |-',' ListDelimiter
+  | `-SimpleDeclarator ListElement
+  |   `-'d'
   `-';'
 )txt"));
 }
@@ -2904,12 +2922,13 @@
 `-SimpleDeclaration
   |-'typedef'
   |-'int'
-  |-SimpleDeclarator Declarator
-  | |-'*'
-  | `-'a'
-  |-','
-  |-SimpleDeclarator Declarator
-  | `-'b'
+  |-DeclaratorList Declarators
+  | |-SimpleDeclarator ListElement
+  | | |-'*'
+  | | `-'a'
+  | |-',' ListDelimiter
+  | `-SimpleDeclarator ListElement
+  |   `-'b'
   `-';'
 )txt"));
 }
@@ -2926,33 +2945,36 @@
 TranslationUnit Detached
 `-SimpleDeclaration
   |-'void'
-  |-SimpleDeclarator Declarator
-  | |-'foo'
-  | `-ParametersAndQualifiers
-  |   |-'(' OpenParen
-  |   `-')' CloseParen
+  |-DeclaratorList Declarators
+  | `-SimpleDeclarator ListElement
+  |   |-'foo'
+  |   `-ParametersAndQualifiers
+  |     |-'(' OpenParen
+  |     `-')' CloseParen
   `-CompoundStatement
     |-'{' OpenParen
     |-DeclarationStatement Statement
     | |-SimpleDeclaration
     | | |-'int'
-    | | |-SimpleDeclarator Declarator
-    | | | |-'*'
-    | | | `-'a'
-    | | |-','
-    | | `-SimpleDeclarator Declarator
-    | |   `-'b'
+    | | `-DeclaratorList Declarators
+    | |   |-SimpleDeclarator ListElement
+    | |   | |-'*'
+    | |   | `-'a'
+    | |   |-',' ListDelimiter
+    | |   `-SimpleDeclarator ListElement
+    | |     `-'b'
     | `-';'
     |-DeclarationStatement Statement
     | |-SimpleDeclaration
     | | |-'typedef'
     | | |-'int'
-    | | |-SimpleDeclarator Declarator
-    | | | |-'*'
-    | | | `-'ta'
-    | | |-','
-    | | `-SimpleDeclarator Declarator
-    | |   `-'tb'
+    | | `-DeclaratorList Declarators
+    | |   |-SimpleDeclarator ListElement
+    | |   | |-'*'
+    | |   | `-'ta'
+    | |   |-',' ListDelimiter
+    | |   `-SimpleDeclarator ListElement
+    | |     `-'tb'
     | `-';'
     `-'}' CloseParen
 )txt"));
@@ -2979,8 +3001,9 @@
   | |-'*'
   | `-')'
   |-')'
-  |-SimpleDeclarator Declarator
-  | `-'size_t'
+  |-DeclaratorList Declarators
+  | `-SimpleDeclarator ListElement
+  |   `-'size_t'
   `-';'
 )txt"));
 }
@@ -3174,9 +3197,10 @@
 SimpleDeclaration
 |-'struct'
 |-'Y'
-|-SimpleDeclarator Declarator
-| |-'*'
-| `-'y1'
+|-DeclaratorList Declarators
+| `-SimpleDeclarator ListElement
+|   |-'*'
+|   `-'y1'
 `-';'
 )txt"}));
 }
@@ -3202,9 +3226,10 @@
 |-'Y'
 |-'{'
 |-'}'
-|-SimpleDeclarator Declarator
-| |-'*'
-| `-'y2'
+|-DeclaratorList Declarators
+| `-SimpleDeclarator ListElement
+|   |-'*'
+|   `-'y2'
 `-';'
 )txt",
        R"txt(
@@ -3212,9 +3237,10 @@
 |-'struct'
 |-'{'
 |-'}'
-|-SimpleDeclarator Declarator
-| |-'*'
-| `-'a1'
+|-DeclaratorList Declarators
+| `-SimpleDeclarator ListElement
+|   |-'*'
+|   `-'a1'
 `-';'
 )txt"}));
 }
@@ -3233,11 +3259,12 @@
 SimpleDeclaration
 |-'static'
 |-'void'
-|-SimpleDeclarator Declarator
-| |-'f'
-| `-ParametersAndQualifiers
-|   |-'(' OpenParen
-|   `-')' CloseParen
+|-DeclaratorList Declarators
+| `-SimpleDeclarator ListElement
+|   |-'f'
+|   `-ParametersAndQualifiers
+|     |-'(' OpenParen
+|     `-')' CloseParen
 `-CompoundStatement
   |-'{' OpenParen
   `-'}' CloseParen
@@ -3258,15 +3285,16 @@
       {R"txt(
 SimpleDeclaration
 |-'void'
-|-SimpleDeclarator Declarator
-| |-NestedNameSpecifier
-| | |-IdentifierNameSpecifier ListElement
-| | | `-'S'
-| | `-'::' ListDelimiter
-| |-'f'
-| `-ParametersAndQualifiers
-|   |-'(' OpenParen
-|   `-')' CloseParen
+|-DeclaratorList Declarators
+| `-SimpleDeclarator ListElement
+|   |-NestedNameSpecifier
+|   | |-IdentifierNameSpecifier ListElement
+|   | | `-'S'
+|   | `-'::' ListDelimiter
+|   |-'f'
+|   `-ParametersAndQualifiers
+|     |-'(' OpenParen
+|     `-')' CloseParen
 `-CompoundStatement
   |-'{' OpenParen
   `-'}' CloseParen
@@ -3285,12 +3313,13 @@
 )cpp",
       {R"txt(
 SimpleDeclaration
-|-SimpleDeclarator Declarator
-| |-'operator'
-| |-'int'
-| `-ParametersAndQualifiers
-|   |-'(' OpenParen
-|   `-')' CloseParen
+|-DeclaratorList Declarators
+| `-SimpleDeclarator ListElement
+|   |-'operator'
+|   |-'int'
+|   `-ParametersAndQualifiers
+|     |-'(' OpenParen
+|     `-')' CloseParen
 `-';'
 )txt"}));
 }
@@ -3307,16 +3336,17 @@
 TranslationUnit Detached
 `-SimpleDeclaration
   |-'unsigned'
-  |-SimpleDeclarator Declarator
-  | |-'operator'
-  | |-'""'
-  | |-'_c'
-  | `-ParametersAndQualifiers
-  |   |-'(' OpenParen
-  |   |-ParameterDeclarationList Parameters
-  |   | `-SimpleDeclaration ListElement
-  |   |   `-'char'
-  |   `-')' CloseParen
+  |-DeclaratorList Declarators
+  | `-SimpleDeclarator ListElement
+  |   |-'operator'
+  |   |-'""'
+  |   |-'_c'
+  |   `-ParametersAndQualifiers
+  |     |-'(' OpenParen
+  |     |-ParameterDeclarationList Parameters
+  |     | `-SimpleDeclaration ListElement
+  |     |   `-'char'
+  |     `-')' CloseParen
   `-';'
 )txt"));
 }
@@ -3341,13 +3371,14 @@
   |-'>'
   `-SimpleDeclaration
     |-'unsigned'
-    |-SimpleDeclarator Declarator
-    | |-'operator'
-    | |-'""'
-    | |-'_t'
-    | `-ParametersAndQualifiers
-    |   |-'(' OpenParen
-    |   `-')' CloseParen
+    |-DeclaratorList Declarators
+    | `-SimpleDeclarator ListElement
+    |   |-'operator'
+    |   |-'""'
+    |   |-'_t'
+    |   `-ParametersAndQualifiers
+    |     |-'(' OpenParen
+    |     `-')' CloseParen
     `-';'
 )txt"));
 }
@@ -3365,19 +3396,21 @@
       {R"txt(
 SimpleDeclaration
 |-'X'
-|-SimpleDeclarator Declarator
-| |-'&'
-| |-'operator'
-| |-'='
-| `-ParametersAndQualifiers
-|   |-'(' OpenParen
-|   |-ParameterDeclarationList Parameters
-|   | `-SimpleDeclaration ListElement
-|   |   |-'const'
-|   |   |-'X'
-|   |   `-SimpleDeclarator Declarator
-|   |     `-'&'
-|   `-')' CloseParen
+|-DeclaratorList Declarators
+| `-SimpleDeclarator ListElement
+|   |-'&'
+|   |-'operator'
+|   |-'='
+|   `-ParametersAndQualifiers
+|     |-'(' OpenParen
+|     |-ParameterDeclarationList Parameters
+|     | `-SimpleDeclaration ListElement
+|     |   |-'const'
+|     |   |-'X'
+|     |   `-DeclaratorList Declarators
+|     |     `-SimpleDeclarator ListElement
+|     |       `-'&'
+|     `-')' CloseParen
 `-';'
 )txt"}));
 }
@@ -3397,21 +3430,23 @@
 `-SimpleDeclaration
   |-'friend'
   |-'X'
-  |-SimpleDeclarator Declarator
-  | |-'operator'
-  | |-'+'
-  | `-ParametersAndQualifiers
-  |   |-'(' OpenParen
-  |   |-ParameterDeclarationList Parameters
-  |   | |-SimpleDeclaration ListElement
-  |   | | `-'X'
-  |   | |-',' ListDelimiter
-  |   | `-SimpleDeclaration ListElement
-  |   |   |-'const'
-  |   |   |-'X'
-  |   |   `-SimpleDeclarator Declarator
-  |   |     `-'&'
-  |   `-')' CloseParen
+  |-DeclaratorList Declarators
+  | `-SimpleDeclarator ListElement
+  |   |-'operator'
+  |   |-'+'
+  |   `-ParametersAndQualifiers
+  |     |-'(' OpenParen
+  |     |-ParameterDeclarationList Parameters
+  |     | |-SimpleDeclaration ListElement
+  |     | | `-'X'
+  |     | |-',' ListDelimiter
+  |     | `-SimpleDeclaration ListElement
+  |     |   |-'const'
+  |     |   |-'X'
+  |     |   `-DeclaratorList Declarators
+  |     |     `-SimpleDeclarator ListElement
+  |     |       `-'&'
+  |     `-')' CloseParen
   `-';'
 )txt"}));
 }
@@ -3463,11 +3498,12 @@
   |-'>'
   `-SimpleDeclaration
     |-'T'
-    |-SimpleDeclarator Declarator
-    | |-'f'
-    | `-ParametersAndQualifiers
-    |   |-'(' OpenParen
-    |   `-')' CloseParen
+    |-DeclaratorList Declarators
+    | `-SimpleDeclarator ListElement
+    |   |-'f'
+    |   `-ParametersAndQualifiers
+    |     |-'(' OpenParen
+    |     `-')' CloseParen
     `-';'
 )txt"));
 }
@@ -3491,11 +3527,12 @@
   |-'>'
   `-SimpleDeclaration
     |-'T'
-    |-SimpleDeclarator Declarator
-    | |-'var'
-    | |-'='
-    | `-IntegerLiteralExpression
-    |   `-'10' LiteralToken
+    |-DeclaratorList Declarators
+    | `-SimpleDeclarator ListElement
+    |   |-'var'
+    |   |-'='
+    |   `-IntegerLiteralExpression
+    |     `-'10' LiteralToken
     `-';'
 )txt"));
 }
@@ -3522,11 +3559,12 @@
 `-SimpleDeclaration
   |-'static'
   |-'U'
-  |-SimpleDeclarator Declarator
-  | |-'f'
-  | `-ParametersAndQualifiers
-  |   |-'(' OpenParen
-  |   `-')' CloseParen
+  |-DeclaratorList Declarators
+  | `-SimpleDeclarator ListElement
+  |   |-'f'
+  |   `-ParametersAndQualifiers
+  |     |-'(' OpenParen
+  |     `-')' CloseParen
   `-';'
 )txt"}));
 }
@@ -3565,11 +3603,12 @@
     | |-'>'
     | `-SimpleDeclaration
     |   |-'U'
-    |   |-SimpleDeclarator Declarator
-    |   | |-'foo'
-    |   | `-ParametersAndQualifiers
-    |   |   |-'(' OpenParen
-    |   |   `-')' CloseParen
+    |   |-DeclaratorList Declarators
+    |   | `-SimpleDeclarator ListElement
+    |   |   |-'foo'
+    |   |   `-ParametersAndQualifiers
+    |   |     |-'(' OpenParen
+    |   |     `-')' CloseParen
     |   `-';'
     |-'}'
     `-';'
@@ -3617,11 +3656,12 @@
   |   | `-SimpleDeclaration
   |   |   |-'static'
   |   |   |-'U'
-  |   |   |-SimpleDeclarator Declarator
-  |   |   | |-'f'
-  |   |   | `-ParametersAndQualifiers
-  |   |   |   |-'(' OpenParen
-  |   |   |   `-')' CloseParen
+  |   |   |-DeclaratorList Declarators
+  |   |   | `-SimpleDeclarator ListElement
+  |   |   |   |-'f'
+  |   |   |   `-ParametersAndQualifiers
+  |   |   |     |-'(' OpenParen
+  |   |   |     `-')' CloseParen
   |   |   `-';'
   |   |-'}'
   |   `-';'
@@ -3834,8 +3874,9 @@
 | |-'"C"'
 | `-SimpleDeclaration
 |   |-'int'
-|   |-SimpleDeclarator Declarator
-|   | `-'a'
+|   |-DeclaratorList Declarators
+|   | `-SimpleDeclarator ListElement
+|   |   `-'a'
 |   `-';'
 `-LinkageSpecificationDeclaration
   |-'extern'
@@ -3843,13 +3884,15 @@
   |-'{'
   |-SimpleDeclaration
   | |-'int'
-  | |-SimpleDeclarator Declarator
-  | | `-'b'
+  | |-DeclaratorList Declarators
+  | | `-SimpleDeclarator ListElement
+  | |   `-'b'
   | `-';'
   |-SimpleDeclaration
   | |-'int'
-  | |-SimpleDeclarator Declarator
-  | | `-'c'
+  | |-DeclaratorList Declarators
+  | | `-SimpleDeclarator ListElement
+  | |   `-'c'
   | `-';'
   `-'}'
 )txt"));
@@ -3876,11 +3919,12 @@
 TranslationUnit Detached
 `-SimpleDeclaration
   |-'void'
-  |-SimpleDeclarator Declarator
-  | |-'test'
-  | `-ParametersAndQualifiers
-  |   |-'(' OpenParen
-  |   `-')' CloseParen
+  |-DeclaratorList Declarators
+  | `-SimpleDeclarator ListElement
+  |   |-'test'
+  |   `-ParametersAndQualifiers
+  |     |-'(' OpenParen
+  |     `-')' CloseParen
   `-CompoundStatement
     |-'{' OpenParen
     |-CompoundStatement Statement
@@ -3913,11 +3957,12 @@
 TranslationUnit Detached
 `-SimpleDeclaration
   |-'void'
-  |-SimpleDeclarator Declarator
-  | |-'test'
-  | `-ParametersAndQualifiers
-  |   |-'(' OpenParen
-  |   `-')' CloseParen
+  |-DeclaratorList Declarators
+  | `-SimpleDeclarator ListElement
+  |   |-'test'
+  |   `-ParametersAndQualifiers
+  |     |-'(' OpenParen
+  |     `-')' CloseParen
   `-CompoundStatement
     |-'{' OpenParen unmodifiable
     `-'}' CloseParen unmodifiable
@@ -3936,11 +3981,12 @@
 TranslationUnit Detached
 `-SimpleDeclaration
   |-'void'
-  |-SimpleDeclarator Declarator
-  | |-'test'
-  | `-ParametersAndQualifiers
-  |   |-'(' OpenParen
-  |   `-')' CloseParen
+  |-DeclaratorList Declarators
+  | `-SimpleDeclarator ListElement
+  |   |-'test'
+  |   `-ParametersAndQualifiers
+  |     |-'(' OpenParen
+  |     `-')' CloseParen
   `-CompoundStatement
     |-'{' OpenParen
     |-IfStatement Statement
@@ -3980,11 +4026,12 @@
 TranslationUnit Detached
 `-SimpleDeclaration
   |-'void'
-  |-SimpleDeclarator Declarator
-  | |-'test'
-  | `-ParametersAndQualifiers
-  |   |-'(' OpenParen
-  |   `-')' CloseParen
+  |-DeclaratorList Declarators
+  | `-SimpleDeclarator ListElement
+  |   |-'test'
+  |   `-ParametersAndQualifiers
+  |     |-'(' OpenParen
+  |     `-')' CloseParen
   `-CompoundStatement
     |-'{' OpenParen
     |-ExpressionStatement Statement
@@ -4018,11 +4065,12 @@
 TranslationUnit Detached
 `-SimpleDeclaration
   |-'void'
-  |-SimpleDeclarator Declarator
-  | |-'test'
-  | `-ParametersAndQualifiers
-  |   |-'(' OpenParen
-  |   `-')' CloseParen
+  |-DeclaratorList Declarators
+  | `-SimpleDeclarator ListElement
+  |   |-'test'
+  |   `-ParametersAndQualifiers
+  |     |-'(' OpenParen
+  |     `-')' CloseParen
   `-CompoundStatement
     |-'{' OpenParen
     |-IfStatement Statement
@@ -4104,11 +4152,12 @@
       {R"txt(
 SimpleDeclaration
 |-'S'
-`-SimpleDeclarator Declarator
-  |-'s'
-  |-'='
-  `-IntegerLiteralExpression
-    `-'1' LiteralToken
+`-DeclaratorList Declarators
+  `-SimpleDeclarator ListElement
+    |-'s'
+    |-'='
+    `-IntegerLiteralExpression
+      `-'1' LiteralToken
 )txt"}));
 }
 
@@ -4133,36 +4182,39 @@
       {R"txt(
 SimpleDeclaration
 |-'S'
-`-SimpleDeclarator Declarator
-  `-UnknownExpression
-    |-'s0'
-    |-'{'
-    `-'}'
+`-DeclaratorList Declarators
+  `-SimpleDeclarator ListElement
+    `-UnknownExpression
+      |-'s0'
+      |-'{'
+      `-'}'
   )txt",
        R"txt(
 SimpleDeclaration
 |-'S'
-`-SimpleDeclarator Declarator
-  `-UnknownExpression
-    |-'s1'
-    |-'{'
-    |-IntegerLiteralExpression
-    | `-'1' LiteralToken
-    `-'}'
+`-DeclaratorList Declarators
+  `-SimpleDeclarator ListElement
+    `-UnknownExpression
+      |-'s1'
+      |-'{'
+      |-IntegerLiteralExpression
+      | `-'1' LiteralToken
+      `-'}'
   )txt",
        R"txt(
 SimpleDeclaration
 |-'S'
-`-SimpleDeclarator Declarator
-  `-UnknownExpression
-    |-'s2'
-    |-'{'
-    |-IntegerLiteralExpression
-    | `-'1' LiteralToken
-    |-','
-    |-FloatingLiteralExpression
-    | `-'2.' LiteralToken
-    `-'}'
+`-DeclaratorList Declarators
+  `-SimpleDeclarator ListElement
+    `-UnknownExpression
+      |-'s2'
+      |-'{'
+      |-IntegerLiteralExpression
+      | `-'1' LiteralToken
+      |-','
+      |-FloatingLiteralExpression
+      | `-'2.' LiteralToken
+      `-'}'
 )txt"}));
 }
 
@@ -4187,39 +4239,42 @@
       {R"txt(
 SimpleDeclaration
 |-'S'
-`-SimpleDeclarator Declarator
-  |-'s0'
-  |-'='
-  `-UnknownExpression
-    |-'{'
-    `-'}'
+`-DeclaratorList Declarators
+  `-SimpleDeclarator ListElement
+    |-'s0'
+    |-'='
+    `-UnknownExpression
+      |-'{'
+      `-'}'
   )txt",
        R"txt(
 SimpleDeclaration
 |-'S'
-`-SimpleDeclarator Declarator
-  |-'s1'
-  |-'='
-  `-UnknownExpression
-    |-'{'
-    |-IntegerLiteralExpression
-    | `-'1' LiteralToken
-    `-'}'
+`-DeclaratorList Declarators
+  `-SimpleDeclarator ListElement
+    |-'s1'
+    |-'='
+    `-UnknownExpression
+      |-'{'
+      |-IntegerLiteralExpression
+      | `-'1' LiteralToken
+      `-'}'
   )txt",
        R"txt(
 SimpleDeclaration
 |-'S'
-`-SimpleDeclarator Declarator
-  |-'s2'
-  |-'='
-  `-UnknownExpression
-    |-'{'
-    |-IntegerLiteralExpression
-    | `-'1' LiteralToken
-    |-','
-    |-FloatingLiteralExpression
-    | `-'2.' LiteralToken
-    `-'}'
+`-DeclaratorList Declarators
+  `-SimpleDeclarator ListElement
+    |-'s2'
+    |-'='
+    `-UnknownExpression
+      |-'{'
+      |-IntegerLiteralExpression
+      | `-'1' LiteralToken
+      |-','
+      |-FloatingLiteralExpression
+      | `-'2.' LiteralToken
+      `-'}'
 )txt"}));
 }
 
@@ -4240,28 +4295,30 @@
       {R"txt(
 SimpleDeclaration
 |-'S'
-|-SimpleDeclarator Declarator
-| `-UnknownExpression
-|   |-'s1'
-|   |-'('
-|   |-IntegerLiteralExpression
-|   | `-'1' LiteralToken
-|   `-')'
+|-DeclaratorList Declarators
+| `-SimpleDeclarator ListElement
+|   `-UnknownExpression
+|     |-'s1'
+|     |-'('
+|     |-IntegerLiteralExpression
+|     | `-'1' LiteralToken
+|     `-')'
 `-';'
   )txt",
        R"txt(
 SimpleDeclaration
 |-'S'
-|-SimpleDeclarator Declarator
-| `-UnknownExpression
-|   |-'s2'
-|   |-'('
-|   |-IntegerLiteralExpression
-|   | `-'1' LiteralToken
-|   |-','
-|   |-FloatingLiteralExpression
-|   | `-'2.' LiteralToken
-|   `-')'
+|-DeclaratorList Declarators
+| `-SimpleDeclarator ListElement
+|   `-UnknownExpression
+|     |-'s2'
+|     |-'('
+|     |-IntegerLiteralExpression
+|     | `-'1' LiteralToken
+|     |-','
+|     |-FloatingLiteralExpression
+|     | `-'2.' LiteralToken
+|     `-')'
 `-';'
 )txt"}));
 }
@@ -4283,35 +4340,38 @@
       {R"txt(
 SimpleDeclaration
 |-'S'
-|-SimpleDeclarator Declarator
-| `-'s0'
+|-DeclaratorList Declarators
+| `-SimpleDeclarator ListElement
+|   `-'s0'
 `-';'
   )txt",
        R"txt(
 SimpleDeclaration
 |-'S'
-|-SimpleDeclarator Declarator
-| `-UnknownExpression
-|   |-'s1'
-|   |-'('
-|   |-IntegerLiteralExpression
-|   | `-'1' LiteralToken
-|   `-')'
+|-DeclaratorList Declarators
+| `-SimpleDeclarator ListElement
+|   `-UnknownExpression
+|     |-'s1'
+|     |-'('
+|     |-IntegerLiteralExpression
+|     | `-'1' LiteralToken
+|     `-')'
 `-';'
   )txt",
        R"txt(
 SimpleDeclaration
 |-'S'
-|-SimpleDeclarator Declarator
-| `-UnknownExpression
-|   |-'s2'
-|   |-'('
-|   |-IntegerLiteralExpression
-|   | `-'1' LiteralToken
-|   |-','
-|   |-FloatingLiteralExpression
-|   | `-'2.' LiteralToken
-|   `-')'
+|-DeclaratorList Declarators
+| `-SimpleDeclarator ListElement
+|   `-UnknownExpression
+|     |-'s2'
+|     |-'('
+|     |-IntegerLiteralExpression
+|     | `-'1' LiteralToken
+|     |-','
+|     |-FloatingLiteralExpression
+|     | `-'2.' LiteralToken
+|     `-')'
 `-';'
 )txt"}));
 }
@@ -4518,13 +4578,14 @@
 TranslationUnit Detached
 `-SimpleDeclaration
   |-'int'
-  |-SimpleDeclarator Declarator
-  | |-'a'
-  | `-ArraySubscript
-  |   |-'[' OpenParen
-  |   |-IntegerLiteralExpression Size
-  |   | `-'10' LiteralToken
-  |   `-']' CloseParen
+  |-DeclaratorList Declarators
+  | `-SimpleDeclarator ListElement
+  |   |-'a'
+  |   `-ArraySubscript
+  |     |-'[' OpenParen
+  |     |-IntegerLiteralExpression Size
+  |     | `-'10' LiteralToken
+  |     `-']' CloseParen
   `-';'
 )txt"));
 }
@@ -4538,23 +4599,24 @@
 TranslationUnit Detached
 `-SimpleDeclaration
   |-'int'
-  |-SimpleDeclarator Declarator
-  | |-'b'
-  | |-ArraySubscript
-  | | |-'[' OpenParen
-  | | |-IntegerLiteralExpression Size
-  | | | `-'1' LiteralToken
-  | | `-']' CloseParen
-  | |-ArraySubscript
-  | | |-'[' OpenParen
-  | | |-IntegerLiteralExpression Size
-  | | | `-'2' LiteralToken
-  | | `-']' CloseParen
-  | `-ArraySubscript
-  |   |-'[' OpenParen
-  |   |-IntegerLiteralExpression Size
-  |   | `-'3' LiteralToken
-  |   `-']' CloseParen
+  |-DeclaratorList Declarators
+  | `-SimpleDeclarator ListElement
+  |   |-'b'
+  |   |-ArraySubscript
+  |   | |-'[' OpenParen
+  |   | |-IntegerLiteralExpression Size
+  |   | | `-'1' LiteralToken
+  |   | `-']' CloseParen
+  |   |-ArraySubscript
+  |   | |-'[' OpenParen
+  |   | |-IntegerLiteralExpression Size
+  |   | | `-'2' LiteralToken
+  |   | `-']' CloseParen
+  |   `-ArraySubscript
+  |     |-'[' OpenParen
+  |     |-IntegerLiteralExpression Size
+  |     | `-'3' LiteralToken
+  |     `-']' CloseParen
   `-';'
 )txt"));
 }
@@ -4568,24 +4630,25 @@
 TranslationUnit Detached
 `-SimpleDeclaration
   |-'int'
-  |-SimpleDeclarator Declarator
-  | |-'c'
-  | |-ArraySubscript
-  | | |-'[' OpenParen
-  | | `-']' CloseParen
-  | |-'='
-  | `-UnknownExpression
+  |-DeclaratorList Declarators
+  | `-SimpleDeclarator ListElement
+  |   |-'c'
+  |   |-ArraySubscript
+  |   | |-'[' OpenParen
+  |   | `-']' CloseParen
+  |   |-'='
   |   `-UnknownExpression
-  |     |-'{'
-  |     |-IntegerLiteralExpression
-  |     | `-'1' LiteralToken
-  |     |-','
-  |     |-IntegerLiteralExpression
-  |     | `-'2' LiteralToken
-  |     |-','
-  |     |-IntegerLiteralExpression
-  |     | `-'3' LiteralToken
-  |     `-'}'
+  |     `-UnknownExpression
+  |       |-'{'
+  |       |-IntegerLiteralExpression
+  |       | `-'1' LiteralToken
+  |       |-','
+  |       |-IntegerLiteralExpression
+  |       | `-'2' LiteralToken
+  |       |-','
+  |       |-IntegerLiteralExpression
+  |       | `-'3' LiteralToken
+  |       `-'}'
   `-';'
 )txt"));
 }
@@ -4602,22 +4665,24 @@
 TranslationUnit Detached
 `-SimpleDeclaration
   |-'void'
-  |-SimpleDeclarator Declarator
-  | |-'f'
-  | `-ParametersAndQualifiers
-  |   |-'(' OpenParen
-  |   |-ParameterDeclarationList Parameters
-  |   | `-SimpleDeclaration ListElement
-  |   |   |-'int'
-  |   |   `-SimpleDeclarator Declarator
-  |   |     |-'xs'
-  |   |     `-ArraySubscript
-  |   |       |-'[' OpenParen
-  |   |       |-'static'
-  |   |       |-IntegerLiteralExpression Size
-  |   |       | `-'10' LiteralToken
-  |   |       `-']' CloseParen
-  |   `-')' CloseParen
+  |-DeclaratorList Declarators
+  | `-SimpleDeclarator ListElement
+  |   |-'f'
+  |   `-ParametersAndQualifiers
+  |     |-'(' OpenParen
+  |     |-ParameterDeclarationList Parameters
+  |     | `-SimpleDeclaration ListElement
+  |     |   |-'int'
+  |     |   `-DeclaratorList Declarators
+  |     |     `-SimpleDeclarator ListElement
+  |     |       |-'xs'
+  |     |       `-ArraySubscript
+  |     |         |-'[' OpenParen
+  |     |         |-'static'
+  |     |         |-IntegerLiteralExpression Size
+  |     |         | `-'10' LiteralToken
+  |     |         `-']' CloseParen
+  |     `-')' CloseParen
   `-';'
 )txt"));
 }
@@ -4631,11 +4696,12 @@
 TranslationUnit Detached
 `-SimpleDeclaration
   |-'int'
-  |-SimpleDeclarator Declarator
-  | |-'func'
-  | `-ParametersAndQualifiers
-  |   |-'(' OpenParen
-  |   `-')' CloseParen
+  |-DeclaratorList Declarators
+  | `-SimpleDeclarator ListElement
+  |   |-'func'
+  |   `-ParametersAndQualifiers
+  |     |-'(' OpenParen
+  |     `-')' CloseParen
   `-';'
 )txt"));
 }
@@ -4651,48 +4717,55 @@
 TranslationUnit Detached
 |-SimpleDeclaration
 | |-'int'
-| |-SimpleDeclarator Declarator
-| | |-'func1'
-| | `-ParametersAndQualifiers
-| |   |-'(' OpenParen
-| |   |-ParameterDeclarationList Parameters
-| |   | `-SimpleDeclaration ListElement
-| |   |   |-'int'
-| |   |   `-SimpleDeclarator Declarator
-| |   |     `-'a'
-| |   `-')' CloseParen
+| |-DeclaratorList Declarators
+| | `-SimpleDeclarator ListElement
+| |   |-'func1'
+| |   `-ParametersAndQualifiers
+| |     |-'(' OpenParen
+| |     |-ParameterDeclarationList Parameters
+| |     | `-SimpleDeclaration ListElement
+| |     |   |-'int'
+| |     |   `-DeclaratorList Declarators
+| |     |     `-SimpleDeclarator ListElement
+| |     |       `-'a'
+| |     `-')' CloseParen
 | `-';'
 |-SimpleDeclaration
 | |-'int'
-| |-SimpleDeclarator Declarator
-| | |-'func2'
-| | `-ParametersAndQualifiers
-| |   |-'(' OpenParen
-| |   |-ParameterDeclarationList Parameters
-| |   | `-SimpleDeclaration ListElement
-| |   |   |-'int'
-| |   |   `-SimpleDeclarator Declarator
-| |   |     |-'*'
-| |   |     `-'ap'
-| |   `-')' CloseParen
+| |-DeclaratorList Declarators
+| | `-SimpleDeclarator ListElement
+| |   |-'func2'
+| |   `-ParametersAndQualifiers
+| |     |-'(' OpenParen
+| |     |-ParameterDeclarationList Parameters
+| |     | `-SimpleDeclaration ListElement
+| |     |   |-'int'
+| |     |   `-DeclaratorList Declarators
+| |     |     `-SimpleDeclarator ListElement
+| |     |       |-'*'
+| |     |       `-'ap'
+| |     `-')' CloseParen
 | `-';'
 `-SimpleDeclaration
   |-'int'
-  |-SimpleDeclarator Declarator
-  | |-'func3'
-  | `-ParametersAndQualifiers
-  |   |-'(' OpenParen
-  |   |-ParameterDeclarationList Parameters
-  |   | |-SimpleDeclaration ListElement
-  |   | | |-'int'
-  |   | | `-SimpleDeclarator Declarator
-  |   | |   `-'a'
-  |   | |-',' ListDelimiter
-  |   | `-SimpleDeclaration ListElement
-  |   |   |-'float'
-  |   |   `-SimpleDeclarator Declarator
-  |   |     `-'b'
-  |   `-')' CloseParen
+  |-DeclaratorList Declarators
+  | `-SimpleDeclarator ListElement
+  |   |-'func3'
+  |   `-ParametersAndQualifiers
+  |     |-'(' OpenParen
+  |     |-ParameterDeclarationList Parameters
+  |     | |-SimpleDeclaration ListElement
+  |     | | |-'int'
+  |     | | `-DeclaratorList Declarators
+  |     | |   `-SimpleDeclarator ListElement
+  |     | |     `-'a'
+  |     | |-',' ListDelimiter
+  |     | `-SimpleDeclaration ListElement
+  |     |   |-'float'
+  |     |   `-DeclaratorList Declarators
+  |     |     `-SimpleDeclarator ListElement
+  |     |       `-'b'
+  |     `-')' CloseParen
   `-';'
 )txt"));
 }
@@ -4708,41 +4781,45 @@
 TranslationUnit Detached
 |-SimpleDeclaration
 | |-'int'
-| |-SimpleDeclarator Declarator
-| | |-'func1'
-| | `-ParametersAndQualifiers
-| |   |-'(' OpenParen
-| |   |-ParameterDeclarationList Parameters
-| |   | `-SimpleDeclaration ListElement
-| |   |   `-'int'
-| |   `-')' CloseParen
+| |-DeclaratorList Declarators
+| | `-SimpleDeclarator ListElement
+| |   |-'func1'
+| |   `-ParametersAndQualifiers
+| |     |-'(' OpenParen
+| |     |-ParameterDeclarationList Parameters
+| |     | `-SimpleDeclaration ListElement
+| |     |   `-'int'
+| |     `-')' CloseParen
 | `-';'
 |-SimpleDeclaration
 | |-'int'
-| |-SimpleDeclarator Declarator
-| | |-'func2'
-| | `-ParametersAndQualifiers
-| |   |-'(' OpenParen
-| |   |-ParameterDeclarationList Parameters
-| |   | `-SimpleDeclaration ListElement
-| |   |   |-'int'
-| |   |   `-SimpleDeclarator Declarator
-| |   |     `-'*'
-| |   `-')' CloseParen
+| |-DeclaratorList Declarators
+| | `-SimpleDeclarator ListElement
+| |   |-'func2'
+| |   `-ParametersAndQualifiers
+| |     |-'(' OpenParen
+| |     |-ParameterDeclarationList Parameters
+| |     | `-SimpleDeclaration ListElement
+| |     |   |-'int'
+| |     |   `-DeclaratorList Declarators
+| |     |     `-SimpleDeclarator ListElement
+| |     |       `-'*'
+| |     `-')' CloseParen
 | `-';'
 `-SimpleDeclaration
   |-'int'
-  |-SimpleDeclarator Declarator
-  | |-'func3'
-  | `-ParametersAndQualifiers
-  |   |-'(' OpenParen
-  |   |-ParameterDeclarationList Parameters
-  |   | |-SimpleDeclaration ListElement
-  |   | | `-'int'
-  |   | |-',' ListDelimiter
-  |   | `-SimpleDeclaration ListElement
-  |   |   `-'float'
-  |   `-')' CloseParen
+  |-DeclaratorList Declarators
+  | `-SimpleDeclarator ListElement
+  |   |-'func3'
+  |   `-ParametersAndQualifiers
+  |     |-'(' OpenParen
+  |     |-ParameterDeclarationList Parameters
+  |     | |-SimpleDeclaration ListElement
+  |     | | `-'int'
+  |     | |-',' ListDelimiter
+  |     | `-SimpleDeclaration ListElement
+  |     |   `-'float'
+  |     `-')' CloseParen
   `-';'
 )txt"));
 }
@@ -4760,11 +4837,12 @@
 ParameterDeclarationList Parameters
 `-SimpleDeclaration ListElement
   |-'int'
-  `-SimpleDeclarator Declarator
-    |-'a'
-    |-'='
-    `-IntegerLiteralExpression
-      `-'1' LiteralToken
+  `-DeclaratorList Declarators
+    `-SimpleDeclarator ListElement
+      |-'a'
+      |-'='
+      `-IntegerLiteralExpression
+        `-'1' LiteralToken
 )txt"}));
 }
 
@@ -4781,25 +4859,28 @@
 ParameterDeclarationList Parameters
 |-SimpleDeclaration ListElement
 | |-'int'
-| `-SimpleDeclarator Declarator
-|   |-'*'
-|   `-'ap'
+| `-DeclaratorList Declarators
+|   `-SimpleDeclarator ListElement
+|     |-'*'
+|     `-'ap'
 |-',' ListDelimiter
 |-SimpleDeclaration ListElement
 | |-'int'
-| `-SimpleDeclarator Declarator
-|   |-'a'
-|   |-'='
-|   `-IntegerLiteralExpression
-|     `-'1' LiteralToken
+| `-DeclaratorList Declarators
+|   `-SimpleDeclarator ListElement
+|     |-'a'
+|     |-'='
+|     `-IntegerLiteralExpression
+|       `-'1' LiteralToken
 |-',' ListDelimiter
 `-SimpleDeclaration ListElement
   |-'char'
-  `-SimpleDeclarator Declarator
-    |-'c'
-    |-'='
-    `-CharacterLiteralExpression
-      `-''2'' LiteralToken
+  `-DeclaratorList Declarators
+    `-SimpleDeclarator ListElement
+      |-'c'
+      |-'='
+      `-CharacterLiteralExpression
+        `-''2'' LiteralToken
 )txt"}));
 }
 
@@ -4816,18 +4897,19 @@
       {R"txt(
 SimpleDeclaration
 |-'void'
-|-SimpleDeclarator Declarator
-| |-'test'
-| `-ParametersAndQualifiers
-|   |-'(' OpenParen
-|   |-ParameterDeclarationList Parameters
-|   | |-SimpleDeclaration ListElement
-|   | | `-'T'
-|   | |-',' ListDelimiter
-|   | `-SimpleDeclaration ListElement
-|   |   |-'Args'
-|   |   `-'...'
-|   `-')' CloseParen
+|-DeclaratorList Declarators
+| `-SimpleDeclarator ListElement
+|   |-'test'
+|   `-ParametersAndQualifiers
+|     |-'(' OpenParen
+|     |-ParameterDeclarationList Parameters
+|     | |-SimpleDeclaration ListElement
+|     | | `-'T'
+|     | |-',' ListDelimiter
+|     | `-SimpleDeclaration ListElement
+|     |   |-'Args'
+|     |   `-'...'
+|     `-')' CloseParen
 `-';'
 )txt"}));
 }
@@ -4845,22 +4927,25 @@
       {R"txt(
 SimpleDeclaration
 |-'void'
-|-SimpleDeclarator Declarator
-| |-'test'
-| `-ParametersAndQualifiers
-|   |-'(' OpenParen
-|   |-ParameterDeclarationList Parameters
-|   | |-SimpleDeclaration ListElement
-|   | | |-'T'
-|   | | `-SimpleDeclarator Declarator
-|   | |   `-'t'
-|   | |-',' ListDelimiter
-|   | `-SimpleDeclaration ListElement
-|   |   |-'Args'
-|   |   |-'...'
-|   |   `-SimpleDeclarator Declarator
-|   |     `-'args'
-|   `-')' CloseParen
+|-DeclaratorList Declarators
+| `-SimpleDeclarator ListElement
+|   |-'test'
+|   `-ParametersAndQualifiers
+|     |-'(' OpenParen
+|     |-ParameterDeclarationList Parameters
+|     | |-SimpleDeclaration ListElement
+|     | | |-'T'
+|     | | `-DeclaratorList Declarators
+|     | |   `-SimpleDeclarator ListElement
+|     | |     `-'t'
+|     | |-',' ListDelimiter
+|     | `-SimpleDeclaration ListElement
+|     |   |-'Args'
+|     |   |-'...'
+|     |   `-DeclaratorList Declarators
+|     |     `-SimpleDeclarator ListElement
+|     |       `-'args'
+|     `-')' CloseParen
 `-';'
 )txt"}));
 }
@@ -4878,18 +4963,19 @@
 TranslationUnit Detached
 `-SimpleDeclaration
   |-'void'
-  |-SimpleDeclarator Declarator
-  | |-'test'
-  | `-ParametersAndQualifiers
-  |   |-'(' OpenParen
-  |   |-ParameterDeclarationList Parameters
-  |   | |-SimpleDeclaration ListElement
-  |   | | `-'int'
-  |   | |-',' ListDelimiter
-  |   | `-SimpleDeclaration ListElement
-  |   |   `-'char'
-  |   |-'...'
-  |   `-')' CloseParen
+  |-DeclaratorList Declarators
+  | `-SimpleDeclarator ListElement
+  |   |-'test'
+  |   `-ParametersAndQualifiers
+  |     |-'(' OpenParen
+  |     |-ParameterDeclarationList Parameters
+  |     | |-SimpleDeclaration ListElement
+  |     | | `-'int'
+  |     | |-',' ListDelimiter
+  |     | `-SimpleDeclaration ListElement
+  |     |   `-'char'
+  |     |-'...'
+  |     `-')' CloseParen
   `-';'
 )txt"));
 }
@@ -4907,30 +4993,34 @@
 TranslationUnit Detached
 `-SimpleDeclaration
   |-'int'
-  |-SimpleDeclarator Declarator
-  | |-'func'
-  | `-ParametersAndQualifiers
-  |   |-'(' OpenParen
-  |   |-ParameterDeclarationList Parameters
-  |   | |-SimpleDeclaration ListElement
-  |   | | |-'const'
-  |   | | |-'int'
-  |   | | `-SimpleDeclarator Declarator
-  |   | |   `-'a'
-  |   | |-',' ListDelimiter
-  |   | |-SimpleDeclaration ListElement
-  |   | | |-'volatile'
-  |   | | |-'int'
-  |   | | `-SimpleDeclarator Declarator
-  |   | |   `-'b'
-  |   | |-',' ListDelimiter
-  |   | `-SimpleDeclaration ListElement
-  |   |   |-'const'
-  |   |   |-'volatile'
-  |   |   |-'int'
-  |   |   `-SimpleDeclarator Declarator
-  |   |     `-'c'
-  |   `-')' CloseParen
+  |-DeclaratorList Declarators
+  | `-SimpleDeclarator ListElement
+  |   |-'func'
+  |   `-ParametersAndQualifiers
+  |     |-'(' OpenParen
+  |     |-ParameterDeclarationList Parameters
+  |     | |-SimpleDeclaration ListElement
+  |     | | |-'const'
+  |     | | |-'int'
+  |     | | `-DeclaratorList Declarators
+  |     | |   `-SimpleDeclarator ListElement
+  |     | |     `-'a'
+  |     | |-',' ListDelimiter
+  |     | |-SimpleDeclaration ListElement
+  |     | | |-'volatile'
+  |     | | |-'int'
+  |     | | `-DeclaratorList Declarators
+  |     | |   `-SimpleDeclarator ListElement
+  |     | |     `-'b'
+  |     | |-',' ListDelimiter
+  |     | `-SimpleDeclaration ListElement
+  |     |   |-'const'
+  |     |   |-'volatile'
+  |     |   |-'int'
+  |     |   `-DeclaratorList Declarators
+  |     |     `-SimpleDeclarator ListElement
+  |     |       `-'c'
+  |     `-')' CloseParen
   `-';'
 )txt"));
 }
@@ -4947,17 +5037,19 @@
 TranslationUnit Detached
 `-SimpleDeclaration
   |-'int'
-  |-SimpleDeclarator Declarator
-  | |-'func'
-  | `-ParametersAndQualifiers
-  |   |-'(' OpenParen
-  |   |-ParameterDeclarationList Parameters
-  |   | `-SimpleDeclaration ListElement
-  |   |   |-'int'
-  |   |   `-SimpleDeclarator Declarator
-  |   |     |-'&'
-  |   |     `-'a'
-  |   `-')' CloseParen
+  |-DeclaratorList Declarators
+  | `-SimpleDeclarator ListElement
+  |   |-'func'
+  |   `-ParametersAndQualifiers
+  |     |-'(' OpenParen
+  |     |-ParameterDeclarationList Parameters
+  |     | `-SimpleDeclaration ListElement
+  |     |   |-'int'
+  |     |   `-DeclaratorList Declarators
+  |     |     `-SimpleDeclarator ListElement
+  |     |       |-'&'
+  |     |       `-'a'
+  |     `-')' CloseParen
   `-';'
 )txt"));
 }
@@ -4975,17 +5067,19 @@
 TranslationUnit Detached
 `-SimpleDeclaration
   |-'int'
-  |-SimpleDeclarator Declarator
-  | |-'func'
-  | `-ParametersAndQualifiers
-  |   |-'(' OpenParen
-  |   |-ParameterDeclarationList Parameters
-  |   | `-SimpleDeclaration ListElement
-  |   |   |-'int'
-  |   |   `-SimpleDeclarator Declarator
-  |   |     |-'&&'
-  |   |     `-'a'
-  |   `-')' CloseParen
+  |-DeclaratorList Declarators
+  | `-SimpleDeclarator ListElement
+  |   |-'func'
+  |   `-ParametersAndQualifiers
+  |     |-'(' OpenParen
+  |     |-ParameterDeclarationList Parameters
+  |     | `-SimpleDeclaration ListElement
+  |     |   |-'int'
+  |     |   `-DeclaratorList Declarators
+  |     |     `-SimpleDeclarator ListElement
+  |     |       |-'&&'
+  |     |       `-'a'
+  |     `-')' CloseParen
   `-';'
 )txt"));
 }
@@ -5008,11 +5102,12 @@
   |-'{'
   |-SimpleDeclaration
   | |-'int'
-  | |-SimpleDeclarator Declarator
-  | | |-'a'
-  | | `-ParametersAndQualifiers
-  | |   |-'(' OpenParen
-  | |   `-')' CloseParen
+  | |-DeclaratorList Declarators
+  | | `-SimpleDeclarator ListElement
+  | |   |-'a'
+  | |   `-ParametersAndQualifiers
+  | |     |-'(' OpenParen
+  | |     `-')' CloseParen
   | `-';'
   |-'}'
   `-';'
@@ -5035,35 +5130,38 @@
       {R"txt(
 SimpleDeclaration
 |-'int'
-|-SimpleDeclarator Declarator
-| |-'b'
-| `-ParametersAndQualifiers
-|   |-'(' OpenParen
-|   |-')' CloseParen
-|   `-'const'
+|-DeclaratorList Declarators
+| `-SimpleDeclarator ListElement
+|   |-'b'
+|   `-ParametersAndQualifiers
+|     |-'(' OpenParen
+|     |-')' CloseParen
+|     `-'const'
 `-';'
 )txt",
        R"txt(
 SimpleDeclaration
 |-'int'
-|-SimpleDeclarator Declarator
-| |-'c'
-| `-ParametersAndQualifiers
-|   |-'(' OpenParen
-|   |-')' CloseParen
-|   `-'volatile'
+|-DeclaratorList Declarators
+| `-SimpleDeclarator ListElement
+|   |-'c'
+|   `-ParametersAndQualifiers
+|     |-'(' OpenParen
+|     |-')' CloseParen
+|     `-'volatile'
 `-';'
 )txt",
        R"txt(
 SimpleDeclaration
 |-'int'
-|-SimpleDeclarator Declarator
-| |-'d'
-| `-ParametersAndQualifiers
-|   |-'(' OpenParen
-|   |-')' CloseParen
-|   |-'const'
-|   `-'volatile'
+|-DeclaratorList Declarators
+| `-SimpleDeclarator ListElement
+|   |-'d'
+|   `-ParametersAndQualifiers
+|     |-'(' OpenParen
+|     |-')' CloseParen
+|     |-'const'
+|     `-'volatile'
 `-';'
 )txt"}));
 }
@@ -5081,12 +5179,13 @@
       {R"txt(
 SimpleDeclaration
 |-'int'
-|-SimpleDeclarator Declarator
-| |-'e'
-| `-ParametersAndQualifiers
-|   |-'(' OpenParen
-|   |-')' CloseParen
-|   `-'&'
+|-DeclaratorList Declarators
+| `-SimpleDeclarator ListElement
+|   |-'e'
+|   `-ParametersAndQualifiers
+|     |-'(' OpenParen
+|     |-')' CloseParen
+|     `-'&'
 `-';'
 )txt"}));
 }
@@ -5104,12 +5203,13 @@
       {R"txt(
 SimpleDeclaration
 |-'int'
-|-SimpleDeclarator Declarator
-| |-'f'
-| `-ParametersAndQualifiers
-|   |-'(' OpenParen
-|   |-')' CloseParen
-|   `-'&&'
+|-DeclaratorList Declarators
+| `-SimpleDeclarator ListElement
+|   |-'f'
+|   `-ParametersAndQualifiers
+|     |-'(' OpenParen
+|     |-')' CloseParen
+|     `-'&&'
 `-';'
 )txt"}));
 }
@@ -5126,14 +5226,15 @@
 TranslationUnit Detached
 `-SimpleDeclaration
   |-'auto'
-  |-SimpleDeclarator Declarator
-  | |-'foo'
-  | `-ParametersAndQualifiers
-  |   |-'(' OpenParen
-  |   |-')' CloseParen
-  |   `-TrailingReturnType TrailingReturn
-  |     |-'->' ArrowToken
-  |     `-'int'
+  |-DeclaratorList Declarators
+  | `-SimpleDeclarator ListElement
+  |   |-'foo'
+  |   `-ParametersAndQualifiers
+  |     |-'(' OpenParen
+  |     |-')' CloseParen
+  |     `-TrailingReturnType TrailingReturn
+  |       |-'->' ArrowToken
+  |       `-'int'
   `-';'
 )txt"));
 }
@@ -5154,58 +5255,62 @@
       {R"txt(
 SimpleDeclaration
 |-'int'
-|-SimpleDeclarator Declarator
-| |-'a'
-| `-ParametersAndQualifiers
-|   |-'(' OpenParen
-|   |-')' CloseParen
-|   |-'throw'
-|   |-'('
-|   `-')'
+|-DeclaratorList Declarators
+| `-SimpleDeclarator ListElement
+|   |-'a'
+|   `-ParametersAndQualifiers
+|     |-'(' OpenParen
+|     |-')' CloseParen
+|     |-'throw'
+|     |-'('
+|     `-')'
 `-';'
 )txt",
        R"txt(
 SimpleDeclaration
 |-'int'
-|-SimpleDeclarator Declarator
-| |-'b'
-| `-ParametersAndQualifiers
-|   |-'(' OpenParen
-|   |-')' CloseParen
-|   |-'throw'
-|   |-'('
-|   |-'...'
-|   `-')'
+|-DeclaratorList Declarators
+| `-SimpleDeclarator ListElement
+|   |-'b'
+|   `-ParametersAndQualifiers
+|     |-'(' OpenParen
+|     |-')' CloseParen
+|     |-'throw'
+|     |-'('
+|     |-'...'
+|     `-')'
 `-';'
 )txt",
        R"txt(
 SimpleDeclaration
 |-'int'
-|-SimpleDeclarator Declarator
-| |-'c'
-| `-ParametersAndQualifiers
-|   |-'(' OpenParen
-|   |-')' CloseParen
-|   |-'throw'
-|   |-'('
-|   |-'MyException1'
-|   `-')'
+|-DeclaratorList Declarators
+| `-SimpleDeclarator ListElement
+|   |-'c'
+|   `-ParametersAndQualifiers
+|     |-'(' OpenParen
+|     |-')' CloseParen
+|     |-'throw'
+|     |-'('
+|     |-'MyException1'
+|     `-')'
 `-';'
 )txt",
        R"txt(
 SimpleDeclaration
 |-'int'
-|-SimpleDeclarator Declarator
-| |-'d'
-| `-ParametersAndQualifiers
-|   |-'(' OpenParen
-|   |-')' CloseParen
-|   |-'throw'
-|   |-'('
-|   |-'MyException1'
-|   |-','
-|   |-'MyException2'
-|   `-')'
+|-DeclaratorList Declarators
+| `-SimpleDeclarator ListElement
+|   |-'d'
+|   `-ParametersAndQualifiers
+|     |-'(' OpenParen
+|     |-')' CloseParen
+|     |-'throw'
+|     |-'('
+|     |-'MyException1'
+|     |-','
+|     |-'MyException2'
+|     `-')'
 `-';'
 )txt"}));
 }
@@ -5223,25 +5328,27 @@
 TranslationUnit Detached
 |-SimpleDeclaration
 | |-'int'
-| |-SimpleDeclarator Declarator
-| | |-'a'
-| | `-ParametersAndQualifiers
-| |   |-'(' OpenParen
-| |   |-')' CloseParen
-| |   `-'noexcept'
+| |-DeclaratorList Declarators
+| | `-SimpleDeclarator ListElement
+| |   |-'a'
+| |   `-ParametersAndQualifiers
+| |     |-'(' OpenParen
+| |     |-')' CloseParen
+| |     `-'noexcept'
 | `-';'
 `-SimpleDeclaration
   |-'int'
-  |-SimpleDeclarator Declarator
-  | |-'b'
-  | `-ParametersAndQualifiers
-  |   |-'(' OpenParen
-  |   |-')' CloseParen
-  |   |-'noexcept'
-  |   |-'('
-  |   |-BoolLiteralExpression
-  |   | `-'true' LiteralToken
-  |   `-')'
+  |-DeclaratorList Declarators
+  | `-SimpleDeclarator ListElement
+  |   |-'b'
+  |   `-ParametersAndQualifiers
+  |     |-'(' OpenParen
+  |     |-')' CloseParen
+  |     |-'noexcept'
+  |     |-'('
+  |     |-BoolLiteralExpression
+  |     | `-'true' LiteralToken
+  |     `-')'
   `-';'
 )txt"));
 }
@@ -5258,50 +5365,54 @@
 TranslationUnit Detached
 |-SimpleDeclaration
 | |-'int'
-| |-SimpleDeclarator Declarator
-| | `-ParenDeclarator
-| |   |-'(' OpenParen
-| |   |-'a'
-| |   `-')' CloseParen
+| |-DeclaratorList Declarators
+| | `-SimpleDeclarator ListElement
+| |   `-ParenDeclarator
+| |     |-'(' OpenParen
+| |     |-'a'
+| |     `-')' CloseParen
 | `-';'
 |-SimpleDeclaration
 | |-'int'
-| |-SimpleDeclarator Declarator
-| | |-'*'
-| | `-ParenDeclarator
-| |   |-'(' OpenParen
-| |   |-'b'
-| |   `-')' CloseParen
+| |-DeclaratorList Declarators
+| | `-SimpleDeclarator ListElement
+| |   |-'*'
+| |   `-ParenDeclarator
+| |     |-'(' OpenParen
+| |     |-'b'
+| |     `-')' CloseParen
 | `-';'
 |-SimpleDeclaration
 | |-'int'
-| |-SimpleDeclarator Declarator
-| | |-ParenDeclarator
-| | | |-'(' OpenParen
-| | | |-'*'
-| | | |-'c'
-| | | `-')' CloseParen
-| | `-ParametersAndQualifiers
-| |   |-'(' OpenParen
-| |   |-ParameterDeclarationList Parameters
-| |   | `-SimpleDeclaration ListElement
-| |   |   `-'int'
-| |   `-')' CloseParen
+| |-DeclaratorList Declarators
+| | `-SimpleDeclarator ListElement
+| |   |-ParenDeclarator
+| |   | |-'(' OpenParen
+| |   | |-'*'
+| |   | |-'c'
+| |   | `-')' CloseParen
+| |   `-ParametersAndQualifiers
+| |     |-'(' OpenParen
+| |     |-ParameterDeclarationList Parameters
+| |     | `-SimpleDeclaration ListElement
+| |     |   `-'int'
+| |     `-')' CloseParen
 | `-';'
 `-SimpleDeclaration
   |-'int'
-  |-SimpleDeclarator Declarator
-  | |-'*'
-  | |-ParenDeclarator
-  | | |-'(' OpenParen
-  | | |-'d'
-  | | `-')' CloseParen
-  | `-ParametersAndQualifiers
-  |   |-'(' OpenParen
-  |   |-ParameterDeclarationList Parameters
-  |   | `-SimpleDeclaration ListElement
-  |   |   `-'int'
-  |   `-')' CloseParen
+  |-DeclaratorList Declarators
+  | `-SimpleDeclarator ListElement
+  |   |-'*'
+  |   |-ParenDeclarator
+  |   | |-'(' OpenParen
+  |   | |-'d'
+  |   | `-')' CloseParen
+  |   `-ParametersAndQualifiers
+  |     |-'(' OpenParen
+  |     |-ParameterDeclarationList Parameters
+  |     | `-SimpleDeclaration ListElement
+  |     |   `-'int'
+  |     `-')' CloseParen
   `-';'
 )txt"));
 }
@@ -5317,22 +5428,24 @@
 |-SimpleDeclaration
 | |-'const'
 | |-'int'
-| |-SimpleDeclarator Declarator
-| | |-'west'
-| | |-'='
-| | `-PrefixUnaryOperatorExpression
-| |   |-'-' OperatorToken
-| |   `-IntegerLiteralExpression Operand
-| |     `-'1' LiteralToken
+| |-DeclaratorList Declarators
+| | `-SimpleDeclarator ListElement
+| |   |-'west'
+| |   |-'='
+| |   `-PrefixUnaryOperatorExpression
+| |     |-'-' OperatorToken
+| |     `-IntegerLiteralExpression Operand
+| |       `-'1' LiteralToken
 | `-';'
 `-SimpleDeclaration
   |-'int'
   |-'const'
-  |-SimpleDeclarator Declarator
-  | |-'east'
-  | |-'='
-  | `-IntegerLiteralExpression
-  |   `-'1' LiteralToken
+  |-DeclaratorList Declarators
+  | `-SimpleDeclarator ListElement
+  |   |-'east'
+  |   |-'='
+  |   `-IntegerLiteralExpression
+  |     `-'1' LiteralToken
   `-';'
 )txt"));
 }
@@ -5348,11 +5461,12 @@
   |-'const'
   |-'int'
   |-'const'
-  |-SimpleDeclarator Declarator
-  | |-'universal'
-  | |-'='
-  | `-IntegerLiteralExpression
-  |   `-'0' LiteralToken
+  |-DeclaratorList Declarators
+  | `-SimpleDeclarator ListElement
+  |   |-'universal'
+  |   |-'='
+  |   `-IntegerLiteralExpression
+  |     `-'0' LiteralToken
   `-';'
 )txt"));
 }
@@ -5369,12 +5483,13 @@
   |-'const'
   |-'int'
   |-'const'
-  |-SimpleDeclarator Declarator
-  | |-'*'
-  | |-'const'
-  | |-'*'
-  | |-'volatile'
-  | `-'b'
+  |-DeclaratorList Declarators
+  | `-SimpleDeclarator ListElement
+  |   |-'*'
+  |   |-'const'
+  |   |-'*'
+  |   |-'volatile'
+  |   `-'b'
   `-';'
 )txt"));
 }
@@ -5391,30 +5506,31 @@
 TranslationUnit Detached
 `-SimpleDeclaration
   |-'auto'
-  |-SimpleDeclarator Declarator
-  | |-'foo'
-  | `-ParametersAndQualifiers
-  |   |-'(' OpenParen
-  |   |-')' CloseParen
-  |   `-TrailingReturnType TrailingReturn
-  |     |-'->' ArrowToken
-  |     |-'auto'
-  |     `-SimpleDeclarator Declarator
-  |       |-ParenDeclarator
-  |       | |-'(' OpenParen
-  |       | |-'*'
-  |       | `-')' CloseParen
-  |       `-ParametersAndQualifiers
-  |         |-'(' OpenParen
-  |         |-ParameterDeclarationList Parameters
-  |         | `-SimpleDeclaration ListElement
-  |         |   `-'int'
-  |         |-')' CloseParen
-  |         `-TrailingReturnType TrailingReturn
-  |           |-'->' ArrowToken
-  |           |-'double'
-  |           `-SimpleDeclarator Declarator
-  |             `-'*'
+  |-DeclaratorList Declarators
+  | `-SimpleDeclarator ListElement
+  |   |-'foo'
+  |   `-ParametersAndQualifiers
+  |     |-'(' OpenParen
+  |     |-')' CloseParen
+  |     `-TrailingReturnType TrailingReturn
+  |       |-'->' ArrowToken
+  |       |-'auto'
+  |       `-SimpleDeclarator Declarator
+  |         |-ParenDeclarator
+  |         | |-'(' OpenParen
+  |         | |-'*'
+  |         | `-')' CloseParen
+  |         `-ParametersAndQualifiers
+  |           |-'(' OpenParen
+  |           |-ParameterDeclarationList Parameters
+  |           | `-SimpleDeclaration ListElement
+  |           |   `-'int'
+  |           |-')' CloseParen
+  |           `-TrailingReturnType TrailingReturn
+  |             |-'->' ArrowToken
+  |             |-'double'
+  |             `-SimpleDeclarator Declarator
+  |               `-'*'
   `-';'
 )txt"));
 }
@@ -5432,24 +5548,26 @@
       {R"txt(
 SimpleDeclaration
 |-'int'
-|-SimpleDeclarator Declarator
-| |-MemberPointer
-| | |-'X'
-| | |-'::'
-| | `-'*'
-| `-'a'
+|-DeclaratorList Declarators
+| `-SimpleDeclarator ListElement
+|   |-MemberPointer
+|   | |-'X'
+|   | |-'::'
+|   | `-'*'
+|   `-'a'
 `-';'
 )txt",
        R"txt(
 SimpleDeclaration
 |-'const'
 |-'int'
-|-SimpleDeclarator Declarator
-| |-MemberPointer
-| | |-'X'
-| | |-'::'
-| | `-'*'
-| `-'b'
+|-DeclaratorList Declarators
+| `-SimpleDeclarator ListElement
+|   |-MemberPointer
+|   | |-'X'
+|   | |-'::'
+|   | `-'*'
+|   `-'b'
 `-';'
 )txt"}));
 }
@@ -5472,70 +5590,75 @@
       {R"txt(
 SimpleDeclaration
 |-'void'
-|-SimpleDeclarator Declarator
-| |-ParenDeclarator
-| | |-'(' OpenParen
-| | |-MemberPointer
-| | | |-'X'
-| | | |-'::'
-| | | `-'*'
-| | |-'xp'
-| | `-')' CloseParen
-| `-ParametersAndQualifiers
-|   |-'(' OpenParen
-|   `-')' CloseParen
+|-DeclaratorList Declarators
+| `-SimpleDeclarator ListElement
+|   |-ParenDeclarator
+|   | |-'(' OpenParen
+|   | |-MemberPointer
+|   | | |-'X'
+|   | | |-'::'
+|   | | `-'*'
+|   | |-'xp'
+|   | `-')' CloseParen
+|   `-ParametersAndQualifiers
+|     |-'(' OpenParen
+|     `-')' CloseParen
 `-';'
 )txt",
        R"txt(
 SimpleDeclaration
 |-'void'
-|-SimpleDeclarator Declarator
-| |-ParenDeclarator
-| | |-'(' OpenParen
-| | |-MemberPointer
-| | | |-'X'
-| | | |-'::'
-| | | `-'*'
-| | |-'*'
-| | |-'xpp'
-| | `-')' CloseParen
-| `-ParametersAndQualifiers
-|   |-'(' OpenParen
-|   |-ParameterDeclarationList Parameters
-|   | `-SimpleDeclaration ListElement
-|   |   |-'const'
-|   |   |-'int'
-|   |   `-SimpleDeclarator Declarator
-|   |     `-'*'
-|   `-')' CloseParen
+|-DeclaratorList Declarators
+| `-SimpleDeclarator ListElement
+|   |-ParenDeclarator
+|   | |-'(' OpenParen
+|   | |-MemberPointer
+|   | | |-'X'
+|   | | |-'::'
+|   | | `-'*'
+|   | |-'*'
+|   | |-'xpp'
+|   | `-')' CloseParen
+|   `-ParametersAndQualifiers
+|     |-'(' OpenParen
+|     |-ParameterDeclarationList Parameters
+|     | `-SimpleDeclaration ListElement
+|     |   |-'const'
+|     |   |-'int'
+|     |   `-DeclaratorList Declarators
+|     |     `-SimpleDeclarator ListElement
+|     |       `-'*'
+|     `-')' CloseParen
 `-';'
 )txt",
        R"txt(
 SimpleDeclaration
 |-'void'
-|-SimpleDeclarator Declarator
-| |-ParenDeclarator
-| | |-'(' OpenParen
-| | |-'X'
-| | |-'::'
-| | |-MemberPointer
-| | | |-'Y'
-| | | |-'::'
-| | | `-'*'
-| | |-'xyp'
-| | `-')' CloseParen
-| `-ParametersAndQualifiers
-|   |-'(' OpenParen
-|   |-ParameterDeclarationList Parameters
-|   | |-SimpleDeclaration ListElement
-|   | | |-'const'
-|   | | |-'int'
-|   | | `-SimpleDeclarator Declarator
-|   | |   `-'*'
-|   | |-',' ListDelimiter
-|   | `-SimpleDeclaration ListElement
-|   |   `-'char'
-|   `-')' CloseParen
+|-DeclaratorList Declarators
+| `-SimpleDeclarator ListElement
+|   |-ParenDeclarator
+|   | |-'(' OpenParen
+|   | |-'X'
+|   | |-'::'
+|   | |-MemberPointer
+|   | | |-'Y'
+|   | | |-'::'
+|   | | `-'*'
+|   | |-'xyp'
+|   | `-')' CloseParen
+|   `-ParametersAndQualifiers
+|     |-'(' OpenParen
+|     |-ParameterDeclarationList Parameters
+|     | |-SimpleDeclaration ListElement
+|     | | |-'const'
+|     | | |-'int'
+|     | | `-DeclaratorList Declarators
+|     | |   `-SimpleDeclarator ListElement
+|     | |     `-'*'
+|     | |-',' ListDelimiter
+|     | `-SimpleDeclaration ListElement
+|     |   `-'char'
+|     `-')' CloseParen
 `-';'
 )txt"}));
 }
@@ -5549,31 +5672,34 @@
 TranslationUnit Detached
 `-SimpleDeclaration
   |-'void'
-  |-SimpleDeclarator Declarator
-  | |-'x'
-  | `-ParametersAndQualifiers
-  |   |-'(' OpenParen
-  |   |-ParameterDeclarationList Parameters
-  |   | |-SimpleDeclaration ListElement
-  |   | | |-'char'
-  |   | | `-SimpleDeclarator Declarator
-  |   | |   `-'a'
-  |   | |-',' ListDelimiter
-  |   | `-SimpleDeclaration ListElement
-  |   |   |-'short'
-  |   |   `-SimpleDeclarator Declarator
-  |   |     |-ParenDeclarator
-  |   |     | |-'(' OpenParen
-  |   |     | |-'*'
-  |   |     | |-'b'
-  |   |     | `-')' CloseParen
-  |   |     `-ParametersAndQualifiers
-  |   |       |-'(' OpenParen
-  |   |       |-ParameterDeclarationList Parameters
-  |   |       | `-SimpleDeclaration ListElement
-  |   |       |   `-'int'
-  |   |       `-')' CloseParen
-  |   `-')' CloseParen
+  |-DeclaratorList Declarators
+  | `-SimpleDeclarator ListElement
+  |   |-'x'
+  |   `-ParametersAndQualifiers
+  |     |-'(' OpenParen
+  |     |-ParameterDeclarationList Parameters
+  |     | |-SimpleDeclaration ListElement
+  |     | | |-'char'
+  |     | | `-DeclaratorList Declarators
+  |     | |   `-SimpleDeclarator ListElement
+  |     | |     `-'a'
+  |     | |-',' ListDelimiter
+  |     | `-SimpleDeclaration ListElement
+  |     |   |-'short'
+  |     |   `-DeclaratorList Declarators
+  |     |     `-SimpleDeclarator ListElement
+  |     |       |-ParenDeclarator
+  |     |       | |-'(' OpenParen
+  |     |       | |-'*'
+  |     |       | |-'b'
+  |     |       | `-')' CloseParen
+  |     |       `-ParametersAndQualifiers
+  |     |         |-'(' OpenParen
+  |     |         |-ParameterDeclarationList Parameters
+  |     |         | `-SimpleDeclaration ListElement
+  |     |         |   `-'int'
+  |     |         `-')' CloseParen
+  |     `-')' CloseParen
   `-';'
 )txt"));
 }
@@ -5587,48 +5713,52 @@
 TranslationUnit Detached
 `-SimpleDeclaration
   |-'void'
-  |-SimpleDeclarator Declarator
-  | |-'x'
-  | `-ParametersAndQualifiers
-  |   |-'(' OpenParen
-  |   |-ParameterDeclarationList Parameters
-  |   | |-SimpleDeclaration ListElement
-  |   | | |-'char'
-  |   | | `-SimpleDeclarator Declarator
-  |   | |   `-'a'
-  |   | |-',' ListDelimiter
-  |   | |-SimpleDeclaration ListElement
-  |   | | |-'short'
-  |   | | `-SimpleDeclarator Declarator
-  |   | |   |-ParenDeclarator
-  |   | |   | |-'(' OpenParen
-  |   | |   | |-'*'
-  |   | |   | |-'b'
-  |   | |   | `-')' CloseParen
-  |   | |   `-ParametersAndQualifiers
-  |   | |     |-'(' OpenParen
-  |   | |     |-ParameterDeclarationList Parameters
-  |   | |     | `-SimpleDeclaration ListElement
-  |   | |     |   `-'int'
-  |   | |     `-')' CloseParen
-  |   | |-',' ListDelimiter
-  |   | `-SimpleDeclaration ListElement
-  |   |   |-'long'
-  |   |   `-SimpleDeclarator Declarator
-  |   |     |-ParenDeclarator
-  |   |     | |-'(' OpenParen
-  |   |     | |-'*'
-  |   |     | |-'*'
-  |   |     | |-'c'
-  |   |     | `-')' CloseParen
-  |   |     `-ParametersAndQualifiers
-  |   |       |-'(' OpenParen
-  |   |       |-ParameterDeclarationList Parameters
-  |   |       | `-SimpleDeclaration ListElement
-  |   |       |   |-'long'
-  |   |       |   `-'long'
-  |   |       `-')' CloseParen
-  |   `-')' CloseParen
+  |-DeclaratorList Declarators
+  | `-SimpleDeclarator ListElement
+  |   |-'x'
+  |   `-ParametersAndQualifiers
+  |     |-'(' OpenParen
+  |     |-ParameterDeclarationList Parameters
+  |     | |-SimpleDeclaration ListElement
+  |     | | |-'char'
+  |     | | `-DeclaratorList Declarators
+  |     | |   `-SimpleDeclarator ListElement
+  |     | |     `-'a'
+  |     | |-',' ListDelimiter
+  |     | |-SimpleDeclaration ListElement
+  |     | | |-'short'
+  |     | | `-DeclaratorList Declarators
+  |     | |   `-SimpleDeclarator ListElement
+  |     | |     |-ParenDeclarator
+  |     | |     | |-'(' OpenParen
+  |     | |     | |-'*'
+  |     | |     | |-'b'
+  |     | |     | `-')' CloseParen
+  |     | |     `-ParametersAndQualifiers
+  |     | |       |-'(' OpenParen
+  |     | |       |-ParameterDeclarationList Parameters
+  |     | |       | `-SimpleDeclaration ListElement
+  |     | |       |   `-'int'
+  |     | |       `-')' CloseParen
+  |     | |-',' ListDelimiter
+  |     | `-SimpleDeclaration ListElement
+  |     |   |-'long'
+  |     |   `-DeclaratorList Declarators
+  |     |     `-SimpleDeclarator ListElement
+  |     |       |-ParenDeclarator
+  |     |       | |-'(' OpenParen
+  |     |       | |-'*'
+  |     |       | |-'*'
+  |     |       | |-'c'
+  |     |       | `-')' CloseParen
+  |     |       `-ParametersAndQualifiers
+  |     |         |-'(' OpenParen
+  |     |         |-ParameterDeclarationList Parameters
+  |     |         | `-SimpleDeclaration ListElement
+  |     |         |   |-'long'
+  |     |         |   `-'long'
+  |     |         `-')' CloseParen
+  |     `-')' CloseParen
   `-';'
 )txt"));
 }
diff --git a/clang/unittests/Tooling/Syntax/SynthesisTest.cpp b/clang/unittests/Tooling/Syntax/SynthesisTest.cpp
--- a/clang/unittests/Tooling/Syntax/SynthesisTest.cpp
+++ b/clang/unittests/Tooling/Syntax/SynthesisTest.cpp
@@ -188,8 +188,9 @@
 TranslationUnit Detached synthesized
 `-SimpleDeclaration synthesized
   |-'int' synthesized
-  |-SimpleDeclarator Declarator synthesized
-  | `-'a' synthesized
+  |-DeclaratorList Declarators synthesized
+  | `-SimpleDeclarator ListElement synthesized
+  |   `-'a' synthesized
   `-';' synthesized
   )txt"));
 }
@@ -201,8 +202,9 @@
   EXPECT_TRUE(treeDumpEqual(Copy, R"txt(
 SimpleDeclaration Detached synthesized
 |-'int' synthesized
-|-SimpleDeclarator Declarator synthesized
-| `-'a' synthesized
+|-DeclaratorList Declarators synthesized
+| `-SimpleDeclarator ListElement synthesized
+|   `-'a' synthesized
 `-';' synthesized
   )txt"));
 }
@@ -225,11 +227,12 @@
 TranslationUnit Detached synthesized
 `-SimpleDeclaration synthesized
   |-'void' synthesized
-  |-SimpleDeclarator Declarator synthesized
-  | |-'test' synthesized
-  | `-ParametersAndQualifiers synthesized
-  |   |-'(' OpenParen synthesized
-  |   `-')' CloseParen synthesized
+  |-DeclaratorList Declarators synthesized
+  | `-SimpleDeclarator ListElement synthesized
+  |   |-'test' synthesized
+  |   `-ParametersAndQualifiers synthesized
+  |     |-'(' OpenParen synthesized
+  |     `-')' CloseParen synthesized
   `-CompoundStatement synthesized
     |-'{' OpenParen synthesized
     |-IfStatement Statement synthesized
diff --git a/compiler-rt/cmake/base-config-ix.cmake b/compiler-rt/cmake/base-config-ix.cmake
--- a/compiler-rt/cmake/base-config-ix.cmake
+++ b/compiler-rt/cmake/base-config-ix.cmake
@@ -5,7 +5,6 @@
 
 include(CheckIncludeFile)
 include(CheckCXXSourceCompiles)
-include(TestBigEndian)
 
 check_include_file(unwind.h HAVE_UNWIND_H)
 
@@ -188,22 +187,13 @@
           test_target_arch(x86_64 "" "")
         endif()
       endif()
+    elseif("${COMPILER_RT_DEFAULT_TARGET_ARCH}" MATCHES "powerpc64le")
+      test_target_arch(powerpc64le "" "-m64")
     elseif("${COMPILER_RT_DEFAULT_TARGET_ARCH}" MATCHES "powerpc")
-      # Strip out -nodefaultlibs when calling TEST_BIG_ENDIAN. Configuration
-      # will fail with this option when building with a sanitizer.
-      cmake_push_check_state()
-      string(REPLACE "-nodefaultlibs" "" CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}")
-      TEST_BIG_ENDIAN(HOST_IS_BIG_ENDIAN)
-      cmake_pop_check_state()
-
-      if(HOST_IS_BIG_ENDIAN)
-        if(CMAKE_SYSTEM_NAME MATCHES "AIX")
-          test_target_arch(powerpc "" "-m32")
-        endif()
-          test_target_arch(powerpc64 "" "-m64")
-      else()
-        test_target_arch(powerpc64le "" "-m64")
+      if(CMAKE_SYSTEM_NAME MATCHES "AIX")
+        test_target_arch(powerpc "" "-m32")
       endif()
+      test_target_arch(powerpc64 "" "-m64")
     elseif("${COMPILER_RT_DEFAULT_TARGET_ARCH}" MATCHES "s390x")
       test_target_arch(s390x "" "")
     elseif("${COMPILER_RT_DEFAULT_TARGET_ARCH}" MATCHES "sparc")
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
@@ -4867,6 +4867,34 @@
 #define INIT_TMPNAM_R
 #endif
 
+#if SANITIZER_INTERCEPT_PTSNAME
+INTERCEPTOR(char *, ptsname, int fd) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, ptsname, fd);
+  char *res = REAL(ptsname)(fd);
+  if (res != nullptr)
+    COMMON_INTERCEPTOR_INITIALIZE_RANGE(res, REAL(strlen)(res) + 1);
+  return res;
+}
+#define INIT_PTSNAME COMMON_INTERCEPT_FUNCTION(ptsname);
+#else
+#define INIT_PTSNAME
+#endif
+
+#if SANITIZER_INTERCEPT_PTSNAME_R
+INTERCEPTOR(int, ptsname_r, int fd, char *name, SIZE_T namesize) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, ptsname_r, fd, name, namesize);
+  int res = REAL(ptsname_r)(fd, name, namesize);
+  if (res == 0)
+    COMMON_INTERCEPTOR_WRITE_RANGE(ctx, name, REAL(strlen)(name) + 1);
+  return res;
+}
+#define INIT_PTSNAME_R COMMON_INTERCEPT_FUNCTION(ptsname_r);
+#else
+#define INIT_PTSNAME_R
+#endif
+
 #if SANITIZER_INTERCEPT_TTYNAME
 INTERCEPTOR(char *, ttyname, int fd) {
   void *ctx;
@@ -10166,6 +10194,8 @@
   INIT_PTHREAD_BARRIERATTR_GETPSHARED;
   INIT_TMPNAM;
   INIT_TMPNAM_R;
+  INIT_PTSNAME;
+  INIT_PTSNAME_R;
   INIT_TTYNAME;
   INIT_TTYNAME_R;
   INIT_TEMPNAM;
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
@@ -384,6 +384,8 @@
 #define SANITIZER_INTERCEPT_THR_EXIT SI_FREEBSD
 #define SANITIZER_INTERCEPT_TMPNAM SI_POSIX
 #define SANITIZER_INTERCEPT_TMPNAM_R SI_LINUX_NOT_ANDROID || SI_SOLARIS
+#define SANITIZER_INTERCEPT_PTSNAME SI_LINUX
+#define SANITIZER_INTERCEPT_PTSNAME_R SI_LINUX
 #define SANITIZER_INTERCEPT_TTYNAME SI_POSIX
 #define SANITIZER_INTERCEPT_TTYNAME_R SI_POSIX
 #define SANITIZER_INTERCEPT_TEMPNAM SI_POSIX
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.h b/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.h
--- a/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.h
@@ -85,6 +85,14 @@
   return pc - 4;
 #elif defined(__sparc__) || defined(__mips__)
   return pc - 8;
+#elif SANITIZER_RISCV64
+  // RV-64 has variable instruciton length...
+  // C extentions gives us 2-byte instructoins
+  // RV-64 has 4-byte instructions
+  // + RISCV architecture allows instructions up to 8 bytes
+  // It seems difficult to figure out the exact instruction length -
+  // pc - 2 seems like a safe option for the purposes of stack tracing
+  return pc - 2;
 #else
   return pc - 1;
 #endif
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.cpp
--- a/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.cpp
@@ -21,6 +21,28 @@
   return pc + 8;
 #elif defined(__powerpc__) || defined(__arm__) || defined(__aarch64__)
   return pc + 4;
+#elif SANITIZER_RISCV64
+  // Current check order is 4 -> 2 -> 6 -> 8
+  u8 InsnByte = *(u8 *)(pc);
+  if (((InsnByte & 0x3) == 0x3) && ((InsnByte & 0x1c) != 0x1c)) {
+    // xxxxxxxxxxxbbb11 | 32 bit | bbb != 111
+    return pc + 4;
+  }
+  if ((InsnByte & 0x3) != 0x3) {
+    // xxxxxxxxxxxxxxaa | 16 bit | aa != 11
+    return pc + 2;
+  }
+  // RISC-V encoding allows instructions to be up to 8 bytes long
+  if ((InsnByte & 0x3f) == 0x1f) {
+    // xxxxxxxxxx011111 | 48 bit |
+    return pc + 6;
+  }
+  if ((InsnByte & 0x7f) == 0x3f) {
+    // xxxxxxxxx0111111 | 64 bit |
+    return pc + 8;
+  }
+  // bail-out if could not figure out the instruction size
+  return 0;
 #else
   return pc + 1;
 #endif
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace_libcdep.cpp
--- a/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace_libcdep.cpp
@@ -134,6 +134,7 @@
   }
   CHECK(out_buf <= out_end);
   *out_buf = 0;
+  frame->ClearAll();
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
diff --git a/compiler-rt/test/asan/TestCases/Posix/no_asan_gen_globals.c b/compiler-rt/test/asan/TestCases/Posix/no_asan_gen_globals.c
--- a/compiler-rt/test/asan/TestCases/Posix/no_asan_gen_globals.c
+++ b/compiler-rt/test/asan/TestCases/Posix/no_asan_gen_globals.c
@@ -1,5 +1,7 @@
 // FIXME: https://code.google.com/p/address-sanitizer/issues/detail?id=316
 // XFAIL: android
+// Bug 47607
+// XFAIL: solaris
 // Make sure ___asan_gen_* strings do not end up in the symbol table.
 
 // RUN: %clang_asan %s -o %t.exe
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/ptsname.c b/compiler-rt/test/sanitizer_common/TestCases/Linux/ptsname.c
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/ptsname.c
@@ -0,0 +1,27 @@
+// RUN: %clang %s -o %t && %run %t
+
+#define _GNU_SOURCE
+#define _XOPEN_SOURCE 600
+
+#include <assert.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+int main() {
+  int pt = posix_openpt(O_NOCTTY);
+  if (pt == -1)
+    return 0;
+  char *s = ptsname(pt);
+  assert(s);
+  assert(strstr(s, "/dev"));
+
+  char buff[1000] = {};
+  int r = ptsname_r(pt, buff, sizeof(buff));
+  assert(!r);
+  assert(strstr(buff, "/dev"));
+
+  close(pt);
+  return 0;
+}
diff --git a/debuginfo-tests/CMakeLists.txt b/debuginfo-tests/CMakeLists.txt
--- a/debuginfo-tests/CMakeLists.txt
+++ b/debuginfo-tests/CMakeLists.txt
@@ -22,6 +22,16 @@
   not
   )
 
+if ("mlir" IN_LIST LLVM_ENABLE_PROJECTS)
+  add_llvm_executable(check-gdb-mlir-support
+        llvm-prettyprinters/gdb/mlir-support.cpp
+  )
+  target_include_directories(check-gdb-mlir-support PRIVATE ${LLVM_EXTERNAL_MLIR_SOURCE_DIR}/include)
+  target_link_libraries(check-gdb-mlir-support PRIVATE MLIRIR)
+  list(APPEND DEBUGINFO_TEST_DEPS check-gdb-mlir-support)
+  set(MLIR_SOURCE_DIR  ${LLVM_EXTERNAL_MLIR_SOURCE_DIR})
+endif()
+
 if("compiler-rt" IN_LIST LLVM_ENABLE_PROJECTS)
   # llgdb-tests/asan.c and other asan* files.
   if(TARGET asan)
diff --git a/debuginfo-tests/lit.cfg.py b/debuginfo-tests/lit.cfg.py
--- a/debuginfo-tests/lit.cfg.py
+++ b/debuginfo-tests/lit.cfg.py
@@ -157,6 +157,6 @@
         if apple_lldb_vers < 1000:
             config.available_features.add('apple-lldb-pre-1000')
 
-llvm_config.feature_config([('--build-mode', {
-    'Debug|RelWithDebInfo': 'debug-info'
-})])
+llvm_config.feature_config(
+    [('--build-mode', {'Debug|RelWithDebInfo': 'debug-info'})]
+)
diff --git a/debuginfo-tests/lit.site.cfg.py.in b/debuginfo-tests/lit.site.cfg.py.in
--- a/debuginfo-tests/lit.site.cfg.py.in
+++ b/debuginfo-tests/lit.site.cfg.py.in
@@ -20,6 +20,8 @@
 config.host_arch = "@HOST_ARCH@"
 config.is_msvc = lit.util.pythonize_bool("@MSVC@")
 
+config.mlir_src_root = "@MLIR_SOURCE_DIR@"
+
 config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@"
 
 config.python3_executable = "@Python3_EXECUTABLE@"
diff --git a/debuginfo-tests/llvm-prettyprinters/gdb/lit.local.cfg b/debuginfo-tests/llvm-prettyprinters/gdb/lit.local.cfg
--- a/debuginfo-tests/llvm-prettyprinters/gdb/lit.local.cfg
+++ b/debuginfo-tests/llvm-prettyprinters/gdb/lit.local.cfg
@@ -4,6 +4,10 @@
 if 'native' not in config.available_features or lit.util.which('gdb') is None:
     config.unsupported = True
 
+if config.mlir_src_root:
+  config.substitutions.append(("%mlir_src_root", config.mlir_src_root))
+  config.available_features.add('mlir')
+
 config.suffixes = ['.gdb']
 
 
diff --git a/debuginfo-tests/llvm-prettyprinters/gdb/llvm-support.cpp b/debuginfo-tests/llvm-prettyprinters/gdb/llvm-support.cpp
--- a/debuginfo-tests/llvm-prettyprinters/gdb/llvm-support.cpp
+++ b/debuginfo-tests/llvm-prettyprinters/gdb/llvm-support.cpp
@@ -53,8 +53,13 @@
   return Result;
 }();
 
-// Check expected instances to avoid compile errors.
-auto CheckExpectedValue = static_cast<bool>(ExpectedValue);
-auto CheckExpectedError = static_cast<bool>(ExpectedError);
-
-int main() { return 0; }
+int main() {
+  // Reference symbols that might otherwise be stripped.
+  ArrayRef[0];
+  MutableArrayRef[0];
+  !ExpectedValue;
+  !ExpectedError;
+  *OptionalValue;
+  *OptionalNone;
+  return 0;
+}
diff --git a/debuginfo-tests/llvm-prettyprinters/gdb/mlir-support.cpp b/debuginfo-tests/llvm-prettyprinters/gdb/mlir-support.cpp
new file mode 100644
--- /dev/null
+++ b/debuginfo-tests/llvm-prettyprinters/gdb/mlir-support.cpp
@@ -0,0 +1,41 @@
+#include "mlir/IR/Identifier.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/StandardTypes.h"
+
+mlir::MLIRContext Context;
+
+auto Identifier = mlir::Identifier::get("foo", &Context);
+mlir::OperationName OperationName("FooOp", &Context);
+mlir::Value Value({reinterpret_cast<void *>(0x8),
+                   mlir::Value::Kind::TrailingOpResult});
+
+mlir::Type Type(nullptr);
+mlir::Type IndexType = mlir::IndexType::get(&Context);
+mlir::Type IntegerType =
+    mlir::IntegerType::get(3, mlir::IntegerType::Unsigned, &Context);
+mlir::Type FloatType = mlir::Float32Type::get(&Context);
+mlir::Type MemRefType = mlir::MemRefType::get({4, 5}, FloatType);
+mlir::Type UnrankedMemRefType = mlir::UnrankedMemRefType::get(IntegerType, 6);
+mlir::Type VectorType = mlir::VectorType::get({1, 2}, FloatType);
+mlir::Type TupleType =
+    mlir::TupleType::get(mlir::TypeRange({IndexType, FloatType}), &Context);
+
+auto UnknownLoc = mlir::UnknownLoc::get(&Context);
+auto FileLineColLoc = mlir::FileLineColLoc::get("file", 7, 8, &Context);
+auto OpaqueLoc = mlir::OpaqueLoc::get<uintptr_t>(9, &Context);
+auto NameLoc = mlir::NameLoc::get(Identifier, &Context);
+auto CallSiteLoc = mlir::CallSiteLoc::get(FileLineColLoc, OpaqueLoc);
+auto FusedLoc = mlir::FusedLoc::get({FileLineColLoc, NameLoc}, &Context);
+
+mlir::Attribute UnitAttr = mlir::UnitAttr::get(&Context);
+mlir::Attribute FloatAttr = mlir::FloatAttr::get(FloatType, 1.0);
+mlir::Attribute IntegerAttr = mlir::IntegerAttr::get(IntegerType, 10);
+mlir::Attribute TypeAttr = mlir::TypeAttr::get(IndexType);
+mlir::Attribute ArrayAttr = mlir::ArrayAttr::get({UnitAttr}, &Context);
+mlir::Attribute StringAttr = mlir::StringAttr::get("foo", &Context);
+mlir::Attribute ElementsAttr = mlir::DenseElementsAttr::get(
+    VectorType.cast<mlir::ShapedType>(), llvm::ArrayRef<float>{2.0f, 3.0f});
+
+int main() { return 0; }
diff --git a/debuginfo-tests/llvm-prettyprinters/gdb/mlir-support.gdb b/debuginfo-tests/llvm-prettyprinters/gdb/mlir-support.gdb
new file mode 100644
--- /dev/null
+++ b/debuginfo-tests/llvm-prettyprinters/gdb/mlir-support.gdb
@@ -0,0 +1,112 @@
+# RUN: gdb -q -batch -n -iex 'source %mlir_src_root/utils/gdb-scripts/prettyprinters.py' -iex 'source %llvm_src_root/utils/gdb-scripts/prettyprinters.py' -x %s %llvm_tools_dir/check-gdb-mlir-support | FileCheck %s
+# REQUIRES: debug-info
+# REQUIRES: mlir
+
+break main
+run
+
+# CHECK: "foo"
+p Identifier
+
+# CHECK: "FooOp"
+p OperationName
+
+# CHECK: 0x8
+# CHECK: TrailingOpResult
+p Value
+
+# CHECK: impl = 0x0
+p Type
+
+# CHECK: cast<mlir::IndexType>
+p IndexType
+
+# CHECK: cast<mlir::IntegerType>
+# CHECK: width = 3
+# CHECK: Unsigned
+p IntegerType
+
+# CHECK: cast<mlir::Float32Type>
+p FloatType
+
+# CHECK: cast<mlir::MemRefType>
+# CHECK: shapeSize = 2
+# CHECK: shapeElements[0] = 4
+# CHECK: shapeElements[1] = 5
+p MemRefType
+
+# CHECK: cast<mlir::UnrankedMemRefType>
+# CHECK: memorySpace = 6
+p UnrankedMemRefType
+
+# CHECK: cast<mlir::VectorType>
+# CHECK: shapeSize = 2
+# CHECK: shapeElements[0] = 1
+# CHECK: shapeElements[1] = 2
+p VectorType
+
+# CHECK: cast<mlir::TupleType>
+# CHECK: numElements = 2
+# CHECK: elements[0]
+# CHECK: mlir::IndexType
+# CHECK: elements[1]
+# CHECK: mlir::Float32Type
+p TupleType
+
+# CHECK: cast<mlir::UnknownLoc>
+p UnknownLoc
+
+# CHECK: cast<mlir::FileLineColLoc>
+# CHECK: filename = "file"
+# CHECK: line = 7
+# CHECK: column = 8
+p FileLineColLoc
+
+# CHECK: cast<mlir::OpaqueLoc>
+# CHECK: underlyingLocation = 9
+p OpaqueLoc
+
+# CHECK: cast<mlir::NameLoc>
+# CHECK: name = "foo"
+# CHECK: mlir::UnknownLoc
+p NameLoc
+
+# CHECK: cast<mlir::CallSiteLoc>
+# CHECK: callee
+# CHECK: mlir::FileLineColLoc
+# CHECK: caller
+# CHECK: mlir::OpaqueLoc
+p CallSiteLoc
+
+# CHECK: cast<mlir::FusedLoc>
+# CHECK: numLocs = 2
+# CHECK: locs[0]
+# CHECK: mlir::FileLineColLoc
+# CHECK: locs[1]
+# CHECK: mlir::NameLoc
+p FusedLoc
+
+# CHECK: cast<mlir::UnitAttr>
+p UnitAttr
+
+# CHECK: cast<mlir::FloatAttr>
+p FloatAttr
+
+# CHECK: cast<mlir::IntegerAttr>
+p IntegerAttr
+
+# CHECK: cast<mlir::TypeAttr>
+# CHECK: mlir::IndexType
+p TypeAttr
+
+# CHECK: cast<mlir::ArrayAttr>
+# CHECK: llvm::ArrayRef of length 1
+# CHECK: mlir::UnitAttr
+p ArrayAttr
+
+# CHECK: cast<mlir::StringAttr>
+# CHECK: value = "foo"
+p StringAttr
+
+# CHECK: cast<mlir::DenseIntOrFPElementsAttr>
+p ElementsAttr
diff --git a/flang/include/flang/Evaluate/characteristics.h b/flang/include/flang/Evaluate/characteristics.h
--- a/flang/include/flang/Evaluate/characteristics.h
+++ b/flang/include/flang/Evaluate/characteristics.h
@@ -45,7 +45,7 @@
 
 using common::CopyableIndirection;
 
-// Are these procedures distinguishable for a generic name?
+// Are these procedures distinguishable for a generic name or FINAL?
 bool Distinguishable(const Procedure &, const Procedure &);
 // Are these procedures distinguishable for a generic operator or assignment?
 bool DistinguishableOpOrAssign(const Procedure &, const Procedure &);
diff --git a/flang/include/flang/Evaluate/type.h b/flang/include/flang/Evaluate/type.h
--- a/flang/include/flang/Evaluate/type.h
+++ b/flang/include/flang/Evaluate/type.h
@@ -166,11 +166,9 @@
   bool HasDeferredTypeParameter() const;
 
   // 7.3.2.3 & 15.5.2.4 type compatibility.
-  // x.IsTypeCompatibleWith(y) is true if "x => y" or passing actual y to
+  // x.IsTkCompatibleWith(y) is true if "x => y" or passing actual y to
   // dummy argument x would be valid.  Be advised, this is not a reflexive
-  // relation.
-  bool IsTypeCompatibleWith(const DynamicType &) const;
-  // Type compatible and kind type parameters match
+  // relation.  Kind type parameters must match.
   bool IsTkCompatibleWith(const DynamicType &) const;
 
   // Result will be missing when a symbol is absent or
diff --git a/flang/include/flang/Lower/PFTBuilder.h b/flang/include/flang/Lower/PFTBuilder.h
--- a/flang/include/flang/Lower/PFTBuilder.h
+++ b/flang/include/flang/Lower/PFTBuilder.h
@@ -55,8 +55,9 @@
   using Ref = common::Reference<BaseType<B>>;
 
   ReferenceVariantBase() = delete;
-  template <typename B>
-  ReferenceVariantBase(B &b) : u{Ref<B>{b}} {}
+  ReferenceVariantBase(std::variant<Ref<A>...> b) : u(b) {}
+  template <typename T>
+  ReferenceVariantBase(Ref<T> b) : u(b) {}
 
   template <typename B>
   constexpr BaseType<B> &get() const {
diff --git a/flang/include/flang/Semantics/symbol.h b/flang/include/flang/Semantics/symbol.h
--- a/flang/include/flang/Semantics/symbol.h
+++ b/flang/include/flang/Semantics/symbol.h
@@ -248,6 +248,8 @@
   const std::list<SourceName> &paramNames() const { return paramNames_; }
   const SymbolVector &paramDecls() const { return paramDecls_; }
   bool sequence() const { return sequence_; }
+  std::map<SourceName, SymbolRef> &finals() { return finals_; }
+  const std::map<SourceName, SymbolRef> &finals() const { return finals_; }
   bool isForwardReferenced() const { return isForwardReferenced_; }
   void add_paramName(const SourceName &name) { paramNames_.push_back(name); }
   void add_paramDecl(const Symbol &symbol) { paramDecls_.push_back(symbol); }
@@ -279,6 +281,7 @@
   // These are the names of the derived type's components in component
   // order.  A parent component, if any, appears first in this list.
   std::list<SourceName> componentNames_;
+  std::map<SourceName, SymbolRef> finals_; // FINAL :: subr
   bool sequence_{false};
   bool isForwardReferenced_{false};
   friend llvm::raw_ostream &operator<<(
@@ -322,8 +325,6 @@
   std::size_t alignment_{0}; // required alignment in bytes
 };
 
-class FinalProcDetails {}; // TODO
-
 class MiscDetails {
 public:
   ENUM_CLASS(Kind, None, ConstructName, ScopeName, PassName, ComplexPartRe,
@@ -471,7 +472,7 @@
     ObjectEntityDetails, ProcEntityDetails, AssocEntityDetails,
     DerivedTypeDetails, UseDetails, UseErrorDetails, HostAssocDetails,
     GenericDetails, ProcBindingDetails, NamelistDetails, CommonBlockDetails,
-    FinalProcDetails, TypeParamDetails, MiscDetails>;
+    TypeParamDetails, MiscDetails>;
 llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Details &);
 std::string DetailsToString(const Details &);
 
diff --git a/flang/include/flang/Semantics/tools.h b/flang/include/flang/Semantics/tools.h
--- a/flang/include/flang/Semantics/tools.h
+++ b/flang/include/flang/Semantics/tools.h
@@ -162,6 +162,7 @@
 }
 bool IsAssumedLengthCharacter(const Symbol &);
 bool IsExternal(const Symbol &);
+bool IsModuleProcedure(const Symbol &);
 // Is the symbol modifiable in this scope
 std::optional<parser::MessageFixedText> WhyNotModifiable(
     const Symbol &, const Scope &);
@@ -283,6 +284,20 @@
   return value && *value == 0;
 }
 
+// 15.2.2
+enum class ProcedureDefinitionClass {
+  None,
+  Intrinsic,
+  External,
+  Internal,
+  Module,
+  Dummy,
+  Pointer,
+  StatementFunction
+};
+
+ProcedureDefinitionClass ClassifyProcedure(const Symbol &);
+
 // Derived type component iterator that provides a C++ LegacyForwardIterator
 // iterator over the Ordered, Direct, Ultimate or Potential components of a
 // DerivedTypeSpec. These iterators can be used with STL algorithms
diff --git a/flang/lib/Evaluate/characteristics.cpp b/flang/lib/Evaluate/characteristics.cpp
--- a/flang/lib/Evaluate/characteristics.cpp
+++ b/flang/lib/Evaluate/characteristics.cpp
@@ -130,7 +130,7 @@
     const TypeAndShape &that, const char *thisIs, const char *thatIs,
     bool isElemental) const {
   const auto &len{that.LEN()};
-  if (!type_.IsTypeCompatibleWith(that.type_)) {
+  if (!type_.IsTkCompatibleWith(that.type_)) {
     messages.Say(
         "%1$s type '%2$s' is not compatible with %3$s type '%4$s'"_err_en_US,
         thatIs, that.type_.AsFortran(len ? len->AsFortran() : ""), thisIs,
diff --git a/flang/lib/Evaluate/fold-implementation.h b/flang/lib/Evaluate/fold-implementation.h
--- a/flang/lib/Evaluate/fold-implementation.h
+++ b/flang/lib/Evaluate/fold-implementation.h
@@ -1155,8 +1155,11 @@
     return *array;
   }
   return std::visit(
-      [&](auto &kindExpr) -> Expr<TO> {
+      [&context, &convert](auto &kindExpr) -> Expr<TO> {
         using Operand = ResultType<decltype(kindExpr)>;
+        // This variable is a workaround for msvc which emits an error when
+        // using the FROMCAT template parameter below.
+        TypeCategory constexpr FromCat{FROMCAT};
         char buffer[64];
         if (auto value{GetScalarConstantValue<Operand>(kindExpr)}) {
           if constexpr (TO::category == TypeCategory::Integer) {
@@ -1213,7 +1216,7 @@
             return Expr<TO>{value->IsTrue()};
           }
         } else if constexpr (std::is_same_v<Operand, TO> &&
-            FROMCAT != TypeCategory::Character) {
+            FromCat != TypeCategory::Character) {
           return std::move(kindExpr); // remove needless conversion
         }
         return Expr<TO>{std::move(convert)};
diff --git a/flang/lib/Evaluate/tools.cpp b/flang/lib/Evaluate/tools.cpp
--- a/flang/lib/Evaluate/tools.cpp
+++ b/flang/lib/Evaluate/tools.cpp
@@ -965,7 +965,6 @@
           [](const GenericDetails &) { return true; },
           [](const ProcBindingDetails &) { return true; },
           [](const UseDetails &x) { return IsProcedure(x.symbol()); },
-          // TODO: FinalProcDetails?
           [](const auto &) { return false; },
       },
       symbol.details());
diff --git a/flang/lib/Evaluate/type.cpp b/flang/lib/Evaluate/type.cpp
--- a/flang/lib/Evaluate/type.cpp
+++ b/flang/lib/Evaluate/type.cpp
@@ -218,19 +218,6 @@
   }
 }
 
-static const semantics::Symbol *FindComponent(
-    const semantics::DerivedTypeSpec &derived, parser::CharBlock name) {
-  if (const auto *scope{derived.scope()}) {
-    auto iter{scope->find(name)};
-    if (iter != scope->end()) {
-      return &*iter->second;
-    } else if (const auto *parent{GetParentTypeSpec(derived)}) {
-      return FindComponent(*parent, name);
-    }
-  }
-  return nullptr;
-}
-
 // Compares two derived type representations to see whether they both
 // represent the "same type" in the sense of section 7.5.2.4.
 using SetOfDerivedTypePairs =
@@ -294,24 +281,9 @@
   if (x.attrs().test(semantics::Attr::PRIVATE)) {
     return false;
   }
-#if 0 // TODO
-  if (const auto *xObject{x.detailsIf<semantics::ObjectEntityDetails>()}) {
-    if (const auto *yObject{y.detailsIf<semantics::ObjectEntityDetails>()}) {
-#else
-  if (x.has<semantics::ObjectEntityDetails>()) {
-    if (y.has<semantics::ObjectEntityDetails>()) {
-#endif
-  // TODO: compare types, type parameters, bounds, &c.
-  return true;
-}
-else {
-  return false;
-}
-} // namespace Fortran::evaluate
-else {
-  // TODO: non-object components
-  return true;
-}
+  // TODO: compare types, parameters, bounds, &c.
+  return x.has<semantics::ObjectEntityDetails>() ==
+      y.has<semantics::ObjectEntityDetails>();
 }
 
 static bool AreCompatibleDerivedTypes(const semantics::DerivedTypeSpec *x,
@@ -334,45 +306,9 @@
   return param && param->attr() == common::TypeParamAttr::Kind;
 }
 
-static bool IsKindTypeParameter(
-    const semantics::DerivedTypeSpec &derived, parser::CharBlock name) {
-  const semantics::Symbol *symbol{FindComponent(derived, name)};
-  return symbol && IsKindTypeParameter(*symbol);
-}
-
-bool DynamicType::IsTypeCompatibleWith(const DynamicType &that) const {
-  if (derived_) {
-    if (!AreCompatibleDerivedTypes(derived_, that.derived_, IsPolymorphic())) {
-      return false;
-    }
-    // The values of derived type KIND parameters must match.
-    for (const auto &[name, param] : derived_->parameters()) {
-      if (IsKindTypeParameter(*derived_, name)) {
-        bool ok{false};
-        if (auto myValue{ToInt64(param.GetExplicit())}) {
-          if (const auto *thatParam{that.derived_->FindParameter(name)}) {
-            if (auto thatValue{ToInt64(thatParam->GetExplicit())}) {
-              ok = *myValue == *thatValue;
-            }
-          }
-        }
-        if (!ok) {
-          return false;
-        }
-      }
-    }
-    return true;
-  } else if (category_ == that.category_ && kind_ == that.kind_) {
-    // CHARACTER length is not checked here
-    return true;
-  } else {
-    return IsUnlimitedPolymorphic();
-  }
-}
-
 // Do the kind type parameters of type1 have the same values as the
-// corresponding kind type parameters of the type2?
-static bool IsKindCompatible(const semantics::DerivedTypeSpec &type1,
+// corresponding kind type parameters of type2?
+static bool AreKindCompatible(const semantics::DerivedTypeSpec &type1,
     const semantics::DerivedTypeSpec &type2) {
   for (const auto &[name, param1] : type1.parameters()) {
     if (param1.isKind()) {
@@ -385,18 +321,20 @@
   return true;
 }
 
+// See 7.3.2.3 (5) & 15.5.2.4
 bool DynamicType::IsTkCompatibleWith(const DynamicType &that) const {
-  if (category_ != TypeCategory::Derived) {
-    return category_ == that.category_ && kind_ == that.kind_;
-  } else if (IsUnlimitedPolymorphic()) {
+  if (IsUnlimitedPolymorphic()) {
     return true;
   } else if (that.IsUnlimitedPolymorphic()) {
     return false;
-  } else if (!derived_ || !that.derived_ ||
-      !IsKindCompatible(*derived_, *that.derived_)) {
-    return false; // kind params don't match
+  } else if (category_ != that.category_) {
+    return false;
+  } else if (derived_) {
+    return that.derived_ &&
+        AreCompatibleDerivedTypes(derived_, that.derived_, IsPolymorphic()) &&
+        AreKindCompatible(*derived_, *that.derived_);
   } else {
-    return AreCompatibleDerivedTypes(derived_, that.derived_, IsPolymorphic());
+    return kind_ == that.kind_;
   }
 }
 
diff --git a/flang/lib/Lower/CMakeLists.txt b/flang/lib/Lower/CMakeLists.txt
--- a/flang/lib/Lower/CMakeLists.txt
+++ b/flang/lib/Lower/CMakeLists.txt
@@ -30,7 +30,7 @@
   MLIRAffineToStandard
   MLIRLLVMIR
   MLIRSCFToStandard
-  MLIRStandardOps
+  MLIRStandard
 
   LINK_COMPONENTS
   Support
diff --git a/flang/lib/Lower/PFTBuilder.cpp b/flang/lib/Lower/PFTBuilder.cpp
--- a/flang/lib/Lower/PFTBuilder.cpp
+++ b/flang/lib/Lower/PFTBuilder.cpp
@@ -64,8 +64,11 @@
 class PFTBuilder {
 public:
   PFTBuilder(const semantics::SemanticsContext &semanticsContext)
-      : pgm{std::make_unique<lower::pft::Program>()},
-        parentVariantStack{*pgm.get()}, semanticsContext{semanticsContext} {}
+      : pgm{std::make_unique<lower::pft::Program>()}, semanticsContext{
+                                                          semanticsContext} {
+    lower::pft::ParentVariant parent{*pgm.get()};
+    parentVariantStack.push_back(parent);
+  }
 
   /// Get the result
   std::unique_ptr<lower::pft::Program> result() { return std::move(pgm); }
@@ -905,11 +908,15 @@
 template <typename A, typename T>
 static lower::pft::FunctionLikeUnit::FunctionStatement
 getFunctionStmt(const T &func) {
-  return std::get<parser::Statement<A>>(func.t);
+  lower::pft::FunctionLikeUnit::FunctionStatement result{
+      std::get<parser::Statement<A>>(func.t)};
+  return result;
 }
 template <typename A, typename T>
 static lower::pft::ModuleLikeUnit::ModuleStatement getModuleStmt(const T &mod) {
-  return std::get<parser::Statement<A>>(mod.t);
+  lower::pft::ModuleLikeUnit::ModuleStatement result{
+      std::get<parser::Statement<A>>(mod.t)};
+  return result;
 }
 
 static const semantics::Symbol *getSymbol(
@@ -1078,7 +1085,8 @@
   const auto &ps{
       std::get<std::optional<parser::Statement<parser::ProgramStmt>>>(func.t)};
   if (ps.has_value()) {
-    beginStmt = ps.value();
+    FunctionStatement begin{ps.value()};
+    beginStmt = begin;
     symbol = getSymbol(beginStmt);
     processSymbolTable(*symbol->scope());
   } else {
diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp
--- a/flang/lib/Parser/unparse.cpp
+++ b/flang/lib/Parser/unparse.cpp
@@ -1903,8 +1903,8 @@
     EndOpenACC();
     Walk(std::get<std::optional<DoConstruct>>(x.t));
     BeginOpenACC();
-    Walk("!$ACC END ", std::get<std::optional<DoConstruct>>(x.t));
-    Put("\n");
+    Walk("!$ACC END ", std::get<std::optional<AccEndCombinedDirective>>(x.t),
+        "\n");
     EndOpenACC();
   }
   void Unparse(const OpenACCRoutineConstruct &x) {
diff --git a/flang/lib/Semantics/check-call.cpp b/flang/lib/Semantics/check-call.cpp
--- a/flang/lib/Semantics/check-call.cpp
+++ b/flang/lib/Semantics/check-call.cpp
@@ -144,8 +144,7 @@
   parser::ContextualMessages &messages{context.messages()};
   PadShortCharacterActual(actual, dummy.type, actualType, messages);
   ConvertIntegerActual(actual, dummy.type, actualType, messages);
-  bool typesCompatible{
-      dummy.type.type().IsTypeCompatibleWith(actualType.type())};
+  bool typesCompatible{dummy.type.type().IsTkCompatibleWith(actualType.type())};
   if (typesCompatible) {
     if (isElemental) {
     } else if (dummy.type.attrs().test(
@@ -215,13 +214,17 @@
             "Actual argument associated with TYPE(*) %s may not have type-bound procedure '%s'"_err_en_US,
             dummyName, tbp->name());
       }
-      if (const Symbol *
-          finalizer{FindImmediateComponent(*derived, [](const Symbol &symbol) {
-            return symbol.has<FinalProcDetails>();
-          })}) { // 15.5.2.4(2)
-        evaluate::SayWithDeclaration(messages, *finalizer,
-            "Actual argument associated with TYPE(*) %s may not have FINAL subroutine '%s'"_err_en_US,
-            dummyName, finalizer->name());
+      const auto &finals{
+          derived->typeSymbol().get<DerivedTypeDetails>().finals()};
+      if (!finals.empty()) { // 15.5.2.4(2)
+        if (auto *msg{messages.Say(
+                "Actual argument associated with TYPE(*) %s may not have derived type '%s' with FINAL subroutine '%s'"_err_en_US,
+                dummyName, derived->typeSymbol().name(),
+                finals.begin()->first)}) {
+          msg->Attach(finals.begin()->first,
+              "FINAL subroutine '%s' in derived type '%s'"_en_US,
+              finals.begin()->first, derived->typeSymbol().name());
+        }
       }
     }
     if (actualIsCoindexed) {
@@ -431,14 +434,14 @@
             "If a POINTER or ALLOCATABLE dummy or actual argument is polymorphic, both must be so"_err_en_US);
       }
     } else if (!actualIsUnlimited && typesCompatible) {
-      if (!actualType.type().IsTypeCompatibleWith(dummy.type.type())) {
+      if (!actualType.type().IsTkCompatibleWith(dummy.type.type())) {
         if (dummy.intent == common::Intent::In) {
           // extension: allow with warning, rule is only relevant for definables
           messages.Say(
-              "POINTER or ALLOCATABLE dummy and actual arguments should have the same declared type"_en_US);
+              "POINTER or ALLOCATABLE dummy and actual arguments should have the same declared type and kind"_en_US);
         } else {
           messages.Say(
-              "POINTER or ALLOCATABLE dummy and actual arguments must have the same declared type"_err_en_US);
+              "POINTER or ALLOCATABLE dummy and actual arguments must have the same declared type and kind"_err_en_US);
         }
       }
       if (const auto *derived{
diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -66,6 +66,10 @@
   void CheckSubprogram(const Symbol &, const SubprogramDetails &);
   void CheckAssumedTypeEntity(const Symbol &, const ObjectEntityDetails &);
   void CheckDerivedType(const Symbol &, const DerivedTypeDetails &);
+  bool CheckFinal(
+      const Symbol &subroutine, SourceName, const Symbol &derivedType);
+  bool CheckDistinguishableFinals(const Symbol &f1, SourceName f1name,
+      const Symbol &f2, SourceName f2name, const Symbol &derivedType);
   void CheckGeneric(const Symbol &, const GenericDetails &);
   void CheckHostAssoc(const Symbol &, const HostAssocDetails &);
   bool CheckDefinedOperator(
@@ -781,24 +785,24 @@
 }
 
 void CheckHelper::CheckDerivedType(
-    const Symbol &symbol, const DerivedTypeDetails &details) {
-  const Scope *scope{symbol.scope()};
+    const Symbol &derivedType, const DerivedTypeDetails &details) {
+  const Scope *scope{derivedType.scope()};
   if (!scope) {
     CHECK(details.isForwardReferenced());
     return;
   }
-  CHECK(scope->symbol() == &symbol);
+  CHECK(scope->symbol() == &derivedType);
   CHECK(scope->IsDerivedType());
-  if (symbol.attrs().test(Attr::ABSTRACT) && // C734
-      (symbol.attrs().test(Attr::BIND_C) || details.sequence())) {
+  if (derivedType.attrs().test(Attr::ABSTRACT) && // C734
+      (derivedType.attrs().test(Attr::BIND_C) || details.sequence())) {
     messages_.Say("An ABSTRACT derived type must be extensible"_err_en_US);
   }
-  if (const DeclTypeSpec * parent{FindParentTypeSpec(symbol)}) {
+  if (const DeclTypeSpec * parent{FindParentTypeSpec(derivedType)}) {
     const DerivedTypeSpec *parentDerived{parent->AsDerived()};
     if (!IsExtensibleType(parentDerived)) { // C705
       messages_.Say("The parent type is not extensible"_err_en_US);
     }
-    if (!symbol.attrs().test(Attr::ABSTRACT) && parentDerived &&
+    if (!derivedType.attrs().test(Attr::ABSTRACT) && parentDerived &&
         parentDerived->typeSymbol().attrs().test(Attr::ABSTRACT)) {
       ScopeComponentIterator components{*parentDerived};
       for (const Symbol &component : components) {
@@ -811,7 +815,7 @@
         }
       }
     }
-    DerivedTypeSpec derived{symbol.name(), symbol};
+    DerivedTypeSpec derived{derivedType.name(), derivedType};
     derived.set_scope(*scope);
     if (FindCoarrayUltimateComponent(derived) && // C736
         !(parentDerived && FindCoarrayUltimateComponent(*parentDerived))) {
@@ -819,7 +823,7 @@
           "Type '%s' has a coarray ultimate component so the type at the base "
           "of its type extension chain ('%s') must be a type that has a "
           "coarray ultimate component"_err_en_US,
-          symbol.name(), scope->GetDerivedTypeBase().GetSymbol()->name());
+          derivedType.name(), scope->GetDerivedTypeBase().GetSymbol()->name());
     }
     if (FindEventOrLockPotentialComponent(derived) && // C737
         !(FindEventOrLockPotentialComponent(*parentDerived) ||
@@ -829,13 +833,154 @@
           "at the base of its type extension chain ('%s') must either have an "
           "EVENT_TYPE or LOCK_TYPE component, or be EVENT_TYPE or "
           "LOCK_TYPE"_err_en_US,
-          symbol.name(), scope->GetDerivedTypeBase().GetSymbol()->name());
+          derivedType.name(), scope->GetDerivedTypeBase().GetSymbol()->name());
     }
   }
-  if (HasIntrinsicTypeName(symbol)) { // C729
+  if (HasIntrinsicTypeName(derivedType)) { // C729
     messages_.Say("A derived type name cannot be the name of an intrinsic"
                   " type"_err_en_US);
   }
+  std::map<SourceName, SymbolRef> previous;
+  for (const auto &pair : details.finals()) {
+    SourceName source{pair.first};
+    const Symbol &ref{*pair.second};
+    if (CheckFinal(ref, source, derivedType) &&
+        std::all_of(previous.begin(), previous.end(),
+            [&](std::pair<SourceName, SymbolRef> prev) {
+              return CheckDistinguishableFinals(
+                  ref, source, *prev.second, prev.first, derivedType);
+            })) {
+      previous.emplace(source, ref);
+    }
+  }
+}
+
+// C786
+bool CheckHelper::CheckFinal(
+    const Symbol &subroutine, SourceName finalName, const Symbol &derivedType) {
+  if (!IsModuleProcedure(subroutine)) {
+    SayWithDeclaration(subroutine, finalName,
+        "FINAL subroutine '%s' of derived type '%s' must be a module procedure"_err_en_US,
+        subroutine.name(), derivedType.name());
+    return false;
+  }
+  const Procedure *proc{Characterize(subroutine)};
+  if (!proc) {
+    return false; // error recovery
+  }
+  if (!proc->IsSubroutine()) {
+    SayWithDeclaration(subroutine, finalName,
+        "FINAL subroutine '%s' of derived type '%s' must be a subroutine"_err_en_US,
+        subroutine.name(), derivedType.name());
+    return false;
+  }
+  if (proc->dummyArguments.size() != 1) {
+    SayWithDeclaration(subroutine, finalName,
+        "FINAL subroutine '%s' of derived type '%s' must have a single dummy argument"_err_en_US,
+        subroutine.name(), derivedType.name());
+    return false;
+  }
+  const auto &arg{proc->dummyArguments[0]};
+  const Symbol *errSym{&subroutine};
+  if (const auto *details{subroutine.detailsIf<SubprogramDetails>()}) {
+    if (!details->dummyArgs().empty()) {
+      if (const Symbol * argSym{details->dummyArgs()[0]}) {
+        errSym = argSym;
+      }
+    }
+  }
+  const auto *ddo{std::get_if<DummyDataObject>(&arg.u)};
+  if (!ddo) {
+    SayWithDeclaration(subroutine, finalName,
+        "FINAL subroutine '%s' of derived type '%s' must have a single dummy argument that is a data object"_err_en_US,
+        subroutine.name(), derivedType.name());
+    return false;
+  }
+  bool ok{true};
+  if (arg.IsOptional()) {
+    SayWithDeclaration(*errSym, finalName,
+        "FINAL subroutine '%s' of derived type '%s' must not have an OPTIONAL dummy argument"_err_en_US,
+        subroutine.name(), derivedType.name());
+    ok = false;
+  }
+  if (ddo->attrs.test(DummyDataObject::Attr::Allocatable)) {
+    SayWithDeclaration(*errSym, finalName,
+        "FINAL subroutine '%s' of derived type '%s' must not have an ALLOCATABLE dummy argument"_err_en_US,
+        subroutine.name(), derivedType.name());
+    ok = false;
+  }
+  if (ddo->attrs.test(DummyDataObject::Attr::Pointer)) {
+    SayWithDeclaration(*errSym, finalName,
+        "FINAL subroutine '%s' of derived type '%s' must not have a POINTER dummy argument"_err_en_US,
+        subroutine.name(), derivedType.name());
+    ok = false;
+  }
+  if (ddo->intent == common::Intent::Out) {
+    SayWithDeclaration(*errSym, finalName,
+        "FINAL subroutine '%s' of derived type '%s' must not have a dummy argument with INTENT(OUT)"_err_en_US,
+        subroutine.name(), derivedType.name());
+    ok = false;
+  }
+  if (ddo->attrs.test(DummyDataObject::Attr::Value)) {
+    SayWithDeclaration(*errSym, finalName,
+        "FINAL subroutine '%s' of derived type '%s' must not have a dummy argument with the VALUE attribute"_err_en_US,
+        subroutine.name(), derivedType.name());
+    ok = false;
+  }
+  if (ddo->type.corank() > 0) {
+    SayWithDeclaration(*errSym, finalName,
+        "FINAL subroutine '%s' of derived type '%s' must not have a coarray dummy argument"_err_en_US,
+        subroutine.name(), derivedType.name());
+    ok = false;
+  }
+  if (ddo->type.type().IsPolymorphic()) {
+    SayWithDeclaration(*errSym, finalName,
+        "FINAL subroutine '%s' of derived type '%s' must not have a polymorphic dummy argument"_err_en_US,
+        subroutine.name(), derivedType.name());
+    ok = false;
+  } else if (ddo->type.type().category() != TypeCategory::Derived ||
+      &ddo->type.type().GetDerivedTypeSpec().typeSymbol() != &derivedType) {
+    SayWithDeclaration(*errSym, finalName,
+        "FINAL subroutine '%s' of derived type '%s' must have a TYPE(%s) dummy argument"_err_en_US,
+        subroutine.name(), derivedType.name(), derivedType.name());
+    ok = false;
+  } else { // check that all LEN type parameters are assumed
+    for (auto ref : OrderParameterDeclarations(derivedType)) {
+      if (const auto *paramDetails{ref->detailsIf<TypeParamDetails>()}) {
+        if (paramDetails->attr() == common::TypeParamAttr::Len) {
+          const auto *value{
+              ddo->type.type().GetDerivedTypeSpec().FindParameter(ref->name())};
+          if (!value || !value->isAssumed()) {
+            SayWithDeclaration(*errSym, finalName,
+                "FINAL subroutine '%s' of derived type '%s' must have a dummy argument with an assumed LEN type parameter '%s=*'"_err_en_US,
+                subroutine.name(), derivedType.name(), ref->name());
+            ok = false;
+          }
+        }
+      }
+    }
+  }
+  return ok;
+}
+
+bool CheckHelper::CheckDistinguishableFinals(const Symbol &f1,
+    SourceName f1Name, const Symbol &f2, SourceName f2Name,
+    const Symbol &derivedType) {
+  const Procedure *p1{Characterize(f1)};
+  const Procedure *p2{Characterize(f2)};
+  if (p1 && p2) {
+    if (characteristics::Distinguishable(*p1, *p2)) {
+      return true;
+    }
+    if (auto *msg{messages_.Say(f1Name,
+            "FINAL subroutines '%s' and '%s' of derived type '%s' cannot be distinguished by rank or KIND type parameter value"_err_en_US,
+            f1Name, f2Name, derivedType.name())}) {
+      msg->Attach(f2Name, "FINAL declaration of '%s'"_en_US, f2.name())
+          .Attach(f1.name(), "Definition of '%s'"_en_US, f1Name)
+          .Attach(f2.name(), "Definition of '%s'"_en_US, f2Name);
+    }
+  }
+  return false;
 }
 
 void CheckHelper::CheckHostAssoc(
diff --git a/flang/lib/Semantics/mod-file.h b/flang/lib/Semantics/mod-file.h
--- a/flang/lib/Semantics/mod-file.h
+++ b/flang/lib/Semantics/mod-file.h
@@ -53,7 +53,8 @@
   void WriteOne(const Scope &);
   void Write(const Symbol &);
   std::string GetAsString(const Symbol &);
-  void PutSymbols(const Scope &);
+  // Returns true if a derived type with bindings and "contains" was emitted
+  bool PutSymbols(const Scope &);
   void PutSymbol(llvm::raw_ostream &, const Symbol &);
   void PutDerivedType(const Symbol &);
   void PutSubprogram(const Symbol &);
diff --git a/flang/lib/Semantics/mod-file.cpp b/flang/lib/Semantics/mod-file.cpp
--- a/flang/lib/Semantics/mod-file.cpp
+++ b/flang/lib/Semantics/mod-file.cpp
@@ -177,7 +177,7 @@
 }
 
 // Put out the visible symbols from scope.
-void ModFileWriter::PutSymbols(const Scope &scope) {
+bool ModFileWriter::PutSymbols(const Scope &scope) {
   std::string buf;
   llvm::raw_string_ostream typeBindings{
       buf}; // stuff after CONTAINS in derived type
@@ -187,6 +187,9 @@
   if (auto str{typeBindings.str()}; !str.empty()) {
     CHECK(scope.IsDerivedType());
     decls_ << "contains\n" << str;
+    return true;
+  } else {
+    return false;
   }
 }
 
@@ -257,9 +260,6 @@
                      decls_ << "::/" << symbol.name() << "/\n";
                    }
                  },
-                 [&](const FinalProcDetails &) {
-                   typeBindings << "final::" << symbol.name() << '\n';
-                 },
                  [](const HostAssocDetails &) {},
                  [](const MiscDetails &) {},
                  [&](const auto &) { PutEntity(decls_, symbol); },
@@ -287,7 +287,17 @@
   if (details.sequence()) {
     decls_ << "sequence\n";
   }
-  PutSymbols(typeScope);
+  bool contains{PutSymbols(typeScope)};
+  if (!details.finals().empty()) {
+    const char *sep{contains ? "final::" : "contains\nfinal::"};
+    for (const auto &pair : details.finals()) {
+      decls_ << sep << pair.second->name();
+      sep = ",";
+    }
+    if (*sep == ',') {
+      decls_ << '\n';
+    }
+  }
   decls_ << "end type\n";
 }
 
diff --git a/flang/lib/Semantics/pointer-assignment.cpp b/flang/lib/Semantics/pointer-assignment.cpp
--- a/flang/lib/Semantics/pointer-assignment.cpp
+++ b/flang/lib/Semantics/pointer-assignment.cpp
@@ -219,7 +219,7 @@
               " derived type when target is unlimited polymorphic"_err_en_US;
       }
     } else {
-      if (!lhsType_->type().IsTypeCompatibleWith(rhsType->type())) {
+      if (!lhsType_->type().IsTkCompatibleWith(rhsType->type())) {
         msg = MessageFormattedText{
             "Target type %s is not compatible with pointer type %s"_err_en_US,
             rhsType->type().AsFortran(), lhsType_->type().AsFortran()};
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -4028,8 +4028,22 @@
 }
 
 void DeclarationVisitor::Post(const parser::FinalProcedureStmt &x) {
-  for (auto &name : x.v) {
-    MakeTypeSymbol(name, FinalProcDetails{});
+  if (currScope().IsDerivedType() && currScope().symbol()) {
+    if (auto *details{currScope().symbol()->detailsIf<DerivedTypeDetails>()}) {
+      for (const auto &subrName : x.v) {
+        if (const auto *name{ResolveName(subrName)}) {
+          auto pair{
+              details->finals().emplace(name->source, DEREF(name->symbol))};
+          if (!pair.second) { // C787
+            Say(name->source,
+                "FINAL subroutine '%s' already appeared in this derived type"_err_en_US,
+                name->source)
+                .Attach(pair.first->first,
+                    "earlier appearance of this FINAL subroutine"_en_US);
+          }
+        }
+      }
+    }
   }
 }
 
diff --git a/flang/lib/Semantics/symbol.cpp b/flang/lib/Semantics/symbol.cpp
--- a/flang/lib/Semantics/symbol.cpp
+++ b/flang/lib/Semantics/symbol.cpp
@@ -228,7 +228,6 @@
           [](const ProcBindingDetails &) { return "ProcBinding"; },
           [](const NamelistDetails &) { return "Namelist"; },
           [](const CommonBlockDetails &) { return "CommonBlockDetails"; },
-          [](const FinalProcDetails &) { return "FinalProc"; },
           [](const TypeParamDetails &) { return "TypeParam"; },
           [](const MiscDetails &) { return "Misc"; },
           [](const AssocEntityDetails &) { return "AssocEntity"; },
@@ -436,7 +435,6 @@
               os << ' ' << object->name();
             }
           },
-          [&](const FinalProcDetails &) {},
           [&](const TypeParamDetails &x) {
             DumpOptional(os, "type", x.type());
             os << ' ' << common::EnumToString(x.attr());
diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp
--- a/flang/lib/Semantics/tools.cpp
+++ b/flang/lib/Semantics/tools.cpp
@@ -637,20 +637,23 @@
 }
 
 bool IsFinalizable(const DerivedTypeSpec &derived) {
-  ScopeComponentIterator components{derived};
-  return std::find_if(components.begin(), components.end(),
-             [](const Symbol &x) { return x.has<FinalProcDetails>(); }) !=
-      components.end();
+  if (!derived.typeSymbol().get<DerivedTypeDetails>().finals().empty()) {
+    return true;
+  }
+  DirectComponentIterator components{derived};
+  return bool{std::find_if(components.begin(), components.end(),
+      [](const Symbol &component) { return IsFinalizable(component); })};
 }
 
-// TODO The following function returns true for all types with FINAL procedures
-// This is because we don't yet fill in the data for FinalProcDetails
 bool HasImpureFinal(const DerivedTypeSpec &derived) {
-  ScopeComponentIterator components{derived};
-  return std::find_if(
-             components.begin(), components.end(), [](const Symbol &x) {
-               return x.has<FinalProcDetails>() && !x.attrs().test(Attr::PURE);
-             }) != components.end();
+  if (const auto *details{
+          derived.typeSymbol().detailsIf<DerivedTypeDetails>()}) {
+    const auto &finals{details->finals()};
+    return std::any_of(finals.begin(), finals.end(),
+        [](const auto &x) { return !x.second->attrs().test(Attr::PURE); });
+  } else {
+    return false;
+  }
 }
 
 bool IsCoarray(const Symbol &symbol) { return symbol.Corank() > 0; }
@@ -701,10 +704,12 @@
 // C722 and C723:  For a function to be assumed length, it must be external and
 // of CHARACTER type
 bool IsExternal(const Symbol &symbol) {
-  return (symbol.has<SubprogramDetails>() && symbol.owner().IsGlobal()) ||
-      symbol.attrs().test(Attr::EXTERNAL);
+  return ClassifyProcedure(symbol) == ProcedureDefinitionClass::External;
 }
 
+bool IsModuleProcedure(const Symbol &symbol) {
+  return ClassifyProcedure(symbol) == ProcedureDefinitionClass::Module;
+}
 const Symbol *IsExternalInPureContext(
     const Symbol &symbol, const Scope &scope) {
   if (const auto *pureProc{FindPureProcedureContaining(scope)}) {
@@ -1005,6 +1010,39 @@
   return nullptr;
 }
 
+ProcedureDefinitionClass ClassifyProcedure(const Symbol &symbol) { // 15.2.2
+  const Symbol &ultimate{symbol.GetUltimate()};
+  if (ultimate.attrs().test(Attr::INTRINSIC)) {
+    return ProcedureDefinitionClass::Intrinsic;
+  } else if (ultimate.attrs().test(Attr::EXTERNAL)) {
+    return ProcedureDefinitionClass::External;
+  } else if (const auto *procDetails{ultimate.detailsIf<ProcEntityDetails>()}) {
+    if (procDetails->isDummy()) {
+      return ProcedureDefinitionClass::Dummy;
+    } else if (IsPointer(ultimate)) {
+      return ProcedureDefinitionClass::Pointer;
+    }
+  } else if (const Symbol * subp{FindSubprogram(symbol)}) {
+    if (const auto *subpDetails{subp->detailsIf<SubprogramDetails>()}) {
+      if (subpDetails->stmtFunction()) {
+        return ProcedureDefinitionClass::StatementFunction;
+      }
+    }
+    switch (ultimate.owner().kind()) {
+    case Scope::Kind::Global:
+      return ProcedureDefinitionClass::External;
+    case Scope::Kind::Module:
+      return ProcedureDefinitionClass::Module;
+    case Scope::Kind::MainProgram:
+    case Scope::Kind::Subprogram:
+      return ProcedureDefinitionClass::Internal;
+    default:
+      break;
+    }
+  }
+  return ProcedureDefinitionClass::None;
+}
+
 // ComponentIterator implementation
 
 template <ComponentKind componentKind>
diff --git a/flang/runtime/descriptor-io.h b/flang/runtime/descriptor-io.h
--- a/flang/runtime/descriptor-io.h
+++ b/flang/runtime/descriptor-io.h
@@ -159,13 +159,13 @@
           }
         }
       }
-      if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
-        io.GetIoErrorHandler().Crash(
-            "FormattedCharacterIO: subscripts out of bounds");
-      }
     } else {
       return false;
     }
+    if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
+      io.GetIoErrorHandler().Crash(
+          "FormattedCharacterIO: subscripts out of bounds");
+    }
   }
   return true;
 }
@@ -198,13 +198,13 @@
           }
         }
       }
-      if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
-        io.GetIoErrorHandler().Crash(
-            "FormattedLogicalIO: subscripts out of bounds");
-      }
     } else {
       return false;
     }
+    if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
+      io.GetIoErrorHandler().Crash(
+          "FormattedLogicalIO: subscripts out of bounds");
+    }
   }
   return true;
 }
diff --git a/flang/runtime/edit-output.cpp b/flang/runtime/edit-output.cpp
--- a/flang/runtime/edit-output.cpp
+++ b/flang/runtime/edit-output.cpp
@@ -330,17 +330,17 @@
 template <int binaryPrecision>
 DataEdit RealOutputEditing<binaryPrecision>::EditForGOutput(DataEdit edit) {
   edit.descriptor = 'E';
-  if (!edit.width.has_value() ||
-      (*edit.width > 0 && edit.digits.value_or(-1) == 0)) {
+  int significantDigits{
+      edit.digits.value_or(BinaryFloatingPoint::decimalPrecision)}; // 'd'
+  if (!edit.width.has_value() || (*edit.width > 0 && significantDigits == 0)) {
     return edit; // Gw.0 -> Ew.0 for w > 0
   }
-  decimal::ConversionToDecimalResult converted{Convert(1, edit)};
+  decimal::ConversionToDecimalResult converted{
+      Convert(significantDigits, edit)};
   if (IsInfOrNaN(converted)) {
     return edit;
   }
   int expo{IsZero() ? 1 : converted.decimalExponent}; // 's'
-  int significantDigits{
-      edit.digits.value_or(BinaryFloatingPoint::decimalPrecision)}; // 'd'
   if (expo < 0 || expo > significantDigits) {
     return edit; // Ew.d
   }
diff --git a/flang/runtime/format-implementation.h b/flang/runtime/format-implementation.h
--- a/flang/runtime/format-implementation.h
+++ b/flang/runtime/format-implementation.h
@@ -97,7 +97,7 @@
   }
   int result{0};
   bool negate{ch == '-'};
-  if (negate) {
+  if (negate || ch == '+') {
     firstCh = '\0';
     ch = PeekNext();
   }
diff --git a/flang/runtime/io-api.h b/flang/runtime/io-api.h
--- a/flang/runtime/io-api.h
+++ b/flang/runtime/io-api.h
@@ -231,10 +231,12 @@
 // and avoid the following items when they might crash.
 bool IONAME(OutputDescriptor)(Cookie, const Descriptor &);
 bool IONAME(InputDescriptor)(Cookie, const Descriptor &);
+// Contiguous transfers for unformatted I/O
 bool IONAME(OutputUnformattedBlock)(
     Cookie, const char *, std::size_t, std::size_t elementBytes);
 bool IONAME(InputUnformattedBlock)(
     Cookie, char *, std::size_t, std::size_t elementBytes);
+// Formatted (including list directed) I/O data items
 bool IONAME(OutputInteger64)(Cookie, std::int64_t);
 bool IONAME(InputInteger)(Cookie, std::int64_t &, int kind = 8);
 bool IONAME(OutputReal32)(Cookie, float);
@@ -245,7 +247,9 @@
 bool IONAME(InputComplex32)(Cookie, float[2]);
 bool IONAME(OutputComplex64)(Cookie, double, double);
 bool IONAME(InputComplex64)(Cookie, double[2]);
+bool IONAME(OutputCharacter)(Cookie, const char *, std::size_t, int kind = 1);
 bool IONAME(OutputAscii)(Cookie, const char *, std::size_t);
+bool IONAME(InputCharacter)(Cookie, char *, std::size_t, int kind = 1);
 bool IONAME(InputAscii)(Cookie, char *, std::size_t);
 bool IONAME(OutputLogical)(Cookie, bool);
 bool IONAME(InputLogical)(Cookie, bool &);
diff --git a/flang/runtime/io-api.cpp b/flang/runtime/io-api.cpp
--- a/flang/runtime/io-api.cpp
+++ b/flang/runtime/io-api.cpp
@@ -235,6 +235,7 @@
     if (unit.access == Access::Sequential && !unit.isFixedRecordLength) {
       // Create space for (sub)record header to be completed by
       // UnformattedIoStatementState<Direction::Output>::EndIoStatement()
+      unit.recordLength.reset(); // in case of prior BACKSPACE
       io.Emit("\0\0\0\0", 4); // placeholder for record length header
     }
   }
@@ -922,14 +923,16 @@
 }
 
 bool IONAME(OutputInteger64)(Cookie cookie, std::int64_t n) {
+  cookie->CheckFormattedStmtType<Direction::Output>("OutputInteger64");
   StaticDescriptor staticDescriptor;
   Descriptor &descriptor{staticDescriptor.descriptor()};
   descriptor.Establish(
-      TypeCategory::Integer, 8, reinterpret_cast<void *>(&n), 0);
+      TypeCategory::Integer, sizeof n, reinterpret_cast<void *>(&n), 0);
   return descr::DescriptorIO<Direction::Output>(*cookie, descriptor);
 }
 
 bool IONAME(InputInteger)(Cookie cookie, std::int64_t &n, int kind) {
+  cookie->CheckFormattedStmtType<Direction::Input>("InputInteger");
   StaticDescriptor staticDescriptor;
   Descriptor &descriptor{staticDescriptor.descriptor()};
   descriptor.Establish(
@@ -938,6 +941,7 @@
 }
 
 bool IONAME(OutputReal32)(Cookie cookie, float x) {
+  cookie->CheckFormattedStmtType<Direction::Output>("OutputReal32");
   StaticDescriptor staticDescriptor;
   Descriptor &descriptor{staticDescriptor.descriptor()};
   descriptor.Establish(TypeCategory::Real, 4, reinterpret_cast<void *>(&x), 0);
@@ -945,6 +949,7 @@
 }
 
 bool IONAME(OutputReal64)(Cookie cookie, double x) {
+  cookie->CheckFormattedStmtType<Direction::Output>("OutputReal64");
   StaticDescriptor staticDescriptor;
   Descriptor &descriptor{staticDescriptor.descriptor()};
   descriptor.Establish(TypeCategory::Real, 8, reinterpret_cast<void *>(&x), 0);
@@ -952,6 +957,7 @@
 }
 
 bool IONAME(InputReal32)(Cookie cookie, float &x) {
+  cookie->CheckFormattedStmtType<Direction::Input>("InputReal32");
   StaticDescriptor staticDescriptor;
   Descriptor &descriptor{staticDescriptor.descriptor()};
   descriptor.Establish(TypeCategory::Real, 4, reinterpret_cast<void *>(&x), 0);
@@ -959,6 +965,7 @@
 }
 
 bool IONAME(InputReal64)(Cookie cookie, double &x) {
+  cookie->CheckFormattedStmtType<Direction::Input>("InputReal64");
   StaticDescriptor staticDescriptor;
   Descriptor &descriptor{staticDescriptor.descriptor()};
   descriptor.Establish(TypeCategory::Real, 8, reinterpret_cast<void *>(&x), 0);
@@ -966,6 +973,7 @@
 }
 
 bool IONAME(OutputComplex32)(Cookie cookie, float r, float i) {
+  cookie->CheckFormattedStmtType<Direction::Output>("OutputComplex32");
   float z[2]{r, i};
   StaticDescriptor staticDescriptor;
   Descriptor &descriptor{staticDescriptor.descriptor()};
@@ -975,6 +983,7 @@
 }
 
 bool IONAME(OutputComplex64)(Cookie cookie, double r, double i) {
+  cookie->CheckFormattedStmtType<Direction::Output>("OutputComplex64");
   double z[2]{r, i};
   StaticDescriptor staticDescriptor;
   Descriptor &descriptor{staticDescriptor.descriptor()};
@@ -984,6 +993,7 @@
 }
 
 bool IONAME(InputComplex32)(Cookie cookie, float z[2]) {
+  cookie->CheckFormattedStmtType<Direction::Input>("InputComplex32");
   StaticDescriptor staticDescriptor;
   Descriptor &descriptor{staticDescriptor.descriptor()};
   descriptor.Establish(
@@ -992,6 +1002,7 @@
 }
 
 bool IONAME(InputComplex64)(Cookie cookie, double z[2]) {
+  cookie->CheckFormattedStmtType<Direction::Input>("InputComplex64");
   StaticDescriptor staticDescriptor;
   Descriptor &descriptor{staticDescriptor.descriptor()};
   descriptor.Establish(
@@ -999,34 +1010,48 @@
   return descr::DescriptorIO<Direction::Input>(*cookie, descriptor);
 }
 
-bool IONAME(OutputAscii)(Cookie cookie, const char *x, std::size_t length) {
+bool IONAME(OutputCharacter)(
+    Cookie cookie, const char *x, std::size_t length, int kind) {
+  cookie->CheckFormattedStmtType<Direction::Output>("OutputCharacter");
   StaticDescriptor staticDescriptor;
   Descriptor &descriptor{staticDescriptor.descriptor()};
   descriptor.Establish(
-      1, length, reinterpret_cast<void *>(const_cast<char *>(x)), 0);
+      kind, length, reinterpret_cast<void *>(const_cast<char *>(x)), 0);
   return descr::DescriptorIO<Direction::Output>(*cookie, descriptor);
 }
 
-bool IONAME(InputAscii)(Cookie cookie, char *x, std::size_t length) {
+bool IONAME(OutputAscii)(Cookie cookie, const char *x, std::size_t length) {
+  return IONAME(OutputCharacter(cookie, x, length, 1));
+}
+
+bool IONAME(InputCharacter)(
+    Cookie cookie, char *x, std::size_t length, int kind) {
+  cookie->CheckFormattedStmtType<Direction::Input>("InputCharacter");
   StaticDescriptor staticDescriptor;
   Descriptor &descriptor{staticDescriptor.descriptor()};
-  descriptor.Establish(1, length, reinterpret_cast<void *>(x), 0);
+  descriptor.Establish(kind, length, reinterpret_cast<void *>(x), 0);
   return descr::DescriptorIO<Direction::Input>(*cookie, descriptor);
 }
 
+bool IONAME(InputAscii)(Cookie cookie, char *x, std::size_t length) {
+  return IONAME(InputCharacter(cookie, x, length, 1));
+}
+
 bool IONAME(OutputLogical)(Cookie cookie, bool truth) {
+  cookie->CheckFormattedStmtType<Direction::Output>("OutputLogical");
   StaticDescriptor staticDescriptor;
   Descriptor &descriptor{staticDescriptor.descriptor()};
   descriptor.Establish(
-      TypeCategory::Logical, 1, reinterpret_cast<void *>(&truth), 0);
+      TypeCategory::Logical, sizeof truth, reinterpret_cast<void *>(&truth), 0);
   return descr::DescriptorIO<Direction::Output>(*cookie, descriptor);
 }
 
 bool IONAME(InputLogical)(Cookie cookie, bool &truth) {
+  cookie->CheckFormattedStmtType<Direction::Input>("InputLogical");
   StaticDescriptor staticDescriptor;
   Descriptor &descriptor{staticDescriptor.descriptor()};
   descriptor.Establish(
-      TypeCategory::Logical, 1, reinterpret_cast<void *>(&truth), 0);
+      TypeCategory::Logical, sizeof truth, reinterpret_cast<void *>(&truth), 0);
   return descr::DescriptorIO<Direction::Input>(*cookie, descriptor);
 }
 
diff --git a/flang/runtime/io-stmt.h b/flang/runtime/io-stmt.h
--- a/flang/runtime/io-stmt.h
+++ b/flang/runtime/io-stmt.h
@@ -43,6 +43,13 @@
 template <Direction> class ExternalListIoStatementState;
 template <Direction> class UnformattedIoStatementState;
 
+struct InputStatementState {};
+struct OutputStatementState {};
+template <Direction D>
+using IoDirectionState = std::conditional_t<D == Direction::Input,
+    InputStatementState, OutputStatementState>;
+struct FormattedIoStatementState {};
+
 // The Cookie type in the I/O API is a pointer (for C) to this class.
 class IoStatementState {
 public:
@@ -65,6 +72,7 @@
   ExternalFileUnit *GetExternalFileUnit() const; // null if internal unit
   MutableModes &mutableModes();
   void BeginReadingRecord();
+  void FinishReadingRecord();
   bool Inquire(InquiryKeywordHash, char *, std::size_t);
   bool Inquire(InquiryKeywordHash, bool &);
   bool Inquire(InquiryKeywordHash, std::int64_t, bool &); // PENDING=
@@ -89,6 +97,15 @@
   std::optional<char32_t> NextInField(std::optional<int> &remaining);
   std::optional<char32_t> GetNextNonBlank(); // can advance record
 
+  template <Direction D> void CheckFormattedStmtType(const char *name) {
+    if (!get_if<FormattedIoStatementState>() ||
+        !get_if<IoDirectionState<D>>()) {
+      GetIoErrorHandler().Crash(
+          "%s called for I/O statement that is not formatted %s", name,
+          D == Direction::Output ? "output" : "input");
+    }
+  }
+
 private:
   std::variant<std::reference_wrapper<OpenStatementState>,
       std::reference_wrapper<CloseStatementState>,
@@ -123,7 +140,7 @@
   std::optional<DataEdit> GetNextDataEdit(IoStatementState &, int = 1);
   ExternalFileUnit *GetExternalFileUnit() const { return nullptr; }
   void BeginReadingRecord() {}
-
+  void FinishReadingRecord() {}
   bool Inquire(InquiryKeywordHash, char *, std::size_t);
   bool Inquire(InquiryKeywordHash, bool &);
   bool Inquire(InquiryKeywordHash, std::int64_t, bool &);
@@ -131,17 +148,11 @@
   void BadInquiryKeywordHashCrash(InquiryKeywordHash);
 };
 
-struct InputStatementState {};
-struct OutputStatementState {};
-template <Direction D>
-using IoDirectionState = std::conditional_t<D == Direction::Input,
-    InputStatementState, OutputStatementState>;
-
-struct FormattedStatementState {};
-
 // Common state for list-directed internal & external I/O
-template <Direction> struct ListDirectedStatementState {};
-template <> struct ListDirectedStatementState<Direction::Output> {
+template <Direction> struct ListDirectedStatementState;
+template <>
+struct ListDirectedStatementState<Direction::Output>
+    : public FormattedIoStatementState {
   static std::size_t RemainingSpaceInRecord(const ConnectionState &);
   bool NeedAdvance(const ConnectionState &, std::size_t) const;
   bool EmitLeadingSpaceOrAdvance(
@@ -150,7 +161,9 @@
       IoStatementState &, int maxRepeat = 1);
   bool lastWasUndelimitedCharacter{false};
 };
-template <> class ListDirectedStatementState<Direction::Input> {
+template <>
+class ListDirectedStatementState<Direction::Input>
+    : public FormattedIoStatementState {
 public:
   // Skips value separators, handles repetition and null values.
   // Vacant when '/' appears; present with descriptor == ListDirectedNullValue
@@ -198,7 +211,7 @@
 template <Direction DIR, typename CHAR>
 class InternalFormattedIoStatementState
     : public InternalIoStatementState<DIR, CHAR>,
-      public FormattedStatementState {
+      public FormattedIoStatementState {
 public:
   using CharType = CHAR;
   using typename InternalIoStatementState<DIR, CharType>::Buffer;
@@ -269,14 +282,12 @@
   void HandleRelativePosition(std::int64_t);
   void HandleAbsolutePosition(std::int64_t);
   void BeginReadingRecord();
-
-private:
-  bool beganReading_{false};
+  void FinishReadingRecord();
 };
 
 template <Direction DIR, typename CHAR>
 class ExternalFormattedIoStatementState : public ExternalIoStatementState<DIR>,
-                                          public FormattedStatementState {
+                                          public FormattedIoStatementState {
 public:
   using CharType = CHAR;
   ExternalFormattedIoStatementState(ExternalFileUnit &, const CharType *format,
diff --git a/flang/runtime/io-stmt.cpp b/flang/runtime/io-stmt.cpp
--- a/flang/runtime/io-stmt.cpp
+++ b/flang/runtime/io-stmt.cpp
@@ -236,11 +236,13 @@
 template <Direction DIR> int ExternalIoStatementState<DIR>::EndIoStatement() {
   if constexpr (DIR == Direction::Input) {
     BeginReadingRecord(); // in case of READ with no data items
-  }
-  if (!unit().nonAdvancing && GetIoStat() != IostatEnd) {
-    unit().AdvanceRecord(*this);
-  }
-  if constexpr (DIR == Direction::Output) {
+    if (!unit().nonAdvancing) {
+      FinishReadingRecord();
+    }
+  } else {
+    if (!unit().nonAdvancing) {
+      unit().AdvanceRecord(*this);
+    }
     unit().FlushIfTerminal(*this);
   }
   return ExternalIoStatementBase::EndIoStatement();
@@ -315,10 +317,20 @@
 template <Direction DIR>
 void ExternalIoStatementState<DIR>::BeginReadingRecord() {
   if constexpr (DIR == Direction::Input) {
-    if (!beganReading_) {
-      beganReading_ = true;
-      unit().BeginReadingRecord(*this);
-    }
+    unit().BeginReadingRecord(*this);
+  } else {
+    Crash("ExternalIoStatementState<Direction::Output>::BeginReadingRecord() "
+          "called");
+  }
+}
+
+template <Direction DIR>
+void ExternalIoStatementState<DIR>::FinishReadingRecord() {
+  if constexpr (DIR == Direction::Input) {
+    unit().FinishReadingRecord(*this);
+  } else {
+    Crash("ExternalIoStatementState<Direction::Output>::FinishReadingRecord() "
+          "called");
   }
 }
 
@@ -808,7 +820,10 @@
     }
     break;
   case HashInquiryKeyword("DIRECT"):
-    str = unit().mayPosition() ? "YES" : "NO";
+    str = unit().access == Access::Direct ||
+            (unit().mayPosition() && unit().isFixedRecordLength)
+        ? "YES"
+        : "NO";
     break;
   case HashInquiryKeyword("ENCODING"):
     str = unit().isUnformatted ? "UNDEFINED"
@@ -819,7 +834,7 @@
     str = unit().isUnformatted ? "UNFORMATTED" : "FORMATTED";
     break;
   case HashInquiryKeyword("FORMATTED"):
-    str = "YES";
+    str = !unit().isUnformatted ? "YES" : "NO";
     break;
   case HashInquiryKeyword("NAME"):
     str = unit().path();
@@ -875,7 +890,9 @@
     }
     break;
   case HashInquiryKeyword("SEQUENTIAL"):
-    str = "YES";
+    // "NO" for Direct, since Sequential would not work if
+    // the unit were reopened without RECL=.
+    str = unit().access == Access::Sequential ? "YES" : "NO";
     break;
   case HashInquiryKeyword("SIGN"):
     str = unit().isUnformatted                 ? "UNDEFINED"
@@ -883,13 +900,13 @@
                                                : "SUPPRESS";
     break;
   case HashInquiryKeyword("STREAM"):
-    str = "YES";
+    str = unit().access == Access::Stream ? "YES" : "NO";
     break;
   case HashInquiryKeyword("WRITE"):
     str = unit().mayWrite() ? "YES" : "NO";
     break;
   case HashInquiryKeyword("UNFORMATTED"):
-    str = "YES";
+    str = unit().isUnformatted ? "YES" : "NO";
     break;
   }
   if (str) {
@@ -1078,6 +1095,10 @@
     break;
   case HashInquiryKeyword("DIRECT"):
   case HashInquiryKeyword("ENCODING"):
+  case HashInquiryKeyword("FORMATTED"):
+  case HashInquiryKeyword("SEQUENTIAL"):
+  case HashInquiryKeyword("STREAM"):
+  case HashInquiryKeyword("UNFORMATTED"):
     str = "UNKNONN";
     break;
   case HashInquiryKeyword("READ"):
@@ -1089,12 +1110,6 @@
   case HashInquiryKeyword("WRITE"):
     str = MayWrite(path_.get()) ? "YES" : "NO";
     break;
-  case HashInquiryKeyword("FORMATTED"):
-  case HashInquiryKeyword("SEQUENTIAL"):
-  case HashInquiryKeyword("STREAM"):
-  case HashInquiryKeyword("UNFORMATTED"):
-    str = "YES";
-    break;
   case HashInquiryKeyword("NAME"):
     str = path_.get();
     return true;
diff --git a/flang/runtime/type-code.cpp b/flang/runtime/type-code.cpp
--- a/flang/runtime/type-code.cpp
+++ b/flang/runtime/type-code.cpp
@@ -78,13 +78,13 @@
       raw_ = CFI_type_Bool;
       break;
     case 2:
-      raw_ = CFI_type_int16_t;
+      raw_ = CFI_type_int_fast16_t;
       break;
     case 4:
-      raw_ = CFI_type_int32_t;
+      raw_ = CFI_type_int_fast32_t;
       break;
     case 8:
-      raw_ = CFI_type_int64_t;
+      raw_ = CFI_type_int_fast64_t;
       break;
     }
     break;
diff --git a/flang/runtime/unit.h b/flang/runtime/unit.h
--- a/flang/runtime/unit.h
+++ b/flang/runtime/unit.h
@@ -78,6 +78,7 @@
   std::optional<char32_t> GetCurrentChar(IoErrorHandler &);
   void SetLeftTabLimit();
   void BeginReadingRecord(IoErrorHandler &);
+  void FinishReadingRecord(IoErrorHandler &);
   bool AdvanceRecord(IoErrorHandler &);
   void BackspaceRecord(IoErrorHandler &);
   void FlushIfTerminal(IoErrorHandler &);
@@ -105,6 +106,7 @@
   int unitNumber_{-1};
   Direction direction_{Direction::Output};
   bool impliedEndfile_{false}; // seq. output has taken place
+  bool beganReadingRecord_{false};
 
   Lock lock_;
 
diff --git a/flang/runtime/unit.cpp b/flang/runtime/unit.cpp
--- a/flang/runtime/unit.cpp
+++ b/flang/runtime/unit.cpp
@@ -132,16 +132,17 @@
           static_cast<std::intmax_t>(*totalBytes));
     }
   }
+  endfileRecordNumber.reset();
+  currentRecordNumber = 1;
+  if (totalBytes && recordLength && *recordLength) {
+    endfileRecordNumber = 1 + (*totalBytes / *recordLength);
+  }
   if (position == Position::Append) {
-    if (totalBytes && recordLength && *recordLength) {
-      endfileRecordNumber = 1 + (*totalBytes / *recordLength);
-    } else {
+    if (!endfileRecordNumber) {
       // Fake it so that we can backspace relative from the end
-      endfileRecordNumber = std::numeric_limits<std::int64_t>::max() - 1;
+      endfileRecordNumber = std::numeric_limits<std::int64_t>::max() - 2;
     }
     currentRecordNumber = *endfileRecordNumber;
-  } else {
-    currentRecordNumber = 1;
   }
 }
 
@@ -290,7 +291,7 @@
     furthestPositionInRecord = furthestAfter;
     return true;
   } else {
-    handler.SignalEnd();
+    // EOF or error: can be handled & has been signaled
     endfileRecordNumber = currentRecordNumber;
     return false;
   }
@@ -349,6 +350,10 @@
 
 void ExternalFileUnit::BeginReadingRecord(IoErrorHandler &handler) {
   RUNTIME_CHECK(handler, direction_ == Direction::Input);
+  if (beganReadingRecord_) {
+    return;
+  }
+  beganReadingRecord_ = true;
   if (access == Access::Sequential) {
     if (endfileRecordNumber && currentRecordNumber >= *endfileRecordNumber) {
       handler.SignalEnd();
@@ -367,28 +372,39 @@
   }
 }
 
+void ExternalFileUnit::FinishReadingRecord(IoErrorHandler &handler) {
+  RUNTIME_CHECK(handler, direction_ == Direction::Input && beganReadingRecord_);
+  beganReadingRecord_ = false;
+  if (handler.GetIoStat() != IostatOk) {
+    // avoid bogus crashes in END/ERR circumstances
+  } else if (access == Access::Sequential) {
+    RUNTIME_CHECK(handler, recordLength.has_value());
+    if (isFixedRecordLength) {
+      frameOffsetInFile_ += recordOffsetInFrame_ + *recordLength;
+      recordOffsetInFrame_ = 0;
+    } else if (isUnformatted) {
+      // Retain footer in frame for more efficient BACKSPACE
+      frameOffsetInFile_ += recordOffsetInFrame_ + *recordLength;
+      recordOffsetInFrame_ = sizeof(std::uint32_t);
+      recordLength.reset();
+    } else { // formatted
+      if (Frame()[recordOffsetInFrame_ + *recordLength] == '\r') {
+        ++recordOffsetInFrame_;
+      }
+      recordOffsetInFrame_ += *recordLength + 1;
+      RUNTIME_CHECK(handler, Frame()[recordOffsetInFrame_ - 1] == '\n');
+      recordLength.reset();
+    }
+  }
+  ++currentRecordNumber;
+  BeginRecord();
+}
+
 bool ExternalFileUnit::AdvanceRecord(IoErrorHandler &handler) {
   bool ok{true};
   if (direction_ == Direction::Input) {
-    if (access == Access::Sequential) {
-      RUNTIME_CHECK(handler, recordLength.has_value());
-      if (isFixedRecordLength) {
-        frameOffsetInFile_ += recordOffsetInFrame_ + *recordLength;
-        recordOffsetInFrame_ = 0;
-      } else if (isUnformatted) {
-        // Retain footer in frame for more efficient BACKSPACE
-        frameOffsetInFile_ += recordOffsetInFrame_ + *recordLength;
-        recordOffsetInFrame_ = sizeof(std::uint32_t);
-        recordLength.reset();
-      } else { // formatted
-        if (Frame()[recordOffsetInFrame_ + *recordLength] == '\r') {
-          ++recordOffsetInFrame_;
-        }
-        recordOffsetInFrame_ += *recordLength + 1;
-        RUNTIME_CHECK(handler, Frame()[recordOffsetInFrame_ - 1] == '\n');
-        recordLength.reset();
-      }
-    }
+    FinishReadingRecord(handler);
+    BeginReadingRecord(handler);
   } else { // Direction::Output
     if (!isUnformatted) {
       if (isFixedRecordLength && recordLength) {
@@ -406,9 +422,9 @@
         recordOffsetInFrame_ + recordLength.value_or(furthestPositionInRecord);
     recordOffsetInFrame_ = 0;
     impliedEndfile_ = true;
+    ++currentRecordNumber;
+    BeginRecord();
   }
-  ++currentRecordNumber;
-  BeginRecord();
   return ok;
 }
 
@@ -417,16 +433,22 @@
     handler.SignalError(IostatBackspaceNonSequential,
         "BACKSPACE(UNIT=%d) on non-sequential file", unitNumber());
   } else {
-    DoImpliedEndfile(handler);
-    --currentRecordNumber;
-    BeginRecord();
-    if (isFixedRecordLength) {
-      BackspaceFixedRecord(handler);
-    } else if (isUnformatted) {
-      BackspaceVariableUnformattedRecord(handler);
+    if (endfileRecordNumber && currentRecordNumber > *endfileRecordNumber) {
+      // BACKSPACE after ENDFILE
     } else {
-      BackspaceVariableFormattedRecord(handler);
+      DoImpliedEndfile(handler);
+      if (frameOffsetInFile_ + recordOffsetInFrame_ > 0) {
+        --currentRecordNumber;
+        if (isFixedRecordLength) {
+          BackspaceFixedRecord(handler);
+        } else if (isUnformatted) {
+          BackspaceVariableUnformattedRecord(handler);
+        } else {
+          BackspaceVariableFormattedRecord(handler);
+        }
+      }
     }
+    BeginRecord();
   }
 }
 
@@ -443,8 +465,12 @@
   } else if (!mayWrite()) {
     handler.SignalError(IostatEndfileUnwritable,
         "ENDFILE(UNIT=%d) on read-only file", unitNumber());
+  } else if (endfileRecordNumber &&
+      currentRecordNumber > *endfileRecordNumber) {
+    // ENDFILE after ENDFILE
   } else {
     DoEndfile(handler);
+    ++currentRecordNumber;
   }
 }
 
@@ -456,7 +482,6 @@
     DoImpliedEndfile(handler);
     SetPosition(0);
     currentRecordNumber = 1;
-    // TODO: reset endfileRecordNumber?
   }
 }
 
diff --git a/flang/test/Parser/acc-unparse.f90 b/flang/test/Parser/acc-unparse.f90
new file mode 100644
--- /dev/null
+++ b/flang/test/Parser/acc-unparse.f90
@@ -0,0 +1,19 @@
+! RUN: %f18 -fopenacc -funparse %s | FileCheck %s
+
+! Test unparse does not crash with OpenACC directives.
+
+! Test bug 47659
+program bug47659
+  integer :: i, j
+  label1: do i = 1, 10
+    !$acc parallel loop
+    do j = 1, 10
+      if (j == 2) then
+        exit label1
+      end if
+    end do
+  end do label1
+end program
+
+!CHECK-LABEL: PROGRAM bug47659
+!CHECK: !$ACC PARALLEL LOOP
diff --git a/flang/test/Semantics/call03.f90 b/flang/test/Semantics/call03.f90
--- a/flang/test/Semantics/call03.f90
+++ b/flang/test/Semantics/call03.f90
@@ -29,7 +29,7 @@
     class(tbp), intent(in) :: this
   end subroutine
   subroutine subr02(this)
-    class(final), intent(in) :: this
+    type(final), intent(inout) :: this
   end subroutine
 
   subroutine poly(x)
@@ -113,7 +113,7 @@
 
   subroutine test05 ! 15.5.2.4(2)
     type(final) :: x
-    !ERROR: Actual argument associated with TYPE(*) dummy argument 'x=' may not have FINAL subroutine 'subr02'
+    !ERROR: Actual argument associated with TYPE(*) dummy argument 'x=' may not have derived type 'final' with FINAL subroutine 'subr02'
     call typestar(x)
   end subroutine
 
diff --git a/flang/test/Semantics/call05.f90 b/flang/test/Semantics/call05.f90
--- a/flang/test/Semantics/call05.f90
+++ b/flang/test/Semantics/call05.f90
@@ -89,9 +89,9 @@
     call spp(up)
     !ERROR: Actual argument type 'CLASS(*)' is not compatible with dummy argument type 't'
     call spa(ua)
-    !ERROR: POINTER or ALLOCATABLE dummy and actual arguments must have the same declared type
+    !ERROR: POINTER or ALLOCATABLE dummy and actual arguments must have the same declared type and kind
     call spp(pp2)
-    !ERROR: POINTER or ALLOCATABLE dummy and actual arguments must have the same declared type
+    !ERROR: POINTER or ALLOCATABLE dummy and actual arguments must have the same declared type and kind
     call spa(pa2)
     !ERROR: Rank of dummy argument is 1, but actual argument has rank 2
     call smp(mpmat)
diff --git a/flang/test/Semantics/final01.f90 b/flang/test/Semantics/final01.f90
new file mode 100644
--- /dev/null
+++ b/flang/test/Semantics/final01.f90
@@ -0,0 +1,119 @@
+! RUN: %S/test_errors.sh %s %t %f18
+! Test FINAL subroutine constraints C786-C789
+module m1
+  external :: external
+  intrinsic :: sin
+  real :: object
+  procedure(valid), pointer :: pointer
+  type :: parent(kind1, len1)
+    integer, kind :: kind1 = 1
+    integer, len :: len1 = 1
+  end type
+  type, extends(parent) :: child(kind2, len2)
+    integer, kind :: kind2 = 2
+    integer, len :: len2 = 2
+   contains
+    final :: valid
+!ERROR: FINAL subroutine 'external' of derived type 'child' must be a module procedure
+!ERROR: FINAL subroutine 'sin' of derived type 'child' must be a module procedure
+!ERROR: FINAL subroutine 'object' of derived type 'child' must be a module procedure
+!ERROR: FINAL subroutine 'pointer' of derived type 'child' must be a module procedure
+!ERROR: FINAL subroutine 'func' of derived type 'child' must be a subroutine
+    final :: external, sin, object, pointer, func
+!ERROR: FINAL subroutine 's01' of derived type 'child' must have a single dummy argument that is a data object
+!ERROR: FINAL subroutine 's02' of derived type 'child' must have a single dummy argument that is a data object
+!ERROR: FINAL subroutine 's03' of derived type 'child' must not have a dummy argument with INTENT(OUT)
+!ERROR: FINAL subroutine 's04' of derived type 'child' must not have a dummy argument with the VALUE attribute
+!ERROR: FINAL subroutine 's05' of derived type 'child' must not have a POINTER dummy argument
+!ERROR: FINAL subroutine 's06' of derived type 'child' must not have an ALLOCATABLE dummy argument
+!ERROR: FINAL subroutine 's07' of derived type 'child' must not have a coarray dummy argument
+!ERROR: FINAL subroutine 's08' of derived type 'child' must not have a polymorphic dummy argument
+!ERROR: FINAL subroutine 's09' of derived type 'child' must not have a polymorphic dummy argument
+!ERROR: FINAL subroutine 's10' of derived type 'child' must not have an OPTIONAL dummy argument
+    final :: s01, s02, s03, s04, s05, s06, s07, s08, s09, s10
+!ERROR: FINAL subroutine 's11' of derived type 'child' must have a single dummy argument
+!ERROR: FINAL subroutine 's12' of derived type 'child' must have a single dummy argument
+!ERROR: FINAL subroutine 's13' of derived type 'child' must have a dummy argument with an assumed LEN type parameter 'len1=*'
+!ERROR: FINAL subroutine 's13' of derived type 'child' must have a dummy argument with an assumed LEN type parameter 'len2=*'
+!ERROR: FINAL subroutine 's14' of derived type 'child' must have a dummy argument with an assumed LEN type parameter 'len2=*'
+!ERROR: FINAL subroutine 's15' of derived type 'child' must have a dummy argument with an assumed LEN type parameter 'len1=*'
+!ERROR: FINAL subroutine 's16' of derived type 'child' must not have a polymorphic dummy argument
+!ERROR: FINAL subroutine 's17' of derived type 'child' must have a TYPE(child) dummy argument
+    final :: s11, s12, s13, s14, s15, s16, s17
+!ERROR: FINAL subroutine 'valid' already appeared in this derived type
+    final :: valid
+!ERROR: FINAL subroutines 'valid2' and 'valid' of derived type 'child' cannot be distinguished by rank or KIND type parameter value
+    final :: valid2
+  end type
+ contains
+  subroutine valid(x)
+    type(child(len1=*, len2=*)), intent(inout) :: x
+  end subroutine
+  subroutine valid2(x)
+    type(child(len1=*, len2=*)), intent(inout) :: x
+  end subroutine
+  real function func(x)
+    type(child(len1=*, len2=*)), intent(inout) :: x
+    func = 0.
+  end function
+  subroutine s01(*)
+  end subroutine
+  subroutine s02(x)
+    external :: x
+  end subroutine
+  subroutine s03(x)
+    type(child(kind1=3, len1=*, len2=*)), intent(out) :: x
+  end subroutine
+  subroutine s04(x)
+    type(child(kind1=4, len1=*, len2=*)), value :: x
+  end subroutine
+  subroutine s05(x)
+    type(child(kind1=5, len1=*, len2=*)), pointer :: x
+  end subroutine
+  subroutine s06(x)
+    type(child(kind1=6, len1=*, len2=*)), allocatable :: x
+  end subroutine
+  subroutine s07(x)
+    type(child(kind1=7, len1=*, len2=*)) :: x[*]
+  end subroutine
+  subroutine s08(x)
+    class(child(kind1=8, len1=*, len2=*)) :: x
+  end subroutine
+  subroutine s09(x)
+    class(*) :: x
+  end subroutine
+  subroutine s10(x)
+    type(child(kind1=10, len1=*, len2=*)), optional :: x
+  end subroutine
+  subroutine s11(x, y)
+    type(child(kind1=11, len1=*, len2=*)) :: x, y
+  end subroutine
+  subroutine s12
+  end subroutine
+  subroutine s13(x)
+    type(child(kind1=13)) :: x
+  end subroutine
+  subroutine s14(x)
+    type(child(kind1=14, len1=*,len2=2)) :: x
+  end subroutine
+  subroutine s15(x)
+    type(child(kind1=15, len2=*)) :: x
+  end subroutine
+  subroutine s16(x)
+    type(*) :: x
+  end subroutine
+  subroutine s17(x)
+    type(parent(kind1=17, len1=*)) :: x
+  end subroutine
+  subroutine nested
+    type :: t
+     contains
+!ERROR: FINAL subroutine 'internal' of derived type 't' must be a module procedure
+      final :: internal
+    end type
+   contains
+    subroutine internal(x)
+      type(t), intent(inout) :: x
+    end subroutine
+  end subroutine
+end module
diff --git a/flang/test/Semantics/modfile10.f90 b/flang/test/Semantics/modfile10.f90
--- a/flang/test/Semantics/modfile10.f90
+++ b/flang/test/Semantics/modfile10.f90
@@ -64,8 +64,8 @@
 !  type::t2
 !    integer(4)::x
 !  contains
-!    final::c
 !    procedure,non_overridable,private::d
+!    final::c
 !  end type
 !  type,abstract::t2a
 !  contains
diff --git a/flang/test/Semantics/resolve32.f90 b/flang/test/Semantics/resolve32.f90
--- a/flang/test/Semantics/resolve32.f90
+++ b/flang/test/Semantics/resolve32.f90
@@ -57,7 +57,7 @@
   contains
     procedure, nopass :: b => s
     final :: f
-    !ERROR: Type parameter, component, or procedure binding 'i' already defined in this type
+    !ERROR: FINAL subroutine 'i' of derived type 't2' must be a module procedure
     final :: i
   end type
   type t3
diff --git a/flang/test/Semantics/resolve55.f90 b/flang/test/Semantics/resolve55.f90
--- a/flang/test/Semantics/resolve55.f90
+++ b/flang/test/Semantics/resolve55.f90
@@ -36,25 +36,24 @@
   end do
 end subroutine s4
 
-subroutine s5()
+module m
 ! Cannot have a variable of a finalizable type in a locality spec
   type t1
     integer :: i
   contains
     final :: f
   end type t1
-
-  type(t1) :: var
-
-!ERROR: Finalizable variable 'var' not allowed in a locality-spec
-  do concurrent(i=1:5) local(var)
-  end do
-
-contains
+ contains
+  subroutine s5()
+    type(t1) :: var
+    !ERROR: Finalizable variable 'var' not allowed in a locality-spec
+    do concurrent(i=1:5) local(var)
+    end do
+  end subroutine s5
   subroutine f(x)
     type(t1) :: x
   end subroutine f
-end subroutine s5
+end module m
 
 subroutine s6
 ! Cannot have a nonpointer polymorphic dummy argument in a locality spec
diff --git a/flang/unittests/Runtime/hello.cpp b/flang/unittests/Runtime/hello.cpp
--- a/flang/unittests/Runtime/hello.cpp
+++ b/flang/unittests/Runtime/hello.cpp
@@ -118,6 +118,41 @@
   }
 }
 
+static void descrOutputTest() {
+  char buffer[9];
+  // Formatted
+  const char *format{"(2A4)"};
+  auto cookie{IONAME(BeginInternalFormattedOutput)(
+      buffer, sizeof buffer, format, std::strlen(format))};
+  StaticDescriptor<1> staticDescriptor;
+  Descriptor &desc{staticDescriptor.descriptor()};
+  SubscriptValue extent[]{2};
+  char data[2][4];
+  std::memcpy(data[0], "ABCD", 4);
+  std::memcpy(data[1], "EFGH", 4);
+  desc.Establish(TypeCode{CFI_type_char}, sizeof data[0], &data, 1, extent);
+  desc.Dump();
+  desc.Check();
+  IONAME(OutputDescriptor)(cookie, desc);
+  if (auto status{IONAME(EndIoStatement)(cookie)}) {
+    Fail() << "descrOutputTest: '" << format << "' failed, status "
+           << static_cast<int>(status) << '\n';
+  } else {
+    test("descrOutputTest(formatted)", "ABCDEFGH ",
+        std::string{buffer, sizeof buffer});
+  }
+  // List-directed
+  cookie = IONAME(BeginInternalListOutput)(buffer, sizeof buffer);
+  IONAME(OutputDescriptor)(cookie, desc);
+  if (auto status{IONAME(EndIoStatement)(cookie)}) {
+    Fail() << "descrOutputTest: list-directed failed, status "
+           << static_cast<int>(status) << '\n';
+  } else {
+    test("descrOutputTest(list)", " ABCDEFGH",
+        std::string{buffer, sizeof buffer});
+  }
+}
+
 static void realTest(const char *format, double x, const char *expect) {
   char buffer[800];
   auto cookie{IONAME(BeginInternalFormattedOutput)(
@@ -485,6 +520,7 @@
   realInTest("(DC,F18.0)", "              12,5", 0x4029000000000000);
 
   listInputTest();
+  descrOutputTest();
 
   return EndTests();
 }
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -202,14 +202,6 @@
 add_custom_target(cxx-generated-config ALL
   DEPENDS ${LIBCXX_BINARY_DIR}/__generated_config)
 
-# In some build configurations (like bootstrapping clang), we need to be able to
-# install the libcxx headers before the CMake configuration for libcxx runs. Making
-# the name of this target configurable allows LLVM/runtimes/CMakeLists.txt to
-# add this subdirectory to the LLVM build to put libcxx's headers in place
-# before libcxx's build configuration is run.
-if (NOT CXX_HEADER_TARGET)
-  set(CXX_HEADER_TARGET cxx-headers)
-endif()
 if(LIBCXX_HEADER_DIR)
   set(output_dir ${LIBCXX_HEADER_DIR}/include/c++/v1)
 
@@ -234,23 +226,23 @@
   list(APPEND out_files ${dst})
   add_custom_target(generate-cxx-headers DEPENDS ${out_files})
 
-  add_library(${CXX_HEADER_TARGET} INTERFACE)
-  add_dependencies(${CXX_HEADER_TARGET} generate-cxx-headers ${LIBCXX_CXX_ABI_HEADER_TARGET})
+  add_library(cxx-headers INTERFACE)
+  add_dependencies(cxx-headers generate-cxx-headers ${LIBCXX_CXX_ABI_HEADER_TARGET})
   # TODO: Use target_include_directories once we figure out why that breaks the runtimes build
   if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC" OR "${CMAKE_CXX_SIMULATE_ID}" STREQUAL "MSVC")
-    target_compile_options(${CXX_HEADER_TARGET} INTERFACE /I "${output_dir}")
+    target_compile_options(cxx-headers INTERFACE /I "${output_dir}")
   else()
-    target_compile_options(${CXX_HEADER_TARGET} INTERFACE -I "${output_dir}")
+    target_compile_options(cxx-headers INTERFACE -I "${output_dir}")
   endif()
 
   # Make sure the generated __config_site header is included when we build the library.
   if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC" OR "${CMAKE_CXX_SIMULATE_ID}" STREQUAL "MSVC")
-    target_compile_options(${CXX_HEADER_TARGET} INTERFACE /FI "${LIBCXX_BINARY_DIR}/__config_site")
+    target_compile_options(cxx-headers INTERFACE /FI "${LIBCXX_BINARY_DIR}/__config_site")
   else()
-    target_compile_options(${CXX_HEADER_TARGET} INTERFACE -include "${LIBCXX_BINARY_DIR}/__config_site")
+    target_compile_options(cxx-headers INTERFACE -include "${LIBCXX_BINARY_DIR}/__config_site")
   endif()
 else()
-  add_library(${CXX_HEADER_TARGET} INTERFACE)
+  add_library(cxx-headers INTERFACE)
 endif()
 
 if (LIBCXX_INSTALL_HEADERS)
@@ -258,7 +250,7 @@
     get_filename_component(dir ${file} DIRECTORY)
     install(FILES ${file}
       DESTINATION ${LIBCXX_INSTALL_HEADER_PREFIX}include/c++/v1/${dir}
-      COMPONENT ${CXX_HEADER_TARGET}
+      COMPONENT cxx-headers
       PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ
     )
   endforeach()
@@ -268,15 +260,15 @@
     DESTINATION ${LIBCXX_INSTALL_HEADER_PREFIX}include/c++/v1
     PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ
     RENAME __config
-    COMPONENT ${CXX_HEADER_TARGET})
+    COMPONENT cxx-headers)
 
   if (NOT CMAKE_CONFIGURATION_TYPES)
-    add_custom_target(install-${CXX_HEADER_TARGET}
-                      DEPENDS ${CXX_HEADER_TARGET} cxx-generated-config
+    add_custom_target(install-cxx-headers
+                      DEPENDS cxx-headers cxx-generated-config
                       COMMAND "${CMAKE_COMMAND}"
-                              -DCMAKE_INSTALL_COMPONENT=${CXX_HEADER_TARGET}
+                              -DCMAKE_INSTALL_COMPONENT=cxx-headers
                               -P "${CMAKE_BINARY_DIR}/cmake_install.cmake")
     # Stripping is a no-op for headers
-    add_custom_target(install-${CXX_HEADER_TARGET}-stripped DEPENDS install-${CXX_HEADER_TARGET})
+    add_custom_target(install-cxx-headers-stripped DEPENDS install-cxx-headers)
   endif()
 endif()
diff --git a/libcxx/lib/abi/CHANGELOG.TXT b/libcxx/lib/abi/CHANGELOG.TXT
--- a/libcxx/lib/abi/CHANGELOG.TXT
+++ b/libcxx/lib/abi/CHANGELOG.TXT
@@ -12,6 +12,20 @@
 
 New entries should be added directly below the "Version" header.
 
+------------
+Version 12.0
+------------
+
+* 4f13b9992971 - [libc++] Simplify how we re-export symbols from libc++abi
+
+  We re-export some symbols that were exported from libc++abi but not from
+  libc++. Exporting new symbols is not an ABI break.
+
+  x86_64-apple-apple-darwin
+  -------------------------
+  Symbol added: ___cxa_allocate_dependent_exception
+  Symbol added: ___cxa_free_dependent_exception
+
 ------------
 Version 10.0
 ------------
diff --git a/libcxx/lib/abi/CMakeLists.txt b/libcxx/lib/abi/CMakeLists.txt
--- a/libcxx/lib/abi/CMakeLists.txt
+++ b/libcxx/lib/abi/CMakeLists.txt
@@ -22,7 +22,8 @@
     AND ("${LIBCXX_CXX_ABI_LIBNAME}" STREQUAL "libcxxabi" OR
          (APPLE AND "${LIBCXX_CXX_ABI_LIBNAME}" STREQUAL "default"))
     AND NOT LIBCXX_ABI_UNSTABLE
-    AND LIBCXX_ENABLE_EXCEPTIONS)
+    AND LIBCXX_ENABLE_EXCEPTIONS
+    AND LIBCXX_ENABLE_NEW_DELETE_DEFINITIONS)
     add_custom_target(check-cxx-abilist
             ${SYMDIFF_EXE} --only-stdlib-symbols --strict ${ABILIST_FILE}
             $<TARGET_SONAME_FILE:cxx_shared>
diff --git a/libcxx/lib/abi/x86_64-apple-darwin.v1.abilist b/libcxx/lib/abi/x86_64-apple-darwin.v1.abilist
--- a/libcxx/lib/abi/x86_64-apple-darwin.v1.abilist
+++ b/libcxx/lib/abi/x86_64-apple-darwin.v1.abilist
@@ -2431,3 +2431,7 @@
 {'type': 'FUNC', 'is_defined': True, 'name': '__ZNSt3__131__arrive_barrier_algorithm_baseEPNS_24__barrier_algorithm_baseEh'}
 {'type': 'FUNC', 'is_defined': True, 'name': '__ZNSt3__132__destroy_barrier_algorithm_baseEPNS_24__barrier_algorithm_baseE'}
 {'type': 'FUNC', 'is_defined': True, 'name': '__ZNSt3__134__construct_barrier_algorithm_baseERl'}
+{'type': 'U', 'is_defined': False, 'name': '___cxa_allocate_dependent_exception'}
+{'type': 'U', 'is_defined': False, 'name': '___cxa_free_dependent_exception'}
+{'type': 'I', 'is_defined': True, 'name': '___cxa_allocate_dependent_exception'}
+{'type': 'I', 'is_defined': True, 'name': '___cxa_free_dependent_exception'}
diff --git a/libcxx/lib/libc++abi-exceptions.exp b/libcxx/lib/libc++abi-exceptions.exp
deleted file mode 100644
--- a/libcxx/lib/libc++abi-exceptions.exp
+++ /dev/null
@@ -1,10 +0,0 @@
-___cxa_allocate_exception
-___cxa_begin_catch
-___cxa_call_unexpected
-___cxa_current_exception_type
-___cxa_end_catch
-___cxa_free_exception
-___cxa_get_exception_ptr
-___cxa_rethrow
-___cxa_throw
-___gxx_personality_v0
diff --git a/libcxx/lib/libc++abi-exceptions.sjlj.exp b/libcxx/lib/libc++abi-exceptions.sjlj.exp
deleted file mode 100644
--- a/libcxx/lib/libc++abi-exceptions.sjlj.exp
+++ /dev/null
@@ -1,10 +0,0 @@
-___cxa_allocate_exception
-___cxa_begin_catch
-___cxa_call_unexpected
-___cxa_current_exception_type
-___cxa_end_catch
-___cxa_free_exception
-___cxa_get_exception_ptr
-___cxa_rethrow
-___cxa_throw
-___gxx_personality_sj0
diff --git a/libcxx/lib/libc++abi-new-delete.exp b/libcxx/lib/libc++abi-new-delete.exp
deleted file mode 100644
--- a/libcxx/lib/libc++abi-new-delete.exp
+++ /dev/null
@@ -1,20 +0,0 @@
-__ZdaPv
-__ZdaPvRKSt9nothrow_t
-__ZdaPvSt11align_val_t
-__ZdaPvSt11align_val_tRKSt9nothrow_t
-__ZdaPvm
-__ZdaPvmSt11align_val_t
-__ZdlPv
-__ZdlPvRKSt9nothrow_t
-__ZdlPvSt11align_val_t
-__ZdlPvSt11align_val_tRKSt9nothrow_t
-__ZdlPvm
-__ZdlPvmSt11align_val_t
-__Znam
-__ZnamRKSt9nothrow_t
-__ZnamSt11align_val_t
-__ZnamSt11align_val_tRKSt9nothrow_t
-__Znwm
-__ZnwmRKSt9nothrow_t
-__ZnwmSt11align_val_t
-__ZnwmSt11align_val_tRKSt9nothrow_t
diff --git a/libcxx/src/CMakeLists.txt b/libcxx/src/CMakeLists.txt
--- a/libcxx/src/CMakeLists.txt
+++ b/libcxx/src/CMakeLists.txt
@@ -213,16 +213,9 @@
       "-Wl,-force_symbols_not_weak_list,${CMAKE_CURRENT_SOURCE_DIR}/../lib/notweak.exp"
       "-Wl,-force_symbols_weak_list,${CMAKE_CURRENT_SOURCE_DIR}/../lib/weak.exp")
 
-    if (LIBCXX_ENABLE_EXCEPTIONS)
-      if ("${CMAKE_OSX_ARCHITECTURES}" MATCHES "^(armv6|armv7|armv7s)$")
-        target_link_libraries(cxx_shared PRIVATE "-Wl,-reexported_symbols_list,${CMAKE_CURRENT_SOURCE_DIR}/../lib/libc++abi-exceptions.sjlj.exp")
-      else()
-        target_link_libraries(cxx_shared PRIVATE "-Wl,-reexported_symbols_list,${CMAKE_CURRENT_SOURCE_DIR}/../lib/libc++abi-exceptions.exp")
-      endif()
-    endif()
-
     if (NOT LIBCXX_ENABLE_NEW_DELETE_DEFINITIONS)
-      target_link_libraries(cxx_shared PRIVATE "-Wl,-reexported_symbols_list,${CMAKE_CURRENT_SOURCE_DIR}/../lib/libc++abi-new-delete.exp")
+      target_link_libraries(cxx_shared PRIVATE
+        "-Wl,-reexported_symbols_list,${CMAKE_CURRENT_SOURCE_DIR}/../../libcxxabi/lib/new-delete.exp")
     endif()
   endif()
 
diff --git a/libcxx/utils/ci/buildkite-pipeline.yml b/libcxx/utils/ci/buildkite-pipeline.yml
--- a/libcxx/utils/ci/buildkite-pipeline.yml
+++ b/libcxx/utils/ci/buildkite-pipeline.yml
@@ -16,61 +16,76 @@
 
 steps:
   - label: "C++03"
-    command: "set -o pipefail && libcxx/utils/ci/run-buildbot.sh x86_64-ubuntu-cxx03 | libcxx/utils/ci/phabricator-report"
+    command: "set -o pipefail && libcxx/utils/ci/run-buildbot.sh generic-cxx03 | libcxx/utils/ci/phabricator-report"
     agents:
       queue: "libcxx-builders"
 
   - label: "C++11"
-    command: "set -o pipefail && libcxx/utils/ci/run-buildbot.sh x86_64-ubuntu-cxx11 | libcxx/utils/ci/phabricator-report"
+    command: "set -o pipefail && libcxx/utils/ci/run-buildbot.sh generic-cxx11 | libcxx/utils/ci/phabricator-report"
     agents:
       queue: "libcxx-builders"
 
   - label: "C++14"
-    command: "set -o pipefail && libcxx/utils/ci/run-buildbot.sh x86_64-ubuntu-cxx14 | libcxx/utils/ci/phabricator-report"
+    command: "set -o pipefail && libcxx/utils/ci/run-buildbot.sh generic-cxx14 | libcxx/utils/ci/phabricator-report"
     agents:
       queue: "libcxx-builders"
 
   - label: "C++17"
-    command: "set -o pipefail && libcxx/utils/ci/run-buildbot.sh x86_64-ubuntu-cxx17 | libcxx/utils/ci/phabricator-report"
+    command: "set -o pipefail && libcxx/utils/ci/run-buildbot.sh generic-cxx17 | libcxx/utils/ci/phabricator-report"
     agents:
       queue: "libcxx-builders"
 
   - label: "C++20"
-    command: "set -o pipefail && libcxx/utils/ci/run-buildbot.sh x86_64-ubuntu-cxx2a | libcxx/utils/ci/phabricator-report"
+    command: "set -o pipefail && libcxx/utils/ci/run-buildbot.sh generic-cxx2a | libcxx/utils/ci/phabricator-report"
     agents:
       queue: "libcxx-builders"
 
   - label: "-fno-exceptions"
-    command: "set -o pipefail && libcxx/utils/ci/run-buildbot.sh x86_64-ubuntu-noexceptions | libcxx/utils/ci/phabricator-report"
+    command: "set -o pipefail && libcxx/utils/ci/run-buildbot.sh generic-noexceptions | libcxx/utils/ci/phabricator-report"
     agents:
       queue: "libcxx-builders"
 
   - label: "GCC/C++20"
-    command: "set -o pipefail && libcxx/utils/ci/run-buildbot.sh x86_64-ubuntu-gcc | libcxx/utils/ci/phabricator-report"
+    command: "set -o pipefail && libcxx/utils/ci/run-buildbot.sh generic-gcc | libcxx/utils/ci/phabricator-report"
     agents:
       queue: "libcxx-builders"
 
   - label: "ASAN"
-    command: "set -o pipefail && libcxx/utils/ci/run-buildbot.sh x86_64-ubuntu-asan | libcxx/utils/ci/phabricator-report"
+    command: "set -o pipefail && libcxx/utils/ci/run-buildbot.sh generic-asan | libcxx/utils/ci/phabricator-report"
     agents:
       queue: "libcxx-builders"
 
   - label: "TSAN"
-    command: "set -o pipefail && libcxx/utils/ci/run-buildbot.sh x86_64-ubuntu-tsan | libcxx/utils/ci/phabricator-report"
+    command: "set -o pipefail && libcxx/utils/ci/run-buildbot.sh generic-tsan | libcxx/utils/ci/phabricator-report"
     agents:
       queue: "libcxx-builders"
 
   - label: "UBSAN"
-    command: "set -o pipefail && libcxx/utils/ci/run-buildbot.sh x86_64-ubuntu-ubsan | libcxx/utils/ci/phabricator-report"
+    command: "set -o pipefail && libcxx/utils/ci/run-buildbot.sh generic-ubsan | libcxx/utils/ci/phabricator-report"
     agents:
       queue: "libcxx-builders"
 
   - label: "With LLVM's libunwind"
-    command: "set -o pipefail && libcxx/utils/ci/run-buildbot.sh x86_64-ubuntu-with_llvm_unwinder | libcxx/utils/ci/phabricator-report"
+    command: "set -o pipefail && libcxx/utils/ci/run-buildbot.sh generic-with_llvm_unwinder | libcxx/utils/ci/phabricator-report"
     agents:
       queue: "libcxx-builders"
 
   - label: "Single-threaded"
-    command: "set -o pipefail && libcxx/utils/ci/run-buildbot.sh x86_64-ubuntu-singlethreaded | libcxx/utils/ci/phabricator-report"
+    command: "set -o pipefail && libcxx/utils/ci/run-buildbot.sh generic-singlethreaded | libcxx/utils/ci/phabricator-report"
     agents:
       queue: "libcxx-builders"
+
+  - label: "MacOS C++20"
+    command: "set -o pipefail && libcxx/utils/ci/run-buildbot.sh generic-cxx2a | libcxx/utils/ci/phabricator-report"
+    agents:
+      queue: "libcxx-macos-builders"
+
+  # Build with the configuration we use to generate libc++.dylib on Apple platforms
+  - label: "Apple system"
+    command: "set -o pipefail && libcxx/utils/ci/run-buildbot.sh x86_64-apple-system | libcxx/utils/ci/phabricator-report"
+    agents:
+      queue: "libcxx-macos-builders"
+  - label: "Apple system -fno-exceptions"
+    command: "set -o pipefail && libcxx/utils/ci/run-buildbot.sh x86_64-apple-system-noexceptions | libcxx/utils/ci/phabricator-report"
+    agents:
+      queue: "libcxx-macos-builders"
diff --git a/libcxx/utils/ci/run-buildbot.sh b/libcxx/utils/ci/run-buildbot.sh
--- a/libcxx/utils/ci/run-buildbot.sh
+++ b/libcxx/utils/ci/run-buildbot.sh
@@ -10,89 +10,91 @@
 set -ex
 
 BUILDER="${1}"
+MONOREPO_ROOT="$(git rev-parse --show-toplevel)"
+BUILD_DIR="${MONOREPO_ROOT}/build/${BUILDER}"
 
 args=()
 args+=("-DLLVM_ENABLE_PROJECTS=libcxx;libunwind;libcxxabi")
 args+=("-DLIBCXX_CXX_ABI=libcxxabi")
 
 case "${BUILDER}" in
-x86_64-ubuntu-cxx03)
+generic-cxx03)
     export CC=clang
     export CXX=clang++
     args+=("-DLLVM_LIT_ARGS=-sv --show-unsupported --param=std=c++03")
 ;;
-x86_64-ubuntu-cxx11)
+generic-cxx11)
     export CC=clang
     export CXX=clang++
     args+=("-DLLVM_LIT_ARGS=-sv --show-unsupported --param=std=c++11")
 ;;
-x86_64-ubuntu-cxx14)
+generic-cxx14)
     export CC=clang
     export CXX=clang++
     args+=("-DLLVM_LIT_ARGS=-sv --show-unsupported --param=std=c++14")
 ;;
-x86_64-ubuntu-cxx17)
+generic-cxx17)
     export CC=clang
     export CXX=clang++
     args+=("-DLLVM_LIT_ARGS=-sv --show-unsupported --param=std=c++17")
 ;;
-x86_64-ubuntu-cxx2a)
+generic-cxx2a)
     export CC=clang
     export CXX=clang++
     args+=("-DLLVM_LIT_ARGS=-sv --show-unsupported --param=std=c++2a")
 ;;
-x86_64-ubuntu-noexceptions)
+generic-noexceptions)
     export CC=clang
     export CXX=clang++
     args+=("-DLLVM_LIT_ARGS=-sv --show-unsupported")
     args+=("-DLIBCXX_ENABLE_EXCEPTIONS=OFF")
     args+=("-DLIBCXXABI_ENABLE_EXCEPTIONS=OFF")
 ;;
-x86_64-ubuntu-32bit)
+generic-32bit)
     export CC=clang
     export CXX=clang++
     args+=("-DLLVM_LIT_ARGS=-sv --show-unsupported")
     args+=("-DLLVM_BUILD_32_BITS=ON")
 ;;
-x86_64-ubuntu-gcc)
+generic-gcc)
     export CC=gcc
     export CXX=g++
     # FIXME: Re-enable experimental testing on GCC. GCC cares about the order
     #        in which we link -lc++experimental, which causes issues.
     args+=("-DLLVM_LIT_ARGS=-sv --show-unsupported --param enable_experimental=False")
 ;;
-x86_64-ubuntu-asan)
+generic-asan)
     export CC=clang
     export CXX=clang++
     args+=("-DLLVM_USE_SANITIZER=Address")
     args+=("-DLLVM_LIT_ARGS=-sv --show-unsupported")
 ;;
-x86_64-ubuntu-msan)
+generic-msan)
     export CC=clang
     export CXX=clang++
     args+=("-DLLVM_USE_SANITIZER=MemoryWithOrigins")
     args+=("-DLLVM_LIT_ARGS=-sv --show-unsupported")
 ;;
-x86_64-ubuntu-tsan)
+generic-tsan)
     export CC=clang
     export CXX=clang++
     args+=("-DLLVM_USE_SANITIZER=Thread")
     args+=("-DLLVM_LIT_ARGS=-sv --show-unsupported")
 ;;
-x86_64-ubuntu-ubsan)
+generic-ubsan)
     export CC=clang
     export CXX=clang++
     args+=("-DLLVM_USE_SANITIZER=Undefined")
     args+=("-DLIBCXX_ABI_UNSTABLE=ON")
     args+=("-DLLVM_LIT_ARGS=-sv --show-unsupported")
 ;;
-x86_64-ubuntu-with_llvm_unwinder)
+generic-with_llvm_unwinder)
     export CC=clang
     export CXX=clang++
     args+=("-DLLVM_LIT_ARGS=-sv --show-unsupported")
     args+=("-DLIBCXXABI_USE_LLVM_UNWINDER=ON")
 ;;
-x86_64-ubuntu-singlethreaded)
+generic-singlethreaded)
     export CC=clang
     export CXX=clang++
     args+=("-DLLVM_LIT_ARGS=-sv --show-unsupported")
@@ -100,19 +102,29 @@
     args+=("-DLIBCXXABI_ENABLE_THREADS=OFF")
     args+=("-DLIBCXX_ENABLE_MONOTONIC_CLOCK=OFF")
 ;;
+x86_64-apple-system)
+    export CC=clang
+    export CXX=clang++
+    args+=("-DLLVM_LIT_ARGS=-sv --show-unsupported")
+    args+=("-C${MONOREPO_ROOT}/libcxx/cmake/caches/Apple.cmake")
+;;
+x86_64-apple-system-noexceptions)
+    export CC=clang
+    export CXX=clang++
+    args+=("-DLLVM_LIT_ARGS=-sv --show-unsupported")
+    args+=("-C${MONOREPO_ROOT}/libcxx/cmake/caches/Apple.cmake")
+    args+=("-DLIBCXX_ENABLE_EXCEPTIONS=OFF")
+    args+=("-DLIBCXXABI_ENABLE_EXCEPTIONS=OFF")
+;;
 *)
     echo "${BUILDER} is not a known configuration"
     exit 1
 ;;
 esac
 
-UMBRELLA_ROOT="$(git rev-parse --show-toplevel)"
-LLVM_ROOT="${UMBRELLA_ROOT}/llvm"
-BUILD_DIR="${UMBRELLA_ROOT}/build/${BUILDER}"
-
 echo "--- Generating CMake"
 rm -rf "${BUILD_DIR}"
-cmake -S "${LLVM_ROOT}" -B "${BUILD_DIR}" -GNinja -DCMAKE_BUILD_TYPE=RelWithDebInfo "${args[@]}"
+cmake -S "${MONOREPO_ROOT}/llvm" -B "${BUILD_DIR}" -GNinja -DCMAKE_BUILD_TYPE=RelWithDebInfo "${args[@]}"
 
 echo "--- Building libc++ and libc++abi"
 ninja -C "${BUILD_DIR}" check-cxx-deps cxxabi
diff --git a/libcxxabi/src/CMakeLists.txt b/libcxxabi/src/CMakeLists.txt
--- a/libcxxabi/src/CMakeLists.txt
+++ b/libcxxabi/src/CMakeLists.txt
@@ -204,19 +204,27 @@
 
   # -exported_symbols_list is only available on Apple platforms
   if (APPLE)
-    target_link_libraries(cxxabi_shared PRIVATE "-Wl,-exported_symbols_list,${CMAKE_CURRENT_SOURCE_DIR}/../lib/itanium-base.exp")
+    function(export_symbols file)
+      target_link_libraries(cxxabi_shared PRIVATE "-Wl,-exported_symbols_list,${file}")
+    endfunction()
+    function(reexport_symbols file)
+      export_symbols("${file}")
+      target_link_libraries(cxxabi_shared INTERFACE "-Wl,-reexported_symbols_list,${file}")
+    endfunction()
+
+    export_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/itanium-base.exp")
 
     if (LIBCXXABI_ENABLE_NEW_DELETE_DEFINITIONS)
-      target_link_libraries(cxxabi_shared PRIVATE "-Wl,-exported_symbols_list,${CMAKE_CURRENT_SOURCE_DIR}/../lib/new-delete.exp")
+      export_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/new-delete.exp")
     endif()
 
     if (LIBCXXABI_ENABLE_EXCEPTIONS)
-      target_link_libraries(cxxabi_shared PRIVATE "-Wl,-exported_symbols_list,${CMAKE_CURRENT_SOURCE_DIR}/../lib/exceptions.exp")
+      reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/exceptions.exp")
 
       if ("${CMAKE_OSX_ARCHITECTURES}" MATCHES "^(armv6|armv7|armv7s)$")
-        target_link_libraries(cxxabi_shared PRIVATE "-Wl,-exported_symbols_list,${CMAKE_CURRENT_SOURCE_DIR}/../lib/personality-sjlj.exp")
+        reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/personality-sjlj.exp")
       else()
-        target_link_libraries(cxxabi_shared PRIVATE "-Wl,-exported_symbols_list,${CMAKE_CURRENT_SOURCE_DIR}/../lib/personality-v0.exp")
+        reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/personality-v0.exp")
       endif()
     endif()
   endif()
diff --git a/lld/COFF/DebugTypes.h b/lld/COFF/DebugTypes.h
--- a/lld/COFF/DebugTypes.h
+++ b/lld/COFF/DebugTypes.h
@@ -10,32 +10,37 @@
 #define LLD_COFF_DEBUGTYPES_H
 
 #include "lld/Common/LLVM.h"
-#include "llvm/DebugInfo/CodeView/TypeIndex.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/DebugInfo/CodeView/TypeIndexDiscovery.h"
+#include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/MemoryBuffer.h"
 
 namespace llvm {
 namespace codeview {
-class PrecompRecord;
-class TypeServer2Record;
+struct GloballyHashedType;
 } // namespace codeview
 namespace pdb {
 class NativeSession;
+class TpiStream;
 }
 } // namespace llvm
 
 namespace lld {
 namespace coff {
 
+using llvm::codeview::GloballyHashedType;
 using llvm::codeview::TypeIndex;
 
 class ObjFile;
 class PDBInputFile;
 class TypeMerger;
+struct GHashState;
 
 class TpiSource {
 public:
-  enum TpiKind { Regular, PCH, UsingPCH, PDB, PDBIpi, UsingPDB };
+  enum TpiKind : uint8_t { Regular, PCH, UsingPCH, PDB, PDBIpi, UsingPDB };
 
   TpiSource(TpiKind k, ObjFile *f);
   virtual ~TpiSource();
@@ -53,21 +58,97 @@
   /// caller-provided ObjectIndexMap.
   virtual Error mergeDebugT(TypeMerger *m);
 
+  /// Load global hashes, either by hashing types directly, or by loading them
+  /// from LLVM's .debug$H section.
+  virtual void loadGHashes();
+
+  /// Use global hashes to merge type information.
+  virtual void remapTpiWithGHashes(GHashState *g);
+
+  // Remap a type index in place.
+  bool remapTypeIndex(TypeIndex &ti, llvm::codeview::TiRefKind refKind) const;
+
+protected:
+  void remapRecord(MutableArrayRef<uint8_t> rec,
+                   ArrayRef<llvm::codeview::TiReference> typeRefs);
+
+  void mergeTypeRecord(TypeIndex curIndex, llvm::codeview::CVType ty);
+
+  // Merge the type records listed in uniqueTypes. beginIndex is the TypeIndex
+  // of the first record in this source, typically 0x1000. When PCHs are
+  // involved, it may start higher.
+  void mergeUniqueTypeRecords(
+      ArrayRef<uint8_t> debugTypes,
+      TypeIndex beginIndex = TypeIndex(TypeIndex::FirstNonSimpleIndex));
+
+  // Use the ghash table to construct a map from source type index to
+  // destination PDB type index. Usable for either TPI or IPI.
+  void fillMapFromGHashes(GHashState *m,
+                          llvm::SmallVectorImpl<TypeIndex> &indexMap);
+
+  // Copies ghashes from a vector into an array. These are long lived, so it's
+  // worth the time to copy these into an appropriately sized vector to reduce
+  // memory usage.
+  void assignGHashesFromVector(std::vector<GloballyHashedType> &&hashVec);
+
+  // Walk over file->debugTypes and fill in the isItemIndex bit vector.
+  void fillIsItemIndexFromDebugT();
+
+public:
+  bool remapTypesInSymbolRecord(MutableArrayRef<uint8_t> rec);
+
+  void remapTypesInTypeRecord(MutableArrayRef<uint8_t> rec);
+
   /// Is this a dependent file that needs to be processed first, before other
   /// OBJs?
   virtual bool isDependency() const { return false; }
 
-  static void forEachSource(llvm::function_ref<void(TpiSource *)> fn);
+  /// Returns true if this type record should be omitted from the PDB, even if
+  /// it is unique. This prevents a record from being added to the input ghash
+  /// table.
+  bool shouldOmitFromPdb(uint32_t ghashIdx) {
+    return ghashIdx == endPrecompGHashIdx;
+  }
+
+  /// All sources of type information in the program.
+  static std::vector<TpiSource *> instances;
+
+  /// Dependency type sources, such as type servers or PCH object files. These
+  /// must be processed before objects that rely on them. Set by
+  /// TpiSources::sortDependencies.
+  static ArrayRef<TpiSource *> dependencySources;
+
+  /// Object file sources. These must be processed after dependencySources.
+  static ArrayRef<TpiSource *> objectSources;
+
+  /// Sorts the dependencies and reassigns TpiSource indices.
+  static void sortDependencies();
 
   static uint32_t countTypeServerPDBs();
   static uint32_t countPrecompObjs();
 
+  /// Free heap allocated ghashes.
+  static void clearGHashes();
+
   /// Clear global data structures for TpiSources.
   static void clear();
 
   const TpiKind kind;
+  bool ownedGHashes = true;
+  uint32_t tpiSrcIdx = 0;
+
+protected:
+  /// The ghash index (zero based, not 0x1000-based) of the LF_ENDPRECOMP record
+  /// in this object, if one exists. This is the all ones value otherwise. It is
+  /// recorded here so that it can be omitted from the final ghash table.
+  uint32_t endPrecompGHashIdx = ~0U;
+
+public:
   ObjFile *file;
 
+  /// An error encountered during type merging, if any.
+  Error typeMergingError = Error::success();
+
   // Storage for tpiMap or ipiMap, depending on the kind of source.
   llvm::SmallVector<TypeIndex, 0> indexMapStorage;
 
@@ -76,6 +157,31 @@
   // objects.
   llvm::ArrayRef<TypeIndex> tpiMap;
   llvm::ArrayRef<TypeIndex> ipiMap;
+
+  /// Array of global type hashes, indexed by TypeIndex. May be calculated on
+  /// demand, or present in input object files.
+  llvm::ArrayRef<llvm::codeview::GloballyHashedType> ghashes;
+
+  /// When ghashing is used, record the mapping from LF_[M]FUNC_ID to function
+  /// type index here. Both indices are PDB indices, not object type indexes.
+  std::vector<std::pair<TypeIndex, TypeIndex>> funcIdToType;
+
+  /// Indicates if a type record is an item index or a type index.
+  llvm::BitVector isItemIndex;
+
+  /// A list of all "unique" type indices which must be merged into the final
+  /// PDB. GHash type deduplication produces this list, and it should be
+  /// considerably smaller than the input.
+  std::vector<uint32_t> uniqueTypes;
+
+  struct MergedInfo {
+    std::vector<uint8_t> recs;
+    std::vector<uint16_t> recSizes;
+    std::vector<uint32_t> recHashes;
+  };
+
+  MergedInfo mergedTpi;
+  MergedInfo mergedIpi;
 };
 
 TpiSource *makeTpiSource(ObjFile *file);
diff --git a/lld/COFF/DebugTypes.cpp b/lld/COFF/DebugTypes.cpp
--- a/lld/COFF/DebugTypes.cpp
+++ b/lld/COFF/DebugTypes.cpp
@@ -10,9 +10,12 @@
 #include "Chunks.h"
 #include "Driver.h"
 #include "InputFiles.h"
+#include "PDB.h"
 #include "TypeMerger.h"
 #include "lld/Common/ErrorHandler.h"
 #include "lld/Common/Memory.h"
+#include "lld/Common/Timer.h"
+#include "llvm/DebugInfo/CodeView/TypeIndexDiscovery.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeRecordHelpers.h"
 #include "llvm/DebugInfo/CodeView/TypeStreamMerger.h"
@@ -20,7 +23,10 @@
 #include "llvm/DebugInfo/PDB/Native/InfoStream.h"
 #include "llvm/DebugInfo/PDB/Native/NativeSession.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
+#include "llvm/DebugInfo/PDB/Native/TpiHashing.h"
 #include "llvm/DebugInfo/PDB/Native/TpiStream.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/Parallel.h"
 #include "llvm/Support/Path.h"
 
 using namespace llvm;
@@ -54,6 +60,10 @@
   }
 
   Error mergeDebugT(TypeMerger *m) override;
+
+  void loadGHashes() override;
+  void remapTpiWithGHashes(GHashState *g) override;
+
   bool isDependency() const override { return true; }
 
   PDBInputFile *pdbInputFile = nullptr;
@@ -73,22 +83,29 @@
 
   friend class TypeServerSource;
 
-  // IPI merging is handled in TypeServerSource::mergeDebugT, since it depends
-  // directly on type merging.
+  // All of the TpiSource methods are no-ops. The parent TypeServerSource
+  // handles both TPI and IPI.
   Error mergeDebugT(TypeMerger *m) override { return Error::success(); }
-
+  void loadGHashes() override {}
+  void remapTpiWithGHashes(GHashState *g) override {}
   bool isDependency() const override { return true; }
 };
 
 // This class represents the debug type stream of an OBJ file that depends on a
 // PDB type server (see TypeServerSource).
 class UseTypeServerSource : public TpiSource {
+  Expected<TypeServerSource *> getTypeServerSource();
+
 public:
   UseTypeServerSource(ObjFile *f, TypeServer2Record ts)
       : TpiSource(UsingPDB, f), typeServerDependency(ts) {}
 
   Error mergeDebugT(TypeMerger *m) override;
 
+  // No need to load ghashes from /Zi objects.
+  void loadGHashes() override {}
+  void remapTpiWithGHashes(GHashState *g) override;
+
   // Information about the PDB type server dependency, that needs to be loaded
   // in before merging this OBJ.
   TypeServer2Record typeServerDependency;
@@ -110,6 +127,8 @@
             toString(it.first->second->file) + " and " + toString(file) + ")");
   }
 
+  void loadGHashes() override;
+
   bool isDependency() const override { return true; }
 
   static std::map<uint32_t, PrecompSource *> mappings;
@@ -124,20 +143,49 @@
 
   Error mergeDebugT(TypeMerger *m) override;
 
+  void loadGHashes() override;
+  void remapTpiWithGHashes(GHashState *g) override;
+
+private:
+  Error mergeInPrecompHeaderObj();
+
+public:
   // Information about the Precomp OBJ dependency, that needs to be loaded in
   // before merging this OBJ.
   PrecompRecord precompDependency;
 };
 } // namespace
 
-static std::vector<TpiSource *> gc;
+std::vector<TpiSource *> TpiSource::instances;
+ArrayRef<TpiSource *> TpiSource::dependencySources;
+ArrayRef<TpiSource *> TpiSource::objectSources;
 
-TpiSource::TpiSource(TpiKind k, ObjFile *f) : kind(k), file(f) {
-  gc.push_back(this);
+TpiSource::TpiSource(TpiKind k, ObjFile *f)
+    : kind(k), tpiSrcIdx(instances.size()), file(f) {
+  instances.push_back(this);
 }
 
 // Vtable key method.
-TpiSource::~TpiSource() = default;
+TpiSource::~TpiSource() {
+  // Silence any assertions about unchecked errors.
+  consumeError(std::move(typeMergingError));
+}
+
+void TpiSource::sortDependencies() {
+  // Order dependencies first, but preserve the existing order.
+  std::vector<TpiSource *> deps;
+  std::vector<TpiSource *> objs;
+  for (TpiSource *s : instances)
+    (s->isDependency() ? deps : objs).push_back(s);
+  uint32_t numDeps = deps.size();
+  uint32_t numObjs = objs.size();
+  instances = std::move(deps);
+  instances.insert(instances.end(), objs.begin(), objs.end());
+  for (uint32_t i = 0, e = instances.size(); i < e; ++i)
+    instances[i]->tpiSrcIdx = i;
+  dependencySources = makeArrayRef(instances.data(), numDeps);
+  objectSources = makeArrayRef(instances.data() + numDeps, numObjs);
+}
 
 TpiSource *lld::coff::makeTpiSource(ObjFile *file) {
   return make<TpiSource>(TpiSource::Regular, file);
@@ -165,14 +213,68 @@
   return make<UsePrecompSource>(file, precomp);
 }
 
-void TpiSource::forEachSource(llvm::function_ref<void(TpiSource *)> fn) {
-  for_each(gc, fn);
-}
-
 std::map<codeview::GUID, TypeServerSource *> TypeServerSource::mappings;
 
 std::map<uint32_t, PrecompSource *> PrecompSource::mappings;
 
+bool TpiSource::remapTypeIndex(TypeIndex &ti, TiRefKind refKind) const {
+  if (ti.isSimple())
+    return true;
+
+  // This can be an item index or a type index. Choose the appropriate map.
+  ArrayRef<TypeIndex> tpiOrIpiMap =
+      (refKind == TiRefKind::IndexRef) ? ipiMap : tpiMap;
+  if (ti.toArrayIndex() >= tpiOrIpiMap.size())
+    return false;
+  ti = tpiOrIpiMap[ti.toArrayIndex()];
+  return true;
+}
+
+void TpiSource::remapRecord(MutableArrayRef<uint8_t> rec,
+                            ArrayRef<TiReference> typeRefs) {
+  MutableArrayRef<uint8_t> contents = rec.drop_front(sizeof(RecordPrefix));
+  for (const TiReference &ref : typeRefs) {
+    unsigned byteSize = ref.Count * sizeof(TypeIndex);
+    if (contents.size() < ref.Offset + byteSize)
+      fatal("symbol record too short");
+
+    MutableArrayRef<TypeIndex> indices(
+        reinterpret_cast<TypeIndex *>(contents.data() + ref.Offset), ref.Count);
+    for (TypeIndex &ti : indices) {
+      if (!remapTypeIndex(ti, ref.Kind)) {
+        if (config->verbose) {
+          uint16_t kind =
+              reinterpret_cast<const RecordPrefix *>(rec.data())->RecordKind;
+          StringRef fname = file ? file->getName() : "<unknown PDB>";
+          log("failed to remap type index in record of kind 0x" +
+              utohexstr(kind) + " in " + fname + " with bad " +
+              (ref.Kind == TiRefKind::IndexRef ? "item" : "type") +
+              " index 0x" + utohexstr(ti.getIndex()));
+        }
+        ti = TypeIndex(SimpleTypeKind::NotTranslated);
+        continue;
+      }
+    }
+  }
+}
+
+void TpiSource::remapTypesInTypeRecord(MutableArrayRef<uint8_t> rec) {
+  // TODO: Handle errors similar to symbols.
+  SmallVector<TiReference, 32> typeRefs;
+  discoverTypeIndices(CVType(rec), typeRefs);
+  remapRecord(rec, typeRefs);
+}
+
+bool TpiSource::remapTypesInSymbolRecord(MutableArrayRef<uint8_t> rec) {
+  // Discover type index references in the record. Skip it if we don't
+  // know where they are.
+  SmallVector<TiReference, 32> typeRefs;
+  if (!discoverTypeIndicesInSymbol(rec, typeRefs))
+    return false;
+  remapRecord(rec, typeRefs);
+  return true;
+}
+
 // A COFF .debug$H section is currently a clang extension.  This function checks
 // if a .debug$H section is in a format that we expect / understand, so that we
 // can ignore any sections which are coincidentally also named .debug$H but do
@@ -203,7 +305,6 @@
 static ArrayRef<GloballyHashedType>
 getHashesFromDebugH(ArrayRef<uint8_t> debugH) {
   assert(canUseDebugH(debugH));
-
   debugH = debugH.drop_front(sizeof(object::debug_h_header));
   uint32_t count = debugH.size() / sizeof(GloballyHashedType);
   return {reinterpret_cast<const GloballyHashedType *>(debugH.data()), count};
@@ -211,32 +312,17 @@
 
 // Merge .debug$T for a generic object file.
 Error TpiSource::mergeDebugT(TypeMerger *m) {
+  assert(!config->debugGHashes &&
+         "use remapTpiWithGHashes when ghash is enabled");
+
   CVTypeArray types;
   BinaryStreamReader reader(file->debugTypes, support::little);
   cantFail(reader.readArray(types, reader.getLength()));
 
-  if (config->debugGHashes) {
-    ArrayRef<GloballyHashedType> hashes;
-    std::vector<GloballyHashedType> ownedHashes;
-    if (Optional<ArrayRef<uint8_t>> debugH = getDebugH(file))
-      hashes = getHashesFromDebugH(*debugH);
-    else {
-      ownedHashes = GloballyHashedType::hashTypes(types);
-      hashes = ownedHashes;
-    }
-
-    if (auto err = mergeTypeAndIdRecords(m->globalIDTable, m->globalTypeTable,
-                                         indexMapStorage, types, hashes,
-                                         file->pchSignature))
-      fatal("codeview::mergeTypeAndIdRecords failed: " +
-            toString(std::move(err)));
-  } else {
-    if (auto err =
-            mergeTypeAndIdRecords(m->idTable, m->typeTable, indexMapStorage,
-                                  types, file->pchSignature))
-      fatal("codeview::mergeTypeAndIdRecords failed: " +
-            toString(std::move(err)));
-  }
+  if (auto err = mergeTypeAndIdRecords(
+          m->idTable, m->typeTable, indexMapStorage, types, file->pchSignature))
+    fatal("codeview::mergeTypeAndIdRecords failed: " +
+          toString(std::move(err)));
 
   // In an object, there is only one mapping for both types and items.
   tpiMap = indexMapStorage;
@@ -267,6 +353,9 @@
 
 // Merge types from a type server PDB.
 Error TypeServerSource::mergeDebugT(TypeMerger *m) {
+  assert(!config->debugGHashes &&
+         "use remapTpiWithGHashes when ghash is enabled");
+
   pdb::PDBFile &pdbFile = pdbInputFile->session->getPDBFile();
   Expected<pdb::TpiStream &> expectedTpi = pdbFile.getPDBTpiStream();
   if (auto e = expectedTpi.takeError())
@@ -279,45 +368,18 @@
     maybeIpi = &*expectedIpi;
   }
 
-  if (config->debugGHashes) {
-    // PDBs do not actually store global hashes, so when merging a type server
-    // PDB we have to synthesize global hashes.  To do this, we first synthesize
-    // global hashes for the TPI stream, since it is independent, then we
-    // synthesize hashes for the IPI stream, using the hashes for the TPI stream
-    // as inputs.
-    auto tpiHashes = GloballyHashedType::hashTypes(expectedTpi->typeArray());
-    Optional<uint32_t> endPrecomp;
-    // Merge TPI first, because the IPI stream will reference type indices.
-    if (auto err =
-            mergeTypeRecords(m->globalTypeTable, indexMapStorage,
-                             expectedTpi->typeArray(), tpiHashes, endPrecomp))
-      fatal("codeview::mergeTypeRecords failed: " + toString(std::move(err)));
-    tpiMap = indexMapStorage;
-
-    // Merge IPI.
-    if (maybeIpi) {
-      auto ipiHashes =
-          GloballyHashedType::hashIds(maybeIpi->typeArray(), tpiHashes);
-      if (auto err =
-              mergeIdRecords(m->globalIDTable, tpiMap, ipiSrc->indexMapStorage,
-                             maybeIpi->typeArray(), ipiHashes))
-        fatal("codeview::mergeIdRecords failed: " + toString(std::move(err)));
-      ipiMap = ipiSrc->indexMapStorage;
-    }
-  } else {
-    // Merge TPI first, because the IPI stream will reference type indices.
-    if (auto err = mergeTypeRecords(m->typeTable, indexMapStorage,
-                                    expectedTpi->typeArray()))
-      fatal("codeview::mergeTypeRecords failed: " + toString(std::move(err)));
-    tpiMap = indexMapStorage;
-
-    // Merge IPI.
-    if (maybeIpi) {
-      if (auto err = mergeIdRecords(m->idTable, tpiMap, ipiSrc->indexMapStorage,
-                                    maybeIpi->typeArray()))
-        fatal("codeview::mergeIdRecords failed: " + toString(std::move(err)));
-      ipiMap = ipiSrc->indexMapStorage;
-    }
+  // Merge TPI first, because the IPI stream will reference type indices.
+  if (auto err = mergeTypeRecords(m->typeTable, indexMapStorage,
+                                  expectedTpi->typeArray()))
+    fatal("codeview::mergeTypeRecords failed: " + toString(std::move(err)));
+  tpiMap = indexMapStorage;
+
+  // Merge IPI.
+  if (maybeIpi) {
+    if (auto err = mergeIdRecords(m->idTable, tpiMap, ipiSrc->indexMapStorage,
+                                  maybeIpi->typeArray()))
+      fatal("codeview::mergeIdRecords failed: " + toString(std::move(err)));
+    ipiMap = ipiSrc->indexMapStorage;
   }
 
   if (config->showSummary) {
@@ -337,7 +399,7 @@
   return Error::success();
 }
 
-Error UseTypeServerSource::mergeDebugT(TypeMerger *m) {
+Expected<TypeServerSource *> UseTypeServerSource::getTypeServerSource() {
   const codeview::GUID &tsId = typeServerDependency.getGuid();
   StringRef tsPath = typeServerDependency.getName();
 
@@ -357,8 +419,15 @@
 
     tsSrc = (TypeServerSource *)pdb->debugTypesObj;
   }
+  return tsSrc;
+}
 
-  pdb::PDBFile &pdbSession = tsSrc->pdbInputFile->session->getPDBFile();
+Error UseTypeServerSource::mergeDebugT(TypeMerger *m) {
+  Expected<TypeServerSource *> tsSrc = getTypeServerSource();
+  if (!tsSrc)
+    return tsSrc.takeError();
+
+  pdb::PDBFile &pdbSession = (*tsSrc)->pdbInputFile->session->getPDBFile();
   auto expectedInfo = pdbSession.getPDBInfoStream();
   if (!expectedInfo)
     return expectedInfo.takeError();
@@ -368,12 +437,12 @@
   // must match the GUID specified in the TypeServer2 record.
   if (expectedInfo->getGuid() != typeServerDependency.getGuid())
     return createFileError(
-        tsPath,
+        typeServerDependency.getName(),
         make_error<pdb::PDBError>(pdb::pdb_error_code::signature_out_of_date));
 
   // Reuse the type index map of the type server.
-  tpiMap = tsSrc->tpiMap;
-  ipiMap = tsSrc->ipiMap;
+  tpiMap = (*tsSrc)->tpiMap;
+  ipiMap = (*tsSrc)->ipiMap;
   return Error::success();
 }
 
@@ -399,26 +468,28 @@
   return nullptr;
 }
 
-static Expected<PrecompSource *> findPrecompMap(ObjFile *file,
-                                                PrecompRecord &pr) {
+static PrecompSource *findPrecompSource(ObjFile *file, PrecompRecord &pr) {
   // Cross-compile warning: given that Clang doesn't generate LF_PRECOMP
   // records, we assume the OBJ comes from a Windows build of cl.exe. Thusly,
   // the paths embedded in the OBJs are in the Windows format.
   SmallString<128> prFileName =
       sys::path::filename(pr.getPrecompFilePath(), sys::path::Style::windows);
 
-  PrecompSource *precomp;
   auto it = PrecompSource::mappings.find(pr.getSignature());
   if (it != PrecompSource::mappings.end()) {
-    precomp = it->second;
-  } else {
-    // Lookup by name
-    precomp = findObjByName(prFileName);
+    return it->second;
   }
+  // Lookup by name
+  return findObjByName(prFileName);
+}
+
+static Expected<PrecompSource *> findPrecompMap(ObjFile *file,
+                                                PrecompRecord &pr) {
+  PrecompSource *precomp = findPrecompSource(file, pr);
 
   if (!precomp)
     return createFileError(
-        prFileName,
+        pr.getPrecompFilePath(),
         make_error<pdb::PDBError>(pdb::pdb_error_code::no_matching_pch));
 
   if (pr.getSignature() != file->pchSignature)
@@ -437,11 +508,8 @@
 /// Merges a precompiled headers TPI map into the current TPI map. The
 /// precompiled headers object will also be loaded and remapped in the
 /// process.
-static Error
-mergeInPrecompHeaderObj(ObjFile *file,
-                        SmallVectorImpl<TypeIndex> &indexMapStorage,
-                        PrecompRecord &precomp) {
-  auto e = findPrecompMap(file, precomp);
+Error UsePrecompSource::mergeInPrecompHeaderObj() {
+  auto e = findPrecompMap(file, precompDependency);
   if (!e)
     return e.takeError();
 
@@ -449,11 +517,14 @@
   if (precompSrc->tpiMap.empty())
     return Error::success();
 
-  assert(precomp.getStartTypeIndex() == TypeIndex::FirstNonSimpleIndex);
-  assert(precomp.getTypesCount() <= precompSrc->tpiMap.size());
+  assert(precompDependency.getStartTypeIndex() ==
+         TypeIndex::FirstNonSimpleIndex);
+  assert(precompDependency.getTypesCount() <= precompSrc->tpiMap.size());
   // Use the previously remapped index map from the precompiled headers.
   indexMapStorage.append(precompSrc->tpiMap.begin(),
-                         precompSrc->tpiMap.begin() + precomp.getTypesCount());
+                         precompSrc->tpiMap.begin() +
+                             precompDependency.getTypesCount());
+
   return Error::success();
 }
 
@@ -462,8 +533,7 @@
   // precompiled headers object (/Yc) first. Some type indices in the current
   // object are referencing data in the precompiled headers object, so we need
   // both to be loaded.
-  if (Error e =
-          mergeInPrecompHeaderObj(file, indexMapStorage, precompDependency))
+  if (Error e = mergeInPrecompHeaderObj())
     return e;
 
   return TpiSource::mergeDebugT(m);
@@ -478,7 +548,593 @@
 }
 
 void TpiSource::clear() {
-  gc.clear();
+  // Clean up any owned ghash allocations.
+  clearGHashes();
+  TpiSource::instances.clear();
   TypeServerSource::mappings.clear();
   PrecompSource::mappings.clear();
 }
+
+//===----------------------------------------------------------------------===//
+// Parellel GHash type merging implementation.
+//===----------------------------------------------------------------------===//
+
+void TpiSource::loadGHashes() {
+  if (Optional<ArrayRef<uint8_t>> debugH = getDebugH(file)) {
+    ghashes = getHashesFromDebugH(*debugH);
+    ownedGHashes = false;
+  } else {
+    CVTypeArray types;
+    BinaryStreamReader reader(file->debugTypes, support::little);
+    cantFail(reader.readArray(types, reader.getLength()));
+    assignGHashesFromVector(GloballyHashedType::hashTypes(types));
+  }
+
+  fillIsItemIndexFromDebugT();
+}
+
+// Copies ghashes from a vector into an array. These are long lived, so it's
+// worth the time to copy these into an appropriately sized vector to reduce
+// memory usage.
+void TpiSource::assignGHashesFromVector(
+    std::vector<GloballyHashedType> &&hashVec) {
+  GloballyHashedType *hashes = new GloballyHashedType[hashVec.size()];
+  memcpy(hashes, hashVec.data(), hashVec.size() * sizeof(GloballyHashedType));
+  ghashes = makeArrayRef(hashes, hashVec.size());
+  ownedGHashes = true;
+}
+
+// Faster way to iterate type records. forEachTypeChecked is faster than
+// iterating CVTypeArray. It avoids virtual readBytes calls in inner loops.
+static void forEachTypeChecked(ArrayRef<uint8_t> types,
+                               function_ref<void(const CVType &)> fn) {
+  checkError(
+      forEachCodeViewRecord<CVType>(types, [fn](const CVType &ty) -> Error {
+        fn(ty);
+        return Error::success();
+      }));
+}
+
+// Walk over file->debugTypes and fill in the isItemIndex bit vector.
+// TODO: Store this information in .debug$H so that we don't have to recompute
+// it. This is the main bottleneck slowing down parallel ghashing with one
+// thread over single-threaded ghashing.
+void TpiSource::fillIsItemIndexFromDebugT() {
+  uint32_t index = 0;
+  isItemIndex.resize(ghashes.size());
+  forEachTypeChecked(file->debugTypes, [&](const CVType &ty) {
+    if (isIdRecord(ty.kind()))
+      isItemIndex.set(index);
+    ++index;
+  });
+}
+
+void TpiSource::mergeTypeRecord(TypeIndex curIndex, CVType ty) {
+  // Decide if the merged type goes into TPI or IPI.
+  bool isItem = isIdRecord(ty.kind());
+  MergedInfo &merged = isItem ? mergedIpi : mergedTpi;
+
+  // Copy the type into our mutable buffer.
+  assert(ty.length() <= codeview::MaxRecordLength);
+  size_t offset = merged.recs.size();
+  size_t newSize = alignTo(ty.length(), 4);
+  merged.recs.resize(offset + newSize);
+  auto newRec = makeMutableArrayRef(&merged.recs[offset], newSize);
+  memcpy(newRec.data(), ty.data().data(), newSize);
+
+  // Fix up the record prefix and padding bytes if it required resizing.
+  if (newSize != ty.length()) {
+    reinterpret_cast<RecordPrefix *>(newRec.data())->RecordLen = newSize - 2;
+    for (size_t i = ty.length(); i < newSize; ++i)
+      newRec[i] = LF_PAD0 + (newSize - i);
+  }
+
+  // Remap the type indices in the new record.
+  remapTypesInTypeRecord(newRec);
+  uint32_t pdbHash = check(pdb::hashTypeRecord(CVType(newRec)));
+  merged.recSizes.push_back(static_cast<uint16_t>(newSize));
+  merged.recHashes.push_back(pdbHash);
+
+  // Retain a mapping from PDB function id to PDB function type. This mapping is
+  // used during symbol procesing to rewrite S_GPROC32_ID symbols to S_GPROC32
+  // symbols.
+  if (ty.kind() == LF_FUNC_ID || ty.kind() == LF_MFUNC_ID) {
+    bool success = ty.length() >= 12;
+    TypeIndex funcId = curIndex;
+    if (success)
+      success &= remapTypeIndex(funcId, TiRefKind::IndexRef);
+    TypeIndex funcType =
+        *reinterpret_cast<const TypeIndex *>(&newRec.data()[8]);
+    if (success) {
+      funcIdToType.push_back({funcId, funcType});
+    } else {
+      StringRef fname = file ? file->getName() : "<unknown PDB>";
+      warn("corrupt LF_[M]FUNC_ID record 0x" + utohexstr(curIndex.getIndex()) +
+           " in " + fname);
+    }
+  }
+}
+
+void TpiSource::mergeUniqueTypeRecords(ArrayRef<uint8_t> typeRecords,
+                                       TypeIndex beginIndex) {
+  // Re-sort the list of unique types by index.
+  if (kind == PDB)
+    assert(std::is_sorted(uniqueTypes.begin(), uniqueTypes.end()));
+  else
+    llvm::sort(uniqueTypes);
+
+  // Accumulate all the unique types into one buffer in mergedTypes.
+  uint32_t ghashIndex = 0;
+  auto nextUniqueIndex = uniqueTypes.begin();
+  assert(mergedTpi.recs.empty());
+  assert(mergedIpi.recs.empty());
+  forEachTypeChecked(typeRecords, [&](const CVType &ty) {
+    if (nextUniqueIndex != uniqueTypes.end() &&
+        *nextUniqueIndex == ghashIndex) {
+      mergeTypeRecord(beginIndex + ghashIndex, ty);
+      ++nextUniqueIndex;
+    }
+    ++ghashIndex;
+  });
+  assert(nextUniqueIndex == uniqueTypes.end() &&
+         "failed to merge all desired records");
+  assert(uniqueTypes.size() ==
+             mergedTpi.recSizes.size() + mergedIpi.recSizes.size() &&
+         "missing desired record");
+}
+
+void TpiSource::remapTpiWithGHashes(GHashState *g) {
+  assert(config->debugGHashes && "ghashes must be enabled");
+  fillMapFromGHashes(g, indexMapStorage);
+  tpiMap = indexMapStorage;
+  ipiMap = indexMapStorage;
+  mergeUniqueTypeRecords(file->debugTypes);
+  // TODO: Free all unneeded ghash resources now that we have a full index map.
+}
+
+// PDBs do not actually store global hashes, so when merging a type server
+// PDB we have to synthesize global hashes.  To do this, we first synthesize
+// global hashes for the TPI stream, since it is independent, then we
+// synthesize hashes for the IPI stream, using the hashes for the TPI stream
+// as inputs.
+void TypeServerSource::loadGHashes() {
+  // Don't hash twice.
+  if (!ghashes.empty())
+    return;
+  pdb::PDBFile &pdbFile = pdbInputFile->session->getPDBFile();
+
+  // Hash TPI stream.
+  Expected<pdb::TpiStream &> expectedTpi = pdbFile.getPDBTpiStream();
+  if (auto e = expectedTpi.takeError())
+    fatal("Type server does not have TPI stream: " + toString(std::move(e)));
+  assignGHashesFromVector(
+      GloballyHashedType::hashTypes(expectedTpi->typeArray()));
+  isItemIndex.resize(ghashes.size());
+
+  // Hash IPI stream, which depends on TPI ghashes.
+  if (!pdbFile.hasPDBIpiStream())
+    return;
+  Expected<pdb::TpiStream &> expectedIpi = pdbFile.getPDBIpiStream();
+  if (auto e = expectedIpi.takeError())
+    fatal("error retreiving IPI stream: " + toString(std::move(e)));
+  ipiSrc->assignGHashesFromVector(
+      GloballyHashedType::hashIds(expectedIpi->typeArray(), ghashes));
+
+  // The IPI stream isItemIndex bitvector should be all ones.
+  ipiSrc->isItemIndex.resize(ipiSrc->ghashes.size());
+  ipiSrc->isItemIndex.set(0, ipiSrc->ghashes.size());
+}
+
+// Flatten discontiguous PDB type arrays to bytes so that we can use
+// forEachTypeChecked instead of CVTypeArray iteration. Copying all types from
+// type servers is faster than iterating all object files compiled with /Z7 with
+// CVTypeArray, which has high overheads due to the virtual interface of
+// BinaryStream::readBytes.
+static ArrayRef<uint8_t> typeArrayToBytes(const CVTypeArray &types) {
+  BinaryStreamRef stream = types.getUnderlyingStream();
+  ArrayRef<uint8_t> debugTypes;
+  checkError(stream.readBytes(0, stream.getLength(), debugTypes));
+  return debugTypes;
+}
+
+// Merge types from a type server PDB.
+void TypeServerSource::remapTpiWithGHashes(GHashState *g) {
+  assert(config->debugGHashes && "ghashes must be enabled");
+
+  // IPI merging depends on TPI, so do TPI first, then do IPI.  No need to
+  // propagate errors, those should've been handled during ghash loading.
+  pdb::PDBFile &pdbFile = pdbInputFile->session->getPDBFile();
+  pdb::TpiStream &tpi = check(pdbFile.getPDBTpiStream());
+  fillMapFromGHashes(g, indexMapStorage);
+  tpiMap = indexMapStorage;
+  mergeUniqueTypeRecords(typeArrayToBytes(tpi.typeArray()));
+  if (pdbFile.hasPDBIpiStream()) {
+    pdb::TpiStream &ipi = check(pdbFile.getPDBIpiStream());
+    ipiSrc->indexMapStorage.resize(ipiSrc->ghashes.size());
+    ipiSrc->fillMapFromGHashes(g, ipiSrc->indexMapStorage);
+    ipiMap = ipiSrc->indexMapStorage;
+    ipiSrc->tpiMap = tpiMap;
+    ipiSrc->ipiMap = ipiMap;
+    ipiSrc->mergeUniqueTypeRecords(typeArrayToBytes(ipi.typeArray()));
+  }
+}
+
+void UseTypeServerSource::remapTpiWithGHashes(GHashState *g) {
+  // No remapping to do with /Zi objects. Simply use the index map from the type
+  // server. Errors should have been reported earlier. Symbols from this object
+  // will be ignored.
+  Expected<TypeServerSource *> maybeTsSrc = getTypeServerSource();
+  if (!maybeTsSrc) {
+    typeMergingError =
+        joinErrors(std::move(typeMergingError), maybeTsSrc.takeError());
+    return;
+  }
+  TypeServerSource *tsSrc = *maybeTsSrc;
+  tpiMap = tsSrc->tpiMap;
+  ipiMap = tsSrc->ipiMap;
+}
+
+void PrecompSource::loadGHashes() {
+  if (getDebugH(file)) {
+    warn("ignoring .debug$H section; pch with ghash is not implemented");
+  }
+
+  uint32_t ghashIdx = 0;
+  std::vector<GloballyHashedType> hashVec;
+  forEachTypeChecked(file->debugTypes, [&](const CVType &ty) {
+    // Remember the index of the LF_ENDPRECOMP record so it can be excluded from
+    // the PDB. There must be an entry in the list of ghashes so that the type
+    // indexes of the following records in the /Yc PCH object line up.
+    if (ty.kind() == LF_ENDPRECOMP)
+      endPrecompGHashIdx = ghashIdx;
+
+    hashVec.push_back(GloballyHashedType::hashType(ty, hashVec, hashVec));
+    isItemIndex.push_back(isIdRecord(ty.kind()));
+    ++ghashIdx;
+  });
+  assignGHashesFromVector(std::move(hashVec));
+}
+
+void UsePrecompSource::loadGHashes() {
+  PrecompSource *pchSrc = findPrecompSource(file, precompDependency);
+  if (!pchSrc)
+    return;
+
+  // To compute ghashes of a /Yu object file, we need to build on the the
+  // ghashes of the /Yc PCH object. After we are done hashing, discard the
+  // ghashes from the PCH source so we don't unnecessarily try to deduplicate
+  // them.
+  std::vector<GloballyHashedType> hashVec =
+      pchSrc->ghashes.take_front(precompDependency.getTypesCount());
+  forEachTypeChecked(file->debugTypes, [&](const CVType &ty) {
+    hashVec.push_back(GloballyHashedType::hashType(ty, hashVec, hashVec));
+    isItemIndex.push_back(isIdRecord(ty.kind()));
+  });
+  hashVec.erase(hashVec.begin(),
+                hashVec.begin() + precompDependency.getTypesCount());
+  assignGHashesFromVector(std::move(hashVec));
+}
+
+void UsePrecompSource::remapTpiWithGHashes(GHashState *g) {
+  // This object was compiled with /Yu, so process the corresponding
+  // precompiled headers object (/Yc) first. Some type indices in the current
+  // object are referencing data in the precompiled headers object, so we need
+  // both to be loaded.
+  if (Error e = mergeInPrecompHeaderObj()) {
+    typeMergingError = joinErrors(std::move(typeMergingError), std::move(e));
+    return;
+  }
+
+  fillMapFromGHashes(g, indexMapStorage);
+  tpiMap = indexMapStorage;
+  ipiMap = indexMapStorage;
+  mergeUniqueTypeRecords(file->debugTypes,
+                         TypeIndex(precompDependency.getStartTypeIndex() +
+                                   precompDependency.getTypesCount()));
+}
+
+namespace {
+/// A concurrent hash table for global type hashing. It is based on this paper:
+/// Concurrent Hash Tables: Fast and General(?)!
+/// https://dl.acm.org/doi/10.1145/3309206
+///
+/// This hash table is meant to be used in two phases:
+/// 1. concurrent insertions
+/// 2. concurrent reads
+/// It does not support lookup, deletion, or rehashing. It uses linear probing.
+///
+/// The paper describes storing a key-value pair in two machine words.
+/// Generally, the values stored in this map are type indices, and we can use
+/// those values to recover the ghash key from a side table. This allows us to
+/// shrink the table entries further at the cost of some loads, and sidesteps
+/// the need for a 128 bit atomic compare-and-swap operation.
+///
+/// During insertion, a priority function is used to decide which insertion
+/// should be preferred. This ensures that the output is deterministic. For
+/// ghashing, lower tpiSrcIdx values (earlier inputs) are preferred.
+///
+class GHashCell;
+struct GHashTable {
+  GHashCell *table = nullptr;
+  uint32_t tableSize = 0;
+
+  GHashTable() = default;
+  ~GHashTable();
+
+  /// Initialize the table with the given size. Because the table cannot be
+  /// resized, the initial size of the table must be large enough to contain all
+  /// inputs, or insertion may not be able to find an empty cell.
+  void init(uint32_t newTableSize);
+
+  /// Insert the cell with the given ghash into the table. Return the insertion
+  /// position in the table. It is safe for the caller to store the insertion
+  /// position because the table cannot be resized.
+  uint32_t insert(GloballyHashedType ghash, GHashCell newCell);
+};
+
+/// A ghash table cell for deduplicating types from TpiSources.
+class GHashCell {
+  uint64_t data = 0;
+
+public:
+  GHashCell() = default;
+
+  // Construct data most to least significant so that sorting works well:
+  // - isItem
+  // - tpiSrcIdx
+  // - ghashIdx
+  // Add one to the tpiSrcIdx so that the 0th record from the 0th source has a
+  // non-zero representation.
+  GHashCell(bool isItem, uint32_t tpiSrcIdx, uint32_t ghashIdx)
+      : data((uint64_t(isItem) << 63U) | (uint64_t(tpiSrcIdx + 1) << 32ULL) |
+             ghashIdx) {
+    assert(tpiSrcIdx == getTpiSrcIdx() && "round trip failure");
+    assert(ghashIdx == getGHashIdx() && "round trip failure");
+  }
+
+  explicit GHashCell(uint64_t data) : data(data) {}
+
+  // The empty cell is all zeros.
+  bool isEmpty() const { return data == 0ULL; }
+
+  /// Extract the tpiSrcIdx.
+  uint32_t getTpiSrcIdx() const {
+    return ((uint32_t)(data >> 32U) & 0x7FFFFFFF) - 1;
+  }
+
+  /// Extract the index into the ghash array of the TpiSource.
+  uint32_t getGHashIdx() const { return (uint32_t)data; }
+
+  bool isItem() const { return data & (1ULL << 63U); }
+
+  /// Get the ghash key for this cell.
+  GloballyHashedType getGHash() const {
+    return TpiSource::instances[getTpiSrcIdx()]->ghashes[getGHashIdx()];
+  }
+
+  /// The priority function for the cell. The data is stored such that lower
+  /// tpiSrcIdx and ghashIdx values are preferred, which means that type record
+  /// from earlier sources are more likely to prevail.
+  friend inline bool operator<(const GHashCell &l, const GHashCell &r) {
+    return l.data < r.data;
+  }
+};
+} // namespace
+
+namespace lld {
+namespace coff {
+/// This type is just a wrapper around GHashTable with external linkage so it
+/// can be used from a header.
+struct GHashState {
+  GHashTable table;
+};
+} // namespace coff
+} // namespace lld
+
+GHashTable::~GHashTable() { delete[] table; }
+
+void GHashTable::init(uint32_t newTableSize) {
+  table = new GHashCell[newTableSize];
+  memset(table, 0, newTableSize * sizeof(GHashCell));
+  tableSize = newTableSize;
+}
+
+uint32_t GHashTable::insert(GloballyHashedType ghash, GHashCell newCell) {
+  assert(!newCell.isEmpty() && "cannot insert empty cell value");
+
+  // FIXME: The low bytes of SHA1 have low entropy for short records, which
+  // type records are. Swap the byte order for better entropy. A better ghash
+  // won't need this.
+  uint32_t startIdx =
+      ByteSwap_64(*reinterpret_cast<uint64_t *>(&ghash)) % tableSize;
+
+  // Do a linear probe starting at startIdx.
+  uint32_t idx = startIdx;
+  while (true) {
+    // Run a compare and swap loop. There are four cases:
+    // - cell is empty: CAS into place and return
+    // - cell has matching key, earlier priority: do nothing, return
+    // - cell has matching key, later priority: CAS into place and return
+    // - cell has non-matching key: hash collision, probe next cell
+    auto *cellPtr = reinterpret_cast<std::atomic<GHashCell> *>(&table[idx]);
+    GHashCell oldCell(cellPtr->load());
+    while (oldCell.isEmpty() || oldCell.getGHash() == ghash) {
+      // Check if there is an existing ghash entry with a higher priority
+      // (earlier ordering). If so, this is a duplicate, we are done.
+      if (!oldCell.isEmpty() && oldCell < newCell)
+        return idx;
+      // Either the cell is empty, or our value is higher priority. Try to
+      // compare and swap. If it succeeds, we are done.
+      if (cellPtr->compare_exchange_weak(oldCell, newCell))
+        return idx;
+      // If the CAS failed, check this cell again.
+    }
+
+    // Advance the probe. Wrap around to the beginning if we run off the end.
+    ++idx;
+    idx = idx == tableSize ? 0 : idx;
+    if (idx == startIdx) {
+      // If this becomes an issue, we could mark failure and rehash from the
+      // beginning with a bigger table. There is no difference between rehashing
+      // internally and starting over.
+      report_fatal_error("ghash table is full");
+    }
+  }
+  llvm_unreachable("left infloop");
+}
+
+TypeMerger::TypeMerger(llvm::BumpPtrAllocator &alloc)
+    : typeTable(alloc), idTable(alloc) {}
+
+TypeMerger::~TypeMerger() = default;
+
+void TypeMerger::mergeTypesWithGHash() {
+  // Load ghashes. Do type servers and PCH objects first.
+  {
+    ScopedTimer t1(loadGHashTimer);
+    parallelForEach(TpiSource::dependencySources,
+                    [&](TpiSource *source) { source->loadGHashes(); });
+    parallelForEach(TpiSource::objectSources,
+                    [&](TpiSource *source) { source->loadGHashes(); });
+  }
+
+  ScopedTimer t2(mergeGHashTimer);
+  GHashState ghashState;
+
+  // Estimate the size of hash table needed to deduplicate ghashes. This *must*
+  // be larger than the number of unique types, or hash table insertion may not
+  // be able to find a vacant slot. Summing the input types guarantees this, but
+  // it is a gross overestimate. The table size could be reduced to save memory,
+  // but it would require implementing rehashing, and this table is generally
+  // small compared to total memory usage, at eight bytes per input type record,
+  // and most input type records are larger than eight bytes.
+  size_t tableSize = 0;
+  for (TpiSource *source : TpiSource::instances)
+    tableSize += source->ghashes.size();
+
+  // Cap the table size so that we can use 32-bit cell indices. Type indices are
+  // also 32-bit, so this is an inherent PDB file format limit anyway.
+  tableSize = std::min(size_t(INT32_MAX), tableSize);
+  ghashState.table.init(static_cast<uint32_t>(tableSize));
+
+  // Insert ghashes in parallel. During concurrent insertion, we cannot observe
+  // the contents of the hash table cell, but we can remember the insertion
+  // position. Because the table does not rehash, the position will not change
+  // under insertion. After insertion is done, the value of the cell can be read
+  // to retreive the final PDB type index.
+  parallelForEachN(0, TpiSource::instances.size(), [&](size_t tpiSrcIdx) {
+    TpiSource *source = TpiSource::instances[tpiSrcIdx];
+    source->indexMapStorage.resize(source->ghashes.size());
+    for (uint32_t i = 0, e = source->ghashes.size(); i < e; i++) {
+      if (source->shouldOmitFromPdb(i)) {
+        source->indexMapStorage[i] = TypeIndex(SimpleTypeKind::NotTranslated);
+        continue;
+      }
+      GloballyHashedType ghash = source->ghashes[i];
+      bool isItem = source->isItemIndex.test(i);
+      uint32_t cellIdx =
+          ghashState.table.insert(ghash, GHashCell(isItem, tpiSrcIdx, i));
+
+      // Store the ghash cell index as a type index in indexMapStorage. Later
+      // we will replace it with the PDB type index.
+      source->indexMapStorage[i] = TypeIndex::fromArrayIndex(cellIdx);
+    }
+  });
+
+  // Collect all non-empty cells and sort them. This will implicitly assign
+  // destination type indices, and partition the entries into type records and
+  // item records. It arranges types in this order:
+  // - type records
+  //   - source 0, type 0...
+  //   - source 1, type 1...
+  // - item records
+  //   - source 0, type 1...
+  //   - source 1, type 0...
+  std::vector<GHashCell> entries;
+  for (const GHashCell &cell :
+       makeArrayRef(ghashState.table.table, tableSize)) {
+    if (!cell.isEmpty())
+      entries.push_back(cell);
+  }
+  parallelSort(entries, std::less<GHashCell>());
+  log(formatv("ghash table load factor: {0:p} (size {1} / capacity {2})\n",
+              double(entries.size()) / tableSize, entries.size(), tableSize));
+
+  // Find out how many type and item indices there are.
+  auto mid =
+      std::lower_bound(entries.begin(), entries.end(), GHashCell(true, 0, 0));
+  assert((mid == entries.end() || mid->isItem()) &&
+         (mid == entries.begin() || !std::prev(mid)->isItem()) &&
+         "midpoint is not midpoint");
+  uint32_t numTypes = std::distance(entries.begin(), mid);
+  uint32_t numItems = std::distance(mid, entries.end());
+  log("Tpi record count: " + Twine(numTypes));
+  log("Ipi record count: " + Twine(numItems));
+
+  // Make a list of the "unique" type records to merge for each tpi source. Type
+  // merging will skip indices not on this list. Store the destination PDB type
+  // index for these unique types in the tpiMap for each source. The entries for
+  // non-unique types will be filled in prior to type merging.
+  for (uint32_t i = 0, e = entries.size(); i < e; ++i) {
+    auto &cell = entries[i];
+    uint32_t tpiSrcIdx = cell.getTpiSrcIdx();
+    TpiSource *source = TpiSource::instances[tpiSrcIdx];
+    source->uniqueTypes.push_back(cell.getGHashIdx());
+
+    // Update the ghash table to store the destination PDB type index in the
+    // table.
+    uint32_t pdbTypeIndex = i < numTypes ? i : i - numTypes;
+    uint32_t ghashCellIndex =
+        source->indexMapStorage[cell.getGHashIdx()].toArrayIndex();
+    ghashState.table.table[ghashCellIndex] =
+        GHashCell(cell.isItem(), cell.getTpiSrcIdx(), pdbTypeIndex);
+  }
+
+  // In parallel, remap all types.
+  for_each(TpiSource::dependencySources, [&](TpiSource *source) {
+    source->remapTpiWithGHashes(&ghashState);
+  });
+  parallelForEach(TpiSource::objectSources, [&](TpiSource *source) {
+    source->remapTpiWithGHashes(&ghashState);
+  });
+
+  // Build a global map of from function ID to function type.
+  for (TpiSource *source : TpiSource::instances) {
+    for (auto idToType : source->funcIdToType)
+      funcIdToType.insert(idToType);
+    source->funcIdToType.clear();
+  }
+
+  TpiSource::clearGHashes();
+}
+
+/// Given the index into the ghash table for a particular type, return the type
+/// index for that type in the output PDB.
+static TypeIndex loadPdbTypeIndexFromCell(GHashState *g,
+                                          uint32_t ghashCellIdx) {
+  GHashCell cell = g->table.table[ghashCellIdx];
+  return TypeIndex::fromArrayIndex(cell.getGHashIdx());
+}
+
+// Fill in a TPI or IPI index map using ghashes. For each source type, use its
+// ghash to lookup its final type index in the PDB, and store that in the map.
+void TpiSource::fillMapFromGHashes(GHashState *g,
+                                   SmallVectorImpl<TypeIndex> &mapToFill) {
+  for (size_t i = 0, e = ghashes.size(); i < e; ++i) {
+    TypeIndex fakeCellIndex = indexMapStorage[i];
+    if (fakeCellIndex.isSimple())
+      mapToFill[i] = fakeCellIndex;
+    else
+      mapToFill[i] = loadPdbTypeIndexFromCell(g, fakeCellIndex.toArrayIndex());
+  }
+}
+
+void TpiSource::clearGHashes() {
+  for (TpiSource *src : TpiSource::instances) {
+    if (src->ownedGHashes)
+      delete[] src->ghashes.data();
+    src->ghashes = {};
+    src->isItemIndex.clear();
+    src->uniqueTypes.clear();
+  }
+}
diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp
--- a/lld/COFF/Driver.cpp
+++ b/lld/COFF/Driver.cpp
@@ -69,13 +69,13 @@
   lld::stderrOS = &stderrOS;
 
   errorHandler().cleanupCallback = []() {
+    TpiSource::clear();
     freeArena();
     ObjFile::instances.clear();
     PDBInputFile::instances.clear();
     ImportFile::instances.clear();
     BitcodeFile::instances.clear();
     memset(MergeChunk::instances, 0, sizeof(MergeChunk::instances));
-    TpiSource::clear();
     OutputSection::clear();
   };
 
diff --git a/lld/COFF/PDB.h b/lld/COFF/PDB.h
--- a/lld/COFF/PDB.h
+++ b/lld/COFF/PDB.h
@@ -20,6 +20,8 @@
 }
 
 namespace lld {
+class Timer;
+
 namespace coff {
 class OutputSection;
 class SectionChunk;
@@ -32,6 +34,10 @@
 
 llvm::Optional<std::pair<llvm::StringRef, uint32_t>>
 getFileLineCodeView(const SectionChunk *c, uint32_t addr);
+
+extern Timer loadGHashTimer;
+extern Timer mergeGHashTimer;
+
 } // namespace coff
 } // namespace lld
 
diff --git a/lld/COFF/PDB.cpp b/lld/COFF/PDB.cpp
--- a/lld/COFF/PDB.cpp
+++ b/lld/COFF/PDB.cpp
@@ -66,7 +66,8 @@
 static ExitOnError exitOnErr;
 
 static Timer totalPdbLinkTimer("PDB Emission (Cumulative)", Timer::root());
-
+Timer lld::coff::loadGHashTimer("Global Type Hashing", totalPdbLinkTimer);
+Timer lld::coff::mergeGHashTimer("GHash Type Merging", totalPdbLinkTimer);
 static Timer addObjectsTimer("Add Objects", totalPdbLinkTimer);
 static Timer typeMergingTimer("Type Merging", addObjectsTimer);
 static Timer symbolMergingTimer("Symbol Merging", addObjectsTimer);
@@ -112,8 +113,6 @@
   /// externally.
   void addDebug(TpiSource *source);
 
-  bool mergeTypeRecords(TpiSource *source);
-
   void addDebugSymbols(TpiSource *source);
 
   void mergeSymbolRecords(TpiSource *source,
@@ -250,43 +249,18 @@
   });
 }
 
-static bool remapTypeIndex(TypeIndex &ti, ArrayRef<TypeIndex> typeIndexMap) {
-  if (ti.isSimple())
-    return true;
-  if (ti.toArrayIndex() >= typeIndexMap.size())
-    return false;
-  ti = typeIndexMap[ti.toArrayIndex()];
-  return true;
-}
-
-static void remapTypesInSymbolRecord(ObjFile *file, SymbolKind symKind,
-                                     MutableArrayRef<uint8_t> recordBytes,
-                                     TpiSource *source,
-                                     ArrayRef<TiReference> typeRefs) {
-  MutableArrayRef<uint8_t> contents =
-      recordBytes.drop_front(sizeof(RecordPrefix));
-  for (const TiReference &ref : typeRefs) {
-    unsigned byteSize = ref.Count * sizeof(TypeIndex);
-    if (contents.size() < ref.Offset + byteSize)
-      fatal("symbol record too short");
-
-    // This can be an item index or a type index. Choose the appropriate map.
-    bool isItemIndex = ref.Kind == TiRefKind::IndexRef;
-    ArrayRef<TypeIndex> typeOrItemMap =
-        isItemIndex ? source->ipiMap : source->tpiMap;
-
-    MutableArrayRef<TypeIndex> tIs(
-        reinterpret_cast<TypeIndex *>(contents.data() + ref.Offset), ref.Count);
-    for (TypeIndex &ti : tIs) {
-      if (!remapTypeIndex(ti, typeOrItemMap)) {
-        log("ignoring symbol record of kind 0x" + utohexstr(symKind) + " in " +
-            file->getName() + " with bad " + (isItemIndex ? "item" : "type") +
-            " index 0x" + utohexstr(ti.getIndex()));
-        ti = TypeIndex(SimpleTypeKind::NotTranslated);
-        continue;
-      }
-    }
-  }
+static void addGHashTypeInfo(pdb::PDBFileBuilder &builder) {
+  // Start the TPI or IPI stream header.
+  builder.getTpiBuilder().setVersionHeader(pdb::PdbTpiV80);
+  builder.getIpiBuilder().setVersionHeader(pdb::PdbTpiV80);
+  for_each(TpiSource::instances, [&](TpiSource *source) {
+    builder.getTpiBuilder().addTypeRecords(source->mergedTpi.recs,
+                                           source->mergedTpi.recSizes,
+                                           source->mergedTpi.recHashes);
+    builder.getIpiBuilder().addTypeRecords(source->mergedIpi.recs,
+                                           source->mergedIpi.recSizes,
+                                           source->mergedIpi.recHashes);
+  });
 }
 
 static void
@@ -329,7 +303,7 @@
 
 /// MSVC translates S_PROC_ID_END to S_END, and S_[LG]PROC32_ID to S_[LG]PROC32
 static void translateIdSymbols(MutableArrayRef<uint8_t> &recordData,
-                               TypeCollection &idTable) {
+                               TypeMerger &tMerger, TpiSource *source) {
   RecordPrefix *prefix = reinterpret_cast<RecordPrefix *>(recordData.data());
 
   SymbolKind kind = symbolKind(recordData);
@@ -356,13 +330,25 @@
         reinterpret_cast<TypeIndex *>(content.data() + refs[0].Offset);
     // `ti` is the index of a FuncIdRecord or MemberFuncIdRecord which lives in
     // the IPI stream, whose `FunctionType` member refers to the TPI stream.
-    // Note that LF_FUNC_ID and LF_MEMFUNC_ID have the same record layout, and
+    // Note that LF_FUNC_ID and LF_MFUNC_ID have the same record layout, and
     // in both cases we just need the second type index.
     if (!ti->isSimple() && !ti->isNoneType()) {
-      CVType funcIdData = idTable.getType(*ti);
-      ArrayRef<uint8_t> tiBuf = funcIdData.data().slice(8, 4);
-      assert(tiBuf.size() == 4 && "corrupt LF_[MEM]FUNC_ID record");
-      *ti = *reinterpret_cast<const TypeIndex *>(tiBuf.data());
+      if (config->debugGHashes) {
+        auto idToType = tMerger.funcIdToType.find(*ti);
+        if (idToType == tMerger.funcIdToType.end()) {
+          warn(formatv("S_[GL]PROC32_ID record in {0} refers to PDB item "
+                       "index {1:X} which is not a LF_[M]FUNC_ID record",
+                       source->file->getName(), ti->getIndex()));
+          *ti = TypeIndex(SimpleTypeKind::NotTranslated);
+        } else {
+          *ti = idToType->second;
+        }
+      } else {
+        CVType funcIdData = tMerger.getIDTable().getType(*ti);
+        ArrayRef<uint8_t> tiBuf = funcIdData.data().slice(8, 4);
+        assert(tiBuf.size() == 4 && "corrupt LF_[M]FUNC_ID record");
+        *ti = *reinterpret_cast<const TypeIndex *>(tiBuf.data());
+      }
     }
 
     kind = (kind == SymbolKind::S_GPROC32_ID) ? SymbolKind::S_GPROC32
@@ -561,22 +547,16 @@
               const_cast<uint8_t *>(sym.data().data()), sym.length());
         }
 
-        // Discover type index references in the record. Skip it if we don't
-        // know where they are.
-        SmallVector<TiReference, 32> typeRefs;
-        if (!discoverTypeIndicesInSymbol(sym, typeRefs)) {
-          log("ignoring unknown symbol record with kind 0x" +
-              utohexstr(sym.kind()));
+        // Re-map all the type index references.
+        if (!source->remapTypesInSymbolRecord(recordBytes)) {
+          log("error remapping types in symbol of kind 0x" +
+              utohexstr(sym.kind()) + ", ignoring");
           return Error::success();
         }
 
-        // Re-map all the type index references.
-        remapTypesInSymbolRecord(file, sym.kind(), recordBytes, source,
-                                 typeRefs);
-
         // An object file may have S_xxx_ID symbols, but these get converted to
         // "real" symbols in a PDB.
-        translateIdSymbols(recordBytes, tMerger.getIDTable());
+        translateIdSymbols(recordBytes, tMerger, source);
         sym = CVSymbol(recordBytes);
 
         // If this record refers to an offset in the object file's string table,
@@ -748,11 +728,15 @@
     const DebugSubsectionRecord &inlineeSubsection) {
   DebugInlineeLinesSubsectionRef inlineeLines;
   exitOnErr(inlineeLines.initialize(inlineeSubsection.getRecordData()));
+  if (!source) {
+    warn("ignoring inlinee lines section in file that lacks type information");
+    return;
+  }
 
   // Remap type indices in inlinee line records in place.
   for (const InlineeSourceLine &line : inlineeLines) {
     TypeIndex &inlinee = *const_cast<TypeIndex *>(&line.Header->Inlinee);
-    if (!remapTypeIndex(inlinee, source->ipiMap)) {
+    if (!source->remapTypeIndex(inlinee, TiRefKind::IndexRef)) {
       log("bad inlinee line record in " + file.getName() +
           " with bad inlinee index 0x" + utohexstr(inlinee.getIndex()));
     }
@@ -827,20 +811,6 @@
     warn(msg);
 }
 
-bool PDBLinker::mergeTypeRecords(TpiSource *source) {
-  ScopedTimer t(typeMergingTimer);
-  // Before we can process symbol substreams from .debug$S, we need to process
-  // type information, file checksums, and the string table.  Add type info to
-  // the PDB first, so that we can get the map from object file type and item
-  // indices to PDB type and item indices.
-  if (Error e = source->mergeDebugT(&tMerger)) {
-    // If the .debug$T sections fail to merge, assume there is no debug info.
-    warnUnusable(source->file, std::move(e));
-    return false;
-  }
-  return true;
-}
-
 // Allocate memory for a .debug$S / .debug$F section and relocate it.
 static ArrayRef<uint8_t> relocateDebugChunk(SectionChunk &debugChunk) {
   uint8_t *buffer = bAlloc.Allocate<uint8_t>(debugChunk.getSize());
@@ -920,9 +890,28 @@
 }
 
 void PDBLinker::addDebug(TpiSource *source) {
+  // Before we can process symbol substreams from .debug$S, we need to process
+  // type information, file checksums, and the string table. Add type info to
+  // the PDB first, so that we can get the map from object file type and item
+  // indices to PDB type and item indices.  If we are using ghashes, types have
+  // already been merged.
+  if (!config->debugGHashes) {
+    ScopedTimer t(typeMergingTimer);
+    if (Error e = source->mergeDebugT(&tMerger)) {
+      // If type merging failed, ignore the symbols.
+      warnUnusable(source->file, std::move(e));
+      return;
+    }
+  }
+
   // If type merging failed, ignore the symbols.
-  if (mergeTypeRecords(source))
-    addDebugSymbols(source);
+  Error typeError = std::move(source->typeMergingError);
+  if (typeError) {
+    warnUnusable(source->file, std::move(typeError));
+    return;
+  }
+
+  addDebugSymbols(source);
 }
 
 static pdb::BulkPublic createPublic(Defined *def) {
@@ -955,25 +944,31 @@
   for_each(ObjFile::instances,
            [&](ObjFile *obj) { createModuleDBI(builder, obj); });
 
-  // Merge dependencies
-  TpiSource::forEachSource([&](TpiSource *source) {
-    if (source->isDependency())
-      addDebug(source);
-  });
+  // Reorder dependency type sources to come first.
+  TpiSource::sortDependencies();
 
-  // Merge regular and dependent OBJs
-  TpiSource::forEachSource([&](TpiSource *source) {
-    if (!source->isDependency())
-      addDebug(source);
-  });
+  // Merge type information from input files using global type hashing.
+  if (config->debugGHashes)
+    tMerger.mergeTypesWithGHash();
+
+  // Merge dependencies and then regular objects.
+  for_each(TpiSource::dependencySources,
+           [&](TpiSource *source) { addDebug(source); });
+  for_each(TpiSource::objectSources,
+           [&](TpiSource *source) { addDebug(source); });
 
   builder.getStringTableBuilder().setStrings(pdbStrTab);
   t1.stop();
 
   // Construct TPI and IPI stream contents.
   ScopedTimer t2(tpiStreamLayoutTimer);
-  addTypeInfo(builder.getTpiBuilder(), tMerger.getTypeTable());
-  addTypeInfo(builder.getIpiBuilder(), tMerger.getIDTable());
+  // Collect all the merged types.
+  if (config->debugGHashes) {
+    addGHashTypeInfo(builder);
+  } else {
+    addTypeInfo(builder.getTpiBuilder(), tMerger.getTypeTable());
+    addTypeInfo(builder.getIpiBuilder(), tMerger.getIDTable());
+  }
   t2.stop();
 }
 
@@ -1014,8 +1009,8 @@
         "Input OBJ files (expanded from all cmd-line inputs)");
   print(TpiSource::countTypeServerPDBs(), "PDB type server dependencies");
   print(TpiSource::countPrecompObjs(), "Precomp OBJ dependencies");
-  print(tMerger.getTypeTable().size() + tMerger.getIDTable().size(),
-        "Merged TPI records");
+  print(builder.getTpiBuilder().getRecordCount(), "Merged TPI records");
+  print(builder.getIpiBuilder().getRecordCount(), "Merged IPI records");
   print(pdbStrTab.size(), "Output PDB strings");
   print(globalSymbols, "Global symbol records");
   print(moduleSymbols, "Module symbol records");
@@ -1067,8 +1062,11 @@
     }
   };
 
-  printLargeInputTypeRecs("TPI", tMerger.tpiCounts, tMerger.getTypeTable());
-  printLargeInputTypeRecs("IPI", tMerger.ipiCounts, tMerger.getIDTable());
+  if (!config->debugGHashes) {
+    // FIXME: Reimplement for ghash.
+    printLargeInputTypeRecs("TPI", tMerger.tpiCounts, tMerger.getTypeTable());
+    printLargeInputTypeRecs("IPI", tMerger.ipiCounts, tMerger.getIDTable());
+  }
 
   message(buffer);
 }
diff --git a/lld/COFF/TypeMerger.h b/lld/COFF/TypeMerger.h
--- a/lld/COFF/TypeMerger.h
+++ b/lld/COFF/TypeMerger.h
@@ -10,45 +10,51 @@
 #define LLD_COFF_TYPEMERGER_H
 
 #include "Config.h"
-#include "llvm/DebugInfo/CodeView/GlobalTypeTableBuilder.h"
 #include "llvm/DebugInfo/CodeView/MergingTypeTableBuilder.h"
+#include "llvm/DebugInfo/CodeView/TypeHashing.h"
 #include "llvm/Support/Allocator.h"
+#include <atomic>
 
 namespace lld {
 namespace coff {
 
+using llvm::codeview::GloballyHashedType;
+using llvm::codeview::TypeIndex;
+
+struct GHashState;
+
 class TypeMerger {
 public:
-  TypeMerger(llvm::BumpPtrAllocator &alloc)
-      : typeTable(alloc), idTable(alloc), globalTypeTable(alloc),
-        globalIDTable(alloc) {}
+  TypeMerger(llvm::BumpPtrAllocator &alloc);
+
+  ~TypeMerger();
 
   /// Get the type table or the global type table if /DEBUG:GHASH is enabled.
   inline llvm::codeview::TypeCollection &getTypeTable() {
-    if (config->debugGHashes)
-      return globalTypeTable;
+    assert(!config->debugGHashes);
     return typeTable;
   }
 
   /// Get the ID table or the global ID table if /DEBUG:GHASH is enabled.
   inline llvm::codeview::TypeCollection &getIDTable() {
-    if (config->debugGHashes)
-      return globalIDTable;
+    assert(!config->debugGHashes);
     return idTable;
   }
 
+  /// Use global hashes to eliminate duplicate types and identify unique type
+  /// indices in each TpiSource.
+  void mergeTypesWithGHash();
+
+  /// Map from PDB function id type indexes to PDB function type indexes.
+  /// Populated after mergeTypesWithGHash.
+  llvm::DenseMap<TypeIndex, TypeIndex> funcIdToType;
+
   /// Type records that will go into the PDB TPI stream.
   llvm::codeview::MergingTypeTableBuilder typeTable;
 
   /// Item records that will go into the PDB IPI stream.
   llvm::codeview::MergingTypeTableBuilder idTable;
 
-  /// Type records that will go into the PDB TPI stream (for /DEBUG:GHASH)
-  llvm::codeview::GlobalTypeTableBuilder globalTypeTable;
-
-  /// Item records that will go into the PDB IPI stream (for /DEBUG:GHASH)
-  llvm::codeview::GlobalTypeTableBuilder globalIDTable;
-
   // When showSummary is enabled, these are histograms of TPI and IPI records
   // keyed by type index.
   SmallVector<uint32_t, 0> tpiCounts;
diff --git a/lld/ELF/Arch/PPC64.cpp b/lld/ELF/Arch/PPC64.cpp
--- a/lld/ELF/Arch/PPC64.cpp
+++ b/lld/ELF/Arch/PPC64.cpp
@@ -727,15 +727,38 @@
     writeFromHalf16(loc, 0x3c6d0000); // addis r3, r13
     relocateNoSym(loc, R_PPC64_TPREL16_HA, val);
     break;
-  case R_PPC64_TLSGD:
-    write32(loc, NOP);
-    write32(loc + 4, 0x38630000); // addi r3, r3
-    // Since we are relocating a half16 type relocation and Loc + 4 points to
-    // the start of an instruction we need to advance the buffer by an extra
-    // 2 bytes on BE.
-    relocateNoSym(loc + 4 + (config->ekind == ELF64BEKind ? 2 : 0),
-                  R_PPC64_TPREL16_LO, val);
+  case R_PPC64_GOT_TLSGD_PCREL34:
+    // Relax from paddi r3, 0, x@got@tlsgd@pcrel, 1 to
+    //            paddi r3, r13, x@tprel, 0
+    writePrefixedInstruction(loc, 0x06000000386d0000);
+    relocateNoSym(loc, R_PPC64_TPREL34, val);
+    break;
+  case R_PPC64_TLSGD: {
+    // PC Relative Relaxation:
+    // Relax from bl __tls_get_addr@notoc(x@tlsgd) to
+    //            nop
+    // TOC Relaxation:
+    // Relax from bl __tls_get_addr(x@tlsgd)
+    //            nop
+    // to
+    //            nop
+    //            addi r3, r3, x@tprel@l
+    const uintptr_t locAsInt = reinterpret_cast<uintptr_t>(loc);
+    if (locAsInt % 4 == 0) {
+      write32(loc, NOP);            // nop
+      write32(loc + 4, 0x38630000); // addi r3, r3
+      // Since we are relocating a half16 type relocation and Loc + 4 points to
+      // the start of an instruction we need to advance the buffer by an extra
+      // 2 bytes on BE.
+      relocateNoSym(loc + 4 + (config->ekind == ELF64BEKind ? 2 : 0),
+                    R_PPC64_TPREL16_LO, val);
+    } else if (locAsInt % 4 == 1) {
+      write32(loc - 1, NOP);
+    } else {
+      errorOrWarn("R_PPC64_TLSGD has unexpected byte alignment");
+    }
     break;
+  }
   default:
     llvm_unreachable("unsupported relocation for TLS GD to LE relaxation");
   }
@@ -947,6 +970,8 @@
   case R_PPC64_GOT_TLSGD16_HI:
   case R_PPC64_GOT_TLSGD16_LO:
     return R_TLSGD_GOT;
+  case R_PPC64_GOT_TLSGD_PCREL34:
+    return R_TLSGD_PC;
   case R_PPC64_GOT_TLSLD16:
   case R_PPC64_GOT_TLSLD16_HA:
   case R_PPC64_GOT_TLSLD16_HI:
@@ -1261,6 +1286,7 @@
     break;
   case R_PPC64_PCREL34:
   case R_PPC64_GOT_PCREL34:
+  case R_PPC64_GOT_TLSGD_PCREL34:
   case R_PPC64_GOT_TPREL_PCREL34:
   case R_PPC64_TPREL34: {
     const uint64_t si0Mask = 0x00000003ffff0000;
@@ -1340,7 +1366,8 @@
     if ((readPrefixedInstruction(data) & 0xfc000000) == 0xe4000000)
       return R_PPC64_RELAX_GOT_PC;
   }
-  if (expr == R_RELAX_TLS_GD_TO_IE)
+
+  if (type != R_PPC64_GOT_TLSGD_PCREL34 && expr == R_RELAX_TLS_GD_TO_IE)
     return R_RELAX_TLS_GD_TO_IE_GOT_OFF;
   if (expr == R_RELAX_TLS_LD_TO_LE)
     return R_RELAX_TLS_LD_TO_LE_ABS;
@@ -1381,10 +1408,35 @@
     relocateNoSym(loc, R_PPC64_GOT_TPREL16_LO_DS, val);
     return;
   }
-  case R_PPC64_TLSGD:
-    write32(loc, NOP);            // bl __tls_get_addr(sym@tlsgd) --> nop
-    write32(loc + 4, 0x7c636A14); // nop --> add r3, r3, r13
+  case R_PPC64_GOT_TLSGD_PCREL34: {
+    // Relax from paddi r3, 0, sym@got@tlsgd@pcrel, 1 to
+    //            pld r3, sym@got@tprel@pcrel
+    writePrefixedInstruction(loc, 0x04100000e4600000);
+    relocateNoSym(loc, R_PPC64_GOT_TPREL_PCREL34, val);
+    return;
+  }
+  case R_PPC64_TLSGD: {
+    // PC Relative Relaxation:
+    // Relax from bl __tls_get_addr@notoc(x@tlsgd) to
+    //            nop
+    // TOC Relaxation:
+    // Relax from bl __tls_get_addr(x@tlsgd)
+    //            nop
+    // to
+    //            nop
+    //            add r3, r3, r13
+    const uintptr_t locAsInt = reinterpret_cast<uintptr_t>(loc);
+    if (locAsInt % 4 == 0) {
+      write32(loc, NOP);            // bl __tls_get_addr(sym@tlsgd) --> nop
+      write32(loc + 4, 0x7c636A14); // nop --> add r3, r3, r13
+    } else if (locAsInt % 4 == 1) {
+      // bl __tls_get_addr(sym@tlsgd) --> add r3, r3, r13
+      write32(loc - 1, 0x7c636a14);
+    } else {
+      errorOrWarn("R_PPC64_TLSGD has unexpected byte alignment");
+    }
     return;
+  }
   default:
     llvm_unreachable("unsupported relocation for TLS GD to IE relaxation");
   }
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -1867,7 +1867,7 @@
     if (!sym)
       continue;
 
-    Symbol *real = addUndefined(saver.save("__real_" + name));
+    Symbol *real = addUnusedUndefined(saver.save("__real_" + name));
     Symbol *wrap = addUnusedUndefined(saver.save("__wrap_" + name));
     v.push_back({sym, real, wrap});
 
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -1357,6 +1357,19 @@
     if (type == R_PPC64_TOC16_LO && sym.isSection() && isa<Defined>(sym) &&
         cast<Defined>(sym).section->name == ".toc")
       ppc64noTocRelax.insert({&sym, addend});
+
+    if (type == R_PPC64_TLSGD && expr == R_TLSDESC_CALL) {
+      if (i == end) {
+        errorOrWarn("R_PPC64_TLSGD may not be the last relocation" +
+                    getLocation(sec, sym, offset));
+        return;
+      }
+
+      // Offset the 4-byte aligned R_PPC64_TLSGD by one byte in the NOTOC case,
+      // so we can discern it later from the toc-case.
+      if (i->getType(/*isMips64EL=*/false) == R_PPC64_REL24_NOTOC)
+        ++offset;
+    }
   }
 
   // Relax relocations.
diff --git a/lld/docs/WebAssembly.rst b/lld/docs/WebAssembly.rst
--- a/lld/docs/WebAssembly.rst
+++ b/lld/docs/WebAssembly.rst
@@ -39,6 +39,10 @@
 
   Export all symbols (normally combined with --no-gc-sections)
 
+  Note that this will not export linker-generated mutable globals unless
+  the resulting binaryen already includes the 'mutable-globals' features
+  since that would otherwise create and invalid binaryen.
+
 .. option:: --export-dynamic
 
   When building an executable, export any non-hidden symbols.  By default only
diff --git a/lld/include/lld/Common/ErrorHandler.h b/lld/include/lld/Common/ErrorHandler.h
--- a/lld/include/lld/Common/ErrorHandler.h
+++ b/lld/include/lld/Common/ErrorHandler.h
@@ -153,6 +153,13 @@
   return std::move(*e);
 }
 
+// Don't move from Expected wrappers around references.
+template <class T> T &check(Expected<T &> e) {
+  if (!e)
+    fatal(llvm::toString(e.takeError()));
+  return *e;
+}
+
 template <class T>
 T check2(ErrorOr<T> e, llvm::function_ref<std::string()> prefix) {
   if (auto ec = e.getError())
diff --git a/lld/test/COFF/pdb-global-hashes.test b/lld/test/COFF/pdb-global-hashes.test
--- a/lld/test/COFF/pdb-global-hashes.test
+++ b/lld/test/COFF/pdb-global-hashes.test
@@ -2,7 +2,7 @@
 RUN: yaml2obj %p/Inputs/pdb-hashes-2.yaml -o %t.2.obj
 RUN: yaml2obj %p/Inputs/pdb-hashes-2-missing.yaml -o %t.2.missing.obj
 RUN: lld-link /debug %t.1.obj %t.2.obj /entry:main /nodefaultlib /PDB:%t.nohash.pdb
-RUN: lld-link /debug:ghash %t.1.obj %t.2.obj /entry:main /nodefaultlib /PDB:%t.hash.pdb
+RUN: lld-link /debug:ghash -verbose %t.1.obj %t.2.obj /entry:main /nodefaultlib /PDB:%t.hash.pdb
 RUN: lld-link /debug:ghash %t.1.obj %t.2.missing.obj /entry:main /nodefaultlib /PDB:%t.mixed.pdb
 RUN: llvm-pdbutil dump -types -ids -dont-resolve-forward-refs %t.nohash.pdb | FileCheck %s
 RUN: llvm-pdbutil dump -types -ids -dont-resolve-forward-refs %t.hash.pdb | FileCheck %s
diff --git a/lld/test/COFF/pdb-procid-remapping.test b/lld/test/COFF/pdb-procid-remapping.test
--- a/lld/test/COFF/pdb-procid-remapping.test
+++ b/lld/test/COFF/pdb-procid-remapping.test
@@ -1,8 +1,12 @@
-# RUN: yaml2obj %p/Inputs/pdb1.yaml -o %t1.obj
-# RUN: yaml2obj %p/Inputs/pdb2.yaml -o %t2.obj
+# RUN: yaml2obj < %p/Inputs/pdb1.yaml > %t1.obj
+# RUN: yaml2obj < %p/Inputs/pdb2.yaml > %t2.obj
+
 # RUN: lld-link /debug /pdb:%t.pdb /dll /out:%t.dll /entry:main /nodefaultlib \
 # RUN:   %t1.obj %t2.obj
+# RUN: llvm-pdbutil dump -symbols %t.pdb | FileCheck %s
 
+# RUN: lld-link /debug /debug:ghash /pdb:%t.pdb /dll /out:%t.dll /entry:main /nodefaultlib \
+# RUN:   %t1.obj %t2.obj
 # RUN: llvm-pdbutil dump -symbols %t.pdb | FileCheck %s
 
 CHECK:                                Symbols
diff --git a/lld/test/COFF/pdb-type-server-missing.yaml b/lld/test/COFF/pdb-type-server-missing.yaml
--- a/lld/test/COFF/pdb-type-server-missing.yaml
+++ b/lld/test/COFF/pdb-type-server-missing.yaml
@@ -5,6 +5,7 @@
 # RUN: yaml2obj %s -o %t1.obj
 # RUN: yaml2obj %p/Inputs/pdb-type-server-missing-2.yaml -o %t2.obj
 # RUN: lld-link %t1.obj %t2.obj -out:%t.exe -debug -pdb:%t.pdb -nodefaultlib -entry:main 2>&1 | FileCheck %s -check-prefix=WARN
+# RUN: lld-link %t1.obj %t2.obj -out:%t.exe -debug:ghash -pdb:%t.pdb -nodefaultlib -entry:main 2>&1 | FileCheck %s -check-prefix=WARN
 # RUN: lld-link %t1.obj %t2.obj -out:%t.exe -debug -pdb:%t.pdb -nodefaultlib -entry:main /ignore:4099 2>&1 | FileCheck %s -check-prefix=IGNORE -allow-empty
 # RUN: not lld-link %t1.obj %t2.obj -out:%t.exe -debug -pdb:%t.pdb -nodefaultlib -entry:main /WX 2>&1 | FileCheck %s -check-prefix=ERR
 # RUN: lld-link %t1.obj %t2.obj -out:%t.exe -debug -pdb:%t.pdb -nodefaultlib -entry:main /ignore:4099 /WX 2>&1 | FileCheck %s -check-prefix=IGNORE-ERR -allow-empty
diff --git a/lld/test/COFF/pdb-type-server-simple.test b/lld/test/COFF/pdb-type-server-simple.test
--- a/lld/test/COFF/pdb-type-server-simple.test
+++ b/lld/test/COFF/pdb-type-server-simple.test
@@ -20,7 +20,11 @@
 RUN: yaml2obj %S/Inputs/pdb-type-server-simple-a.yaml -o a.obj
 RUN: yaml2obj %S/Inputs/pdb-type-server-simple-b.yaml -o b.obj
 RUN: llvm-pdbutil yaml2pdb %S/Inputs/pdb-type-server-simple-ts.yaml -pdb ts.pdb
-RUN: lld-link a.obj b.obj -entry:main -debug -out:t.exe -pdb:t.pdb -nodefaultlib /summary | FileCheck %s -check-prefix SUMMARY
+RUN: lld-link a.obj b.obj -entry:main -debug -out:t.exe -pdb:t.pdb -nodefaultlib -summary | FileCheck %s -check-prefix SUMMARY
+RUN: llvm-pdbutil dump -symbols -types -ids -globals %t/t.pdb | FileCheck %s
+
+Re-run with /DEBUG:GHASH
+RUN: lld-link a.obj b.obj -entry:main -debug:ghash -out:t.exe -pdb:t.pdb -nodefaultlib -summary -verbose
 RUN: llvm-pdbutil dump -symbols -types -ids -globals %t/t.pdb | FileCheck %s
 
 
@@ -101,7 +105,8 @@
 SUMMARY-NEXT:               2 Input OBJ files (expanded from all cmd-line inputs)
 SUMMARY-NEXT:               1 PDB type server dependencies
 SUMMARY-NEXT:               0 Precomp OBJ dependencies
-SUMMARY-NEXT:              25 Merged TPI records
+SUMMARY-NEXT:               9 Merged TPI records
+SUMMARY-NEXT:              16 Merged IPI records
 SUMMARY-NEXT:               3 Output PDB strings
 SUMMARY-NEXT:               4 Global symbol records
 SUMMARY-NEXT:              14 Module symbol records
diff --git a/lld/test/COFF/precomp-link.test b/lld/test/COFF/precomp-link.test
--- a/lld/test/COFF/precomp-link.test
+++ b/lld/test/COFF/precomp-link.test
@@ -5,6 +5,7 @@
 RUN: llvm-pdbutil dump -types %t.pdb | FileCheck %s
 
 RUN: lld-link %S/Inputs/precomp-a.obj %S/Inputs/precomp-invalid.obj %S/Inputs/precomp.obj /nodefaultlib /entry:main /debug /pdb:%t.pdb /out:%t.exe /opt:ref /opt:icf 2>&1 | FileCheck %s -check-prefix FAILURE
+RUN: lld-link %S/Inputs/precomp-a.obj %S/Inputs/precomp-invalid.obj %S/Inputs/precomp.obj /nodefaultlib /entry:main /debug:ghash /pdb:%t.pdb /out:%t.exe /opt:ref /opt:icf 2>&1 | FileCheck %s -check-prefix FAILURE
 
 FIXME: The following RUN line should fail, regardless of whether debug info is
 enabled or not. Normally this would result in an error due to missing _PchSym_
@@ -52,12 +53,19 @@
 CHECK-NOT: LF_ENDPRECOMP
 
 
+Re-run with ghash. Eventually, perhaps this will be the default.
+
+RUN: lld-link %S/Inputs/precomp-a.obj %S/Inputs/precomp-b.obj %S/Inputs/precomp.obj /nodefaultlib /entry:main /debug /debug:ghash /pdb:%t.pdb /out:%t.exe /opt:ref /opt:icf /summary | FileCheck %s -check-prefix SUMMARY
+RUN: llvm-pdbutil dump -types %t.pdb | FileCheck %s
+
+
 SUMMARY:                                     Summary
 SUMMARY-NEXT: --------------------------------------------------------------------------------
 SUMMARY-NEXT:               3 Input OBJ files (expanded from all cmd-line inputs)
 SUMMARY-NEXT:               0 PDB type server dependencies
 SUMMARY-NEXT:               1 Precomp OBJ dependencies
-SUMMARY-NEXT:            1044 Merged TPI records
+SUMMARY-NEXT:            874 Merged TPI records
+SUMMARY-NEXT:            170 Merged IPI records
 SUMMARY-NEXT:               5 Output PDB strings
 SUMMARY-NEXT:             167 Global symbol records
 SUMMARY-NEXT:              20 Module symbol records
diff --git a/lld/test/COFF/s_udt.s b/lld/test/COFF/s_udt.s
--- a/lld/test/COFF/s_udt.s
+++ b/lld/test/COFF/s_udt.s
@@ -2,6 +2,8 @@
 # RUN: llvm-mc -filetype=obj -triple=x86_64-pc-windows-msvc < %s > %t.obj
 # RUN: lld-link /DEBUG:FULL /nodefaultlib /entry:main %t.obj /PDB:%t.pdb /OUT:%t.exe
 # RUN: llvm-pdbutil dump -types -globals -symbols -modi=0 %t.pdb | FileCheck %s
+# RUN: lld-link /DEBUG:FULL /debug:ghash /nodefaultlib /entry:main %t.obj /PDB:%t.pdb /OUT:%t.exe
+# RUN: llvm-pdbutil dump -types -globals -symbols -modi=0 %t.pdb | FileCheck %s
 
 # CHECK:                               Types (TPI Stream)
 # CHECK-NEXT: ============================================================
diff --git a/lld/test/ELF/invalid/data-encoding.test b/lld/test/ELF/invalid/data-encoding.test
--- a/lld/test/ELF/invalid/data-encoding.test
+++ b/lld/test/ELF/invalid/data-encoding.test
@@ -4,7 +4,7 @@
 # Check we report this.
 
 # RUN: yaml2obj %s -o %t.o
-# RUN: llvm-ar rcs %t.a %t.o
+# RUN: llvm-ar rcS %t.a %t.o
 
 # RUN: not ld.lld --whole-archive %t.a -o /dev/null 2>&1 | FileCheck %s
 # CHECK: {{.*}}.a({{.*}}.o): corrupted ELF file: invalid data encoding
diff --git a/lld/test/ELF/invalid/invalid-file-class.test b/lld/test/ELF/invalid/invalid-file-class.test
--- a/lld/test/ELF/invalid/invalid-file-class.test
+++ b/lld/test/ELF/invalid/invalid-file-class.test
@@ -11,7 +11,7 @@
 ## EV_CURRENT(1), ELFOSABI_LINUX(3), <padding zero bytes>, ET_REL(1), EM_NONE(0)
 # RUN: echo -e -n "\x7f\x45\x4c\x46\x00\x01\x01\x03\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00" > %t/invalid.o
 
-# RUN: llvm-ar --format=gnu cr %t/invalid-class.a %t/invalid.o
+# RUN: llvm-ar --format=gnu crS %t/invalid-class.a %t/invalid.o
 # RUN: not ld.lld -whole-archive %t/invalid-class.a -o /dev/null 2>&1 | FileCheck %s
 # CHECK: invalid-class.a(invalid.o): corrupted ELF file: invalid file class
 
diff --git a/lld/test/ELF/lto/wrap-1.ll b/lld/test/ELF/lto/wrap-1.ll
--- a/lld/test/ELF/lto/wrap-1.ll
+++ b/lld/test/ELF/lto/wrap-1.ll
@@ -17,11 +17,12 @@
 ; CHECK-NEXT: Binding: Global
 ; CHECK-NEXT: Type: Function
 
-; Make sure that the 'r' (linker redefined) bit is set for bar and __wrap_bar
-; in the resolutions file.
-; RESOLS: ,bar,xr
-; RESOLS: ,__wrap_bar,plx
-; RESOLS: ,__real_bar,plxr
+; Make sure that the 'r' (linker redefined) bit is set for bar and __real_bar
+; in the resolutions file. The calls to bar and __real_bar will be routed to
+; __wrap_bar and bar, respectively. So they cannot be inlined.
+; RESOLS: ,bar,xr{{$}}
+; RESOLS: ,__wrap_bar,plx{{$}}
+; RESOLS: ,__real_bar,plr{{$}}
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/lld/test/ELF/ppc64-tls-pcrel-gd.s b/lld/test/ELF/ppc64-tls-pcrel-gd.s
new file mode 100644
--- /dev/null
+++ b/lld/test/ELF/ppc64-tls-pcrel-gd.s
@@ -0,0 +1,94 @@
+# REQUIRES: ppc
+# RUN: split-file %s %t
+
+# RUN: llvm-mc -filetype=obj -triple=powerpc64le %t/asm -o %t.o
+# RUN: llvm-mc -filetype=obj -triple=powerpc64le %t/defs -o %t-defs.o
+# RUN: ld.lld --shared %t-defs.o --soname=t-defs -o %t-defs.so
+# RUN: ld.lld -T %t/lds --shared %t.o -o %t-gd.so
+# RUN: ld.lld -T %t/lds %t.o %t-defs.so -o %t-gdtoie
+# RUN: ld.lld -T %t/lds %t.o %t-defs.o -o %t-gdtole
+
+# RUN: llvm-readelf -r %t-gd.so | FileCheck %s --check-prefix=GD-RELOC
+# RUN: llvm-readelf -s %t-gd.so | FileCheck %s --check-prefix=GD-SYM
+# RUN: llvm-objdump -d --no-show-raw-insn --mcpu=pwr10 %t-gd.so | FileCheck %s --check-prefix=GD
+
+# RUN: llvm-readelf -r %t-gdtoie | FileCheck %s --check-prefix=GDTOIE-RELOC
+# RUN: llvm-readelf -s %t-gdtoie | FileCheck %s --check-prefix=GDTOIE-SYM
+# RUN: llvm-objdump -d --no-show-raw-insn --mcpu=pwr10 %t-gdtoie | FileCheck %s --check-prefix=GDTOIE
+
+# RUN: llvm-readelf -r %t-gdtole | FileCheck %s --check-prefix=GDTOLE-RELOC
+# RUN: llvm-readelf -s %t-gdtole | FileCheck %s --check-prefix=GDTOLE-SYM
+# RUN: llvm-objdump -d --no-show-raw-insn --mcpu=pwr10 %t-gdtole | FileCheck %s --check-prefix=GDTOLE
+
+## This test checks the General Dynamic PC Relative TLS implementation for lld.
+## GD - General Dynamic with no relaxation possible
+## GDTOIE - General Dynamic relaxed to Initial Exec
+## GDTOLE - General Dynamic relaxed to Local Exec
+
+#--- lds
+SECTIONS {
+  .text_addr 0x1001000 : { *(.text_addr) }
+}
+
+#--- defs
+.section .tbss,"awT",@nobits
+.globl  x
+x:
+  .long 0
+.globl  y
+y:
+  .long 0
+
+#--- asm
+
+# GD-RELOC: Relocation section '.rela.dyn' at offset 0x100b8 contains 4 entries:
+# GD-RELOC: 0000000001001160  0000000200000044 R_PPC64_DTPMOD64       0000000000000000 x + 0
+# GD-RELOC: 0000000001001168  000000020000004e R_PPC64_DTPREL64       0000000000000000 x + 0
+# GD-RELOC: 0000000001001170  0000000300000044 R_PPC64_DTPMOD64       0000000000000000 y + 0
+# GD-RELOC: 0000000001001178  000000030000004e R_PPC64_DTPREL64       0000000000000000 y + 0
+
+# GD-SYM:   Symbol table '.dynsym' contains 4 entries:
+# GD-SYM:   2: 0000000000000000     0 TLS     GLOBAL DEFAULT   UND x
+# GD-SYM:   3: 0000000000000000     0 TLS     GLOBAL DEFAULT   UND y
+
+
+# GDTOIE-RELOC: Relocation section '.rela.dyn' at offset 0x{{.*}} contains 2 entries:
+# GDTOIE-RELOC: 00000000010010e0  0000000200000049 R_PPC64_TPREL64        0000000000000000 x + 0
+# GDTOIE-RELOC: 00000000010010e8  0000000300000049 R_PPC64_TPREL64        0000000000000000 y + 0
+
+# GDTOIE-SYM: Symbol table '.dynsym' contains 4 entries:
+# GDTOIE-SYM:   2: 0000000000000000     0 TLS     GLOBAL DEFAULT   UND x
+# GDTOIE-SYM:   3: 0000000000000000     0 TLS     GLOBAL DEFAULT   UND y
+
+
+# GDTOLE-RELOC: There are no relocations in this file.
+
+# GDTOLE-SYM: Symbol table '.symtab' contains 5 entries:
+# GDTOLE-SYM: 3: 0000000000000000     0 TLS     GLOBAL DEFAULT     3 x
+# GDTOLE-SYM: 4: 0000000000000004     0 TLS     GLOBAL DEFAULT     3 y
+
+# GD-LABEL: <GDTwoVal>:
+# GD-NEXT:    paddi 3, 0, 352, 1
+# GD-NEXT:    bl
+# GD-NEXT:    paddi 3, 0, 356, 1
+# GD-NEXT:    bl
+# GD-NEXT:    blr
+# GDTOIE-LABEL: <GDTwoVal>:
+# GDTOIE-NEXT:    pld 3, 224(0), 1
+# GDTOIE-NEXT:    add 3, 3, 13
+# GDTOIE-NEXT:    pld 3, 220(0), 1
+# GDTOIE-NEXT:    add 3, 3, 13
+# GDTOIE-NEXT:    blr
+# GDTOLE-LABEL: <GDTwoVal>:
+# GDTOLE-NEXT:    paddi 3, 13, -28672, 0
+# GDTOLE-NEXT:    nop
+# GDTOLE-NEXT:    paddi 3, 13, -28668, 0
+# GDTOLE-NEXT:    nop
+# GDTOLE-NEXT:    blr
+.section .text_addr, "ax", %progbits
+GDTwoVal:
+  paddi 3, 0, x@got@tlsgd@pcrel, 1
+  bl __tls_get_addr@notoc(x@tlsgd)
+  paddi 3, 0, y@got@tlsgd@pcrel, 1
+  bl __tls_get_addr@notoc(y@tlsgd)
+  blr
diff --git a/lld/test/wasm/command-exports-no-tors.s b/lld/test/wasm/command-exports-no-tors.s
new file mode 100644
--- /dev/null
+++ b/lld/test/wasm/command-exports-no-tors.s
@@ -0,0 +1,54 @@
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s
+# RUN: wasm-ld --no-entry %t.o -o %t.wasm
+# RUN: obj2yaml %t.wasm | FileCheck %s
+
+# Like command-exports.s, but with no ctors or dtors, so there should be no
+# __wasm_call_ctors, __cxa_atexit, or wrappers.
+
+	.globl	foo_i32
+foo_i32:
+	.functype	foo_i32 (i32, i32) -> (i32)
+	local.get	0
+	local.get	1
+	i32.add
+	end_function
+
+	.globl	foo_f64
+foo_f64:
+	.functype	foo_f64 (f64, f64) -> (f64)
+	local.get	0
+	local.get	1
+	f64.add
+	end_function
+
+	.export_name	foo_i32, foo_i32
+	.export_name	foo_f64, foo_f64
+
+# CHECK:       - Type:            EXPORT
+# CHECK-NEXT:    Exports:
+# CHECK-NEXT:      - Name:            memory
+# CHECK-NEXT:        Kind:            MEMORY
+# CHECK-NEXT:        Index:           0
+# CHECK-NEXT:      - Name:            foo_i32
+# CHECK-NEXT:        Kind:            FUNCTION
+# CHECK-NEXT:        Index:           0
+# CHECK-NEXT:      - Name:            foo_f64
+# CHECK-NEXT:        Kind:            FUNCTION
+# CHECK-NEXT:        Index:           1
+
+# CHECK:       - Type:            CODE
+
+# CHECK:           - Index:           0
+# CHECK-NEXT:        Locals:          []
+# CHECK-NEXT:        Body:            200020016A0B
+# CHECK-NEXT:      - Index:           1
+# CHECK-NEXT:        Locals:          []
+# CHECK-NEXT:        Body:            20002001A00B
+
+# CHECK:       - Type:            CUSTOM
+# CHECK-NEXT:    Name:            name
+# CHECK-NEXT:    FunctionNames:
+# CHECK-NEXT:      - Index:           0
+# CHECK-NEXT:        Name:            foo_i32
+# CHECK-NEXT:      - Index:           1
+# CHECK-NEXT:        Name:            foo_f64
diff --git a/lld/test/wasm/command-exports.s b/lld/test/wasm/command-exports.s
new file mode 100644
--- /dev/null
+++ b/lld/test/wasm/command-exports.s
@@ -0,0 +1,113 @@
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s
+# RUN: wasm-ld --no-entry %t.o -o %t.wasm
+# RUN: obj2yaml %t.wasm | FileCheck %s
+
+# This test defines a command with two exported functions, as well as a static
+# constructor and a static destructor. Check that the exports, constructor, and
+# destructor are all set up properly.
+
+	.globl	foo_i32
+foo_i32:
+	.functype	foo_i32 (i32, i32) -> (i32)
+	local.get	0
+	local.get	1
+	i32.add
+	end_function
+
+	.globl	foo_f64
+foo_f64:
+	.functype	foo_f64 (f64, f64) -> (f64)
+	local.get	0
+	local.get	1
+	f64.add
+	end_function
+
+	.globl	some_ctor
+some_ctor:
+	.functype	some_ctor () -> ()
+	end_function
+
+	.globl	some_dtor
+some_dtor:
+	.functype	some_dtor () -> ()
+	end_function
+
+	.hidden	__cxa_atexit
+	.globl	__cxa_atexit
+__cxa_atexit:
+	.functype	__cxa_atexit (i32, i32, i32) -> (i32)
+	i32.const	0
+	end_function
+
+	.section	.text..Lcall_dtors.1,"",@
+.Lcall_dtors.1:
+	.functype	.Lcall_dtors.1 (i32) -> ()
+	call	some_dtor
+	end_function
+
+	.section	.text..Lregister_call_dtors.1,"",@
+.Lregister_call_dtors.1:
+	.functype	.Lregister_call_dtors.1 () -> ()
+	block
+	i32.const	.Lcall_dtors.1
+	i32.const	0
+	i32.const	0
+	call	__cxa_atexit
+	i32.eqz
+	br_if   	0
+	unreachable
+.LBB6_2:
+	end_block
+	end_function
+
+	.section	.init_array.1,"",@
+	.p2align	2
+	.int32	some_ctor
+	.int32	.Lregister_call_dtors.1
+	.export_name	foo_i32, foo_i32
+	.export_name	foo_f64, foo_f64
+
+# CHECK:       - Type:            EXPORT
+# CHECK-NEXT:    Exports:
+# CHECK-NEXT:      - Name:            memory
+# CHECK-NEXT:        Kind:            MEMORY
+# CHECK-NEXT:        Index:           0
+# CHECK-NEXT:      - Name:            foo_i32
+# CHECK-NEXT:        Kind:            FUNCTION
+# CHECK-NEXT:        Index:           8
+# CHECK-NEXT:      - Name:            foo_f64
+# CHECK-NEXT:        Kind:            FUNCTION
+# CHECK-NEXT:        Index:           9
+
+# CHECK:       - Type:            CODE
+
+# CHECK:           - Index:           8
+# CHECK-NEXT:        Locals:          []
+# CHECK-NEXT:        Body:            10002000200110010B
+# CHECK-NEXT:      - Index:           9
+# CHECK-NEXT:        Locals:          []
+# CHECK-NEXT:        Body:            10002000200110020B
+
+# CHECK:       - Type:            CUSTOM
+# CHECK-NEXT:    Name:            name
+# CHECK-NEXT:    FunctionNames:
+# CHECK-NEXT:      - Index:           0
+# CHECK-NEXT:        Name:            __wasm_call_ctors
+# CHECK-NEXT:      - Index:           1
+# CHECK-NEXT:        Name:            foo_i32
+# CHECK-NEXT:      - Index:           2
+# CHECK-NEXT:        Name:            foo_f64
+# CHECK-NEXT:      - Index:           3
+# CHECK-NEXT:        Name:            some_ctor
+# CHECK-NEXT:      - Index:           4
+# CHECK-NEXT:        Name:            some_dtor
+# CHECK-NEXT:      - Index:           5
+# CHECK-NEXT:        Name:            __cxa_atexit
+# CHECK-NEXT:      - Index:           6
+# CHECK-NEXT:        Name:            .Lcall_dtors.1
+# CHECK-NEXT:      - Index:           7
+# CHECK-NEXT:        Name:            .Lregister_call_dtors.1
+# CHECK-NEXT:      - Index:           8
+# CHECK-NEXT:        Name:            foo_i32.command_export
+# CHECK-NEXT:      - Index:           9
+# CHECK-NEXT:        Name:            foo_f64.command_export
diff --git a/lld/test/wasm/init-fini-gc.ll b/lld/test/wasm/init-fini-gc.ll
deleted file mode 100644
--- a/lld/test/wasm/init-fini-gc.ll
+++ /dev/null
@@ -1,48 +0,0 @@
-; RUN: llc -filetype=obj -o %t.o %s
-; RUN: wasm-ld %t.o -o %t.wasm
-; RUN: obj2yaml %t.wasm | FileCheck %s
-
-; RUN: wasm-ld %t.o -o %t.wasm
-; RUN: obj2yaml %t.wasm | FileCheck %s
-
-; RUN: wasm-ld --export=__wasm_call_ctors %t.o -o %t.export.wasm
-; RUN: obj2yaml %t.export.wasm | FileCheck %s -check-prefix=EXPORT
-
-; Test that the __wasm_call_ctor function if not referenced
-
-target triple = "wasm32-unknown-unknown"
-
-define hidden void @_start() {
-entry:
-  ret void
-}
-
-define hidden void @func1() {
-entry:
-  ret void
-}
-
-define hidden void @func2() {
-entry:
-  ret void
-}
-
-define i32 @__cxa_atexit(i32 %func, i32 %arg, i32 %dso_handle) {
-  ret i32 0
-}
-
-@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [
-  { i32, void ()*, i8* } { i32 1, void ()* @func1, i8* null }
-]
-
-@llvm.global_dtors = appending global [1 x { i32, void ()*, i8* }] [
-  { i32, void ()*, i8* } { i32 1, void ()* @func2, i8* null }
-]
-
-; CHECK-NOT: __cxa_atexit
-; CHECK-NOT: __wasm_call_ctors
-
-; EXPORT: __wasm_call_ctors
-; EXPORT: func1
-; EXPORT: func2
-; EXPORT: __cxa_atexit
diff --git a/lld/test/wasm/init-fini-no-gc.ll b/lld/test/wasm/init-fini-no-gc.ll
new file mode 100644
--- /dev/null
+++ b/lld/test/wasm/init-fini-no-gc.ll
@@ -0,0 +1,85 @@
+; RUN: llc -filetype=obj -o %t.o %s
+; RUN: wasm-ld %t.o -o %t.wasm
+; RUN: obj2yaml %t.wasm | FileCheck %s
+
+; RUN: wasm-ld --export=__wasm_call_ctors %t.o -o %t.export.wasm
+; RUN: obj2yaml %t.export.wasm | FileCheck %s -check-prefix=EXPORT
+
+; Test that we emit wrappers and call __wasm_call_ctor when not referenced.
+
+target triple = "wasm32-unknown-unknown"
+
+define hidden void @_start() {
+entry:
+  ret void
+}
+
+define hidden void @func1() {
+entry:
+  ret void
+}
+
+define hidden void @func2() {
+entry:
+  ret void
+}
+
+define hidden i32 @__cxa_atexit(i32 %func, i32 %arg, i32 %dso_handle) {
+  ret i32 0
+}
+
+@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [
+  { i32, void ()*, i8* } { i32 1, void ()* @func1, i8* null }
+]
+
+@llvm.global_dtors = appending global [1 x { i32, void ()*, i8* }] [
+  { i32, void ()*, i8* } { i32 1, void ()* @func2, i8* null }
+]
+
+; Check that we have exactly the needed exports: `memory` because that's
+; currently on by default, and `_start`, because that's the default entrypoint.
+
+; CHECK:       - Type:            EXPORT
+; CHECK-NEXT:    Exports:
+; CHECK-NEXT:      - Name:            memory
+; CHECK-NEXT:        Kind:            MEMORY
+; CHECK-NEXT:        Index:           0
+; CHECK-NEXT:      - Name:            _start
+; CHECK-NEXT:        Kind:            FUNCTION
+; CHECK-NEXT:        Index:           7
+
+; Check the body of `_start`'s command-export wrapper.
+
+; CHECK:       - Type:            CODE
+
+; CHECK:           - Index:           7
+; CHECK-NEXT:        Locals:          []
+; CHECK-NEXT:        Body:            100010010B
+
+; Check the symbol table to ensure all the functions are here, and that
+; index 7 above refers to the function we think it does.
+
+; CHECK:       - Type:            CUSTOM
+; CHECK-NEXT:    Name:            name
+; CHECK-NEXT:    FunctionNames:
+; CHECK-NEXT:      - Index:           0
+; CHECK-NEXT:        Name:            __wasm_call_ctors
+; CHECK-NEXT:      - Index:           1
+; CHECK-NEXT:        Name:            _start
+; CHECK-NEXT:      - Index:           2
+; CHECK-NEXT:        Name:            func1
+; CHECK-NEXT:      - Index:           3
+; CHECK-NEXT:        Name:            func2
+; CHECK-NEXT:      - Index:           4
+; CHECK-NEXT:        Name:            __cxa_atexit
+; CHECK-NEXT:      - Index:           5
+; CHECK-NEXT:        Name:            .Lcall_dtors.1
+; CHECK-NEXT:      - Index:           6
+; CHECK-NEXT:        Name:            .Lregister_call_dtors.1
+; CHECK-NEXT:      - Index:           7
+; CHECK-NEXT:        Name:            _start.command_export
+
+; EXPORT: __wasm_call_ctors
+; EXPORT: func1
+; EXPORT: func2
+; EXPORT: __cxa_atexit
diff --git a/lld/test/wasm/mutable-global-exports.s b/lld/test/wasm/mutable-global-exports.s
new file mode 100644
--- /dev/null
+++ b/lld/test/wasm/mutable-global-exports.s
@@ -0,0 +1,88 @@
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s
+#
+# Should fail without mutable globals feature enabled.
+# RUN: not wasm-ld --export-all %t.o -o %t.wasm 2>&1 | FileCheck -check-prefix=CHECK-ERR %s
+# RUN: not wasm-ld --export=foo_global %t.o -o %t.wasm 2>&1 | FileCheck -check-prefix=CHECK-ERR %s
+#
+# RUN: wasm-ld --features=mutable-globals --export=foo_global %t.o -o %t.wasm
+# RUN: obj2yaml %t.wasm | FileCheck %s
+
+# Explcitly check that __stack_pointer can be exported
+# RUN: wasm-ld --features=mutable-globals --export=__stack_pointer %t.o -o %t.wasm
+# RUN: obj2yaml %t.wasm | FileCheck -check-prefix=CHECK-SP %s
+
+# RUN: wasm-ld --features=mutable-globals --export-all %t.o -o %t.wasm
+# RUN: obj2yaml %t.wasm | FileCheck -check-prefix=CHECK-ALL %s
+
+
+.globl _start
+.globl foo_global
+
+.globaltype foo_global, i32
+foo_global:
+
+_start:
+  .functype _start () -> ()
+  end_function
+
+# CHECK-ERR: mutable global exported but 'mutable-globals' feature not present in inputs: `foo_global`. Use --no-check-features to suppress
+
+#      CHECK:  - Type:            EXPORT
+# CHECK-NEXT:    Exports:
+# CHECK-NEXT:      - Name:            memory
+# CHECK-NEXT:        Kind:            MEMORY
+# CHECK-NEXT:        Index:           0
+# CHECK-NEXT:      - Name:            _start
+# CHECK-NEXT:        Kind:            FUNCTION
+# CHECK-NEXT:        Index:           0
+# CHECK-NEXT:      - Name:            foo_global
+# CHECK-NEXT:        Kind:            GLOBAL
+# CHECK-NEXT:        Index:           1
+# CHECK-NEXT:  - Type:            CODE
+
+#      CHECK-SP:  - Type:            EXPORT
+# CHECK-SP-NEXT:    Exports:
+# CHECK-SP-NEXT:      - Name:            memory
+# CHECK-SP-NEXT:        Kind:            MEMORY
+# CHECK-SP-NEXT:        Index:           0
+# CHECK-SP-NEXT:      - Name:            __stack_pointer
+# CHECK-SP-NEXT:        Kind:            GLOBAL
+# CHECK-SP-NEXT:        Index:           0
+# CHECK-SP-NEXT:      - Name:            _start
+# CHECK-SP-NEXT:        Kind:            FUNCTION
+# CHECK-SP-NEXT:        Index:           0
+# CHECK-SP-NEXT:  - Type:            CODE
+
+#      CHECK-ALL:  - Type:            EXPORT
+# CHECK-ALL-NEXT:    Exports:
+# CHECK-ALL-NEXT:      - Name:            memory
+# CHECK-ALL-NEXT:        Kind:            MEMORY
+# CHECK-ALL-NEXT:        Index:           0
+# CHECK-ALL-NEXT:      - Name:            __wasm_call_ctors
+# CHECK-ALL-NEXT:        Kind:            FUNCTION
+# CHECK-ALL-NEXT:        Index:           0
+# CHECK-ALL-NEXT:      - Name:            _start
+# CHECK-ALL-NEXT:        Kind:            FUNCTION
+# CHECK-ALL-NEXT:        Index:           1
+# CHECK-ALL-NEXT:      - Name:            foo_global
+# CHECK-ALL-NEXT:        Kind:            GLOBAL
+# CHECK-ALL-NEXT:        Index:           1
+# CHECK-ALL-NEXT:      - Name:            __dso_handle
+# CHECK-ALL-NEXT:        Kind:            GLOBAL
+# CHECK-ALL-NEXT:        Index:           2
+# CHECK-ALL-NEXT:      - Name:            __data_end
+# CHECK-ALL-NEXT:        Kind:            GLOBAL
+# CHECK-ALL-NEXT:        Index:           3
+# CHECK-ALL-NEXT:      - Name:            __global_base
+# CHECK-ALL-NEXT:        Kind:            GLOBAL
+# CHECK-ALL-NEXT:        Index:           4
+# CHECK-ALL-NEXT:      - Name:            __heap_base
+# CHECK-ALL-NEXT:        Kind:            GLOBAL
+# CHECK-ALL-NEXT:        Index:           5
+# CHECK-ALL-NEXT:      - Name:            __memory_base
+# CHECK-ALL-NEXT:        Kind:            GLOBAL
+# CHECK-ALL-NEXT:        Index:           6
+# CHECK-ALL-NEXT:      - Name:            __table_base
+# CHECK-ALL-NEXT:        Kind:            GLOBAL
+# CHECK-ALL-NEXT:        Index:           7
+# CHECK-ALL-NEXT:  - Type:            CODE
diff --git a/lld/test/wasm/mutable-globals.s b/lld/test/wasm/mutable-globals.s
--- a/lld/test/wasm/mutable-globals.s
+++ b/lld/test/wasm/mutable-globals.s
@@ -1,5 +1,6 @@
 # RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s
 # RUN: not wasm-ld %t.o -o %t.wasm 2>&1 | FileCheck %s
+# RUN: wasm-ld --features=mutable-globals %t.o -o %t.wasm
 
 .globl _start
 _start:
diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp
--- a/lld/wasm/Driver.cpp
+++ b/lld/wasm/Driver.cpp
@@ -572,7 +572,6 @@
         make<SyntheticFunction>(nullSignature, "__wasm_apply_relocs"));
   }
 
-
   if (config->isPic) {
     WasmSym::stackPointer =
         createUndefinedGlobal("__stack_pointer", config->is64.getValueOr(false)
@@ -841,6 +840,29 @@
             config->entry);
   }
 
+  // If the user code defines a `__wasm_call_dtors` function, remember it so
+  // that we can call it from the command export wrappers. Unlike
+  // `__wasm_call_ctors` which we synthesize, `__wasm_call_dtors` is defined
+  // by libc/etc., because destructors are registered dynamically with
+  // `__cxa_atexit` and friends.
+  if (!config->relocatable && !config->shared &&
+      !WasmSym::callCtors->isUsedInRegularObj &&
+      WasmSym::callCtors->getName() != config->entry &&
+      !config->exportedSymbols.count(WasmSym::callCtors->getName())) {
+    if (Symbol *callDtors = handleUndefined("__wasm_call_dtors")) {
+      if (auto *callDtorsFunc = dyn_cast<DefinedFunction>(callDtors)) {
+        if (callDtorsFunc->signature &&
+            (!callDtorsFunc->signature->Params.empty() ||
+             !callDtorsFunc->signature->Returns.empty())) {
+          error("__wasm_call_dtors must have no argument or return values");
+        }
+        WasmSym::callDtors = callDtorsFunc;
+      } else {
+        error("__wasm_call_dtors must be a function");
+      }
+    }
+  }
+
   createOptionalSymbols();
 
   if (errorCount())
diff --git a/lld/wasm/InputChunks.h b/lld/wasm/InputChunks.h
--- a/lld/wasm/InputChunks.h
+++ b/lld/wasm/InputChunks.h
@@ -122,7 +122,10 @@
 class InputFunction : public InputChunk {
 public:
   InputFunction(const WasmSignature &s, const WasmFunction *func, ObjFile *f)
-      : InputChunk(f, InputChunk::Function), signature(s), function(func) {}
+      : InputChunk(f, InputChunk::Function), signature(s), function(func),
+        exportName(func && func->ExportName.hasValue()
+                       ? (*func->ExportName).str()
+                       : llvm::Optional<std::string>()) {}
 
   static bool classof(const InputChunk *c) {
     return c->kind() == InputChunk::Function ||
@@ -133,8 +136,10 @@
   StringRef getName() const override { return function->SymbolName; }
   StringRef getDebugName() const override { return function->DebugName; }
   llvm::Optional<StringRef> getExportName() const {
-    return function ? function->ExportName : llvm::Optional<StringRef>();
+    return exportName.hasValue() ? llvm::Optional<StringRef>(*exportName)
+                                 : llvm::Optional<StringRef>();
   }
+  void setExportName(std::string exportName) { this->exportName = exportName; }
   uint32_t getComdat() const override { return function->Comdat; }
   uint32_t getFunctionInputOffset() const { return getInputSectionOffset(); }
   uint32_t getFunctionCodeOffset() const { return function->CodeOffset; }
@@ -172,6 +177,7 @@
   }
 
   const WasmFunction *function;
+  llvm::Optional<std::string> exportName;
   llvm::Optional<uint32_t> functionIndex;
   llvm::Optional<uint32_t> tableIndex;
   uint32_t compressedFuncSize = 0;
diff --git a/lld/wasm/MarkLive.cpp b/lld/wasm/MarkLive.cpp
--- a/lld/wasm/MarkLive.cpp
+++ b/lld/wasm/MarkLive.cpp
@@ -44,6 +44,7 @@
   void enqueue(Symbol *sym);
   void markSymbol(Symbol *sym);
   void mark();
+  bool isCallCtorsLive();
 
   // A list of chunks to visit.
   SmallVector<InputChunk *, 256> queue;
@@ -58,22 +59,6 @@
   sym->markLive();
   if (InputChunk *chunk = sym->getChunk())
     queue.push_back(chunk);
-
-  // The ctor functions are all referenced by the synthetic callCtors
-  // function.  However, this function does not contain relocations so we
-  // have to manually mark the ctors as live if callCtors itself is live.
-  if (sym == WasmSym::callCtors) {
-    if (config->isPic)
-      enqueue(WasmSym::applyRelocs);
-    for (const ObjFile *obj : symtab->objectFiles) {
-      const WasmLinkingData &l = obj->getWasmObj()->linkingData();
-      for (const WasmInitFunc &f : l.InitFunctions) {
-        auto* initSym = obj->getFunctionSymbol(f.Symbol);
-        if (!initSym->isDiscarded())
-          enqueue(initSym);
-      }
-    }
-  }
 }
 
 void MarkLive::run() {
@@ -86,16 +71,29 @@
     if (sym->isNoStrip() || sym->isExported())
       enqueue(sym);
 
-  // For relocatable output, we need to preserve all the ctor functions
-  if (config->relocatable) {
-    for (const ObjFile *obj : symtab->objectFiles) {
-      const WasmLinkingData &l = obj->getWasmObj()->linkingData();
-      for (const WasmInitFunc &f : l.InitFunctions)
-        enqueue(obj->getFunctionSymbol(f.Symbol));
+  // If we'll be calling the user's `__wasm_call_dtors` function, mark it live.
+  if (Symbol *callDtors = WasmSym::callDtors)
+    enqueue(callDtors);
+
+  // The ctor functions are all referenced by the synthetic callCtors
+  // function.  However, this function does not contain relocations so we
+  // have to manually mark the ctors as live.
+  for (const ObjFile *obj : symtab->objectFiles) {
+    const WasmLinkingData &l = obj->getWasmObj()->linkingData();
+    for (const WasmInitFunc &f : l.InitFunctions) {
+      auto *initSym = obj->getFunctionSymbol(f.Symbol);
+      if (!initSym->isDiscarded())
+        enqueue(initSym);
     }
   }
 
+  // In Emscripten-style PIC, `__wasm_call_ctors` calls `__wasm_apply_relocs`.
   if (config->isPic)
+    enqueue(WasmSym::applyRelocs);
+
+  // If we have any non-discarded init functions, mark `__wasm_call_ctors` as
+  // live so that we assign it an index and call it.
+  if (isCallCtorsLive())
     enqueue(WasmSym::callCtors);
 
   if (config->sharedMemory && !config->shared)
@@ -169,5 +167,27 @@
   }
 }
 
+bool MarkLive::isCallCtorsLive() {
+  // In a reloctable link, we don't call `__wasm_call_ctors`.
+  if (config->relocatable)
+    return false;
+
+  // In Emscripten-style PIC, we call `__wasm_call_ctors` which calls
+  // `__wasm_apply_relocs`.
+  if (config->isPic)
+    return true;
+
+  // If there are any init functions, mark `__wasm_call_ctors` live so that
+  // it can call them.
+  for (const ObjFile *file : symtab->objectFiles) {
+    const WasmLinkingData &l = file->getWasmObj()->linkingData();
+    for (const WasmInitFunc &f : l.InitFunctions)
+      if (!file->getFunctionSymbol(f.Symbol)->isDiscarded())
+        return true;
+  }
+
+  return false;
+}
+
 } // namespace wasm
 } // namespace lld
diff --git a/lld/wasm/Symbols.h b/lld/wasm/Symbols.h
--- a/lld/wasm/Symbols.h
+++ b/lld/wasm/Symbols.h
@@ -471,6 +471,10 @@
   // Function that directly calls all ctors in priority order.
   static DefinedFunction *callCtors;
 
+  // __wasm_call_dtors
+  // Function that calls the libc/etc. cleanup function.
+  static DefinedFunction *callDtors;
+
   // __wasm_apply_relocs
   // Function that applies relocations to data segment post-instantiation.
   static DefinedFunction *applyRelocs;
diff --git a/lld/wasm/Symbols.cpp b/lld/wasm/Symbols.cpp
--- a/lld/wasm/Symbols.cpp
+++ b/lld/wasm/Symbols.cpp
@@ -66,6 +66,7 @@
 
 namespace wasm {
 DefinedFunction *WasmSym::callCtors;
+DefinedFunction *WasmSym::callDtors;
 DefinedFunction *WasmSym::initMemory;
 DefinedFunction *WasmSym::applyRelocs;
 DefinedFunction *WasmSym::initTLS;
diff --git a/lld/wasm/Writer.cpp b/lld/wasm/Writer.cpp
--- a/lld/wasm/Writer.cpp
+++ b/lld/wasm/Writer.cpp
@@ -62,6 +62,8 @@
   void createApplyRelocationsFunction();
   void createCallCtorsFunction();
   void createInitTLSFunction();
+  void createCommandExportWrappers();
+  void createCommandExportWrapper(uint32_t functionIndex, DefinedFunction *f);
 
   void assignIndexes();
   void populateSymtab();
@@ -95,6 +97,9 @@
   std::vector<WasmInitEntry> initFunctions;
   llvm::StringMap<std::vector<InputSection *>> customSectionMapping;
 
+  // Stable storage for command export wrapper function name strings.
+  std::list<std::string> commandExportWrapperNames;
+
   // Elements that are used to construct the final output
   std::string header;
   std::vector<OutputSection *> outputSections;
@@ -453,7 +458,7 @@
   if (!config->checkFeatures)
     return;
 
-  if (!config->relocatable && used.count("mutable-globals") == 0) {
+  if (!config->relocatable && allowed.count("mutable-globals") == 0) {
     for (const Symbol *sym : out.importSec->importedSymbols) {
       if (auto *global = dyn_cast<GlobalSymbol>(sym)) {
         if (global->getGlobalType()->Mutable) {
@@ -571,12 +576,13 @@
       }
       export_ = {name, WASM_EXTERNAL_FUNCTION, f->getFunctionIndex()};
     } else if (auto *g = dyn_cast<DefinedGlobal>(sym)) {
-      // TODO(sbc): Remove this check once to mutable global proposal is
-      // implement in all major browsers.
-      // See: https://github.com/WebAssembly/mutable-global
-      if (g->getGlobalType()->Mutable) {
-        // Only __stack_pointer and __tls_base should ever be create as mutable.
-        assert(g == WasmSym::stackPointer || g == WasmSym::tlsBase);
+      if (g->getGlobalType()->Mutable && !g->getFile() && !g->forceExport) {
+        // Avoid exporting mutable globals are linker synthesized (e.g.
+        // __stack_pointer or __tls_base) unless they are explicitly exported
+        // from the command line.
+        // Without this check `--export-all` would cause any program using the
+        // stack pointer to export a mutable global even if none of the input
+        // files were built with the `mutable-globals` feature.
         continue;
       }
       export_ = {name, WASM_EXTERNAL_GLOBAL, g->getGlobalIndex()};
@@ -639,6 +645,53 @@
     out.typeSec->registerType(e->signature);
 }
 
+// In a command-style link, create a wrapper for each exported symbol
+// which calls the constructors and destructors.
+void Writer::createCommandExportWrappers() {
+  // This logic doesn't currently support Emscripten-style PIC mode.
+  assert(!config->isPic);
+
+  // If there are no ctors and there's no libc `__wasm_call_dtors` to
+  // call, don't wrap the exports.
+  if (initFunctions.empty() && WasmSym::callDtors == NULL)
+    return;
+
+  std::vector<DefinedFunction *> toWrap;
+
+  for (Symbol *sym : symtab->getSymbols())
+    if (sym->isExported())
+      if (auto *f = dyn_cast<DefinedFunction>(sym))
+        toWrap.push_back(f);
+
+  for (auto *f : toWrap) {
+    auto funcNameStr = (f->getName() + ".command_export").str();
+    commandExportWrapperNames.push_back(funcNameStr);
+    const std::string &funcName = commandExportWrapperNames.back();
+
+    auto func = make<SyntheticFunction>(*f->getSignature(), funcName);
+    if (f->function->getExportName().hasValue())
+      func->setExportName(f->function->getExportName()->str());
+    else
+      func->setExportName(f->getName().str());
+
+    DefinedFunction *def =
+        symtab->addSyntheticFunction(funcName, f->flags, func);
+    def->markLive();
+
+    def->flags |= WASM_SYMBOL_EXPORTED;
+    def->flags &= ~WASM_SYMBOL_VISIBILITY_HIDDEN;
+    def->forceExport = f->forceExport;
+
+    f->flags |= WASM_SYMBOL_VISIBILITY_HIDDEN;
+    f->flags &= ~WASM_SYMBOL_EXPORTED;
+    f->forceExport = false;
+
+    out.functionSec->addFunction(func);
+
+    createCommandExportWrapper(f->getFunctionIndex(), def);
+  }
+}
+
 static void scanRelocations() {
   for (ObjFile *file : symtab->objectFiles) {
     LLVM_DEBUG(dbgs() << "scanRelocations: " << file->getName() << "\n");
@@ -924,7 +977,10 @@
 // Create synthetic "__wasm_call_ctors" function based on ctor functions
 // in input object.
 void Writer::createCallCtorsFunction() {
-  if (!WasmSym::callCtors->isLive())
+  // If __wasm_call_ctors isn't referenced, there aren't any ctors, and we
+  // aren't calling `__wasm_apply_relocs` for Emscripten-style PIC, don't
+  // define the `__wasm_call_ctors` function.
+  if (!WasmSym::callCtors->isLive() && initFunctions.empty() && !config->isPic)
     return;
 
   // First write the body's contents to a string.
@@ -953,6 +1009,46 @@
   createFunction(WasmSym::callCtors, bodyContent);
 }
 
+// Create a wrapper around a function export which calls the
+// static constructors and destructors.
+void Writer::createCommandExportWrapper(uint32_t functionIndex,
+                                        DefinedFunction *f) {
+  // First write the body's contents to a string.
+  std::string bodyContent;
+  {
+    raw_string_ostream os(bodyContent);
+    writeUleb128(os, 0, "num locals");
+
+    // If we have any ctors, or we're calling `__wasm_apply_relocs` for
+    // Emscripten-style PIC, call `__wasm_call_ctors` which performs those
+    // calls.
+    if (!initFunctions.empty() || config->isPic) {
+      writeU8(os, WASM_OPCODE_CALL, "CALL");
+      writeUleb128(os, WasmSym::callCtors->getFunctionIndex(),
+                   "function index");
+    }
+
+    // Call the user's code, leaving any return values on the operand stack.
+    for (size_t i = 0; i < f->signature->Params.size(); ++i) {
+      writeU8(os, WASM_OPCODE_LOCAL_GET, "local.get");
+      writeUleb128(os, i, "local index");
+    }
+    writeU8(os, WASM_OPCODE_CALL, "CALL");
+    writeUleb128(os, functionIndex, "function index");
+
+    // Call the function that calls the destructors.
+    if (DefinedFunction *callDtors = WasmSym::callDtors) {
+      writeU8(os, WASM_OPCODE_CALL, "CALL");
+      writeUleb128(os, callDtors->getFunctionIndex(), "function index");
+    }
+
+    // End the function, returning the return values from the user's code.
+    writeU8(os, WASM_OPCODE_END, "END");
+  }
+
+  createFunction(f, bodyContent);
+}
+
 void Writer::createInitTLSFunction() {
   if (!WasmSym::initTLS->isLive())
     return;
@@ -1089,6 +1185,18 @@
     if (config->isPic)
       createApplyRelocationsFunction();
     createCallCtorsFunction();
+
+    // Create export wrappers for commands if needed.
+    //
+    // If the input contains a call to `__wasm_call_ctors`, either in one of
+    // the input objects or an explicit export from the command-line, we
+    // assume ctors and dtors are taken care of already.
+    if (!config->relocatable && !config->isPic &&
+        !WasmSym::callCtors->isUsedInRegularObj &&
+        !WasmSym::callCtors->isExported()) {
+      log("-- createCommandExportWrappers");
+      createCommandExportWrappers();
+    }
   }
 
   if (!config->relocatable && config->sharedMemory && !config->shared)
diff --git a/lldb/include/lldb/Core/StructuredDataImpl.h b/lldb/include/lldb/Core/StructuredDataImpl.h
--- a/lldb/include/lldb/Core/StructuredDataImpl.h
+++ b/lldb/include/lldb/Core/StructuredDataImpl.h
@@ -68,14 +68,18 @@
       return error;
     }
 
-    // Grab the plugin.
-    auto plugin_sp = lldb::StructuredDataPluginSP(m_plugin_wp);
+    // Grab the plugin
+    lldb::StructuredDataPluginSP plugin_sp = m_plugin_wp.lock();
+
+    // If there's no plugin, call underlying data's dump method:
     if (!plugin_sp) {
-      error.SetErrorString("Cannot pretty print structured data: "
-                           "plugin doesn't exist.");
+      if (!m_data_sp) {
+        error.SetErrorString("No data to describe.");
+        return error;
+      }
+      m_data_sp->Dump(stream, true);
       return error;
     }
-
     // Get the data's description.
     return plugin_sp->GetDescription(m_data_sp, stream);
   }
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-server/gdbremote_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-server/gdbremote_testcase.py
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-server/gdbremote_testcase.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-server/gdbremote_testcase.py
@@ -318,7 +318,13 @@
             raise _ConnectionRefused()  # Got EOF, connection dropped.
 
     def create_socket(self):
-        sock = socket.socket()
+        try:
+            sock = socket.socket(family=socket.AF_INET)
+        except OSError as e:
+            if e.errno != errno.EAFNOSUPPORT:
+                raise
+            sock = socket.socket(family=socket.AF_INET6)
+
         logger = self.logger
 
         triple = self.dbg.GetSelectedPlatform().GetTriple()
@@ -379,7 +385,7 @@
                 ["*:{}".format(self.port)]
         else:
             commandline_args = self.debug_monitor_extra_args + \
-                ["127.0.0.1:{}".format(self.port)]
+                ["localhost:{}".format(self.port)]
 
         if attach_pid:
             commandline_args += ["--attach=%d" % attach_pid]
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/vscode.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/vscode.py
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/vscode.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/vscode.py
@@ -728,24 +728,26 @@
     def request_setBreakpoints(self, file_path, line_array, condition=None,
                                hitCondition=None):
         (dir, base) = os.path.split(file_path)
-        breakpoints = []
-        for line in line_array:
-            bp = {'line': line}
-            if condition is not None:
-                bp['condition'] = condition
-            if hitCondition is not None:
-                bp['hitCondition'] = hitCondition
-            breakpoints.append(bp)
         source_dict = {
             'name': base,
             'path': file_path
         }
         args_dict = {
             'source': source_dict,
-            'breakpoints': breakpoints,
-            'lines': '%s' % (line_array),
             'sourceModified': False,
         }
+        if line_array is not None:
+            args_dict['lines'] = '%s' % line_array
+            breakpoints = []
+            for line in line_array:
+                bp = {'line': line}
+                if condition is not None:
+                    bp['condition'] = condition
+                if hitCondition is not None:
+                    bp['hitCondition'] = hitCondition
+                breakpoints.append(bp)
+            args_dict['breakpoints'] = breakpoints
+
         command_dict = {
             'command': 'setBreakpoints',
             'type': 'request',
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
@@ -1234,7 +1234,7 @@
   const int backlog = 5;
   TCPSocket listen_socket(true, child_processes_inherit);
   if (llvm::Error error =
-          listen_socket.Listen("127.0.0.1:0", backlog).ToError())
+          listen_socket.Listen("localhost:0", backlog).ToError())
     return error;
 
   Socket *accept_socket;
@@ -1243,7 +1243,7 @@
 
   llvm::SmallString<32> remote_addr;
   llvm::raw_svector_ostream(remote_addr)
-      << "connect://127.0.0.1:" << listen_socket.GetLocalPortNumber();
+      << "connect://localhost:" << listen_socket.GetLocalPortNumber();
 
   std::unique_ptr<ConnectionFileDescriptor> conn_up(
       new ConnectionFileDescriptor());
diff --git a/lldb/test/API/commands/gui/basicdebug/TestGuiBasicDebug.py b/lldb/test/API/commands/gui/basicdebug/TestGuiBasicDebug.py
--- a/lldb/test/API/commands/gui/basicdebug/TestGuiBasicDebug.py
+++ b/lldb/test/API/commands/gui/basicdebug/TestGuiBasicDebug.py
@@ -15,7 +15,6 @@
     # under ASAN on a loaded machine..
     @skipIfAsan
     @skipIfCursesSupportMissing
-    @expectedFailureAll(archs=["aarch64"], oslist=["linux"])
     def test_gui(self):
         self.build()
 
@@ -37,11 +36,11 @@
         self.child.send("d") # down
         self.child.expect_exact("return 1; // In function")
         self.child.send("f") # finish
-        self.child.expect("func\(\); // Break here[^\r\n]+<<< Thread 1: step out")
+        self.child.expect("<<< Thread 1: step out")
         self.child.send("s") # move onto the second one
-        self.child.expect("func\(\); // Second[^\r\n]+<<< Thread 1: step in")
+        self.child.expect("<<< Thread 1: step in")
         self.child.send("n") # step over
-        self.child.expect("return 0;[^\r\n]+<<< Thread 1: step over")
+        self.child.expect("<<< Thread 1: step over")
 
         # Press escape to quit the gui
         self.child.send(escape_key)
diff --git a/lldb/test/API/commands/target/stop-hooks/TestStopHookScripted.py b/lldb/test/API/commands/target/stop-hooks/TestStopHookScripted.py
--- a/lldb/test/API/commands/target/stop-hooks/TestStopHookScripted.py
+++ b/lldb/test/API/commands/target/stop-hooks/TestStopHookScripted.py
@@ -7,7 +7,7 @@
 import lldb
 import lldbsuite.test.lldbutil as lldbutil
 from lldbsuite.test.lldbtest import *
-
+from lldbsuite.test.decorators import *
 
 class TestStopHooks(TestBase):
 
@@ -71,6 +71,8 @@
         """Test that the returning False from a stop hook works"""
         self.do_test_auto_continue(True)
 
+    # Test is flakey on Linux.
+    @skipIfLinux
     def do_test_auto_continue(self, return_true):
         """Test that auto-continue works."""
         # We set auto-continue to 1 but the stop hook only applies to step_out_of_me,
diff --git a/lldb/test/API/functionalities/gdb_remote_client/gdbclientutils.py b/lldb/test/API/functionalities/gdb_remote_client/gdbclientutils.py
--- a/lldb/test/API/functionalities/gdb_remote_client/gdbclientutils.py
+++ b/lldb/test/API/functionalities/gdb_remote_client/gdbclientutils.py
@@ -1,3 +1,4 @@
+import errno
 import os
 import os.path
 import threading
@@ -317,12 +318,20 @@
     def __init__(self, port = 0):
         self.responder = MockGDBServerResponder()
         self.port = port
-        self._socket = socket.socket()
+        try:
+            self._socket = socket.socket(family=socket.AF_INET)
+        except OSError as e:
+            if e.errno != errno.EAFNOSUPPORT:
+                raise
+            self._socket = socket.socket(family=socket.AF_INET6)
 
     def start(self):
         # Block until the socket is up, so self.port is available immediately.
         # Then start a thread that waits for a client connection.
-        addr = ("127.0.0.1", self.port)
+        if self._socket.family == socket.AF_INET:
+            addr = ("127.0.0.1", self.port)
+        elif self._socket.family == socket.AF_INET6:
+            addr = ("::1", self.port)
         self._socket.bind(addr)
         self.port = self._socket.getsockname()[1]
         self._socket.listen(1)
diff --git a/lldb/test/API/python_api/sbstructureddata/TestStructuredDataAPI.py b/lldb/test/API/python_api/sbstructureddata/TestStructuredDataAPI.py
--- a/lldb/test/API/python_api/sbstructureddata/TestStructuredDataAPI.py
+++ b/lldb/test/API/python_api/sbstructureddata/TestStructuredDataAPI.py
@@ -35,6 +35,13 @@
         # Tests for invalid data type
         self.invalid_struct_test(example)
 
+        # Test that GetDescription works:
+        s.Clear()
+        error = example.GetDescription(s)
+        self.assertTrue(error.Success(), "GetDescription works")
+        if not "key_float" in s.GetData():
+            self.fail("FAILED: could not find key_float in description output")
+        
         dict_struct = lldb.SBStructuredData()
         dict_struct = example.GetValueForKey("key_dict")
 
diff --git a/lldb/test/API/tools/lldb-server/commandline/TestStubReverseConnect.py b/lldb/test/API/tools/lldb-server/commandline/TestStubReverseConnect.py
--- a/lldb/test/API/tools/lldb-server/commandline/TestStubReverseConnect.py
+++ b/lldb/test/API/tools/lldb-server/commandline/TestStubReverseConnect.py
@@ -1,5 +1,6 @@
 from __future__ import print_function
 
+import errno
 import gdbremote_testcase
 import lldbgdbserverutils
 import re
@@ -24,11 +25,20 @@
         self.listener_port = self.listener_socket.getsockname()[1]
 
     def create_listener_socket(self):
-        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        try:
+            sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        except OSError as e:
+            if e.errno != errno.EAFNOSUPPORT:
+                raise
+            sock = socket.socket(socket.AF_INET6, socket.SOCK_STREAM)
         self.assertIsNotNone(sock)
 
         sock.settimeout(self.DEFAULT_TIMEOUT)
-        sock.bind(("127.0.0.1", 0))
+        if sock.family == socket.AF_INET:
+            bind_addr = ("127.0.0.1", 0)
+        elif sock.family == socket.AF_INET6:
+            bind_addr = ("::1", 0)
+        sock.bind(bind_addr)
         sock.listen(1)
 
         def tear_down_listener():
diff --git a/lldb/test/API/tools/lldb-vscode/breakpoint/TestVSCode_setBreakpoints.py b/lldb/test/API/tools/lldb-vscode/breakpoint/TestVSCode_setBreakpoints.py
--- a/lldb/test/API/tools/lldb-vscode/breakpoint/TestVSCode_setBreakpoints.py
+++ b/lldb/test/API/tools/lldb-vscode/breakpoint/TestVSCode_setBreakpoints.py
@@ -219,6 +219,48 @@
                 self.assertTrue(breakpoint['verified'],
                                 "expect breakpoint still verified")
 
+    @skipIfWindows
+    @skipIfRemote
+    def test_clear_breakpoints_unset_breakpoints(self):
+        '''Test clearing breakpoints like test_set_and_clear, but clear
+           breakpoints by omitting the breakpoints array instead of sending an
+           empty one.'''
+        lines = [line_number('main.cpp', 'break 12'),
+                 line_number('main.cpp', 'break 13')]
+
+        # Visual Studio Code Debug Adaptors have no way to specify the file
+        # without launching or attaching to a process, so we must start a
+        # process in order to be able to set breakpoints.
+        program = self.getBuildArtifact("a.out")
+        self.build_and_launch(program)
+
+        # Set one breakpoint and verify that it got set correctly.
+        response = self.vscode.request_setBreakpoints(self.main_path, lines)
+        line_to_id = {}
+        breakpoints = response['body']['breakpoints']
+        self.assertEquals(len(breakpoints), len(lines),
+                        "expect %u source breakpoints" % (len(lines)))
+        for (breakpoint, index) in zip(breakpoints, range(len(lines))):
+            line = breakpoint['line']
+            self.assertTrue(line, lines[index])
+            # Store the "id" of the breakpoint that was set for later
+            line_to_id[line] = breakpoint['id']
+            self.assertTrue(line in lines, "line expected in lines array")
+            self.assertTrue(breakpoint['verified'],
+                            "expect breakpoint verified")
+
+        # Now clear all breakpoints for the source file by not setting the
+        # lines array.
+        lines = None
+        response = self.vscode.request_setBreakpoints(self.main_path, lines)
+        breakpoints = response['body']['breakpoints']
+        self.assertEquals(len(breakpoints), 0, "expect no source breakpoints")
+
+        # Verify with the target that all breakpoints have been cleared.
+        response = self.vscode.request_testGetTargetBreakpoints()
+        breakpoints = response['body']['breakpoints']
+        self.assertEquals(len(breakpoints), 0, "expect no source breakpoints")
+
     @skipIfWindows
     @skipIfRemote
     def test_functionality(self):
diff --git a/lldb/tools/lldb-server/lldb-gdbserver.cpp b/lldb/tools/lldb-server/lldb-gdbserver.cpp
--- a/lldb/tools/lldb-server/lldb-gdbserver.cpp
+++ b/lldb/tools/lldb-server/lldb-gdbserver.cpp
@@ -267,7 +267,8 @@
       final_host_and_port.append("localhost");
     final_host_and_port.append(host_and_port);
 
-    const std::string::size_type colon_pos = final_host_and_port.find(':');
+    // Note: use rfind, because the host/port may look like "[::1]:12345".
+    const std::string::size_type colon_pos = final_host_and_port.rfind(':');
     if (colon_pos != std::string::npos) {
       connection_host = final_host_and_port.substr(0, colon_pos);
       connection_port = final_host_and_port.substr(colon_pos + 1);
diff --git a/lldb/tools/lldb-vscode/lldb-vscode.cpp b/lldb/tools/lldb-vscode/lldb-vscode.cpp
--- a/lldb/tools/lldb-vscode/lldb-vscode.cpp
+++ b/lldb/tools/lldb-vscode/lldb-vscode.cpp
@@ -1936,27 +1936,32 @@
 
   // Decode the source breakpoint infos for this "setBreakpoints" request
   SourceBreakpointMap request_bps;
-  for (const auto &bp : *breakpoints) {
-    auto bp_obj = bp.getAsObject();
-    if (bp_obj) {
-      SourceBreakpoint src_bp(*bp_obj);
-      request_bps[src_bp.line] = src_bp;
-
-      // We check if this breakpoint already exists to update it
-      auto existing_source_bps = g_vsc.source_breakpoints.find(path);
-      if (existing_source_bps != g_vsc.source_breakpoints.end()) {
-        const auto &existing_bp = existing_source_bps->second.find(src_bp.line);
-        if (existing_bp != existing_source_bps->second.end()) {
-          existing_bp->second.UpdateBreakpoint(src_bp);
-          AppendBreakpoint(existing_bp->second.bp, response_breakpoints, path,
-                           src_bp.line);
-          continue;
+  // "breakpoints" may be unset, in which case we treat it the same as being set
+  // to an empty array.
+  if (breakpoints) {
+    for (const auto &bp : *breakpoints) {
+      auto bp_obj = bp.getAsObject();
+      if (bp_obj) {
+        SourceBreakpoint src_bp(*bp_obj);
+        request_bps[src_bp.line] = src_bp;
+
+        // We check if this breakpoint already exists to update it
+        auto existing_source_bps = g_vsc.source_breakpoints.find(path);
+        if (existing_source_bps != g_vsc.source_breakpoints.end()) {
+          const auto &existing_bp =
+              existing_source_bps->second.find(src_bp.line);
+          if (existing_bp != existing_source_bps->second.end()) {
+            existing_bp->second.UpdateBreakpoint(src_bp);
+            AppendBreakpoint(existing_bp->second.bp, response_breakpoints, path,
+                             src_bp.line);
+            continue;
+          }
         }
+        // At this point the breakpoint is new
+        src_bp.SetBreakpoint(path.data());
+        AppendBreakpoint(src_bp.bp, response_breakpoints, path, src_bp.line);
+        g_vsc.source_breakpoints[path][src_bp.line] = std::move(src_bp);
       }
-      // At this point the breakpoint is new
-      src_bp.SetBreakpoint(path.data());
-      AppendBreakpoint(src_bp.bp, response_breakpoints, path, src_bp.line);
-      g_vsc.source_breakpoints[path][src_bp.line] = std::move(src_bp);
     }
   }
 
diff --git a/lldb/unittests/Host/SocketTest.cpp b/lldb/unittests/Host/SocketTest.cpp
--- a/lldb/unittests/Host/SocketTest.cpp
+++ b/lldb/unittests/Host/SocketTest.cpp
@@ -14,12 +14,24 @@
 
 using namespace lldb_private;
 
-class SocketTest : public testing::Test {
+struct SocketTestParams {
+  bool is_ipv6;
+  std::string localhost_ip;
+};
+
+class SocketTest : public testing::TestWithParam<SocketTestParams> {
 public:
   SubsystemRAII<Socket> subsystems;
+
+protected:
+  bool HostSupportsProtocol() const {
+    if (GetParam().is_ipv6)
+      return HostSupportsIPv6();
+    return HostSupportsIPv4();
+  }
 };
 
-TEST_F(SocketTest, DecodeHostAndPort) {
+TEST_P(SocketTest, DecodeHostAndPort) {
   std::string host_str;
   std::string port_str;
   int32_t port;
@@ -86,7 +98,7 @@
 }
 
 #if LLDB_ENABLE_POSIX
-TEST_F(SocketTest, DomainListenConnectAccept) {
+TEST_P(SocketTest, DomainListenConnectAccept) {
   llvm::SmallString<64> Path;
   std::error_code EC = llvm::sys::fs::createUniqueDirectory("DomainListenConnectAccept", Path);
   ASSERT_FALSE(EC);
@@ -102,18 +114,22 @@
 }
 #endif
 
-TEST_F(SocketTest, TCPListen0ConnectAccept) {
+TEST_P(SocketTest, TCPListen0ConnectAccept) {
+  if (!HostSupportsProtocol())
+    return;
   std::unique_ptr<TCPSocket> socket_a_up;
   std::unique_ptr<TCPSocket> socket_b_up;
-  CreateTCPConnectedSockets("127.0.0.1", &socket_a_up, &socket_b_up);
+  CreateTCPConnectedSockets(GetParam().localhost_ip, &socket_a_up,
+                            &socket_b_up);
 }
 
-TEST_F(SocketTest, TCPGetAddress) {
+TEST_P(SocketTest, TCPGetAddress) {
   std::unique_ptr<TCPSocket> socket_a_up;
   std::unique_ptr<TCPSocket> socket_b_up;
-  if (!HostSupportsIPv4())
+  if (!HostSupportsProtocol())
     return;
-  CreateTCPConnectedSockets("127.0.0.1", &socket_a_up, &socket_b_up);
+  CreateTCPConnectedSockets(GetParam().localhost_ip, &socket_a_up,
+                            &socket_b_up);
 
   EXPECT_EQ(socket_a_up->GetLocalPortNumber(),
             socket_b_up->GetRemotePortNumber());
@@ -121,11 +137,16 @@
             socket_a_up->GetRemotePortNumber());
   EXPECT_NE(socket_a_up->GetLocalPortNumber(),
             socket_b_up->GetLocalPortNumber());
-  EXPECT_STREQ("127.0.0.1", socket_a_up->GetRemoteIPAddress().c_str());
-  EXPECT_STREQ("127.0.0.1", socket_b_up->GetRemoteIPAddress().c_str());
+  EXPECT_STREQ(GetParam().localhost_ip.c_str(),
+               socket_a_up->GetRemoteIPAddress().c_str());
+  EXPECT_STREQ(GetParam().localhost_ip.c_str(),
+               socket_b_up->GetRemoteIPAddress().c_str());
 }
 
-TEST_F(SocketTest, UDPConnect) {
+TEST_P(SocketTest, UDPConnect) {
+  // UDPSocket::Connect() creates sockets with AF_INET (IPv4).
+  if (!HostSupportsIPv4())
+    return;
   llvm::Expected<std::unique_ptr<UDPSocket>> socket =
       UDPSocket::Connect("127.0.0.1:0", /*child_processes_inherit=*/false);
 
@@ -133,7 +154,9 @@
   EXPECT_TRUE(socket.get()->IsValid());
 }
 
-TEST_F(SocketTest, TCPListen0GetPort) {
+TEST_P(SocketTest, TCPListen0GetPort) {
+  if (!HostSupportsIPv4())
+    return;
   Predicate<uint16_t> port_predicate;
   port_predicate.SetValue(0, eBroadcastNever);
   llvm::Expected<std::unique_ptr<TCPSocket>> sock =
@@ -143,12 +166,13 @@
   EXPECT_NE(sock.get()->GetLocalPortNumber(), 0);
 }
 
-TEST_F(SocketTest, TCPGetConnectURI) {
+TEST_P(SocketTest, TCPGetConnectURI) {
   std::unique_ptr<TCPSocket> socket_a_up;
   std::unique_ptr<TCPSocket> socket_b_up;
-  if (!HostSupportsIPv4())
+  if (!HostSupportsProtocol())
     return;
-  CreateTCPConnectedSockets("127.0.0.1", &socket_a_up, &socket_b_up);
+  CreateTCPConnectedSockets(GetParam().localhost_ip, &socket_a_up,
+                            &socket_b_up);
 
   llvm::StringRef scheme;
   llvm::StringRef hostname;
@@ -160,7 +184,8 @@
   EXPECT_EQ(port, socket_a_up->GetRemotePortNumber());
 }
 
-TEST_F(SocketTest, UDPGetConnectURI) {
+TEST_P(SocketTest, UDPGetConnectURI) {
+  // UDPSocket::Connect() creates sockets with AF_INET (IPv4).
   if (!HostSupportsIPv4())
     return;
   llvm::Expected<std::unique_ptr<UDPSocket>> socket =
@@ -177,7 +202,7 @@
 }
 
 #if LLDB_ENABLE_POSIX
-TEST_F(SocketTest, DomainGetConnectURI) {
+TEST_P(SocketTest, DomainGetConnectURI) {
   llvm::SmallString<64> domain_path;
   std::error_code EC =
       llvm::sys::fs::createUniqueDirectory("DomainListenConnectAccept", domain_path);
@@ -202,3 +227,13 @@
   EXPECT_EQ(path, domain_path);
 }
 #endif
+
+INSTANTIATE_TEST_CASE_P(
+    SocketTests, SocketTest,
+    testing::Values(SocketTestParams{/*is_ipv6=*/false,
+                                     /*localhost_ip=*/"127.0.0.1"},
+                    SocketTestParams{/*is_ipv6=*/true, /*localhost_ip=*/"::1"}),
+    // Prints "SocketTests/SocketTest.DecodeHostAndPort/ipv4" etc. in test logs.
+    [](const testing::TestParamInfo<SocketTestParams> &info) {
+      return info.param.is_ipv6 ? "ipv6" : "ipv4";
+    });
diff --git a/lldb/unittests/Host/SocketTestUtilities.cpp b/lldb/unittests/Host/SocketTestUtilities.cpp
--- a/lldb/unittests/Host/SocketTestUtilities.cpp
+++ b/lldb/unittests/Host/SocketTestUtilities.cpp
@@ -101,13 +101,14 @@
                              "Creating a canary {0} TCP socket failed: {1}.",
                              Proto, Err)
                              .str();
-  bool HasAddrNotAvail = false;
+  bool HasProtocolError = false;
   handleAllErrors(std::move(Err), [&](std::unique_ptr<llvm::ECError> ECErr) {
-    if (ECErr->convertToErrorCode() ==
-        std::make_error_code(std::errc::address_not_available))
-      HasAddrNotAvail = true;
+    std::error_code ec = ECErr->convertToErrorCode();
+    if (ec == std::make_error_code(std::errc::address_family_not_supported) ||
+        ec == std::make_error_code(std::errc::address_not_available))
+      HasProtocolError = true;
   });
-  if (HasAddrNotAvail) {
+  if (HasProtocolError) {
     GTEST_LOG_(WARNING)
         << llvm::formatv(
                "Assuming the host does not support {0}. Skipping test.", Proto)
diff --git a/llvm/docs/DeveloperPolicy.rst b/llvm/docs/DeveloperPolicy.rst
--- a/llvm/docs/DeveloperPolicy.rst
+++ b/llvm/docs/DeveloperPolicy.rst
@@ -298,11 +298,10 @@
 Obtaining Commit Access
 -----------------------
 
-New Contributors
-^^^^^^^^^^^^^^^^
 We grant commit access to contributors with a track record of submitting high
 quality patches.  If you would like commit access, please send an email to
-`Chris <mailto:clattner@llvm.org>`_ with your GitHub username.
+`Chris <mailto:clattner@llvm.org>`_ with your GitHub username.  This is true
+for former contributors with SVN access as well as new contributors.
 
 Prior to obtaining commit access, it is common practice to request that
 someone with commit access commits on your behalf. When doing so, please
@@ -345,12 +344,6 @@
 encouraged to review other peoples' patches as well, but you aren't required
 to do so.
 
-Current Contributors - Transferring from SVN
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-If you had commit access to SVN and would like to request commit access to
-GitHub, please email `llvm-admin <mailto:llvm-admin@lists.llvm.org>`_ with your
-SVN username and GitHub username.
-
 .. _discuss the change/gather consensus:
 
 Making a Major Change
diff --git a/llvm/examples/CMakeLists.txt b/llvm/examples/CMakeLists.txt
--- a/llvm/examples/CMakeLists.txt
+++ b/llvm/examples/CMakeLists.txt
@@ -8,7 +8,6 @@
 add_subdirectory(OrcV2Examples)
 add_subdirectory(SpeculativeJIT)
 add_subdirectory(Bye)
-add_subdirectory(ThinLtoJIT)
 
 if(LLVM_ENABLE_EH AND (NOT WIN32) AND (NOT "${LLVM_NATIVE_ARCH}" STREQUAL "ARM"))
     add_subdirectory(ExceptionDemo)
diff --git a/llvm/examples/OrcV2Examples/CMakeLists.txt b/llvm/examples/OrcV2Examples/CMakeLists.txt
--- a/llvm/examples/OrcV2Examples/CMakeLists.txt
+++ b/llvm/examples/OrcV2Examples/CMakeLists.txt
@@ -11,7 +11,3 @@
 add_subdirectory(OrcV2CBindingsAddObjectFile)
 add_subdirectory(OrcV2CBindingsBasicUsage)
 add_subdirectory(OrcV2CBindingsReflectProcessSymbols)
-
-if(CMAKE_HOST_UNIX)
-  add_subdirectory(LLJITWithChildProcess)
-endif()
diff --git a/llvm/examples/OrcV2Examples/LLJITWithChildProcess/CMakeLists.txt b/llvm/examples/OrcV2Examples/LLJITWithChildProcess/CMakeLists.txt
deleted file mode 100644
--- a/llvm/examples/OrcV2Examples/LLJITWithChildProcess/CMakeLists.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-set(LLVM_LINK_COMPONENTS
-  Core
-  ExecutionEngine
-  IRReader
-  JITLink
-  OrcError
-  OrcJIT
-  Support
-  nativecodegen
-  )
-
-add_llvm_example(LLJITInChildProcess
-  LLJITWithChildProcess.cpp
-  )
diff --git a/llvm/examples/OrcV2Examples/LLJITWithChildProcess/LLJITWithChildProcess.cpp b/llvm/examples/OrcV2Examples/LLJITWithChildProcess/LLJITWithChildProcess.cpp
deleted file mode 100644
--- a/llvm/examples/OrcV2Examples/LLJITWithChildProcess/LLJITWithChildProcess.cpp
+++ /dev/null
@@ -1,128 +0,0 @@
-//===--- LLJITWithChildProcess.cpp - LLJIT targeting a child process ------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// In this example we will execute JITed code in a child process:
-//
-// 1. Launch a remote process.
-// 2. Create a JITLink-compatible remote memory manager.
-// 3. Use LLJITBuilder to create a (greedy) LLJIT instance.
-// 4. Add the Add1Example module and execute add1().
-// 5. Terminate the remote target session.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h"
-#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
-#include "llvm/ExecutionEngine/Orc/LLJIT.h"
-#include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h"
-#include "llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/InitLLVM.h"
-#include "llvm/Support/TargetSelect.h"
-#include "llvm/Support/raw_ostream.h"
-
-#include "../ExampleModules.h"
-#include "RemoteJITUtils.h"
-
-#include <memory>
-#include <string>
-
-#define DEBUG_TYPE "orc"
-
-using namespace llvm;
-using namespace llvm::orc;
-
-// Executable running in the child process for remote execution. It communicates
-// via stdin/stdout pipes.
-cl::opt<std::string>
-    ChildExecPath("remote-process", cl::Required,
-                  cl::desc("Specify the filename of the process to launch for "
-                           "remote JITing."),
-                  cl::value_desc("filename"));
-
-int main(int argc, char *argv[]) {
-  InitLLVM X(argc, argv);
-
-  InitializeNativeTarget();
-  InitializeNativeTargetAsmPrinter();
-
-  cl::ParseCommandLineOptions(argc, argv, "LLJITWithChildProcess");
-
-  ExitOnError ExitOnErr;
-  ExitOnErr.setBanner(std::string(argv[0]) + ": ");
-
-  if (!sys::fs::can_execute(ChildExecPath)) {
-    WithColor::error(errs(), argv[0])
-        << "Child executable invalid: '" << ChildExecPath << "'\n";
-    return -1;
-  }
-
-  ExecutionSession ES;
-  ES.setErrorReporter([&](Error Err) { ExitOnErr(std::move(Err)); });
-
-  // Launch the remote process and get a channel to it.
-  pid_t ChildPID;
-  std::unique_ptr<FDRawChannel> Ch = launchRemote(ChildExecPath, ChildPID);
-  if (!Ch) {
-    WithColor::error(errs(), argv[0]) << "Failed to launch remote JIT.\n";
-    exit(1);
-  }
-
-  LLVM_DEBUG({
-    dbgs()
-        << "Launched executable in subprocess " << ChildPID << ":\n"
-        << ChildExecPath << "\n\n"
-        << "You may want to attach a debugger now. Press enter to continue.\n";
-    fflush(stdin);
-    getchar();
-  });
-
-  std::unique_ptr<remote::OrcRemoteTargetClient> Client =
-      ExitOnErr(remote::OrcRemoteTargetClient::Create(*Ch, ES));
-
-  // Create a JITLink-compatible remote memory manager.
-  using MemManager = remote::OrcRemoteTargetClient::RemoteJITLinkMemoryManager;
-  std::unique_ptr<MemManager> RemoteMM =
-      ExitOnErr(Client->createRemoteJITLinkMemoryManager());
-
-  // Our remote target is running on the host system.
-  auto JTMB = ExitOnErr(JITTargetMachineBuilder::detectHost());
-  JTMB.setCodeModel(CodeModel::Small);
-
-  // Create an LLJIT instance with a JITLink ObjectLinkingLayer.
-  auto J = ExitOnErr(
-      LLJITBuilder()
-          .setJITTargetMachineBuilder(std::move(JTMB))
-          .setObjectLinkingLayerCreator(
-              [&](ExecutionSession &ES,
-                  const Triple &TT) -> std::unique_ptr<ObjectLayer> {
-                return std::make_unique<ObjectLinkingLayer>(ES, *RemoteMM);
-              })
-          .create());
-
-  auto M = ExitOnErr(parseExampleModule(Add1Example, "add1"));
-
-  ExitOnErr(J->addIRModule(std::move(M)));
-
-  // Look up the JIT'd function.
-  auto Add1Sym = ExitOnErr(J->lookup("add1"));
-
-  // Run in child target.
-  Expected<int> Result = Client->callIntInt(Add1Sym.getAddress(), 42);
-  if (Result)
-    outs() << "add1(42) = " << *Result << "\n";
-  else
-    ES.reportError(Result.takeError());
-
-  // Signal the remote target that we're done JITing.
-  ExitOnErr(Client->terminateSession());
-  LLVM_DEBUG(dbgs() << "Subprocess terminated\n");
-
-  return 0;
-}
diff --git a/llvm/examples/OrcV2Examples/LLJITWithChildProcess/RemoteJITUtils.h b/llvm/examples/OrcV2Examples/LLJITWithChildProcess/RemoteJITUtils.h
deleted file mode 100644
--- a/llvm/examples/OrcV2Examples/LLJITWithChildProcess/RemoteJITUtils.h
+++ /dev/null
@@ -1,121 +0,0 @@
-//===-- RemoteJITUtils.h - Utilities for remote-JITing ----------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Utilities for remote-JITing
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_EXAMPLES_ORCV2EXAMPLES_LLJITWITHCHILDPROCESS_REMOTEJITUTILS_H
-#define LLVM_EXAMPLES_ORCV2EXAMPLES_LLJITWITHCHILDPROCESS_REMOTEJITUTILS_H
-
-#include "llvm/ExecutionEngine/Orc/RPC/RawByteChannel.h"
-#include <mutex>
-
-#if !defined(_MSC_VER) && !defined(__MINGW32__)
-#include <unistd.h>
-#else
-#include <io.h>
-#endif
-
-/// RPC channel that reads from and writes from file descriptors.
-class FDRawChannel final : public llvm::orc::rpc::RawByteChannel {
-public:
-  FDRawChannel(int InFD, int OutFD) : InFD(InFD), OutFD(OutFD) {}
-
-  llvm::Error readBytes(char *Dst, unsigned Size) override {
-    assert(Dst && "Attempt to read into null.");
-    ssize_t Completed = 0;
-    while (Completed < static_cast<ssize_t>(Size)) {
-      ssize_t Read = ::read(InFD, Dst + Completed, Size - Completed);
-      if (Read <= 0) {
-        auto ErrNo = errno;
-        if (ErrNo == EAGAIN || ErrNo == EINTR)
-          continue;
-        else
-          return llvm::errorCodeToError(
-              std::error_code(errno, std::generic_category()));
-      }
-      Completed += Read;
-    }
-    return llvm::Error::success();
-  }
-
-  llvm::Error appendBytes(const char *Src, unsigned Size) override {
-    assert(Src && "Attempt to append from null.");
-    ssize_t Completed = 0;
-    while (Completed < static_cast<ssize_t>(Size)) {
-      ssize_t Written = ::write(OutFD, Src + Completed, Size - Completed);
-      if (Written < 0) {
-        auto ErrNo = errno;
-        if (ErrNo == EAGAIN || ErrNo == EINTR)
-          continue;
-        else
-          return llvm::errorCodeToError(
-              std::error_code(errno, std::generic_category()));
-      }
-      Completed += Written;
-    }
-    return llvm::Error::success();
-  }
-
-  llvm::Error send() override { return llvm::Error::success(); }
-
-private:
-  int InFD, OutFD;
-};
-
-// Launch child process and return a channel to it.
-std::unique_ptr<FDRawChannel> launchRemote(std::string ExecPath,
-                                           pid_t &ChildPID) {
-  // Create two pipes.
-  int PipeFD[2][2];
-  if (pipe(PipeFD[0]) != 0 || pipe(PipeFD[1]) != 0)
-    perror("Error creating pipe: ");
-
-  ChildPID = fork();
-
-  if (ChildPID == 0) {
-    // In the child...
-
-    // Close the parent ends of the pipes
-    close(PipeFD[0][1]);
-    close(PipeFD[1][0]);
-
-    // Execute the child process.
-    std::unique_ptr<char[]> ChildPath, ChildIn, ChildOut;
-    {
-      ChildPath.reset(new char[ExecPath.size() + 1]);
-      std::copy(ExecPath.begin(), ExecPath.end(), &ChildPath[0]);
-      ChildPath[ExecPath.size()] = '\0';
-      std::string ChildInStr = llvm::utostr(PipeFD[0][0]);
-      ChildIn.reset(new char[ChildInStr.size() + 1]);
-      std::copy(ChildInStr.begin(), ChildInStr.end(), &ChildIn[0]);
-      ChildIn[ChildInStr.size()] = '\0';
-      std::string ChildOutStr = llvm::utostr(PipeFD[1][1]);
-      ChildOut.reset(new char[ChildOutStr.size() + 1]);
-      std::copy(ChildOutStr.begin(), ChildOutStr.end(), &ChildOut[0]);
-      ChildOut[ChildOutStr.size()] = '\0';
-    }
-
-    char *const args[] = {&ChildPath[0], &ChildIn[0], &ChildOut[0], nullptr};
-    int rc = execv(ExecPath.c_str(), args);
-    if (rc != 0)
-      perror("Error executing child process: ");
-    llvm_unreachable("Error executing child process");
-  }
-  // else we're the parent...
-
-  // Close the child ends of the pipes
-  close(PipeFD[0][0]);
-  close(PipeFD[1][1]);
-
-  // Return an RPC channel connected to our end of the pipes.
-  return std::make_unique<FDRawChannel>(PipeFD[1][0], PipeFD[0][1]);
-}
-
-#endif
diff --git a/llvm/examples/ThinLtoJIT/CMakeLists.txt b/llvm/examples/ThinLtoJIT/CMakeLists.txt
deleted file mode 100644
--- a/llvm/examples/ThinLtoJIT/CMakeLists.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-set(LLVM_LINK_COMPONENTS
-  BitReader
-  Core
-  IRReader
-  OrcJIT
-  ExecutionEngine
-  Support
-  nativecodegen
-  Analysis
-  Passes
-  )
-
-add_llvm_example(ThinLtoJIT
-  main.cpp
-  ThinLtoJIT.cpp
-  ThinLtoModuleIndex.cpp
-  ThinLtoInstrumentationLayer.cpp
-  ThinLtoDiscoveryThread.cpp
-  )
diff --git a/llvm/examples/ThinLtoJIT/ThinLtoDiscoveryThread.h b/llvm/examples/ThinLtoJIT/ThinLtoDiscoveryThread.h
deleted file mode 100644
--- a/llvm/examples/ThinLtoJIT/ThinLtoDiscoveryThread.h
+++ /dev/null
@@ -1,57 +0,0 @@
-#ifndef LLVM_EXAMPLES_THINLTOJIT_DISCOVERYTHREAD_H
-#define LLVM_EXAMPLES_THINLTOJIT_DISCOVERYTHREAD_H
-
-#include "llvm/ADT/StringRef.h"
-#include "llvm/IR/ModuleSummaryIndex.h"
-
-#include "ThinLtoJIT.h"
-
-#include <atomic>
-#include <vector>
-
-namespace llvm {
-namespace orc {
-
-class ExecutionSession;
-class ThinLtoModuleIndex;
-class ThinLtoInstrumentationLayer;
-
-class ThinLtoDiscoveryThread {
-public:
-  ThinLtoDiscoveryThread(std::atomic<bool> &RunningFlag, ExecutionSession &ES,
-                         JITDylib *MainJD, ThinLtoInstrumentationLayer &L,
-                         ThinLtoModuleIndex &GlobalIndex,
-                         ThinLtoJIT::AddModuleFunction AddModule,
-                         unsigned LookaheadLevels, bool PrintStats)
-      : KeepRunning(RunningFlag), ES(ES), Layer(L), GlobalIndex(GlobalIndex),
-        AddModule(std::move(AddModule)), LookaheadLevels(LookaheadLevels),
-        PrintStats(PrintStats) {}
-
-  ~ThinLtoDiscoveryThread() {
-    if (PrintStats)
-      dump(errs());
-  }
-
-  void operator()();
-
-  void dump(raw_ostream &OS) {
-    OS << format("Modules submitted asynchronously: %d\n", NumModulesSubmitted);
-  }
-
-private:
-  std::atomic<bool> &KeepRunning;
-  ExecutionSession &ES;
-  ThinLtoInstrumentationLayer &Layer;
-  ThinLtoModuleIndex &GlobalIndex;
-  ThinLtoJIT::AddModuleFunction AddModule;
-  unsigned LookaheadLevels;
-  bool PrintStats;
-  unsigned NumModulesSubmitted{0};
-
-  void spawnLookupForHighRankModules();
-};
-
-} // namespace orc
-} // namespace llvm
-
-#endif
diff --git a/llvm/examples/ThinLtoJIT/ThinLtoDiscoveryThread.cpp b/llvm/examples/ThinLtoJIT/ThinLtoDiscoveryThread.cpp
deleted file mode 100644
--- a/llvm/examples/ThinLtoJIT/ThinLtoDiscoveryThread.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-#include "ThinLtoDiscoveryThread.h"
-
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Error.h"
-
-#include "ThinLtoInstrumentationLayer.h"
-#include "ThinLtoModuleIndex.h"
-
-#include <thread>
-
-#define DEBUG_TYPE "thinltojit"
-
-namespace llvm {
-namespace orc {
-
-void ThinLtoDiscoveryThread::operator()() {
-  while (KeepRunning.load()) {
-    std::vector<unsigned> Indexes = Layer.takeFlagsThatFired();
-
-    if (!Indexes.empty()) {
-      LLVM_DEBUG(dbgs() << Indexes.size() << " new flags raised\n");
-      auto ReachedFunctions = Layer.takeFlagOwners(std::move(Indexes));
-
-      for (GlobalValue::GUID F : ReachedFunctions) {
-        if (GlobalValueSummary *S = GlobalIndex.getSummary(F)) {
-          assert(isa<FunctionSummary>(S) && "Reached symbols are functions");
-          GlobalIndex.discoverCalleeModulePaths(cast<FunctionSummary>(S),
-                                                LookaheadLevels);
-        } else {
-          LLVM_DEBUG(dbgs() << "No summary for GUID: " << F << "\n");
-        }
-      }
-
-      if (GlobalIndex.getNumDiscoveredModules() > 0)
-        spawnLookupForHighRankModules();
-    }
-  }
-}
-
-void ThinLtoDiscoveryThread::spawnLookupForHighRankModules() {
-  std::vector<std::string> Paths = GlobalIndex.selectNextPaths();
-  GlobalIndex.scheduleModuleParsing(Paths);
-
-  // In order to add modules we need exclusive access to the execution session.
-  std::thread([this, Paths = std::move(Paths)]() {
-    ES.runSessionLocked([this, Paths = std::move(Paths)]() mutable {
-      for (const std::string &Path : Paths) {
-        ThreadSafeModule TSM = GlobalIndex.takeModule(Path);
-        if (!TSM)
-          // In the meantime the module was added synchronously.
-          continue;
-
-        if (Error LoadErr = AddModule(std::move(TSM)))
-          // Failed to add the module to the session.
-          ES.reportError(std::move(LoadErr));
-
-        ++NumModulesSubmitted;
-      }
-    });
-  }).detach();
-}
-
-} // namespace orc
-} // namespace llvm
diff --git a/llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.h b/llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.h
deleted file mode 100644
--- a/llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.h
+++ /dev/null
@@ -1,78 +0,0 @@
-#ifndef LLVM_EXAMPLES_THINLTOJIT_DISCOVERYLAYER_H
-#define LLVM_EXAMPLES_THINLTOJIT_DISCOVERYLAYER_H
-
-#include "llvm/ExecutionEngine/JITSymbol.h"
-#include "llvm/ExecutionEngine/Orc/Core.h"
-#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
-#include "llvm/ExecutionEngine/Orc/Layer.h"
-#include "llvm/ExecutionEngine/Orc/ThreadSafeModule.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/Support/raw_ostream.h"
-
-#include "ThinLtoJIT.h"
-
-#include <atomic>
-#include <cstdint>
-#include <map>
-#include <mutex>
-#include <vector>
-
-namespace llvm {
-namespace orc {
-
-class ThinLtoInstrumentationLayer : public IRLayer {
-public:
-  ThinLtoInstrumentationLayer(ExecutionSession &ES, IRCompileLayer &BaseLayer,
-                              ThinLtoJIT::ExplicitMemoryBarrier MemFence,
-                              unsigned FlagsPerBucket)
-      : IRLayer(ES, BaseLayer.getManglingOptions()), BaseLayer(BaseLayer),
-        MemFence(MemFence) {
-    // TODO: So far we only allocate one bucket.
-    allocateDiscoveryFlags(FlagsPerBucket);
-  }
-
-  ~ThinLtoInstrumentationLayer() override;
-
-  void emit(std::unique_ptr<MaterializationResponsibility> R,
-            ThreadSafeModule TSM) override;
-
-  unsigned reserveDiscoveryFlags(unsigned Count);
-  void registerDiscoveryFlagOwners(std::vector<GlobalValue::GUID> Guids,
-                                   unsigned FirstIdx);
-
-  void nudgeIntoDiscovery(std::vector<GlobalValue::GUID> Functions);
-
-  std::vector<unsigned> takeFlagsThatFired();
-  std::vector<GlobalValue::GUID> takeFlagOwners(std::vector<unsigned> Indexes);
-
-  void dump(raw_ostream &OS);
-
-private:
-  IRCompileLayer &BaseLayer;
-  ThinLtoJIT::ExplicitMemoryBarrier MemFence;
-
-  enum Flag : uint8_t { Clear = 0, Fired = 1 };
-
-  // Lock-free read access.
-  uint8_t *FlagsStorage;
-  Flag *FlagsIncoming; // lock-free write by design
-  Flag *FlagsHandled;
-  unsigned NumFlagsAllocated;
-  std::atomic<unsigned> NumFlagsUsed; // spin-lock
-
-  // Acquire/release sync between writers and reader
-  std::atomic<uint64_t> FlagsSync;
-
-  // STL container requires locking for both, read and write access.
-  mutable std::mutex DiscoveryFlagsInfoLock;
-  std::map<unsigned, GlobalValue::GUID> FlagOwnersMap;
-
-  void allocateDiscoveryFlags(unsigned MinFlags);
-  void compileFunctionReachedFlagSetter(IRBuilder<> &B, Flag *F);
-};
-
-} // namespace orc
-} // namespace llvm
-
-#endif
diff --git a/llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.cpp b/llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.cpp
deleted file mode 100644
--- a/llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.cpp
+++ /dev/null
@@ -1,225 +0,0 @@
-#include "ThinLtoInstrumentationLayer.h"
-
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Process.h"
-
-#include <cstdlib>
-
-#define DEBUG_TYPE "thinltojit"
-
-namespace llvm {
-namespace orc {
-
-// TODO: Fixed set of flags may not always be enough. Make this expandable.
-void ThinLtoInstrumentationLayer::allocateDiscoveryFlags(unsigned MinFlags) {
-  // Round up to full memory pages.
-  unsigned PageSize = sys::Process::getPageSizeEstimate();
-  unsigned NumPagesEach = (MinFlags + (PageSize - 1)) / PageSize;
-  unsigned NumPagesTotal = 2 * NumPagesEach;
-  assert(isPowerOf2_64(PageSize) && "Adjust aligned memory alloc below");
-
-  // Allocate one more page to make up for size loss due to alignment.
-  void *Storage = std::calloc(NumPagesTotal + 1, PageSize);
-  uint64_t StorageAddr = reinterpret_cast<uint64_t>(Storage);
-  uint64_t PageSizeDecr = PageSize - 1;
-  uint64_t AlignedAddr = ((StorageAddr + PageSizeDecr) & ~PageSizeDecr);
-  uint64_t Diff = AlignedAddr - StorageAddr;
-
-  // For each flag we allocate one byte in each location: Incoming and Handled.
-  // TODO: 'Handled' could be a bitset, but size must be dynamic
-  NumFlagsUsed.store(0);
-  NumFlagsAllocated = NumPagesEach * PageSize;
-  FlagsStorage = static_cast<uint8_t *>(Storage);
-  FlagsIncoming = reinterpret_cast<Flag *>(FlagsStorage + Diff);
-  FlagsHandled = FlagsIncoming + NumFlagsAllocated;
-
-  static_assert(sizeof(FlagsIncoming[0]) == sizeof(uint8_t), "Flags are bytes");
-  assert(reinterpret_cast<uint64_t>(FlagsIncoming) % PageSize == 0);
-  assert(reinterpret_cast<uint64_t>(FlagsHandled) % PageSize == 0);
-  assert(NumFlagsAllocated >= MinFlags);
-}
-
-// Reserve a new set of discovery flags and return the index of the first one.
-unsigned ThinLtoInstrumentationLayer::reserveDiscoveryFlags(unsigned Count) {
-#ifndef NDEBUG
-  for (unsigned i = NumFlagsUsed.load(), e = i + Count; i < e; i++) {
-    assert(FlagsIncoming[i] == Clear);
-  }
-#endif
-
-  assert(Count > 0);
-  return NumFlagsUsed.fetch_add(Count);
-}
-
-void ThinLtoInstrumentationLayer::registerDiscoveryFlagOwners(
-    std::vector<GlobalValue::GUID> Guids, unsigned FirstIdx) {
-  unsigned Count = Guids.size();
-
-  std::lock_guard<std::mutex> Lock(DiscoveryFlagsInfoLock);
-  for (unsigned i = 0; i < Count; i++) {
-    assert(!FlagOwnersMap.count(FirstIdx + i) &&
-           "Flag should not have an owner at this point");
-    FlagOwnersMap[FirstIdx + i] = Guids[i];
-  }
-}
-
-std::vector<unsigned> ThinLtoInstrumentationLayer::takeFlagsThatFired() {
-  // This is only effective with the respective Release.
-  FlagsSync.load(std::memory_order_acquire);
-
-  std::vector<unsigned> Indexes;
-  unsigned NumIndexesUsed = NumFlagsUsed.load();
-  for (unsigned i = 0; i < NumIndexesUsed; i++) {
-    if (FlagsIncoming[i] == Fired && FlagsHandled[i] == Clear) {
-      FlagsHandled[i] = Fired;
-      Indexes.push_back(i);
-    }
-  }
-
-  return Indexes;
-}
-
-std::vector<GlobalValue::GUID>
-ThinLtoInstrumentationLayer::takeFlagOwners(std::vector<unsigned> Indexes) {
-  std::vector<GlobalValue::GUID> ReachedFunctions;
-  std::lock_guard<std::mutex> Lock(DiscoveryFlagsInfoLock);
-
-  for (unsigned i : Indexes) {
-    auto KV = FlagOwnersMap.find(i);
-    assert(KV != FlagOwnersMap.end());
-    ReachedFunctions.push_back(KV->second);
-    FlagOwnersMap.erase(KV);
-  }
-
-  return ReachedFunctions;
-}
-
-void ThinLtoInstrumentationLayer::nudgeIntoDiscovery(
-    std::vector<GlobalValue::GUID> Functions) {
-  unsigned Count = Functions.size();
-
-  // Registering synthetic flags in advance. We expect them to get processed
-  // before the respective functions get emitted. If not, the emit() function
-  unsigned FirstFlagIdx = reserveDiscoveryFlags(Functions.size());
-  registerDiscoveryFlagOwners(std::move(Functions), FirstFlagIdx);
-
-  // Initialize the flags as fired and force a cache sync, so discovery will
-  // pick them up as soon as possible.
-  for (unsigned i = FirstFlagIdx; i < FirstFlagIdx + Count; i++) {
-    FlagsIncoming[i] = Fired;
-  }
-  if (MemFence & ThinLtoJIT::FenceStaticCode) {
-    FlagsSync.store(0, std::memory_order_release);
-  }
-
-  LLVM_DEBUG(dbgs() << "Nudged " << Count << " new functions into discovery\n");
-}
-
-void ThinLtoInstrumentationLayer::emit(
-    std::unique_ptr<MaterializationResponsibility> R, ThreadSafeModule TSM) {
-  TSM.withModuleDo([this](Module &M) {
-    std::vector<Function *> FunctionsToInstrument;
-
-    // We may have discovered ahead of some functions already, but we still
-    // instrument them all. Their notifications steer the future direction of
-    // discovery.
-    for (Function &F : M.getFunctionList())
-      if (!F.isDeclaration())
-        FunctionsToInstrument.push_back(&F);
-
-    if (!FunctionsToInstrument.empty()) {
-      IRBuilder<> B(M.getContext());
-      std::vector<GlobalValue::GUID> NewDiscoveryRoots;
-
-      // Flags that fire must have owners registered. We will do it below and
-      // that's fine, because they can only be reached once the code is emitted.
-      unsigned FirstFlagIdx =
-          reserveDiscoveryFlags(FunctionsToInstrument.size());
-
-      unsigned NextFlagIdx = FirstFlagIdx;
-      for (Function *F : FunctionsToInstrument) {
-        // TODO: Emitting the write operation into an indirection stub would
-        // allow to skip it once we got the notification.
-        BasicBlock *E = &F->getEntryBlock();
-        B.SetInsertPoint(BasicBlock::Create(
-            M.getContext(), "NotifyFunctionReachedProlog", F, E));
-        compileFunctionReachedFlagSetter(B, FlagsIncoming + NextFlagIdx);
-        B.CreateBr(E);
-
-        std::string GlobalName = GlobalValue::getGlobalIdentifier(
-            F->getName(), F->getLinkage(), M.getSourceFileName());
-        NewDiscoveryRoots.push_back(GlobalValue::getGUID(GlobalName));
-        ++NextFlagIdx;
-      }
-
-      LLVM_DEBUG(dbgs() << "Instrumented " << NewDiscoveryRoots.size()
-                        << " new functions in module " << M.getName() << "\n");
-
-      // Submit owner info, so the DiscoveryThread can evaluate the flags.
-      registerDiscoveryFlagOwners(std::move(NewDiscoveryRoots), FirstFlagIdx);
-    }
-  });
-
-  BaseLayer.emit(std::move(R), std::move(TSM));
-}
-
-void ThinLtoInstrumentationLayer::compileFunctionReachedFlagSetter(
-    IRBuilder<> &B, Flag *F) {
-  assert(*F == Clear);
-  Type *Int64Ty = Type::getInt64Ty(B.getContext());
-
-  // Write one immediate 8bit value to a fixed location in memory.
-  auto FlagAddr = pointerToJITTargetAddress(F);
-  Type *FlagTy = Type::getInt8Ty(B.getContext());
-  B.CreateStore(ConstantInt::get(FlagTy, Fired),
-                B.CreateIntToPtr(ConstantInt::get(Int64Ty, FlagAddr),
-                                 FlagTy->getPointerTo()));
-
-  if (MemFence & ThinLtoJIT::FenceJITedCode) {
-    // Overwrite the sync value with Release ordering. The discovery thread
-    // reads it with Acquire ordering. The actual value doesn't matter.
-    static constexpr bool IsVolatile = true;
-    static constexpr Instruction *NoInsertBefore = nullptr;
-    auto SyncFlagAddr = pointerToJITTargetAddress(&FlagsSync);
-
-    B.Insert(
-        new StoreInst(ConstantInt::get(Int64Ty, 0),
-                      B.CreateIntToPtr(ConstantInt::get(Int64Ty, SyncFlagAddr),
-                                       Int64Ty->getPointerTo()),
-                      IsVolatile, Align(64), AtomicOrdering::Release,
-                      SyncScope::System, NoInsertBefore));
-  }
-}
-
-void ThinLtoInstrumentationLayer::dump(raw_ostream &OS) {
-  OS << "Discovery flags stats\n";
-
-  unsigned NumFlagsFired = 0;
-  for (unsigned i = 0; i < NumFlagsAllocated; i++) {
-    if (FlagsIncoming[i] == Fired)
-      ++NumFlagsFired;
-  }
-  OS << "Alloc:  " << format("%6.d", NumFlagsAllocated) << "\n";
-  OS << "Issued: " << format("%6.d", NumFlagsUsed.load()) << "\n";
-  OS << "Fired:  " << format("%6.d", NumFlagsFired) << "\n";
-
-  unsigned RemainingFlagOwners = 0;
-  for (const auto &_ : FlagOwnersMap) {
-    ++RemainingFlagOwners;
-    (void)_;
-  }
-  OS << "\nFlagOwnersMap has " << RemainingFlagOwners
-     << " remaining entries.\n";
-}
-
-ThinLtoInstrumentationLayer::~ThinLtoInstrumentationLayer() {
-  std::free(FlagsStorage);
-}
-
-} // namespace orc
-} // namespace llvm
diff --git a/llvm/examples/ThinLtoJIT/ThinLtoJIT.h b/llvm/examples/ThinLtoJIT/ThinLtoJIT.h
deleted file mode 100644
--- a/llvm/examples/ThinLtoJIT/ThinLtoJIT.h
+++ /dev/null
@@ -1,111 +0,0 @@
-#ifndef LLVM_EXAMPLES_THINLTOJIT_THINLTOJIT_H
-#define LLVM_EXAMPLES_THINLTOJIT_THINLTOJIT_H
-
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/ExecutionEngine/Orc/Core.h"
-#include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
-#include "llvm/ExecutionEngine/Orc/SymbolStringPool.h"
-#include "llvm/ExecutionEngine/Orc/ThreadSafeModule.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/ThreadPool.h"
-
-#include <functional>
-#include <memory>
-#include <string>
-#include <thread>
-#include <vector>
-
-namespace llvm {
-namespace orc {
-
-class ThinLtoDiscoveryThread;
-class ThinLtoInstrumentationLayer;
-class ThinLtoModuleIndex;
-
-class CompileOnDemandLayer;
-class IRCompileLayer;
-class RTDyldObjectLinkingLayer;
-
-class JITDylib;
-class JITTargetMachineBuilder;
-class LazyCallThroughManager;
-class MangleAndInterner;
-
-class ThinLtoJIT {
-public:
-  using AddModuleFunction = std::function<Error(ThreadSafeModule)>;
-
-  enum ExplicitMemoryBarrier {
-    NeverFence = 0,
-    FenceStaticCode = 1,
-    FenceJITedCode = 2,
-    AlwaysFence = 3
-  };
-
-  ThinLtoJIT(ArrayRef<std::string> InputFiles, StringRef MainFunctionName,
-             unsigned LookaheadLevels, unsigned NumCompileThreads,
-             unsigned NumLoadThreads, unsigned DiscoveryFlagsPerBucket,
-             ExplicitMemoryBarrier MemFence, bool AllowNudgeIntoDiscovery,
-             bool PrintStats, Error &Err);
-  ~ThinLtoJIT();
-
-  ThinLtoJIT(const ThinLtoJIT &) = delete;
-  ThinLtoJIT &operator=(const ThinLtoJIT &) = delete;
-  ThinLtoJIT(ThinLtoJIT &&) = delete;
-  ThinLtoJIT &operator=(ThinLtoJIT &&) = delete;
-
-  Expected<int> main(ArrayRef<std::string> Args) {
-    auto MainSym = ES.lookup({MainJD}, MainFunctionMangled);
-    if (!MainSym)
-      return MainSym.takeError();
-
-    using MainFn = int(int, char *[]);
-    auto Main = jitTargetAddressToFunction<MainFn *>(MainSym->getAddress());
-
-    return runAsMain(Main, Args, StringRef("ThinLtoJIT"));
-  }
-
-private:
-  ExecutionSession ES;
-  DataLayout DL{""};
-
-  JITDylib *MainJD;
-  SymbolStringPtr MainFunctionMangled;
-  std::unique_ptr<ThreadPool> CompileThreads;
-  std::unique_ptr<ThinLtoModuleIndex> GlobalIndex;
-
-  AddModuleFunction AddModule;
-  std::unique_ptr<RTDyldObjectLinkingLayer> ObjLinkingLayer;
-  std::unique_ptr<IRCompileLayer> CompileLayer;
-  std::unique_ptr<ThinLtoInstrumentationLayer> InstrumentationLayer;
-  std::unique_ptr<CompileOnDemandLayer> OnDemandLayer;
-
-  std::atomic<bool> JitRunning;
-  std::thread DiscoveryThread;
-  std::unique_ptr<ThinLtoDiscoveryThread> DiscoveryThreadWorker;
-
-  std::unique_ptr<MangleAndInterner> Mangle;
-  std::unique_ptr<LazyCallThroughManager> CallThroughManager;
-
-  void setupLayers(JITTargetMachineBuilder JTMB, unsigned NumCompileThreads,
-                   unsigned DiscoveryFlagsPerBucket,
-                   ExplicitMemoryBarrier MemFence);
-  Error setupJITDylib(JITDylib *JD, bool AllowNudge, bool PrintStats);
-  void setupDiscovery(JITDylib *MainJD, unsigned LookaheadLevels,
-                      bool PrintStats);
-  Expected<ThreadSafeModule> setupMainModule(StringRef MainFunction);
-  Expected<JITTargetMachineBuilder> setupTargetUtils(Module *M);
-  Error applyDataLayout(Module *M);
-
-  static void exitOnLazyCallThroughFailure() {
-    errs() << "Compilation failed. Aborting.\n";
-    exit(1);
-  }
-};
-
-} // namespace orc
-} // namespace llvm
-
-#endif
diff --git a/llvm/examples/ThinLtoJIT/ThinLtoJIT.cpp b/llvm/examples/ThinLtoJIT/ThinLtoJIT.cpp
deleted file mode 100644
--- a/llvm/examples/ThinLtoJIT/ThinLtoJIT.cpp
+++ /dev/null
@@ -1,339 +0,0 @@
-#include "ThinLtoJIT.h"
-
-#include "llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h"
-#include "llvm/ExecutionEngine/Orc/CompileUtils.h"
-#include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
-#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
-#include "llvm/ExecutionEngine/Orc/IndirectionUtils.h"
-#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
-#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
-#include "llvm/ExecutionEngine/SectionMemoryManager.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Host.h"
-
-#include "ThinLtoDiscoveryThread.h"
-#include "ThinLtoInstrumentationLayer.h"
-#include "ThinLtoModuleIndex.h"
-
-#include <set>
-#include <string>
-#include <thread>
-
-#ifndef NDEBUG
-#include <chrono>
-#endif
-
-#define DEBUG_TYPE "thinltojit"
-
-namespace llvm {
-namespace orc {
-
-class ThinLtoDefinitionGenerator : public JITDylib::DefinitionGenerator {
-public:
-  ThinLtoDefinitionGenerator(ThinLtoModuleIndex &GlobalIndex,
-                             ThinLtoInstrumentationLayer &InstrumentationLayer,
-                             ThinLtoJIT::AddModuleFunction AddModule,
-                             char Prefix, bool AllowNudge, bool PrintStats)
-      : GlobalIndex(GlobalIndex), InstrumentationLayer(InstrumentationLayer),
-        AddModule(std::move(AddModule)), ManglePrefix(Prefix),
-        AllowNudgeIntoDiscovery(AllowNudge), PrintStats(PrintStats) {}
-
-  ~ThinLtoDefinitionGenerator() {
-    if (PrintStats)
-      dump(errs());
-  }
-
-  Error tryToGenerate(LookupKind K, JITDylib &JD,
-                      JITDylibLookupFlags JDLookupFlags,
-                      const SymbolLookupSet &Symbols) override;
-
-  void dump(raw_ostream &OS) {
-    OS << format("Modules submitted synchronously: %d\n", NumModulesMissed);
-  }
-
-private:
-  ThinLtoModuleIndex &GlobalIndex;
-  ThinLtoInstrumentationLayer &InstrumentationLayer;
-  ThinLtoJIT::AddModuleFunction AddModule;
-  char ManglePrefix;
-  bool AllowNudgeIntoDiscovery;
-  bool PrintStats;
-  unsigned NumModulesMissed{0};
-
-  // ThinLTO summaries encode unprefixed names.
-  StringRef stripGlobalManglePrefix(StringRef Symbol) const {
-    bool Strip = (ManglePrefix != '\0' && Symbol[0] == ManglePrefix);
-    return Strip ? StringRef(Symbol.data() + 1, Symbol.size() - 1) : Symbol;
-  }
-};
-
-Error ThinLtoDefinitionGenerator::tryToGenerate(
-    LookupKind K, JITDylib &JD, JITDylibLookupFlags JDLookupFlags,
-    const SymbolLookupSet &Symbols) {
-  std::set<StringRef> ModulePaths;
-  std::vector<GlobalValue::GUID> NewDiscoveryRoots;
-
-  for (const auto &KV : Symbols) {
-    StringRef UnmangledName = stripGlobalManglePrefix(*KV.first);
-    auto Guid = GlobalValue::getGUID(UnmangledName);
-    if (GlobalValueSummary *S = GlobalIndex.getSummary(Guid)) {
-      // We could have discovered it ahead of time.
-      LLVM_DEBUG(dbgs() << format("Failed to discover symbol: %s\n",
-                                  UnmangledName.str().c_str()));
-      ModulePaths.insert(S->modulePath());
-      if (AllowNudgeIntoDiscovery && isa<FunctionSummary>(S)) {
-        NewDiscoveryRoots.push_back(Guid);
-      }
-    }
-  }
-
-  NumModulesMissed += ModulePaths.size();
-
-  // Parse the requested modules if it hasn't happened yet.
-  GlobalIndex.scheduleModuleParsing(ModulePaths);
-
-  for (StringRef Path : ModulePaths) {
-    ThreadSafeModule TSM = GlobalIndex.takeModule(Path);
-    assert(TSM && "We own the session lock, no asynchronous access possible");
-
-    if (Error LoadErr = AddModule(std::move(TSM)))
-      // Failed to add the module to the session.
-      return LoadErr;
-
-    LLVM_DEBUG(dbgs() << "Generator: added " << Path << " synchronously\n");
-  }
-
-  // Requested functions that we failed to discover ahead of time, are likely
-  // close to the execution front. We can anticipate to run into them as soon
-  // as execution continues and trigger their discovery flags already now. This
-  // behavior is enabled with the 'allow-nudge' option and implemented below.
-  // On the one hand, it may give us a head start in a moment where discovery
-  // was lacking behind. On the other hand, we may bet on the wrong horse and
-  // waste extra time speculating in the wrong direction.
-  if (!NewDiscoveryRoots.empty()) {
-    assert(AllowNudgeIntoDiscovery);
-    InstrumentationLayer.nudgeIntoDiscovery(std::move(NewDiscoveryRoots));
-  }
-
-  return Error::success();
-}
-
-ThinLtoJIT::ThinLtoJIT(ArrayRef<std::string> InputFiles,
-                       StringRef MainFunctionName, unsigned LookaheadLevels,
-                       unsigned NumCompileThreads, unsigned NumLoadThreads,
-                       unsigned DiscoveryFlagsPerBucket,
-                       ExplicitMemoryBarrier MemFence,
-                       bool AllowNudgeIntoDiscovery, bool PrintStats,
-                       Error &Err) {
-  ErrorAsOutParameter ErrAsOutParam(&Err);
-
-  // Populate the module index, so we know which modules exist and we can find
-  // the one that defines the main function.
-  GlobalIndex = std::make_unique<ThinLtoModuleIndex>(ES, NumLoadThreads);
-  for (StringRef F : InputFiles) {
-    if (auto Err = GlobalIndex->add(F))
-      ES.reportError(std::move(Err));
-  }
-
-  // Load the module that defines the main function.
-  auto TSM = setupMainModule(MainFunctionName);
-  if (!TSM) {
-    Err = TSM.takeError();
-    return;
-  }
-
-  // Infer target-specific utils from the main module.
-  ThreadSafeModule MainModule = std::move(*TSM);
-  auto JTMB = setupTargetUtils(MainModule.getModuleUnlocked());
-  if (!JTMB) {
-    Err = JTMB.takeError();
-    return;
-  }
-
-  // Set up the JIT compile pipeline.
-  setupLayers(std::move(*JTMB), NumCompileThreads, DiscoveryFlagsPerBucket,
-              MemFence);
-
-  // We can use the mangler now. Remember the mangled name of the main function.
-  MainFunctionMangled = (*Mangle)(MainFunctionName);
-
-  // We are restricted to a single dylib currently. Add runtime overrides and
-  // symbol generators.
-  MainJD = &ES.createBareJITDylib("main");
-  Err = setupJITDylib(MainJD, AllowNudgeIntoDiscovery, PrintStats);
-  if (Err)
-    return;
-
-  // Spawn discovery thread and let it add newly discovered modules to the JIT.
-  setupDiscovery(MainJD, LookaheadLevels, PrintStats);
-
-  Err = AddModule(std::move(MainModule));
-  if (Err)
-    return;
-
-  if (AllowNudgeIntoDiscovery) {
-    auto MainFunctionGuid = GlobalValue::getGUID(MainFunctionName);
-    InstrumentationLayer->nudgeIntoDiscovery({MainFunctionGuid});
-  }
-}
-
-Expected<ThreadSafeModule> ThinLtoJIT::setupMainModule(StringRef MainFunction) {
-  Optional<StringRef> M = GlobalIndex->getModulePathForSymbol(MainFunction);
-  if (!M) {
-    std::string Buffer;
-    raw_string_ostream OS(Buffer);
-    OS << "No ValueInfo for symbol '" << MainFunction;
-    OS << "' in provided modules: ";
-    for (StringRef P : GlobalIndex->getAllModulePaths())
-      OS << P << " ";
-    OS << "\n";
-    return createStringError(inconvertibleErrorCode(), OS.str());
-  }
-
-  if (auto TSM = GlobalIndex->parseModuleFromFile(*M))
-    return std::move(TSM); // Not a redundant move: fix build on gcc-7.5
-
-  return createStringError(inconvertibleErrorCode(),
-                           "Failed to parse main module");
-}
-
-Expected<JITTargetMachineBuilder> ThinLtoJIT::setupTargetUtils(Module *M) {
-  std::string T = M->getTargetTriple();
-  JITTargetMachineBuilder JTMB(Triple(T.empty() ? sys::getProcessTriple() : T));
-
-  // CallThroughManager is ABI-specific
-  auto LCTM = createLocalLazyCallThroughManager(
-      JTMB.getTargetTriple(), ES,
-      pointerToJITTargetAddress(exitOnLazyCallThroughFailure));
-  if (!LCTM)
-    return LCTM.takeError();
-  CallThroughManager = std::move(*LCTM);
-
-  // Use DataLayout or the given module or fall back to the host's default.
-  DL = DataLayout(M);
-  if (DL.getStringRepresentation().empty()) {
-    auto HostDL = JTMB.getDefaultDataLayoutForTarget();
-    if (!HostDL)
-      return HostDL.takeError();
-    DL = std::move(*HostDL);
-    if (Error Err = applyDataLayout(M))
-      return std::move(Err);
-  }
-
-  // Now that we know the target data layout we can setup the mangler.
-  Mangle = std::make_unique<MangleAndInterner>(ES, DL);
-  return JTMB;
-}
-
-Error ThinLtoJIT::applyDataLayout(Module *M) {
-  if (M->getDataLayout().isDefault())
-    M->setDataLayout(DL);
-
-  if (M->getDataLayout() != DL)
-    return make_error<StringError>(
-        "Added modules have incompatible data layouts",
-        inconvertibleErrorCode());
-
-  return Error::success();
-}
-
-static bool IsTrivialModule(MaterializationUnit *MU) {
-  StringRef ModuleName = MU->getName();
-  return ModuleName == "<Lazy Reexports>" || ModuleName == "<Reexports>" ||
-         ModuleName == "<Absolute Symbols>";
-}
-
-void ThinLtoJIT::setupLayers(JITTargetMachineBuilder JTMB,
-                             unsigned NumCompileThreads,
-                             unsigned DiscoveryFlagsPerBucket,
-                             ExplicitMemoryBarrier MemFence) {
-  ObjLinkingLayer = std::make_unique<RTDyldObjectLinkingLayer>(
-      ES, []() { return std::make_unique<SectionMemoryManager>(); });
-
-  CompileLayer = std::make_unique<IRCompileLayer>(
-      ES, *ObjLinkingLayer, std::make_unique<ConcurrentIRCompiler>(JTMB));
-
-  InstrumentationLayer = std::make_unique<ThinLtoInstrumentationLayer>(
-      ES, *CompileLayer, MemFence, DiscoveryFlagsPerBucket);
-
-  OnDemandLayer = std::make_unique<CompileOnDemandLayer>(
-      ES, *InstrumentationLayer, *CallThroughManager,
-      createLocalIndirectStubsManagerBuilder(JTMB.getTargetTriple()));
-  // Don't break up modules. Insert stubs on module boundaries.
-  OnDemandLayer->setPartitionFunction(CompileOnDemandLayer::compileWholeModule);
-
-  // Delegate compilation to the thread pool.
-  CompileThreads = std::make_unique<ThreadPool>(
-      llvm::hardware_concurrency(NumCompileThreads));
-  ES.setDispatchMaterialization(
-      [this](std::unique_ptr<MaterializationUnit> MU,
-             std::unique_ptr<MaterializationResponsibility> MR) {
-        if (IsTrivialModule(MU.get())) {
-          // This should be quick and we may save a few session locks.
-          MU->materialize(std::move(MR));
-        } else {
-          // FIXME: Drop the std::shared_ptr workaround once ThreadPool::async()
-          // accepts llvm::unique_function to define jobs.
-          CompileThreads->async(
-              [UnownedMU = MU.release(), UnownedMR = MR.release()]() {
-                std::unique_ptr<MaterializationUnit> MU(UnownedMU);
-                std::unique_ptr<MaterializationResponsibility> MR(UnownedMR);
-                MU->materialize(std::move(MR));
-              });
-        }
-      });
-
-  AddModule = [this](ThreadSafeModule TSM) -> Error {
-    assert(MainJD && "Setup MainJD JITDylib before calling");
-    Module *M = TSM.getModuleUnlocked();
-    if (Error Err = applyDataLayout(M))
-      return Err;
-    VModuleKey Id = GlobalIndex->getModuleId(M->getName());
-    return OnDemandLayer->add(*MainJD, std::move(TSM), Id);
-  };
-}
-
-void ThinLtoJIT::setupDiscovery(JITDylib *MainJD, unsigned LookaheadLevels,
-                                bool PrintStats) {
-  JitRunning.store(true);
-  DiscoveryThreadWorker = std::make_unique<ThinLtoDiscoveryThread>(
-      JitRunning, ES, MainJD, *InstrumentationLayer, *GlobalIndex, AddModule,
-      LookaheadLevels, PrintStats);
-
-  DiscoveryThread = std::thread(std::ref(*DiscoveryThreadWorker));
-}
-
-Error ThinLtoJIT::setupJITDylib(JITDylib *JD, bool AllowNudge,
-                                bool PrintStats) {
-  // Register symbols for C++ static destructors.
-  LocalCXXRuntimeOverrides CXXRuntimeoverrides;
-  Error Err = CXXRuntimeoverrides.enable(*JD, *Mangle);
-  if (Err)
-    return Err;
-
-  // Lookup symbol names in the global ThinLTO module index first
-  char Prefix = DL.getGlobalPrefix();
-  JD->addGenerator(std::make_unique<ThinLtoDefinitionGenerator>(
-      *GlobalIndex, *InstrumentationLayer, AddModule, Prefix, AllowNudge,
-      PrintStats));
-
-  // Then try lookup in the host process.
-  auto HostLookup = DynamicLibrarySearchGenerator::GetForCurrentProcess(Prefix);
-  if (!HostLookup)
-    return HostLookup.takeError();
-  JD->addGenerator(std::move(*HostLookup));
-
-  return Error::success();
-}
-
-ThinLtoJIT::~ThinLtoJIT() {
-  // Signal the DiscoveryThread to shut down.
-  JitRunning.store(false);
-  DiscoveryThread.join();
-
-  // Wait for potential compile actions to finish.
-  CompileThreads->wait();
-}
-
-} // namespace orc
-} // namespace llvm
diff --git a/llvm/examples/ThinLtoJIT/ThinLtoModuleIndex.h b/llvm/examples/ThinLtoJIT/ThinLtoModuleIndex.h
deleted file mode 100644
--- a/llvm/examples/ThinLtoJIT/ThinLtoModuleIndex.h
+++ /dev/null
@@ -1,94 +0,0 @@
-#ifndef LLVM_EXAMPLES_THINLTOJIT_THINLTOJITMODULEINDEX_H
-#define LLVM_EXAMPLES_THINLTOJIT_THINLTOJITMODULEINDEX_H
-
-#include "llvm/ADT/Optional.h"
-#include "llvm/ExecutionEngine/Orc/Core.h"
-#include "llvm/ExecutionEngine/Orc/ThreadSafeModule.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/ModuleSummaryIndex.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/ThreadPool.h"
-
-#include <cstdint>
-#include <future>
-#include <mutex>
-#include <set>
-#include <vector>
-
-namespace llvm {
-namespace orc {
-
-class SymbolStringPtr;
-
-class ThinLtoModuleIndex {
-  static constexpr bool HaveGVs = false;
-
-public:
-  ThinLtoModuleIndex(ExecutionSession &ES, unsigned ParseModuleThreads)
-      : ES(ES), CombinedSummaryIndex(HaveGVs),
-        ParseModuleWorkers(llvm::hardware_concurrency(ParseModuleThreads)),
-        NumParseModuleThreads(ParseModuleThreads) {}
-
-  Error add(StringRef InputPath);
-  GlobalValueSummary *getSummary(GlobalValue::GUID Function) const;
-  std::vector<StringRef> getAllModulePaths() const;
-  Optional<StringRef> getModulePathForSymbol(StringRef Name) const;
-
-  template <typename RangeT> void scheduleModuleParsing(const RangeT &Paths);
-  ThreadSafeModule takeModule(StringRef Path);
-
-  // Blocking module parsing, returns a Null-module on error.
-  // Only used for the main module.
-  ThreadSafeModule parseModuleFromFile(StringRef Path);
-
-  std::vector<std::string> selectNextPaths();
-  unsigned getNumDiscoveredModules() const;
-  void discoverCalleeModulePaths(FunctionSummary *S, unsigned LookaheadLevels);
-
-  VModuleKey getModuleId(StringRef Path) const {
-    return CombinedSummaryIndex.getModuleId(Path);
-  }
-
-private:
-  ExecutionSession &ES;
-  ModuleSummaryIndex CombinedSummaryIndex;
-  uint64_t NextModuleId{0};
-
-  struct PathRankEntry {
-    uint32_t Count{0};
-    uint32_t MinDist{100};
-  };
-  StringMap<PathRankEntry> PathRank;
-
-  ThreadPool ParseModuleWorkers;
-  unsigned NumParseModuleThreads;
-
-  std::mutex ScheduledModulesLock;
-  StringMap<std::shared_future<void>> ScheduledModules;
-
-  std::mutex ParsedModulesLock;
-  StringMap<ThreadSafeModule> ParsedModules;
-
-  void updatePathRank(StringRef Path, unsigned Distance);
-  void addToWorklist(std::vector<FunctionSummary *> &List,
-                     ArrayRef<FunctionSummary::EdgeTy> Calls);
-
-  std::vector<StringRef> selectAllPaths();
-  std::vector<StringRef> selectHotPaths(unsigned Count);
-
-  void scheduleModuleParsingPrelocked(StringRef Path);
-  Expected<ThreadSafeModule> doParseModule(StringRef Path);
-};
-
-template <typename RangeT>
-inline void ThinLtoModuleIndex::scheduleModuleParsing(const RangeT &Paths) {
-  std::lock_guard<std::mutex> Lock(ScheduledModulesLock);
-  for (const auto &Path : Paths) {
-    scheduleModuleParsingPrelocked(Path);
-  }
-}
-
-} // namespace orc
-} // namespace llvm
-
-#endif
diff --git a/llvm/examples/ThinLtoJIT/ThinLtoModuleIndex.cpp b/llvm/examples/ThinLtoJIT/ThinLtoModuleIndex.cpp
deleted file mode 100644
--- a/llvm/examples/ThinLtoJIT/ThinLtoModuleIndex.cpp
+++ /dev/null
@@ -1,268 +0,0 @@
-#include "ThinLtoModuleIndex.h"
-
-#include "llvm/Bitcode/BitcodeReader.h"
-#include "llvm/ExecutionEngine/Orc/SymbolStringPool.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IRReader/IRReader.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/raw_ostream.h"
-
-#include <memory>
-#include <string>
-
-#define DEBUG_TYPE "thinltojit"
-
-namespace llvm {
-namespace orc {
-
-Error ThinLtoModuleIndex::add(StringRef InputPath) {
-  auto Buffer = errorOrToExpected(MemoryBuffer::getFile(InputPath));
-  if (!Buffer)
-    return Buffer.takeError();
-
-  Error ParseErr = readModuleSummaryIndex((*Buffer)->getMemBufferRef(),
-                                          CombinedSummaryIndex, NextModuleId);
-  if (ParseErr)
-    return ParseErr;
-
-#ifndef NDEBUG
-  auto Paths = getAllModulePaths();
-  unsigned TotalPaths = Paths.size();
-  std::sort(Paths.begin(), Paths.end());
-  Paths.erase(std::unique(Paths.begin(), Paths.end()), Paths.end());
-  assert(TotalPaths == Paths.size() && "Module paths must be unique");
-#endif
-
-  ++NextModuleId;
-  return Error::success();
-}
-
-std::vector<StringRef> ThinLtoModuleIndex::getAllModulePaths() const {
-  auto ModuleTable = CombinedSummaryIndex.modulePaths();
-
-  std::vector<StringRef> Paths;
-  Paths.resize(ModuleTable.size());
-
-  for (const auto &KV : ModuleTable) {
-    assert(Paths[KV.second.first].empty() && "IDs are unique and continuous");
-    Paths[KV.second.first] = KV.first();
-  }
-
-  return Paths;
-}
-
-GlobalValueSummary *
-ThinLtoModuleIndex::getSummary(GlobalValue::GUID Function) const {
-  ValueInfo VI = CombinedSummaryIndex.getValueInfo(Function);
-  if (!VI || VI.getSummaryList().empty())
-    return nullptr;
-
-  // There can be more than one symbol with the same GUID, in the case of same-
-  // named locals in different but same-named source files that were compiled in
-  // their respective directories (so the source file name and resulting GUID is
-  // the same). We avoid this by checking that module paths are unique upon
-  // add().
-  //
-  // TODO: We can still get duplicates on symbols declared with
-  // attribute((weak)), a GNU extension supported by gcc and clang.
-  // We should support it by looking for a symbol in the current module
-  // or in the same module as the caller.
-  assert(VI.getSummaryList().size() == 1 && "Weak symbols not yet supported");
-
-  return VI.getSummaryList().front().get()->getBaseObject();
-}
-
-Optional<StringRef>
-ThinLtoModuleIndex::getModulePathForSymbol(StringRef Name) const {
-  if (GlobalValueSummary *S = getSummary(GlobalValue::getGUID(Name)))
-    return S->modulePath();
-  return None; // We don't know the symbol.
-}
-
-void ThinLtoModuleIndex::scheduleModuleParsingPrelocked(StringRef Path) {
-  // Once the module was scheduled, we can call takeModule().
-  auto ScheduledIt = ScheduledModules.find(Path);
-  if (ScheduledIt != ScheduledModules.end())
-    return;
-
-  auto Worker = [this](std::string Path) {
-    if (auto TSM = doParseModule(Path)) {
-      std::lock_guard<std::mutex> Lock(ParsedModulesLock);
-      ParsedModules[Path] = std::move(*TSM);
-
-      LLVM_DEBUG(dbgs() << "Finished parsing module: " << Path << "\n");
-    } else {
-      ES.reportError(TSM.takeError());
-    }
-  };
-
-  LLVM_DEBUG(dbgs() << "Schedule module for parsing: " << Path << "\n");
-  ScheduledModules[Path] = ParseModuleWorkers.async(Worker, Path.str());
-}
-
-ThreadSafeModule ThinLtoModuleIndex::takeModule(StringRef Path) {
-  std::unique_lock<std::mutex> ParseLock(ParsedModulesLock);
-
-  auto ParsedIt = ParsedModules.find(Path);
-  if (ParsedIt == ParsedModules.end()) {
-    ParseLock.unlock();
-
-    // The module is not ready, wait for the future we stored.
-    std::unique_lock<std::mutex> ScheduleLock(ScheduledModulesLock);
-    auto ScheduledIt = ScheduledModules.find(Path);
-    assert(ScheduledIt != ScheduledModules.end() &&
-           "Don't call for unscheduled modules");
-    std::shared_future<void> Future = ScheduledIt->getValue();
-    ScheduleLock.unlock();
-    Future.get();
-
-    ParseLock.lock();
-    ParsedIt = ParsedModules.find(Path);
-    assert(ParsedIt != ParsedModules.end() && "Must be ready now");
-  }
-
-  // We only add each module once. If it's not here anymore, we can skip it.
-  ThreadSafeModule TSM = std::move(ParsedIt->getValue());
-  ParsedIt->getValue() = ThreadSafeModule();
-  return TSM;
-}
-
-ThreadSafeModule ThinLtoModuleIndex::parseModuleFromFile(StringRef Path) {
-  {
-    std::lock_guard<std::mutex> ScheduleLock(ScheduledModulesLock);
-    scheduleModuleParsingPrelocked(Path);
-  }
-  return takeModule(Path);
-}
-
-Expected<ThreadSafeModule> ThinLtoModuleIndex::doParseModule(StringRef Path) {
-  // TODO: make a SMDiagnosticError class for this
-  SMDiagnostic Err;
-  auto Ctx = std::make_unique<LLVMContext>();
-  auto M = parseIRFile(Path, Err, *Ctx);
-  if (!M) {
-    std::string ErrDescription;
-    {
-      raw_string_ostream S(ErrDescription);
-      Err.print("ThinLtoJIT", S);
-    }
-    return createStringError(inconvertibleErrorCode(),
-                             "Failed to load module from file '%s' (%s)",
-                             Path.data(), ErrDescription.c_str());
-  }
-
-  return ThreadSafeModule(std::move(M), std::move(Ctx));
-}
-
-// We don't filter visited functions. Discovery will often be retriggered
-// from the middle of already visited functions and it aims to reach a little
-// further each time.
-void ThinLtoModuleIndex::discoverCalleeModulePaths(FunctionSummary *S,
-                                                   unsigned LookaheadLevels) {
-  // Populate initial worklist
-  std::vector<FunctionSummary *> Worklist;
-  addToWorklist(Worklist, S->calls());
-  unsigned Distance = 0;
-
-  while (++Distance < LookaheadLevels) {
-    // Process current worklist and populate a new one.
-    std::vector<FunctionSummary *> NextWorklist;
-    for (FunctionSummary *F : Worklist) {
-      updatePathRank(F->modulePath(), Distance);
-      addToWorklist(NextWorklist, F->calls());
-    }
-    Worklist = std::move(NextWorklist);
-  }
-
-  // Process the last worklist without filling a new one
-  for (FunctionSummary *F : Worklist) {
-    updatePathRank(F->modulePath(), Distance);
-  }
-
-  // Reset counts for known paths (includes both, scheduled and parsed modules).
-  std::lock_guard<std::mutex> Lock(ScheduledModulesLock);
-  for (const auto &KV : ScheduledModules) {
-    PathRank[KV.first()].Count = 0;
-  }
-}
-
-void ThinLtoModuleIndex::addToWorklist(
-    std::vector<FunctionSummary *> &List,
-    ArrayRef<FunctionSummary::EdgeTy> Calls) {
-  for (const auto &Edge : Calls) {
-    const auto &SummaryList = Edge.first.getSummaryList();
-    if (!SummaryList.empty()) {
-      GlobalValueSummary *S = SummaryList.front().get()->getBaseObject();
-      assert(isa<FunctionSummary>(S) && "Callees must be functions");
-      List.push_back(cast<FunctionSummary>(S));
-    }
-  }
-}
-
-// PathRank is global and continuous.
-void ThinLtoModuleIndex::updatePathRank(StringRef Path, unsigned Distance) {
-  auto &Entry = PathRank[Path];
-  Entry.Count += 1;
-  Entry.MinDist = std::min(Entry.MinDist, Distance);
-  assert(Entry.MinDist > 0 && "We want it as a divisor");
-}
-
-// TODO: The size of a ThreadPool's task queue is not accessible. It would
-// be great to know in order to estimate how many modules we schedule. The
-// more we schedule, the less precise is the ranking. The less we schedule,
-// the higher the risk for downtime.
-std::vector<std::string> ThinLtoModuleIndex::selectNextPaths() {
-  struct ScorePath {
-    float Score;
-    unsigned MinDist;
-    StringRef Path;
-  };
-
-  std::vector<ScorePath> Candidates;
-  Candidates.reserve(PathRank.size());
-  for (const auto &KV : PathRank) {
-    float Score = static_cast<float>(KV.second.Count) / KV.second.MinDist;
-    if (Score > .0f) {
-      Candidates.push_back({Score, KV.second.MinDist, KV.first()});
-    }
-  }
-
-  // Sort candidates by descending score.
-  std::sort(Candidates.begin(), Candidates.end(),
-            [](const ScorePath &LHS, const ScorePath &RHS) {
-              return LHS.Score > RHS.Score;
-            });
-
-  // Sort highest score candidates by ascending minimal distance.
-  size_t Selected =
-      std::min(std::max<size_t>(NumParseModuleThreads, Candidates.size() / 2),
-               Candidates.size());
-  std::sort(Candidates.begin(), Candidates.begin() + Selected,
-            [](const ScorePath &LHS, const ScorePath &RHS) {
-              return LHS.MinDist < RHS.MinDist;
-            });
-
-  std::vector<std::string> Paths;
-  Paths.reserve(Selected);
-  for (unsigned i = 0; i < Selected; i++) {
-    Paths.push_back(Candidates[i].Path.str());
-  }
-
-  LLVM_DEBUG(dbgs() << "ModuleIndex: select " << Paths.size() << " out of "
-                    << Candidates.size() << " discovered paths\n");
-
-  return Paths;
-}
-
-unsigned ThinLtoModuleIndex::getNumDiscoveredModules() const {
-  // TODO: It would probably be more efficient to track the number of
-  // unscheduled modules.
-  unsigned NonNullItems = 0;
-  for (const auto &KV : PathRank)
-    if (KV.second.Count > 0)
-      ++NonNullItems;
-  return NonNullItems;
-}
-
-} // namespace orc
-} // namespace llvm
diff --git a/llvm/examples/ThinLtoJIT/bench b/llvm/examples/ThinLtoJIT/bench
deleted file mode 100755
--- a/llvm/examples/ThinLtoJIT/bench
+++ /dev/null
@@ -1,100 +0,0 @@
-#!/bin/bash
-#set -x
-
-if [ $# -gt 2 ]; then
-  TOOLS_DIR="$1"
-  SOURCE_DIR="$2"
-  MAIN_SOURCE_FILE="$3"
-else
-  echo "Usage: bench <path to llvm binaries> <path to c-sources> <main source file> [<override sysroot>]"
-  exit 1
-fi
-
-if [ $# -gt 3 ]; then
-  SYS_ROOT="$4"
-else
-  SYS_ROOT="/"
-fi
-
-function check_tool ()
-{
-  if [ -e "${TOOLS_DIR}/$1" ]; then
-    echo "Found: $1"
-  else
-    echo "!!! Cannot find required tool, please provide it in the LLVM binaries folder: $1"
-  fi
-}
-
-check_tool lli
-check_tool SpeculativeJIT
-check_tool ThinLtoJIT
-
-SKIP_BITCODE_GEN=0
-if [[ -e bc-default || -e bc-thinlto || -e ll-default || -e ll-thinlto ]]; then
-  echo "Skipping bitcode generation: output directories existing"
-  echo "Please clean up manually: rm -R bc-default bc-thinlto ll-default ll-thinlto"
-  SKIP_BITCODE_GEN=1
-else
-  check_tool clang
-  check_tool llvm-dis
-  check_tool llvm-lto
-  mkdir bc-default
-  mkdir bc-thinlto
-  mkdir ll-default
-  mkdir ll-thinlto
-fi
-
-ROOT_DIR=$(pwd)
-ALL_BITCODE_FILES=""
-
-MAIN_FILE_BASENAME=$(basename "${MAIN_SOURCE_FILE%.c*}")
-LLI_EXTRA_MODULES=""
-
-for f in ${SOURCE_DIR}/*.c* ; do
-  BASE_NAME=$(basename "${f%.c*}")
-
-  if [ ${SKIP_BITCODE_GEN} -eq 0 ]; then
-    echo "Compile: $f -> ${BASE_NAME}.bc"
-
-    ${TOOLS_DIR}/clang -c -I ${SOURCE_DIR} ${CFLAGS} -isysroot ${SYS_ROOT} -emit-llvm \
-                       -o "bc-default/${BASE_NAME}.bc" "$f"
-    ${TOOLS_DIR}/clang -c -I ${SOURCE_DIR} ${CFLAGS} -isysroot ${SYS_ROOT} -flto=thin \
-                       -o "bc-thinlto/${BASE_NAME}.bc" "$f"
-
-    echo "Disassemble ${BASE_NAME}.bc -> ${BASE_NAME}.ll"
-    ${TOOLS_DIR}/llvm-dis bc-default/${BASE_NAME}.bc -o ll-default/${BASE_NAME}.ll
-    ${TOOLS_DIR}/llvm-dis bc-thinlto/${BASE_NAME}.bc -o ll-thinlto/${BASE_NAME}.ll
-  fi
-
-  ALL_BITCODE_FILES="${ALL_BITCODE_FILES} ${BASE_NAME}.bc"
-  if [ "${BASE_NAME}" != "${MAIN_FILE_BASENAME}" ]; then
-    LLI_EXTRA_MODULES="${LLI_EXTRA_MODULES} -extra-module=${BASE_NAME}.bc"
-  fi
-done
-
-if [ ${SKIP_BITCODE_GEN} -eq 0 ]; then
-  echo "Link global index file: index.thinlto.bc"
-  cd ${ROOT_DIR}/bc-thinlto
-  ${TOOLS_DIR}/llvm-lto --thinlto -o ${ROOT_DIR}/bc-thinlto/index ${ALL_BITCODE_FILES}
-
-  echo "Disassemble global index file: index.thinlto.ll"
-  cd ${ROOT_DIR}/ll-thinlto
-  ${TOOLS_DIR}/llvm-dis -o index.thinlto.ll ${ROOT_DIR}/bc-thinlto/index.thinlto.bc
-fi
-
-set -x
-cd ${ROOT_DIR}/bc-default
-time (${TOOLS_DIR}/clang -o ${MAIN_FILE_BASENAME} -O0 ${LDFLAGS} ${ALL_BITCODE_FILES} && ./${MAIN_FILE_BASENAME} ${EXEC_ARGS} 1>/dev/null)
-time ${TOOLS_DIR}/lli ${LLI_EXTRA_MODULES} -jit-kind=mcjit "${MAIN_FILE_BASENAME}.bc" ${EXEC_ARGS} 1>/dev/null
-time ${TOOLS_DIR}/lli ${LLI_EXTRA_MODULES} -jit-kind=orc-mcjit "${MAIN_FILE_BASENAME}.bc" ${EXEC_ARGS} 1>/dev/null
-time ${TOOLS_DIR}/lli ${LLI_EXTRA_MODULES} -jit-kind=orc-lazy "${MAIN_FILE_BASENAME}.bc" ${EXEC_ARGS} 1>/dev/null
-time ${TOOLS_DIR}/lli ${LLI_EXTRA_MODULES} -jit-kind=orc-lazy -compile-threads=8 "${MAIN_FILE_BASENAME}.bc" ${EXEC_ARGS} 1>/dev/null
-time ${TOOLS_DIR}/lli ${LLI_EXTRA_MODULES} -jit-kind=orc-lazy -per-module-lazy "${MAIN_FILE_BASENAME}.bc" ${EXEC_ARGS} 1>/dev/null
-time ${TOOLS_DIR}/lli ${LLI_EXTRA_MODULES} -jit-kind=orc-lazy -per-module-lazy -compile-threads=8 "${MAIN_FILE_BASENAME}.bc" ${EXEC_ARGS} 1>/dev/null
-time ${TOOLS_DIR}/lli ${LLI_EXTRA_MODULES} -jit-kind=orc-lazy -per-module-lazy -compile-threads=8 -O1 "${MAIN_FILE_BASENAME}.bc" ${EXEC_ARGS} 1>/dev/null
-time ${TOOLS_DIR}/lli ${LLI_EXTRA_MODULES} -jit-kind=orc-lazy -per-module-lazy -compile-threads=8 -O0 "${MAIN_FILE_BASENAME}.bc" ${EXEC_ARGS} 1>/dev/null
-time ${TOOLS_DIR}/SpeculativeJIT -num-threads=8 ${ALL_BITCODE_FILES} --args ${EXEC_ARGS} 1>/dev/null
-
-cd ${ROOT_DIR}/bc-thinlto
-#time (${TOOLS_DIR}/clang -flto=thin -o test ${ALL_BITCODE_FILES} && ./test ${EXEC_ARGS} 1>/dev/null)
-time ${TOOLS_DIR}/ThinLtoJIT index.thinlto.bc --args ${EXEC_ARGS} 1>/dev/null
diff --git a/llvm/examples/ThinLtoJIT/main.cpp b/llvm/examples/ThinLtoJIT/main.cpp
deleted file mode 100644
--- a/llvm/examples/ThinLtoJIT/main.cpp
+++ /dev/null
@@ -1,83 +0,0 @@
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/InitLLVM.h"
-#include "llvm/Support/TargetSelect.h"
-
-#include "ThinLtoJIT.h"
-
-#include <string>
-#include <vector>
-
-using namespace llvm;
-
-static cl::list<std::string>
-    InputFiles(cl::Positional, cl::OneOrMore,
-               cl::desc("<bitcode files or global index>"));
-
-static cl::list<std::string> InputArgs("args", cl::Positional,
-                                       cl::desc("<program arguments>..."),
-                                       cl::ZeroOrMore, cl::PositionalEatsArgs);
-
-static cl::opt<unsigned> CompileThreads("compile-threads", cl::Optional,
-                                        cl::desc("Number of compile threads"),
-                                        cl::init(4));
-
-static cl::opt<unsigned> LoadThreads("load-threads", cl::Optional,
-                                     cl::desc("Number of module load threads"),
-                                     cl::init(8));
-
-static cl::opt<unsigned>
-    LookaheadLevels("lookahead", cl::Optional,
-                    cl::desc("Calls to look ahead of execution"), cl::init(4));
-
-static cl::opt<unsigned> DiscoveryFlagsBucketSize(
-    "discovery-flag-bucket-size", cl::Optional,
-    cl::desc("Flags per bucket (rounds up to memory pages)"), cl::init(4096));
-
-static cl::opt<orc::ThinLtoJIT::ExplicitMemoryBarrier>
-    MemFence("mem-fence",
-             cl::desc("Control memory fences for cache synchronization"),
-             cl::init(orc::ThinLtoJIT::NeverFence),
-             cl::values(clEnumValN(orc::ThinLtoJIT::NeverFence, "never",
-                                   "No use of memory fences"),
-                        clEnumValN(orc::ThinLtoJIT::FenceStaticCode, "static",
-                                   "Use of memory fences in static code only"),
-                        clEnumValN(orc::ThinLtoJIT::FenceJITedCode, "jited",
-                                   "Install memory fences in JITed code only"),
-                        clEnumValN(orc::ThinLtoJIT::AlwaysFence, "always",
-                                   "Always use of memory fences")));
-
-static cl::opt<bool>
-    AllowNudge("allow-nudge",
-               cl::desc("Allow the symbol generator to nudge symbols into "
-                        "discovery even though they haven't been reached"),
-               cl::init(false));
-
-static cl::opt<bool> PrintStats("print-stats",
-                                cl::desc("Print module stats on shutdown"),
-                                cl::init(false));
-
-int main(int argc, char *argv[]) {
-  InitLLVM X(argc, argv);
-  InitializeNativeTarget();
-  InitializeNativeTargetAsmPrinter();
-  cl::ParseCommandLineOptions(argc, argv, "ThinLtoJIT");
-
-  Error Err = Error::success();
-  auto atLeastOne = [](unsigned N) { return std::max(1u, N); };
-
-  orc::ThinLtoJIT Jit(InputFiles, "main", atLeastOne(LookaheadLevels),
-                      atLeastOne(CompileThreads), atLeastOne(LoadThreads),
-                      DiscoveryFlagsBucketSize, MemFence, AllowNudge,
-                      PrintStats, Err);
-  if (Err) {
-    logAllUnhandledErrors(std::move(Err), errs(), "[ThinLtoJIT] ");
-    exit(1);
-  }
-
-  ExitOnError ExitOnErr;
-  ExitOnErr.setBanner("[ThinLtoJIT] ");
-
-  return ExitOnErr(Jit.main(InputArgs));
-}
diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -916,6 +916,11 @@
   bool isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *LHS,
                         const SCEV *RHS);
 
+  /// Test if the given expression is known to satisfy the condition described
+  /// by Pred, LHS, and RHS in the given Context.
+  bool isKnownPredicateAt(ICmpInst::Predicate Pred, const SCEV *LHS,
+                        const SCEV *RHS, const Instruction *Context);
+
   /// Test if the condition described by Pred, LHS, RHS is known to be true on
   /// every iteration of the loop of the recurrency LHS.
   bool isKnownOnEveryIteration(ICmpInst::Predicate Pred,
@@ -1672,23 +1677,30 @@
   getPredecessorWithUniqueSuccessorForBB(const BasicBlock *BB) const;
 
   /// Test whether the condition described by Pred, LHS, and RHS is true
-  /// whenever the given FoundCondValue value evaluates to true.
+  /// whenever the given FoundCondValue value evaluates to true in given
+  /// Context. If Context is nullptr, then the found predicate is true
+  /// everywhere.
   bool isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS,
-                     const Value *FoundCondValue, bool Inverse);
+                     const Value *FoundCondValue, bool Inverse,
+                     const Instruction *Context = nullptr);
 
   /// Test whether the condition described by Pred, LHS, and RHS is true
   /// whenever the condition described by FoundPred, FoundLHS, FoundRHS is
-  /// true.
+  /// true in given Context. If Context is nullptr, then the found predicate is
+  /// true everywhere.
   bool isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS,
                      ICmpInst::Predicate FoundPred, const SCEV *FoundLHS,
-                     const SCEV *FoundRHS);
+                     const SCEV *FoundRHS,
+                     const Instruction *Context = nullptr);
 
   /// Test whether the condition described by Pred, LHS, and RHS is true
   /// whenever the condition described by Pred, FoundLHS, and FoundRHS is
-  /// true.
+  /// true in given Context. If Context is nullptr, then the found predicate is
+  /// true everywhere.
   bool isImpliedCondOperands(ICmpInst::Predicate Pred, const SCEV *LHS,
                              const SCEV *RHS, const SCEV *FoundLHS,
-                             const SCEV *FoundRHS);
+                             const SCEV *FoundRHS,
+                             const Instruction *Context = nullptr);
 
   /// Test whether the condition described by Pred, LHS, and RHS is true
   /// whenever the condition described by Pred, FoundLHS, and FoundRHS is
@@ -1735,6 +1747,18 @@
                                           const SCEV *FoundLHS,
                                           const SCEV *FoundRHS);
 
+  /// Test whether the condition described by Pred, LHS, and RHS is true
+  /// whenever the condition described by Pred, FoundLHS, and FoundRHS is
+  /// true.
+  ///
+  /// This routine tries to weaken the known condition basing on fact that
+  /// FoundLHS is an AddRec.
+  bool isImpliedCondOperandsViaAddRecStart(ICmpInst::Predicate Pred,
+                                           const SCEV *LHS, const SCEV *RHS,
+                                           const SCEV *FoundLHS,
+                                           const SCEV *FoundRHS,
+                                           const Instruction *Context);
+
   /// Test whether the condition described by Pred, LHS, and RHS is true
   /// whenever the condition described by Pred, FoundLHS, and FoundRHS is
   /// true.
diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -856,10 +856,11 @@
   SHT_LLVM_ADDRSIG = 0x6fff4c03, // List of address-significant symbols
                                  // for safe ICF.
   SHT_LLVM_DEPENDENT_LIBRARIES =
-      0x6fff4c04,                  // LLVM Dependent Library Specifiers.
-  SHT_LLVM_SYMPART = 0x6fff4c05,   // Symbol partition specification.
-  SHT_LLVM_PART_EHDR = 0x6fff4c06, // ELF header for loadable partition.
-  SHT_LLVM_PART_PHDR = 0x6fff4c07, // Phdrs for loadable partition.
+      0x6fff4c04,                    // LLVM Dependent Library Specifiers.
+  SHT_LLVM_SYMPART = 0x6fff4c05,     // Symbol partition specification.
+  SHT_LLVM_PART_EHDR = 0x6fff4c06,   // ELF header for loadable partition.
+  SHT_LLVM_PART_PHDR = 0x6fff4c07,   // Phdrs for loadable partition.
+  SHT_LLVM_BB_ADDR_MAP = 0x6fff4c08, // LLVM Basic Block Address Map.
   // Android's experimental support for SHT_RELR sections.
   // https://android.googlesource.com/platform/bionic/+/b7feec74547f84559a1467aca02708ff61346d2a/libc/include/elf.h#512
   SHT_ANDROID_RELR = 0x6fffff00,   // Relocation entries; only offsets.
diff --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h
--- a/llvm/include/llvm/CodeGen/AsmPrinter.h
+++ b/llvm/include/llvm/CodeGen/AsmPrinter.h
@@ -141,7 +141,11 @@
 
 private:
   MCSymbol *CurrentFnEnd = nullptr;
-  MCSymbol *CurExceptionSym = nullptr;
+
+  /// Map a basic block section ID to the exception symbol associated with that
+  /// section. Map entries are assigned and looked up via
+  /// AsmPrinter::getMBBExceptionSym.
+  DenseMap<unsigned, MCSymbol *> MBBSectionExceptionSyms;
 
   // The symbol used to represent the start of the current BB section of the
   // function. This is used to calculate the size of the BB section.
@@ -238,7 +242,10 @@
 
   MCSymbol *getFunctionBegin() const { return CurrentFnBegin; }
   MCSymbol *getFunctionEnd() const { return CurrentFnEnd; }
-  MCSymbol *getCurExceptionSym();
+
+  // Return the exception symbol associated with the MBB section containing a
+  // given basic block.
+  MCSymbol *getMBBExceptionSym(const MachineBasicBlock &MBB);
 
   /// Return information about object file lowering.
   const TargetLoweringObjectFile &getObjFileLowering() const;
diff --git a/llvm/include/llvm/CodeGen/AsmPrinterHandler.h b/llvm/include/llvm/CodeGen/AsmPrinterHandler.h
--- a/llvm/include/llvm/CodeGen/AsmPrinterHandler.h
+++ b/llvm/include/llvm/CodeGen/AsmPrinterHandler.h
@@ -24,7 +24,8 @@
 class MachineInstr;
 class MCSymbol;
 
-typedef MCSymbol *ExceptionSymbolProvider(AsmPrinter *Asm);
+typedef MCSymbol *ExceptionSymbolProvider(AsmPrinter *Asm,
+                                          const MachineBasicBlock *MBB);
 
 /// Collects and handles AsmPrinter objects required to build debug
 /// or EH information.
diff --git a/llvm/include/llvm/CodeGen/Register.h b/llvm/include/llvm/CodeGen/Register.h
--- a/llvm/include/llvm/CodeGen/Register.h
+++ b/llvm/include/llvm/CodeGen/Register.h
@@ -40,10 +40,6 @@
   /// frame index in a variable that normally holds a register. isStackSlot()
   /// returns true if Reg is in the range used for stack slots.
   ///
-  /// Note that isVirtualRegister() and isPhysicalRegister() cannot handle stack
-  /// slots, so if a variable may contains a stack slot, always check
-  /// isStackSlot() first.
-  ///
   static bool isStackSlot(unsigned Reg) {
     return MCRegister::isStackSlot(Reg);
   }
@@ -69,8 +65,7 @@
   /// Return true if the specified register number is in
   /// the virtual register namespace.
   static bool isVirtualRegister(unsigned Reg) {
-    assert(!isStackSlot(Reg) && "Not a register! Check isStackSlot() first.");
-    return Reg & MCRegister::VirtualRegFlag;
+    return Reg & MCRegister::VirtualRegFlag && !isStackSlot(Reg);
   }
 
   /// Convert a virtual register number to a 0-based index.
diff --git a/llvm/include/llvm/DebugInfo/CodeView/TypeHashing.h b/llvm/include/llvm/DebugInfo/CodeView/TypeHashing.h
--- a/llvm/include/llvm/DebugInfo/CodeView/TypeHashing.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/TypeHashing.h
@@ -86,6 +86,16 @@
 
   bool empty() const { return *(const uint64_t*)Hash.data() == 0; }
 
+  friend inline bool operator==(const GloballyHashedType &L,
+                                const GloballyHashedType &R) {
+    return L.Hash == R.Hash;
+  }
+
+  friend inline bool operator!=(const GloballyHashedType &L,
+                                const GloballyHashedType &R) {
+    return !(L.Hash == R.Hash);
+  }
+
   /// Given a sequence of bytes representing a record, compute a global hash for
   /// this record.  Due to the nature of global hashes incorporating the hashes
   /// of referenced records, this function requires a list of types and ids
@@ -206,7 +216,7 @@
 
   static bool isEqual(codeview::GloballyHashedType LHS,
                       codeview::GloballyHashedType RHS) {
-    return LHS.Hash == RHS.Hash;
+    return LHS == RHS;
   }
 };
 
diff --git a/llvm/include/llvm/DebugInfo/CodeView/TypeIndex.h b/llvm/include/llvm/DebugInfo/CodeView/TypeIndex.h
--- a/llvm/include/llvm/DebugInfo/CodeView/TypeIndex.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/TypeIndex.h
@@ -116,13 +116,22 @@
 
   uint32_t toArrayIndex() const {
     assert(!isSimple());
-    return getIndex() - FirstNonSimpleIndex;
+    return (getIndex() & ~DecoratedItemIdMask) - FirstNonSimpleIndex;
   }
 
   static TypeIndex fromArrayIndex(uint32_t Index) {
     return TypeIndex(Index + FirstNonSimpleIndex);
   }
 
+  static TypeIndex fromDecoratedArrayIndex(bool IsItem, uint32_t Index) {
+    return TypeIndex((Index + FirstNonSimpleIndex) |
+                     (IsItem ? DecoratedItemIdMask : 0));
+  }
+
+  TypeIndex removeDecoration() {
+    return TypeIndex(Index & ~DecoratedItemIdMask);
+  }
+
   SimpleTypeKind getSimpleKind() const {
     assert(isSimple());
     return static_cast<SimpleTypeKind>(Index & SimpleKindMask);
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h b/llvm/include/llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h
--- a/llvm/include/llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h
@@ -54,16 +54,20 @@
 
   void setVersionHeader(PdbRaw_TpiVer Version);
   void addTypeRecord(ArrayRef<uint8_t> Type, Optional<uint32_t> Hash);
+  void addTypeRecords(ArrayRef<uint8_t> Types, ArrayRef<uint16_t> Sizes,
+                      ArrayRef<uint32_t> Hashes);
 
   Error finalizeMsfLayout();
 
-  uint32_t getRecordCount() const { return TypeRecords.size(); }
+  uint32_t getRecordCount() const { return TypeRecordCount; }
 
   Error commit(const msf::MSFLayout &Layout, WritableBinaryStreamRef Buffer);
 
   uint32_t calculateSerializedLength();
 
 private:
+  void updateTypeIndexOffsets(ArrayRef<uint16_t> Sizes);
+
   uint32_t calculateHashBufferSize() const;
   uint32_t calculateIndexOffsetSize() const;
   Error finalize();
@@ -71,10 +75,11 @@
   msf::MSFBuilder &Msf;
   BumpPtrAllocator &Allocator;
 
+  uint32_t TypeRecordCount = 0;
   size_t TypeRecordBytes = 0;
 
   PdbRaw_TpiVer VerHeader = PdbRaw_TpiVer::PdbTpiV80;
-  std::vector<ArrayRef<uint8_t>> TypeRecords;
+  std::vector<ArrayRef<uint8_t>> TypeRecBuffers;
   std::vector<uint32_t> TypeHashes;
   std::vector<codeview::TypeIndexOffset> TypeIndexOffsets;
   uint32_t HashStreamIndex = kInvalidStreamIndex;
diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -2015,6 +2015,18 @@
   return m_Intrinsic<Intrinsic::maxnum>(Op0, Op1);
 }
 
+template <typename Opnd0, typename Opnd1, typename Opnd2>
+inline typename m_Intrinsic_Ty<Opnd0, Opnd1, Opnd2>::Ty
+m_FShl(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2) {
+  return m_Intrinsic<Intrinsic::fshl>(Op0, Op1, Op2);
+}
+
+template <typename Opnd0, typename Opnd1, typename Opnd2>
+inline typename m_Intrinsic_Ty<Opnd0, Opnd1, Opnd2>::Ty
+m_FShr(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2) {
+  return m_Intrinsic<Intrinsic::fshr>(Op0, Op1, Op2);
+}
+
 //===----------------------------------------------------------------------===//
 // Matchers for two-operands operators with the operators in either order
 //
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -242,6 +242,7 @@
 void initializeLoopInfoWrapperPassPass(PassRegistry&);
 void initializeLoopInstSimplifyLegacyPassPass(PassRegistry&);
 void initializeLoopInterchangePass(PassRegistry&);
+void initializeLoopFlattenLegacyPassPass(PassRegistry&);
 void initializeLoopLoadEliminationPass(PassRegistry&);
 void initializeLoopPassPass(PassRegistry&);
 void initializeLoopPredicationLegacyPassPass(PassRegistry&);
diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h
--- a/llvm/include/llvm/LinkAllPasses.h
+++ b/llvm/include/llvm/LinkAllPasses.h
@@ -127,6 +127,7 @@
       (void) llvm::createLazyValueInfoPass();
       (void) llvm::createLoopExtractorPass();
       (void) llvm::createLoopInterchangePass();
+      (void) llvm::createLoopFlattenPass();
       (void) llvm::createLoopPredicationPass();
       (void) llvm::createLoopSimplifyPass();
       (void) llvm::createLoopSimplifyCFGPass();
diff --git a/llvm/include/llvm/MC/MCExpr.h b/llvm/include/llvm/MC/MCExpr.h
--- a/llvm/include/llvm/MC/MCExpr.h
+++ b/llvm/include/llvm/MC/MCExpr.h
@@ -355,30 +355,20 @@
   /// The symbol being referenced.
   const MCSymbol *Symbol;
 
-  // Subclass data stores VariantKind in bits 0..15, UseParensForSymbolVariant
-  // in bit 16 and HasSubsectionsViaSymbols in bit 17.
+  // Subclass data stores VariantKind in bits 0..15 and HasSubsectionsViaSymbols
+  // in bit 16.
   static const unsigned VariantKindBits = 16;
   static const unsigned VariantKindMask = (1 << VariantKindBits) - 1;
 
-  /// Specifies how the variant kind should be printed.
-  static const unsigned UseParensForSymbolVariantBit = 1 << VariantKindBits;
-
   // FIXME: Remove this bit.
-  static const unsigned HasSubsectionsViaSymbolsBit =
-      1 << (VariantKindBits + 1);
+  static const unsigned HasSubsectionsViaSymbolsBit = 1 << VariantKindBits;
 
   static unsigned encodeSubclassData(VariantKind Kind,
-                              bool UseParensForSymbolVariant,
-                              bool HasSubsectionsViaSymbols) {
+                                     bool HasSubsectionsViaSymbols) {
     return (unsigned)Kind |
-           (UseParensForSymbolVariant ? UseParensForSymbolVariantBit : 0) |
            (HasSubsectionsViaSymbols ? HasSubsectionsViaSymbolsBit : 0);
   }
 
-  bool useParensForSymbolVariant() const {
-    return (getSubclassData() & UseParensForSymbolVariantBit) != 0;
-  }
-
   explicit MCSymbolRefExpr(const MCSymbol *Symbol, VariantKind Kind,
                            const MCAsmInfo *MAI, SMLoc Loc = SMLoc());
 
@@ -405,8 +395,6 @@
     return (VariantKind)(getSubclassData() & VariantKindMask);
   }
 
-  void printVariantKind(raw_ostream &OS) const;
-
   bool hasSubsectionsViaSymbols() const {
     return (getSubclassData() & HasSubsectionsViaSymbolsBit) != 0;
   }
diff --git a/llvm/include/llvm/MC/MCRegister.h b/llvm/include/llvm/MC/MCRegister.h
--- a/llvm/include/llvm/MC/MCRegister.h
+++ b/llvm/include/llvm/MC/MCRegister.h
@@ -46,9 +46,6 @@
   /// register. StackSlot values do not exist in the MC layer, see
   /// Register::isStackSlot() for the more information on them.
   ///
-  /// Note that isVirtualRegister() and isPhysicalRegister() cannot handle stack
-  /// slots, so if a variable may contains a stack slot, always check
-  /// isStackSlot() first.
   static bool isStackSlot(unsigned Reg) {
     return !(Reg & VirtualRegFlag) &&
            uint32_t(Reg & ~VirtualRegFlag) >= FirstStackSlot;
@@ -57,8 +54,8 @@
   /// Return true if the specified register number is in
   /// the physical register namespace.
   static bool isPhysicalRegister(unsigned Reg) {
-    assert(!isStackSlot(Reg) && "Not a register! Check isStackSlot() first.");
-    return Reg >= FirstPhysicalReg && !(Reg & VirtualRegFlag);
+    return Reg >= FirstPhysicalReg && !(Reg & VirtualRegFlag) &&
+           !isStackSlot(Reg);
   }
 
   /// Return true if the specified register number is in the physical register
diff --git a/llvm/include/llvm/Object/SymbolicFile.h b/llvm/include/llvm/Object/SymbolicFile.h
--- a/llvm/include/llvm/Object/SymbolicFile.h
+++ b/llvm/include/llvm/Object/SymbolicFile.h
@@ -173,6 +173,8 @@
   static bool classof(const Binary *v) {
     return v->isSymbolic();
   }
+
+  static bool isSymbolicFile(file_magic Type, const LLVMContext *Context);
 };
 
 inline BasicSymbolRef::BasicSymbolRef(DataRefImpl SymbolP,
diff --git a/llvm/include/llvm/Passes/StandardInstrumentations.h b/llvm/include/llvm/Passes/StandardInstrumentations.h
--- a/llvm/include/llvm/Passes/StandardInstrumentations.h
+++ b/llvm/include/llvm/Passes/StandardInstrumentations.h
@@ -122,6 +122,97 @@
   void registerCallbacks(PassInstrumentationCallbacks &PIC);
 };
 
+// Base class for classes that report changes to the IR.
+// It presents an interface for such classes and provides calls
+// on various events as the new pass manager transforms the IR.
+// It also provides filtering of information based on hidden options
+// specifying which functions are interesting.
+// Calls are made for the following events/queries:
+// 1.  The initial IR processed.
+// 2.  To get the representation of the IR (of type \p T).
+// 3.  When a pass does not change the IR.
+// 4.  When a pass changes the IR (given both before and after representations
+//         of type \p T).
+// 5.  When an IR is invalidated.
+// 6.  When a pass is run on an IR that is not interesting (based on options).
+// 7.  When a pass is ignored (pass manager or adapter pass).
+// 8.  To compare two IR representations (of type \p T).
+template <typename IRUnitT> class ChangePrinter {
+protected:
+  ChangePrinter() {}
+
+public:
+  virtual ~ChangePrinter();
+
+  // Determine if this pass/IR is interesting and if so, save the IR
+  // otherwise it is left on the stack without data.
+  void saveIRBeforePass(Any IR, StringRef PassID);
+  // Compare the IR from before the pass after the pass.
+  void handleIRAfterPass(Any IR, StringRef PassID);
+  // Handle the situation where a pass is invalidated.
+  void handleInvalidatedPass(StringRef PassID);
+
+protected:
+  // Called on the first IR processed.
+  virtual void handleInitialIR(Any IR) = 0;
+  // Called before and after a pass to get the representation of the IR.
+  virtual void generateIRRepresentation(Any IR, StringRef PassID,
+                                        IRUnitT &Output) = 0;
+  // Called when the pass is not iteresting.
+  virtual void omitAfter(StringRef PassID, std::string &Name) = 0;
+  // Called when an interesting IR has changed.
+  virtual void handleAfter(StringRef PassID, std::string &Name,
+                           const IRUnitT &Before, const IRUnitT &After,
+                           Any) = 0;
+  // Called when an interesting pass is invalidated.
+  virtual void handleInvalidated(StringRef PassID) = 0;
+  // Called when the IR or pass is not interesting.
+  virtual void handleFiltered(StringRef PassID, std::string &Name) = 0;
+  // Called when an ignored pass is encountered.
+  virtual void handleIgnored(StringRef PassID, std::string &Name) = 0;
+  // Called to compare the before and after representations of the IR.
+  virtual bool same(const IRUnitT &Before, const IRUnitT &After) = 0;
+
+  // Stack of IRs before passes.
+  std::vector<IRUnitT> BeforeStack;
+  // Is this the first IR seen?
+  bool InitialIR = true;
+};
+
+// A change printer based on the string representation of the IR as created
+// by unwrapAndPrint.  The string representation is stored in a std::string
+// to preserve it as the IR changes in each pass.  Note that the banner is
+// included in this representation but it is massaged before reporting.
+class IRChangePrinter : public ChangePrinter<std::string> {
+public:
+  IRChangePrinter();
+  ~IRChangePrinter() override;
+  void registerCallbacks(PassInstrumentationCallbacks &PIC);
+
+protected:
+  // Called on the first IR processed.
+  void handleInitialIR(Any IR) override;
+  // Called before and after a pass to get the representation of the IR.
+  void generateIRRepresentation(Any IR, StringRef PassID,
+                                std::string &Output) override;
+  // Called when the pass is not iteresting.
+  void omitAfter(StringRef PassID, std::string &Name) override;
+  // Called when an interesting IR has changed.
+  void handleAfter(StringRef PassID, std::string &Name,
+                   const std::string &Before, const std::string &After,
+                   Any) override;
+  // Called when an interesting pass is invalidated.
+  void handleInvalidated(StringRef PassID) override;
+  // Called when the IR or pass is not interesting.
+  void handleFiltered(StringRef PassID, std::string &Name) override;
+  // Called when an ignored pass is encountered.
+  void handleIgnored(StringRef PassID, std::string &Name) override;
+  // Called to compare the before and after representations of the IR.
+  bool same(const std::string &Before, const std::string &After) override;
+
+  raw_ostream &Out;
+};
+
 /// This class provides an interface to register all the standard pass
 /// instrumentations and manages their state (if any).
 class StandardInstrumentations {
@@ -130,6 +221,7 @@
   TimePassesHandler TimePasses;
   OptNoneInstrumentation OptNone;
   PreservedCFGCheckerInstrumentation PreservedCFGChecker;
+  IRChangePrinter PrintChangedIR;
 
 public:
   StandardInstrumentations(bool DebugLogging) : PrintPass(DebugLogging) {}
diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h
--- a/llvm/include/llvm/Target/TargetMachine.h
+++ b/llvm/include/llvm/Target/TargetMachine.h
@@ -34,6 +34,7 @@
 class MCSubtargetInfo;
 class MCSymbol;
 class raw_pwrite_stream;
+class PassBuilder;
 class PassManagerBuilder;
 struct PerFunctionMIParsingState;
 class SMDiagnostic;
@@ -294,6 +295,11 @@
   /// PassManagerBuilder::addExtension.
   virtual void adjustPassManager(PassManagerBuilder &) {}
 
+  /// Allow the target to modify the pass pipeline with New Pass Manager
+  /// (similar to adjustPassManager for Legacy Pass manager).
+  virtual void registerPassBuilderCallbacks(PassBuilder &,
+                                            bool DebugPassManager) {}
+
   /// Add passes to the specified pass manager to get the specified file
   /// emitted.  Typically this will involve several steps of code generation.
   /// This method should return true if emission of this file type is not
diff --git a/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h b/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h
--- a/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h
+++ b/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h
@@ -223,6 +223,9 @@
 struct WholeProgramDevirtPass : public PassInfoMixin<WholeProgramDevirtPass> {
   ModuleSummaryIndex *ExportSummary;
   const ModuleSummaryIndex *ImportSummary;
+  bool UseCommandLine = false;
+  WholeProgramDevirtPass()
+      : ExportSummary(nullptr), ImportSummary(nullptr), UseCommandLine(true) {}
   WholeProgramDevirtPass(ModuleSummaryIndex *ExportSummary,
                          const ModuleSummaryIndex *ImportSummary)
       : ExportSummary(ExportSummary), ImportSummary(ImportSummary) {
diff --git a/llvm/include/llvm/Transforms/Scalar.h b/llvm/include/llvm/Transforms/Scalar.h
--- a/llvm/include/llvm/Transforms/Scalar.h
+++ b/llvm/include/llvm/Transforms/Scalar.h
@@ -149,6 +149,12 @@
 //
 Pass *createLoopInterchangePass();
 
+//===----------------------------------------------------------------------===//
+//
+// LoopFlatten - This pass flattens nested loops into a single loop.
+//
+Pass *createLoopFlattenPass();
+
 //===----------------------------------------------------------------------===//
 //
 // LoopStrengthReduce - This pass is strength reduces GEP instructions that use
diff --git a/llvm/include/llvm/Transforms/Scalar/LoopFlatten.h b/llvm/include/llvm/Transforms/Scalar/LoopFlatten.h
new file mode 100644
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Scalar/LoopFlatten.h
@@ -0,0 +1,33 @@
+//===- LoopFlatten.h - Loop Flatten ----------------  -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the interface for the Loop Flatten Pass.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_LOOPFLATTEN_H
+#define LLVM_TRANSFORMS_SCALAR_LOOPFLATTEN_H
+
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+
+namespace llvm {
+
+class LoopFlattenPass : public PassInfoMixin<LoopFlattenPass> {
+public:
+  LoopFlattenPass() = default;
+
+  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
+                        LoopStandardAnalysisResults &AR, LPMUpdater &U);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_SCALAR_LOOPFLATTEN_H
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -3963,10 +3963,8 @@
 
     // Test for a bogus zero-shift-guard-op around funnel-shift or rotate.
     Value *ShAmt;
-    auto isFsh = m_CombineOr(m_Intrinsic<Intrinsic::fshl>(m_Value(X), m_Value(),
-                                                          m_Value(ShAmt)),
-                             m_Intrinsic<Intrinsic::fshr>(m_Value(), m_Value(X),
-                                                          m_Value(ShAmt)));
+    auto isFsh = m_CombineOr(m_FShl(m_Value(X), m_Value(), m_Value(ShAmt)),
+                             m_FShr(m_Value(), m_Value(X), m_Value(ShAmt)));
     // (ShAmt == 0) ? fshl(X, *, ShAmt) : X --> X
     // (ShAmt == 0) ? fshr(*, X, ShAmt) : X --> X
     if (match(TrueVal, isFsh) && FalseVal == X && CmpLHS == ShAmt)
@@ -3977,12 +3975,9 @@
     // intrinsics do not have that problem.
     // We do not allow this transform for the general funnel shift case because
     // that would not preserve the poison safety of the original code.
-    auto isRotate = m_CombineOr(m_Intrinsic<Intrinsic::fshl>(m_Value(X),
-                                                             m_Deferred(X),
-                                                             m_Value(ShAmt)),
-                                m_Intrinsic<Intrinsic::fshr>(m_Value(X),
-                                                             m_Deferred(X),
-                                                             m_Value(ShAmt)));
+    auto isRotate =
+        m_CombineOr(m_FShl(m_Value(X), m_Deferred(X), m_Value(ShAmt)),
+                    m_FShr(m_Value(X), m_Deferred(X), m_Value(ShAmt)));
     // (ShAmt == 0) ? X : fshl(X, X, ShAmt) --> fshl(X, X, ShAmt)
     // (ShAmt == 0) ? X : fshr(X, X, ShAmt) --> fshr(X, X, ShAmt)
     if (match(FalseVal, isRotate) && TrueVal == X && CmpLHS == ShAmt &&
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -9100,6 +9100,14 @@
   return isKnownViaNonRecursiveReasoning(Pred, LHS, RHS);
 }
 
+bool ScalarEvolution::isKnownPredicateAt(ICmpInst::Predicate Pred,
+                                         const SCEV *LHS, const SCEV *RHS,
+                                         const Instruction *Context) {
+  // TODO: Analyze guards and assumes from Context's block.
+  return isKnownPredicate(Pred, LHS, RHS) ||
+         isBasicBlockEntryGuardedByCond(Context->getParent(), Pred, LHS, RHS);
+}
+
 bool ScalarEvolution::isKnownOnEveryIteration(ICmpInst::Predicate Pred,
                                               const SCEVAddRecExpr *LHS,
                                               const SCEV *RHS) {
@@ -9541,15 +9549,16 @@
 
   // Try to prove (Pred, LHS, RHS) using isImpliedCond.
   auto ProveViaCond = [&](const Value *Condition, bool Inverse) {
-    if (isImpliedCond(Pred, LHS, RHS, Condition, Inverse))
+    const Instruction *Context = &BB->front();
+    if (isImpliedCond(Pred, LHS, RHS, Condition, Inverse, Context))
       return true;
     if (ProvingStrictComparison) {
       if (!ProvedNonStrictComparison)
-        ProvedNonStrictComparison =
-            isImpliedCond(NonStrictPredicate, LHS, RHS, Condition, Inverse);
+        ProvedNonStrictComparison = isImpliedCond(NonStrictPredicate, LHS, RHS,
+                                                  Condition, Inverse, Context);
       if (!ProvedNonEquality)
-        ProvedNonEquality =
-            isImpliedCond(ICmpInst::ICMP_NE, LHS, RHS, Condition, Inverse);
+        ProvedNonEquality = isImpliedCond(ICmpInst::ICMP_NE, LHS, RHS,
+                                          Condition, Inverse, Context);
       if (ProvedNonStrictComparison && ProvedNonEquality)
         return true;
     }
@@ -9615,7 +9624,8 @@
 
 bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS,
                                     const SCEV *RHS,
-                                    const Value *FoundCondValue, bool Inverse) {
+                                    const Value *FoundCondValue, bool Inverse,
+                                    const Instruction *Context) {
   if (!PendingLoopPredicates.insert(FoundCondValue).second)
     return false;
 
@@ -9626,12 +9636,16 @@
   if (const BinaryOperator *BO = dyn_cast<BinaryOperator>(FoundCondValue)) {
     if (BO->getOpcode() == Instruction::And) {
       if (!Inverse)
-        return isImpliedCond(Pred, LHS, RHS, BO->getOperand(0), Inverse) ||
-               isImpliedCond(Pred, LHS, RHS, BO->getOperand(1), Inverse);
+        return isImpliedCond(Pred, LHS, RHS, BO->getOperand(0), Inverse,
+                             Context) ||
+               isImpliedCond(Pred, LHS, RHS, BO->getOperand(1), Inverse,
+                             Context);
     } else if (BO->getOpcode() == Instruction::Or) {
       if (Inverse)
-        return isImpliedCond(Pred, LHS, RHS, BO->getOperand(0), Inverse) ||
-               isImpliedCond(Pred, LHS, RHS, BO->getOperand(1), Inverse);
+        return isImpliedCond(Pred, LHS, RHS, BO->getOperand(0), Inverse,
+                             Context) ||
+               isImpliedCond(Pred, LHS, RHS, BO->getOperand(1), Inverse,
+                             Context);
     }
   }
 
@@ -9649,14 +9663,14 @@
   const SCEV *FoundLHS = getSCEV(ICI->getOperand(0));
   const SCEV *FoundRHS = getSCEV(ICI->getOperand(1));
 
-  return isImpliedCond(Pred, LHS, RHS, FoundPred, FoundLHS, FoundRHS);
+  return isImpliedCond(Pred, LHS, RHS, FoundPred, FoundLHS, FoundRHS, Context);
 }
 
 bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS,
                                     const SCEV *RHS,
                                     ICmpInst::Predicate FoundPred,
-                                    const SCEV *FoundLHS,
-                                    const SCEV *FoundRHS) {
+                                    const SCEV *FoundLHS, const SCEV *FoundRHS,
+                                    const Instruction *Context) {
   // Balance the types.
   if (getTypeSizeInBits(LHS->getType()) <
       getTypeSizeInBits(FoundLHS->getType())) {
@@ -9700,16 +9714,16 @@
 
   // Check whether the found predicate is the same as the desired predicate.
   if (FoundPred == Pred)
-    return isImpliedCondOperands(Pred, LHS, RHS, FoundLHS, FoundRHS);
+    return isImpliedCondOperands(Pred, LHS, RHS, FoundLHS, FoundRHS, Context);
 
   // Check whether swapping the found predicate makes it the same as the
   // desired predicate.
   if (ICmpInst::getSwappedPredicate(FoundPred) == Pred) {
     if (isa<SCEVConstant>(RHS))
-      return isImpliedCondOperands(Pred, LHS, RHS, FoundRHS, FoundLHS);
+      return isImpliedCondOperands(Pred, LHS, RHS, FoundRHS, FoundLHS, Context);
     else
-      return isImpliedCondOperands(ICmpInst::getSwappedPredicate(Pred),
-                                   RHS, LHS, FoundLHS, FoundRHS);
+      return isImpliedCondOperands(ICmpInst::getSwappedPredicate(Pred), RHS,
+                                   LHS, FoundLHS, FoundRHS, Context);
   }
 
   // Unsigned comparison is the same as signed comparison when both the operands
@@ -9717,7 +9731,7 @@
   if (CmpInst::isUnsigned(FoundPred) &&
       CmpInst::getSignedPredicate(FoundPred) == Pred &&
       isKnownNonNegative(FoundLHS) && isKnownNonNegative(FoundRHS))
-    return isImpliedCondOperands(Pred, LHS, RHS, FoundLHS, FoundRHS);
+    return isImpliedCondOperands(Pred, LHS, RHS, FoundLHS, FoundRHS, Context);
 
   // Check if we can make progress by sharpening ranges.
   if (FoundPred == ICmpInst::ICMP_NE &&
@@ -9754,8 +9768,8 @@
         case ICmpInst::ICMP_UGE:
           // We know V `Pred` SharperMin.  If this implies LHS `Pred`
           // RHS, we're done.
-          if (isImpliedCondOperands(Pred, LHS, RHS, V,
-                                    getConstant(SharperMin)))
+          if (isImpliedCondOperands(Pred, LHS, RHS, V, getConstant(SharperMin),
+                                    Context))
             return true;
           LLVM_FALLTHROUGH;
 
@@ -9770,7 +9784,8 @@
           //
           // If V `Pred` Min implies LHS `Pred` RHS, we're done.
 
-          if (isImpliedCondOperands(Pred, LHS, RHS, V, getConstant(Min)))
+          if (isImpliedCondOperands(Pred, LHS, RHS, V, getConstant(Min),
+                                    Context))
             return true;
           break;
 
@@ -9778,14 +9793,14 @@
         case ICmpInst::ICMP_SLE:
         case ICmpInst::ICMP_ULE:
           if (isImpliedCondOperands(CmpInst::getSwappedPredicate(Pred), RHS,
-                                    LHS, V, getConstant(SharperMin)))
+                                    LHS, V, getConstant(SharperMin), Context))
             return true;
           LLVM_FALLTHROUGH;
 
         case ICmpInst::ICMP_SLT:
         case ICmpInst::ICMP_ULT:
           if (isImpliedCondOperands(CmpInst::getSwappedPredicate(Pred), RHS,
-                                    LHS, V, getConstant(Min)))
+                                    LHS, V, getConstant(Min), Context))
             return true;
           break;
 
@@ -9799,11 +9814,12 @@
   // Check whether the actual condition is beyond sufficient.
   if (FoundPred == ICmpInst::ICMP_EQ)
     if (ICmpInst::isTrueWhenEqual(Pred))
-      if (isImpliedCondOperands(Pred, LHS, RHS, FoundLHS, FoundRHS))
+      if (isImpliedCondOperands(Pred, LHS, RHS, FoundLHS, FoundRHS, Context))
         return true;
   if (Pred == ICmpInst::ICMP_NE)
     if (!ICmpInst::isTrueWhenEqual(FoundPred))
-      if (isImpliedCondOperands(FoundPred, LHS, RHS, FoundLHS, FoundRHS))
+      if (isImpliedCondOperands(FoundPred, LHS, RHS, FoundLHS, FoundRHS,
+                                Context))
         return true;
 
   // Otherwise assume the worst.
@@ -9882,6 +9898,44 @@
   return None;
 }
 
+bool ScalarEvolution::isImpliedCondOperandsViaAddRecStart(
+    ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS,
+    const SCEV *FoundLHS, const SCEV *FoundRHS, const Instruction *Context) {
+  // Try to recognize the following pattern:
+  //
+  //   FoundRHS = ...
+  // ...
+  // loop:
+  //   FoundLHS = {Start,+,W}
+  // context_bb: // Basic block from the same loop
+  //   known(Pred, FoundLHS, FoundRHS)
+  //
+  // If some predicate is known in the context of a loop, it is also known on
+  // each iteration of this loop, including the first iteration. Therefore, in
+  // this case, `FoundLHS Pred FoundRHS` implies `Start Pred FoundRHS`. Try to
+  // prove the original pred using this fact.
+  if (!Context)
+    return false;
+  // Make sure AR varies in the context block.
+  if (auto *AR = dyn_cast<SCEVAddRecExpr>(FoundLHS)) {
+    if (!AR->getLoop()->contains(Context->getParent()))
+      return false;
+    if (!isAvailableAtLoopEntry(FoundRHS, AR->getLoop()))
+      return false;
+    return isImpliedCondOperands(Pred, LHS, RHS, AR->getStart(), FoundRHS);
+  }
+
+  if (auto *AR = dyn_cast<SCEVAddRecExpr>(FoundRHS)) {
+    if (!AR->getLoop()->contains(Context))
+      return false;
+    if (!isAvailableAtLoopEntry(FoundLHS, AR->getLoop()))
+      return false;
+    return isImpliedCondOperands(Pred, LHS, RHS, FoundLHS, AR->getStart());
+  }
+
+  return false;
+}
+
 bool ScalarEvolution::isImpliedCondOperandsViaNoOverflow(
     ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS,
     const SCEV *FoundLHS, const SCEV *FoundRHS) {
@@ -10072,13 +10126,18 @@
 bool ScalarEvolution::isImpliedCondOperands(ICmpInst::Predicate Pred,
                                             const SCEV *LHS, const SCEV *RHS,
                                             const SCEV *FoundLHS,
-                                            const SCEV *FoundRHS) {
+                                            const SCEV *FoundRHS,
+                                            const Instruction *Context) {
   if (isImpliedCondOperandsViaRanges(Pred, LHS, RHS, FoundLHS, FoundRHS))
     return true;
 
   if (isImpliedCondOperandsViaNoOverflow(Pred, LHS, RHS, FoundLHS, FoundRHS))
     return true;
 
+  if (isImpliedCondOperandsViaAddRecStart(Pred, LHS, RHS, FoundLHS, FoundRHS,
+                                          Context))
+    return true;
+
   return isImpliedCondOperandsHelper(Pred, LHS, RHS,
                                      FoundLHS, FoundRHS) ||
          // ~x < ~y --> x > y
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -5345,6 +5345,8 @@
     // The lexer has no type info, so builds all half, bfloat, float, and double
     // FP constants as double.  Fix this here.  Long double does not need this.
     if (&ID.APFloatVal.getSemantics() == &APFloat::IEEEdouble()) {
+      // Check for signaling before potentially converting and losing that info.
+      bool IsSNAN = ID.APFloatVal.isSignaling();
       bool Ignored;
       if (Ty->isHalfTy())
         ID.APFloatVal.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven,
@@ -5355,6 +5357,14 @@
       else if (Ty->isFloatTy())
         ID.APFloatVal.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven,
                               &Ignored);
+      if (IsSNAN) {
+        // The convert call above may quiet an SNaN, so manufacture another
+        // SNaN. The bitcast works because the payload (significand) parameter
+        // is truncated to fit.
+        APInt Payload = ID.APFloatVal.bitcastToAPInt();
+        ID.APFloatVal = APFloat::getSNaN(ID.APFloatVal.getSemantics(),
+                                         ID.APFloatVal.isNegative(), &Payload);
+      }
     }
     V = ConstantFP::get(Context, ID.APFloatVal);
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -1023,9 +1023,10 @@
                              MCConstantExpr::create(FrameOffset, OutContext));
 }
 
-/// Returns the BB metadata to be emitted in the bb_addr_map section for a given
-/// basic block. This can be used to capture more precise profile information.
-/// We use the last 3 bits (LSBs) to ecnode the following information:
+/// Returns the BB metadata to be emitted in the __llvm_bb_addr_map section for
+/// a given basic block. This can be used to capture more precise profile
+/// information. We use the last 3 bits (LSBs) to ecnode the following
+/// information:
 ///  * (1): set if return block (ret or tail call).
 ///  * (2): set if ends with a tail call.
 ///  * (3): set if exception handling (EH) landing pad.
@@ -1040,7 +1041,7 @@
 void AsmPrinter::emitBBAddrMapSection(const MachineFunction &MF) {
   MCSection *BBAddrMapSection =
       getObjFileLowering().getBBAddrMapSection(*MF.getSection());
-  assert(BBAddrMapSection && ".bb_addr_map section is not initialized.");
+  assert(BBAddrMapSection && "__llvm_bb_addr_map section is not initialized.");
 
   const MCSymbol *FunctionSymbol = getFunctionBegin();
 
@@ -1763,10 +1764,11 @@
   return false;
 }
 
-MCSymbol *AsmPrinter::getCurExceptionSym() {
-  if (!CurExceptionSym)
-    CurExceptionSym = createTempSymbol("exception");
-  return CurExceptionSym;
+MCSymbol *AsmPrinter::getMBBExceptionSym(const MachineBasicBlock &MBB) {
+  auto Res = MBBSectionExceptionSyms.try_emplace(MBB.getSectionIDNum());
+  if (Res.second)
+    Res.first->second = createTempSymbol("exception");
+  return Res.first->second;
 }
 
 void AsmPrinter::SetupMachineFunction(MachineFunction &MF) {
@@ -1793,7 +1795,7 @@
   CurrentFnBegin = nullptr;
   CurrentSectionBeginSym = nullptr;
   MBBSectionRanges.clear();
-  CurExceptionSym = nullptr;
+  MBBSectionExceptionSyms.clear();
   bool NeedsLocalForSize = MAI->needsLocalForSize();
   if (F.hasFnAttribute("patchable-function-entry") ||
       F.hasFnAttribute("function-instrument") ||
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
@@ -81,8 +81,9 @@
   }
 }
 
-static MCSymbol *getExceptionSym(AsmPrinter *Asm) {
-  return Asm->getCurExceptionSym();
+static MCSymbol *getExceptionSym(AsmPrinter *Asm,
+                                 const MachineBasicBlock *MBB) {
+  return Asm->getMBBExceptionSym(*MBB);
 }
 
 void DwarfCFIException::beginFunction(const MachineFunction *MF) {
@@ -161,7 +162,7 @@
 
   // Provide LSDA information.
   if (shouldEmitLSDA)
-    Asm->OutStreamer->emitCFILsda(ESP(Asm), TLOF.getLSDAEncoding());
+    Asm->OutStreamer->emitCFILsda(ESP(Asm, MBB), TLOF.getLSDAEncoding());
 }
 
 /// endFunction - Gather and emit post-function exception information.
diff --git a/llvm/lib/CodeGen/AsmPrinter/EHStreamer.h b/llvm/lib/CodeGen/AsmPrinter/EHStreamer.h
--- a/llvm/lib/CodeGen/AsmPrinter/EHStreamer.h
+++ b/llvm/lib/CodeGen/AsmPrinter/EHStreamer.h
@@ -69,23 +69,48 @@
     unsigned Action;
   };
 
+  /// Structure describing a contiguous range of call-sites which reside
+  /// in the same procedure fragment. With -fbasic-block-sections, there will
+  /// be one call site range per basic block section. Otherwise, we will have
+  /// one call site range containing all the call sites in the function.
+  struct CallSiteRange {
+    // Symbol marking the beginning of the precedure fragment.
+    MCSymbol *FragmentBeginLabel = nullptr;
+    // Symbol marking the end of the procedure fragment.
+    MCSymbol *FragmentEndLabel = nullptr;
+    // LSDA symbol for this call-site range.
+    MCSymbol *ExceptionLabel = nullptr;
+    // Index of the first call-site entry in the call-site table which
+    // belongs to this range.
+    size_t CallSiteBeginIdx = 0;
+    // Index just after the last call-site entry in the call-site table which
+    // belongs to this range.
+    size_t CallSiteEndIdx = 0;
+    // Whether this is the call-site range containing all the landing pads.
+    bool IsLPRange = false;
+  };
+
   /// Compute the actions table and gather the first action index for each
   /// landing pad site.
-  void computeActionsTable(const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
-                           SmallVectorImpl<ActionEntry> &Actions,
-                           SmallVectorImpl<unsigned> &FirstActions);
+  void computeActionsTable(
+      const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
+      SmallVectorImpl<ActionEntry> &Actions,
+      SmallVectorImpl<unsigned> &FirstActions);
 
   void computePadMap(const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
                      RangeMapType &PadMap);
 
-  /// Compute the call-site table.  The entry for an invoke has a try-range
-  /// containing the call, a non-zero landing pad and an appropriate action.
-  /// The entry for an ordinary call has a try-range containing the call and
-  /// zero for the landing pad and the action.  Calls marked 'nounwind' have
-  /// no entry and must not be contained in the try-range of any entry - they
-  /// form gaps in the table.  Entries must be ordered by try-range address.
+  /// Compute the call-site table and the call-site ranges. The entry for an
+  /// invoke has a try-range containing the call, a non-zero landing pad and an
+  /// appropriate action. The entry for an ordinary call has a try-range
+  /// containing the call and zero for the landing pad and the action.  Calls
+  /// marked 'nounwind' have no entry and must not be contained in the try-range
+  /// of any entry - they form gaps in the table.  Entries must be ordered by
+  /// try-range address. CallSiteRanges vector is only populated for Itanium
+  /// exception handling.
   virtual void computeCallSiteTable(
       SmallVectorImpl<CallSiteEntry> &CallSites,
+      SmallVectorImpl<CallSiteRange> &CallSiteRanges,
       const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
       const SmallVectorImpl<unsigned> &FirstActions);
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp b/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp
--- a/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp
@@ -214,10 +214,25 @@
 /// the landing pad and the action.  Calls marked 'nounwind' have no entry and
 /// must not be contained in the try-range of any entry - they form gaps in the
 /// table.  Entries must be ordered by try-range address.
-void EHStreamer::
-computeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
-                     const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
-                     const SmallVectorImpl<unsigned> &FirstActions) {
+///
+/// Call-sites are split into one or more call-site ranges associated with
+/// different sections of the function.
+///
+///   - Without -basic-block-sections, all call-sites are grouped into one
+///     call-site-range corresponding to the function section.
+///
+///   - With -basic-block-sections, one call-site range is created for each
+///     section, with its FragmentBeginLabel and FragmentEndLabel respectively
+//      set to the beginning and ending of the corresponding section and its
+//      ExceptionLabel set to the exception symbol dedicated for this section.
+//      Later, one LSDA header will be emitted for each call-site range with its
+//      call-sites following. The action table and type info table will be
+//      shared across all ranges.
+void EHStreamer::computeCallSiteTable(
+    SmallVectorImpl<CallSiteEntry> &CallSites,
+    SmallVectorImpl<CallSiteRange> &CallSiteRanges,
+    const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
+    const SmallVectorImpl<unsigned> &FirstActions) {
   RangeMapType PadMap;
   computePadMap(LandingPads, PadMap);
 
@@ -235,6 +250,21 @@
 
   // Visit all instructions in order of address.
   for (const auto &MBB : *Asm->MF) {
+    if (&MBB == &Asm->MF->front() || MBB.isBeginSection()) {
+      // We start a call-site range upon function entry and at the beginning of
+      // every basic block section.
+      CallSiteRanges.push_back(
+          {Asm->MBBSectionRanges[MBB.getSectionIDNum()].BeginLabel,
+           Asm->MBBSectionRanges[MBB.getSectionIDNum()].EndLabel,
+           Asm->getMBBExceptionSym(MBB), CallSites.size()});
+      PreviousIsInvoke = false;
+      SawPotentiallyThrowing = false;
+      LastLabel = nullptr;
+    }
+
+    if (MBB.isEHPad())
+      CallSiteRanges.back().IsLPRange = true;
+
     for (const auto &MI : MBB) {
       if (!MI.isEHLabel()) {
         if (MI.isCall())
@@ -306,13 +336,22 @@
         PreviousIsInvoke = true;
       }
     }
-  }
 
-  // If some instruction between the previous try-range and the end of the
-  // function may throw, create a call-site entry with no landing pad for the
-  // region following the try-range.
-  if (SawPotentiallyThrowing && !IsSJLJ)
-    CallSites.push_back({LastLabel, Asm->getFunctionEnd(), nullptr, 0});
+    // We end the call-site range upon function exit and at the end of every
+    // basic block section.
+    if (&MBB == &Asm->MF->back() || MBB.isEndSection()) {
+      // If some instruction between the previous try-range and the end of the
+      // function may throw, create a call-site entry with no landing pad for
+      // the region following the try-range.
+      if (SawPotentiallyThrowing && !IsSJLJ) {
+        CallSiteEntry Site = {LastLabel, CallSiteRanges.back().FragmentEndLabel,
+                              nullptr, 0};
+        CallSites.push_back(Site);
+        SawPotentiallyThrowing = false;
+      }
+      CallSiteRanges.back().CallSiteEndIdx = CallSites.size();
+    }
+  }
 }
 
 /// Emit landing pads and actions.
@@ -362,9 +401,13 @@
   SmallVector<unsigned, 64> FirstActions;
   computeActionsTable(LandingPads, Actions, FirstActions);
 
-  // Compute the call-site table.
+  // Compute the call-site table and call-site ranges. Normally, there is only
+  // one call-site-range which covers the whole funciton. With
+  // -basic-block-sections, there is one call-site-range per basic block
+  // section.
   SmallVector<CallSiteEntry, 64> CallSites;
-  computeCallSiteTable(CallSites, LandingPads, FirstActions);
+  SmallVector<CallSiteRange, 4> CallSiteRanges;
+  computeCallSiteTable(CallSites, CallSiteRanges, LandingPads, FirstActions);
 
   bool IsSJLJ = Asm->MAI->getExceptionHandlingType() == ExceptionHandling::SjLj;
   bool IsWasm = Asm->MAI->getExceptionHandlingType() == ExceptionHandling::Wasm;
@@ -424,35 +467,49 @@
     Asm->OutContext.getOrCreateSymbol(Twine("GCC_except_table")+
                                       Twine(Asm->getFunctionNumber()));
   Asm->OutStreamer->emitLabel(GCCETSym);
-  Asm->OutStreamer->emitLabel(Asm->getCurExceptionSym());
-
-  // Emit the LSDA header.
-  Asm->emitEncodingByte(dwarf::DW_EH_PE_omit, "@LPStart");
-  Asm->emitEncodingByte(TTypeEncoding, "@TType");
+  MCSymbol *CstEndLabel = Asm->createTempSymbol(
+      CallSiteRanges.size() > 1 ? "action_table_base" : "cst_end");
 
   MCSymbol *TTBaseLabel = nullptr;
-  if (HaveTTData) {
-    // N.B.: There is a dependency loop between the size of the TTBase uleb128
-    // here and the amount of padding before the aligned type table. The
-    // assembler must sometimes pad this uleb128 or insert extra padding before
-    // the type table. See PR35809 or GNU as bug 4029.
-    MCSymbol *TTBaseRefLabel = Asm->createTempSymbol("ttbaseref");
+  if (HaveTTData)
     TTBaseLabel = Asm->createTempSymbol("ttbase");
-    Asm->emitLabelDifferenceAsULEB128(TTBaseLabel, TTBaseRefLabel);
-    Asm->OutStreamer->emitLabel(TTBaseRefLabel);
-  }
 
-  bool VerboseAsm = Asm->OutStreamer->isVerboseAsm();
+  const bool VerboseAsm = Asm->OutStreamer->isVerboseAsm();
+
+  // Helper for emitting references (offsets) for type table and the end of the
+  // call-site table (which marks the beginning of the action table).
+  //  * For Itanium, these references will be emitted for every callsite range.
+  //  * For SJLJ and Wasm, they will be emitted only once in the LSDA header.
+  auto EmitTypeTableRefAndCallSiteTableEndRef = [&]() {
+    Asm->emitEncodingByte(TTypeEncoding, "@TType");
+    if (HaveTTData) {
+      // N.B.: There is a dependency loop between the size of the TTBase uleb128
+      // here and the amount of padding before the aligned type table. The
+      // assembler must sometimes pad this uleb128 or insert extra padding
+      // before the type table. See PR35809 or GNU as bug 4029.
+      MCSymbol *TTBaseRefLabel = Asm->createTempSymbol("ttbaseref");
+      Asm->emitLabelDifferenceAsULEB128(TTBaseLabel, TTBaseRefLabel);
+      Asm->OutStreamer->emitLabel(TTBaseRefLabel);
+    }
 
-  // Emit the landing pad call site table.
-  MCSymbol *CstBeginLabel = Asm->createTempSymbol("cst_begin");
-  MCSymbol *CstEndLabel = Asm->createTempSymbol("cst_end");
-  Asm->emitEncodingByte(CallSiteEncoding, "Call site");
-  Asm->emitLabelDifferenceAsULEB128(CstEndLabel, CstBeginLabel);
-  Asm->OutStreamer->emitLabel(CstBeginLabel);
+    // The Action table follows the call-site table. So we emit the
+    // label difference from here (start of the call-site table for SJLJ and
+    // Wasm, and start of a call-site range for Itanium) to the end of the
+    // whole call-site table (end of the last call-site range for Itanium).
+    MCSymbol *CstBeginLabel = Asm->createTempSymbol("cst_begin");
+    Asm->emitEncodingByte(CallSiteEncoding, "Call site");
+    Asm->emitLabelDifferenceAsULEB128(CstEndLabel, CstBeginLabel);
+    Asm->OutStreamer->emitLabel(CstBeginLabel);
+  };
 
   // SjLj / Wasm Exception handling
   if (IsSJLJ || IsWasm) {
+    Asm->OutStreamer->emitLabel(Asm->getMBBExceptionSym(Asm->MF->front()));
+
+    // emit the LSDA header.
+    Asm->emitEncodingByte(dwarf::DW_EH_PE_omit, "@LPStart");
+    EmitTypeTableRefAndCallSiteTableEndRef();
+
     unsigned idx = 0;
     for (SmallVectorImpl<CallSiteEntry>::const_iterator
          I = CallSites.begin(), E = CallSites.end(); I != E; ++I, ++idx) {
@@ -477,6 +534,7 @@
       }
       Asm->emitULEB128(S.Action);
     }
+    Asm->OutStreamer->emitLabel(CstEndLabel);
   } else {
     // Itanium LSDA exception handling
 
@@ -498,50 +556,124 @@
     // A missing entry in the call-site table indicates that a call is not
     // supposed to throw.
 
+    assert(CallSiteRanges.size() != 0 && "No call-site ranges!");
+
+    // There should be only one call-site range which includes all the landing
+    // pads. Find that call-site range here.
+    const CallSiteRange *LandingPadRange = nullptr;
+    for (const CallSiteRange &CSRange : CallSiteRanges) {
+      if (CSRange.IsLPRange) {
+        assert(LandingPadRange == nullptr &&
+               "All landing pads must be in a single callsite range.");
+        LandingPadRange = &CSRange;
+      }
+    }
+
+    // The call-site table is split into its call-site ranges, each being
+    // emitted as:
+    //              [ LPStartEncoding | LPStart ]
+    //              [ TypeTableEncoding | TypeTableOffset ]
+    //              [ CallSiteEncoding | CallSiteTableEndOffset ]
+    // cst_begin -> { call-site entries contained in this range }
+    //
+    // and is followed by the next call-site range.
+    //
+    // For each call-site range, CallSiteTableEndOffset is computed as the
+    // difference between cst_begin of that range and the last call-site-table's
+    // end label. This offset is used to find the action table.
+
     unsigned Entry = 0;
-    for (SmallVectorImpl<CallSiteEntry>::const_iterator
-         I = CallSites.begin(), E = CallSites.end(); I != E; ++I) {
-      const CallSiteEntry &S = *I;
+    for (const CallSiteRange &CSRange : CallSiteRanges) {
+      if (CSRange.CallSiteBeginIdx != 0) {
+        // Align the call-site range for all ranges except the first. The
+        // first range is already aligned due to the exception table alignment.
+        Asm->emitAlignment(Align(4));
+      }
+      Asm->OutStreamer->emitLabel(CSRange.ExceptionLabel);
+
+      // Emit the LSDA header.
+      // If only one call-site range exists, LPStart is omitted as it is the
+      // same as the function entry.
+      if (CallSiteRanges.size() == 1) {
+        Asm->emitEncodingByte(dwarf::DW_EH_PE_omit, "@LPStart");
+      } else if (!Asm->isPositionIndependent()) {
+        // For more than one call-site ranges, LPStart must be explicitly
+        // specified.
+        // For non-PIC we can simply use the absolute value.
+        Asm->emitEncodingByte(dwarf::DW_EH_PE_absptr, "@LPStart");
+        Asm->OutStreamer->emitSymbolValue(LandingPadRange->FragmentBeginLabel,
+                                          Asm->MAI->getCodePointerSize());
+      } else {
+        // For PIC mode, we Emit a PC-relative address for LPStart.
+        Asm->emitEncodingByte(dwarf::DW_EH_PE_pcrel, "@LPStart");
+        MCContext &Context = Asm->OutStreamer->getContext();
+        MCSymbol *Dot = Context.createTempSymbol();
+        Asm->OutStreamer->emitLabel(Dot);
+        Asm->OutStreamer->emitValue(
+            MCBinaryExpr::createSub(
+                MCSymbolRefExpr::create(LandingPadRange->FragmentBeginLabel,
+                                        Context),
+                MCSymbolRefExpr::create(Dot, Context), Context),
+            Asm->MAI->getCodePointerSize());
+      }
+
+      EmitTypeTableRefAndCallSiteTableEndRef();
 
-      MCSymbol *EHFuncBeginSym = Asm->getFunctionBegin();
+      for (size_t CallSiteIdx = CSRange.CallSiteBeginIdx;
+           CallSiteIdx != CSRange.CallSiteEndIdx; ++CallSiteIdx) {
+        const CallSiteEntry &S = CallSites[CallSiteIdx];
 
-      // Offset of the call site relative to the start of the procedure.
-      if (VerboseAsm)
-        Asm->OutStreamer->AddComment(">> Call Site " + Twine(++Entry) + " <<");
-      Asm->emitCallSiteOffset(S.BeginLabel, EHFuncBeginSym, CallSiteEncoding);
-      if (VerboseAsm)
-        Asm->OutStreamer->AddComment(Twine("  Call between ") +
-                                     S.BeginLabel->getName() + " and " +
-                                     S.EndLabel->getName());
-      Asm->emitCallSiteOffset(S.EndLabel, S.BeginLabel, CallSiteEncoding);
+        MCSymbol *EHFuncBeginSym = CSRange.FragmentBeginLabel;
+        MCSymbol *EHFuncEndSym = CSRange.FragmentEndLabel;
 
-      // Offset of the landing pad relative to the start of the procedure.
-      if (!S.LPad) {
+        MCSymbol *BeginLabel = S.BeginLabel;
+        if (!BeginLabel)
+          BeginLabel = EHFuncBeginSym;
+        MCSymbol *EndLabel = S.EndLabel;
+        if (!EndLabel)
+          EndLabel = EHFuncEndSym;
+
+        // Offset of the call site relative to the start of the procedure.
         if (VerboseAsm)
-          Asm->OutStreamer->AddComment("    has no landing pad");
-        Asm->emitCallSiteValue(0, CallSiteEncoding);
-      } else {
+          Asm->OutStreamer->AddComment(">> Call Site " + Twine(++Entry) +
+                                       " <<");
+        Asm->emitCallSiteOffset(BeginLabel, EHFuncBeginSym, CallSiteEncoding);
         if (VerboseAsm)
-          Asm->OutStreamer->AddComment(Twine("    jumps to ") +
-                                       S.LPad->LandingPadLabel->getName());
-        Asm->emitCallSiteOffset(S.LPad->LandingPadLabel, EHFuncBeginSym,
-                                CallSiteEncoding);
-      }
+          Asm->OutStreamer->AddComment(Twine("  Call between ") +
+                                       BeginLabel->getName() + " and " +
+                                       EndLabel->getName());
+        Asm->emitCallSiteOffset(EndLabel, BeginLabel, CallSiteEncoding);
+
+        // Offset of the landing pad relative to the start of the landing pad
+        // fragment.
+        if (!S.LPad) {
+          if (VerboseAsm)
+            Asm->OutStreamer->AddComment("    has no landing pad");
+          Asm->emitCallSiteValue(0, CallSiteEncoding);
+        } else {
+          if (VerboseAsm)
+            Asm->OutStreamer->AddComment(Twine("    jumps to ") +
+                                         S.LPad->LandingPadLabel->getName());
+          Asm->emitCallSiteOffset(S.LPad->LandingPadLabel,
+                                  LandingPadRange->FragmentBeginLabel,
+                                  CallSiteEncoding);
+        }
 
-      // Offset of the first associated action record, relative to the start of
-      // the action table. This value is biased by 1 (1 indicates the start of
-      // the action table), and 0 indicates that there are no actions.
-      if (VerboseAsm) {
-        if (S.Action == 0)
-          Asm->OutStreamer->AddComment("  On action: cleanup");
-        else
-          Asm->OutStreamer->AddComment("  On action: " +
-                                       Twine((S.Action - 1) / 2 + 1));
+        // Offset of the first associated action record, relative to the start
+        // of the action table. This value is biased by 1 (1 indicates the start
+        // of the action table), and 0 indicates that there are no actions.
+        if (VerboseAsm) {
+          if (S.Action == 0)
+            Asm->OutStreamer->AddComment("  On action: cleanup");
+          else
+            Asm->OutStreamer->AddComment("  On action: " +
+                                         Twine((S.Action - 1) / 2 + 1));
+        }
+        Asm->emitULEB128(S.Action);
       }
-      Asm->emitULEB128(S.Action);
     }
+    Asm->OutStreamer->emitLabel(CstEndLabel);
   }
-  Asm->OutStreamer->emitLabel(CstEndLabel);
 
   // Emit the Action Table.
   int Entry = 0;
@@ -596,7 +728,7 @@
   const std::vector<const GlobalValue *> &TypeInfos = MF->getTypeInfos();
   const std::vector<unsigned> &FilterIds = MF->getFilterIds();
 
-  bool VerboseAsm = Asm->OutStreamer->isVerboseAsm();
+  const bool VerboseAsm = Asm->OutStreamer->isVerboseAsm();
 
   int Entry = 0;
   // Emit the Catch TypeInfos.
diff --git a/llvm/lib/CodeGen/AsmPrinter/WasmException.h b/llvm/lib/CodeGen/AsmPrinter/WasmException.h
--- a/llvm/lib/CodeGen/AsmPrinter/WasmException.h
+++ b/llvm/lib/CodeGen/AsmPrinter/WasmException.h
@@ -32,6 +32,7 @@
   // Compute the call site table for wasm EH.
   void computeCallSiteTable(
       SmallVectorImpl<CallSiteEntry> &CallSites,
+      SmallVectorImpl<CallSiteRange> &CallSiteRanges,
       const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
       const SmallVectorImpl<unsigned> &FirstActions) override;
 };
diff --git a/llvm/lib/CodeGen/AsmPrinter/WasmException.cpp b/llvm/lib/CodeGen/AsmPrinter/WasmException.cpp
--- a/llvm/lib/CodeGen/AsmPrinter/WasmException.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/WasmException.cpp
@@ -76,6 +76,7 @@
 // information.
 void WasmException::computeCallSiteTable(
     SmallVectorImpl<CallSiteEntry> &CallSites,
+    SmallVectorImpl<CallSiteRange> &CallSiteRanges,
     const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
     const SmallVectorImpl<unsigned> &FirstActions) {
   MachineFunction &MF = *Asm->MF;
diff --git a/llvm/lib/CodeGen/BasicBlockSections.cpp b/llvm/lib/CodeGen/BasicBlockSections.cpp
--- a/llvm/lib/CodeGen/BasicBlockSections.cpp
+++ b/llvm/lib/CodeGen/BasicBlockSections.cpp
@@ -49,9 +49,9 @@
 // ==================
 //
 // With -fbasic-block-sections=labels, we emit the offsets of BB addresses of
-// every function into a .bb_addr_map section. Along with the function symbols,
-// this allows for mapping of virtual addresses in PMU profiles back to the
-// corresponding basic blocks. This logic is implemented in AsmPrinter. This
+// every function into the __llvm_bb_addr_map section. Along with the function
+// symbols, this allows for mapping of virtual addresses in PMU profiles back to
+// the corresponding basic blocks. This logic is implemented in AsmPrinter. This
 // pass only assigns the BBSectionType of every function to ``labels``.
 //
 //===----------------------------------------------------------------------===//
@@ -293,6 +293,26 @@
   updateBranches(MF, PreLayoutFallThroughs);
 }
 
+// If the exception section begins with a landing pad, that landing pad will
+// assume a zero offset (relative to @LPStart) in the LSDA. However, a value of
+// zero implies "no landing pad." This function inserts a NOP just before the EH
+// pad label to ensure a nonzero offset. Returns true if padding is not needed.
+static bool avoidZeroOffsetLandingPad(MachineFunction &MF) {
+  for (auto &MBB : MF) {
+    if (MBB.isBeginSection() && MBB.isEHPad()) {
+      MachineBasicBlock::iterator MI = MBB.begin();
+      while (!MI->isEHLabel())
+        ++MI;
+      MCInst Noop;
+      MF.getSubtarget().getInstrInfo()->getNoop(Noop);
+      BuildMI(MBB, MI, DebugLoc(),
+              MF.getSubtarget().getInstrInfo()->get(Noop.getOpcode()));
+      return false;
+    }
+  }
+  return true;
+}
+
 bool BasicBlockSections::runOnMachineFunction(MachineFunction &MF) {
   auto BBSectionsType = MF.getTarget().getBBSectionsType();
   assert(BBSectionsType != BasicBlockSection::None &&
@@ -354,6 +374,7 @@
   };
 
   sortBasicBlocksAndUpdateBranches(MF, Comparator);
+  avoidZeroOffsetLandingPad(MF);
   return true;
 }
 
diff --git a/llvm/lib/CodeGen/GlobalMerge.cpp b/llvm/lib/CodeGen/GlobalMerge.cpp
--- a/llvm/lib/CodeGen/GlobalMerge.cpp
+++ b/llvm/lib/CodeGen/GlobalMerge.cpp
@@ -223,8 +223,9 @@
   // FIXME: Find better heuristics
   llvm::stable_sort(
       Globals, [&DL](const GlobalVariable *GV1, const GlobalVariable *GV2) {
-        return DL.getTypeAllocSize(GV1->getValueType()) <
-               DL.getTypeAllocSize(GV2->getValueType());
+        // We don't support scalable global variables.
+        return DL.getTypeAllocSize(GV1->getValueType()).getFixedSize() <
+               DL.getTypeAllocSize(GV2->getValueType()).getFixedSize();
       });
 
   // If we want to just blindly group all globals together, do so.
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -2715,7 +2715,7 @@
   EVT InVT = Lo.getValueType();
 
   EVT OutVT = EVT::getVectorVT(*DAG.getContext(), ResVT.getVectorElementType(),
-                               InVT.getVectorNumElements());
+                               InVT.getVectorElementCount());
 
   if (N->isStrictFPOpcode()) {
     Lo = DAG.getNode(N->getOpcode(), DL, { OutVT, MVT::Other }, 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -4613,8 +4613,8 @@
            Operand.getValueType().isFloatingPoint() && "Invalid FP cast!");
     if (Operand.getValueType() == VT) return Operand;  // noop conversion.
     assert((!VT.isVector() ||
-            VT.getVectorNumElements() ==
-            Operand.getValueType().getVectorNumElements()) &&
+            VT.getVectorElementCount() ==
+            Operand.getValueType().getVectorElementCount()) &&
            "Vector element count mismatch!");
     assert(Operand.getValueType().bitsLT(VT) &&
            "Invalid fpext node, dst < src!");
diff --git a/llvm/lib/DebugInfo/CodeView/RecordName.cpp b/llvm/lib/DebugInfo/CodeView/RecordName.cpp
--- a/llvm/lib/DebugInfo/CodeView/RecordName.cpp
+++ b/llvm/lib/DebugInfo/CodeView/RecordName.cpp
@@ -9,6 +9,7 @@
 #include "llvm/DebugInfo/CodeView/RecordName.h"
 
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/DebugInfo/CodeView/CVSymbolVisitor.h"
 #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecordMapping.h"
@@ -77,9 +78,10 @@
   uint32_t Size = Indices.size();
   Name = "(";
   for (uint32_t I = 0; I < Size; ++I) {
-    assert(Indices[I] < CurrentTypeIndex);
-
-    Name.append(Types.getTypeName(Indices[I]));
+    if (Indices[I] < CurrentTypeIndex)
+      Name.append(Types.getTypeName(Indices[I]));
+    else
+      Name.append("<unknown 0x" + utohexstr(Indices[I].getIndex()) + ">");
     if (I + 1 != Size)
       Name.append(", ");
   }
diff --git a/llvm/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp b/llvm/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
--- a/llvm/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
@@ -25,6 +25,7 @@
 #include "llvm/Support/Error.h"
 #include <algorithm>
 #include <cstdint>
+#include <numeric>
 
 using namespace llvm;
 using namespace llvm::msf;
@@ -41,39 +42,68 @@
   VerHeader = Version;
 }
 
+void TpiStreamBuilder::updateTypeIndexOffsets(ArrayRef<uint16_t> Sizes) {
+  // If we just crossed an 8KB threshold, add a type index offset.
+  for (uint16_t Size : Sizes) {
+    size_t NewSize = TypeRecordBytes + Size;
+    constexpr size_t EightKB = 8 * 1024;
+    if (NewSize / EightKB > TypeRecordBytes / EightKB || TypeRecordCount == 0) {
+      TypeIndexOffsets.push_back(
+          {codeview::TypeIndex(codeview::TypeIndex::FirstNonSimpleIndex +
+                               TypeRecordCount),
+           ulittle32_t(TypeRecordBytes)});
+    }
+    ++TypeRecordCount;
+    TypeRecordBytes = NewSize;
+  }
+}
+
 void TpiStreamBuilder::addTypeRecord(ArrayRef<uint8_t> Record,
                                      Optional<uint32_t> Hash) {
-  // If we just crossed an 8KB threshold, add a type index offset.
   assert(((Record.size() & 3) == 0) &&
          "The type record's size is not a multiple of 4 bytes which will "
          "cause misalignment in the output TPI stream!");
-  size_t NewSize = TypeRecordBytes + Record.size();
-  constexpr size_t EightKB = 8 * 1024;
-  if (NewSize / EightKB > TypeRecordBytes / EightKB || TypeRecords.empty()) {
-    TypeIndexOffsets.push_back(
-        {codeview::TypeIndex(codeview::TypeIndex::FirstNonSimpleIndex +
-                             TypeRecords.size()),
-         ulittle32_t(TypeRecordBytes)});
-  }
-  TypeRecordBytes = NewSize;
+  assert(Record.size() <= codeview::MaxRecordLength);
+  uint16_t OneSize = (uint16_t)Record.size();
+  updateTypeIndexOffsets(makeArrayRef(&OneSize, 1));
 
-  TypeRecords.push_back(Record);
+  TypeRecBuffers.push_back(Record);
+  // FIXME: Require it.
   if (Hash)
     TypeHashes.push_back(*Hash);
 }
 
+void TpiStreamBuilder::addTypeRecords(ArrayRef<uint8_t> Types,
+                                      ArrayRef<uint16_t> Sizes,
+                                      ArrayRef<uint32_t> Hashes) {
+  // Ignore empty type buffers. There should be no hashes or sizes in this case.
+  if (Types.empty()) {
+    assert(Sizes.empty() && Hashes.empty());
+    return;
+  }
+
+  assert(((Types.size() & 3) == 0) &&
+         "The type record's size is not a multiple of 4 bytes which will "
+         "cause misalignment in the output TPI stream!");
+  assert(Sizes.size() == Hashes.size() && "sizes and hashes should be in sync");
+  assert(std::accumulate(Sizes.begin(), Sizes.end(), 0U) == Types.size() &&
+         "sizes of type records should sum to the size of the types");
+  updateTypeIndexOffsets(Sizes);
+
+  TypeRecBuffers.push_back(Types);
+  TypeHashes.insert(TypeHashes.end(), Hashes.begin(), Hashes.end());
+}
+
 Error TpiStreamBuilder::finalize() {
   if (Header)
     return Error::success();
 
   TpiStreamHeader *H = Allocator.Allocate<TpiStreamHeader>();
 
-  uint32_t Count = TypeRecords.size();
-
   H->Version = VerHeader;
   H->HeaderSize = sizeof(TpiStreamHeader);
   H->TypeIndexBegin = codeview::TypeIndex::FirstNonSimpleIndex;
-  H->TypeIndexEnd = H->TypeIndexBegin + Count;
+  H->TypeIndexEnd = H->TypeIndexBegin + TypeRecordCount;
   H->TypeRecordBytes = TypeRecordBytes;
 
   H->HashStreamIndex = HashStreamIndex;
@@ -104,7 +134,7 @@
 }
 
 uint32_t TpiStreamBuilder::calculateHashBufferSize() const {
-  assert((TypeRecords.size() == TypeHashes.size() || TypeHashes.empty()) &&
+  assert((TypeRecordCount == TypeHashes.size() || TypeHashes.empty()) &&
          "either all or no type records should have hashes");
   return TypeHashes.size() * sizeof(ulittle32_t);
 }
@@ -155,7 +185,7 @@
   if (auto EC = Writer.writeObject(*Header))
     return EC;
 
-  for (auto Rec : TypeRecords) {
+  for (auto Rec : TypeRecBuffers) {
     assert(!Rec.empty() && "Attempting to write an empty type record shifts "
                            "all offsets in the TPI stream!");
     assert(((Rec.size() & 3) == 0) &&
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -1373,9 +1373,19 @@
                     "assuming that double is 64 bits!");
       APFloat apf = APF;
       // Floats are represented in ASCII IR as double, convert.
-      if (!isDouble)
+      // FIXME: We should allow 32-bit hex float and remove this.
+      if (!isDouble) {
+        // A signaling NaN is quieted on conversion, so we need to recreate the
+        // expected value after convert (quiet bit of the payload is clear).
+        bool IsSNAN = apf.isSignaling();
         apf.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
-                          &ignored);
+                    &ignored);
+        if (IsSNAN) {
+          APInt Payload = apf.bitcastToAPInt();
+          apf = APFloat::getSNaN(APFloat::IEEEdouble(), apf.isNegative(),
+                                 &Payload);
+        }
+      }
       Out << format_hex(apf.bitcastToAPInt().getZExtValue(), 0, /*Upper=*/true);
       return;
     }
diff --git a/llvm/lib/IR/LegacyPassManager.cpp b/llvm/lib/IR/LegacyPassManager.cpp
--- a/llvm/lib/IR/LegacyPassManager.cpp
+++ b/llvm/lib/IR/LegacyPassManager.cpp
@@ -87,14 +87,14 @@
 static cl::opt<bool>
     PrintModuleScope("print-module-scope",
                      cl::desc("When printing IR for print-[before|after]{-all} "
-                              "always print a module IR"),
+                              "and change reporters, always print a module IR"),
                      cl::init(false), cl::Hidden);
 
 static cl::list<std::string>
     PrintFuncsList("filter-print-funcs", cl::value_desc("function names"),
                    cl::desc("Only print IR for functions whose name "
                             "match this for all print-[before|after][-all] "
-                            "options"),
+                            "and change reporter options"),
                    cl::CommaSeparated, cl::Hidden);
 
 /// This is a helper to determine whether to print IR before or
diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp
--- a/llvm/lib/MC/MCExpr.cpp
+++ b/llvm/lib/MC/MCExpr.cpp
@@ -85,8 +85,13 @@
     } else
       Sym.print(OS, MAI);
 
-    if (SRE.getKind() != MCSymbolRefExpr::VK_None)
-      SRE.printVariantKind(OS);
+    const MCSymbolRefExpr::VariantKind Kind = SRE.getKind();
+    if (Kind != MCSymbolRefExpr::VK_None) {
+      if (MAI && MAI->useParensForSymbolVariant()) // ARM
+        OS << '(' << MCSymbolRefExpr::getVariantKindName(Kind) << ')';
+      else
+        OS << '@' << MCSymbolRefExpr::getVariantKindName(Kind);
+    }
 
     return;
   }
@@ -197,8 +202,7 @@
 MCSymbolRefExpr::MCSymbolRefExpr(const MCSymbol *Symbol, VariantKind Kind,
                                  const MCAsmInfo *MAI, SMLoc Loc)
     : MCExpr(MCExpr::SymbolRef, Loc,
-             encodeSubclassData(Kind, MAI->useParensForSymbolVariant(),
-                                MAI->hasSubsectionsViaSymbols())),
+             encodeSubclassData(Kind, MAI->hasSubsectionsViaSymbols())),
       Symbol(Symbol) {
   assert(Symbol);
 }
@@ -510,13 +514,6 @@
     .Default(VK_Invalid);
 }
 
-void MCSymbolRefExpr::printVariantKind(raw_ostream &OS) const {
-  if (useParensForSymbolVariant())
-    OS << '(' << MCSymbolRefExpr::getVariantKindName(getKind()) << ')';
-  else
-    OS << '@' << MCSymbolRefExpr::getVariantKindName(getKind());
-}
-
 /* *** */
 
 void MCTargetExpr::anchor() {}
diff --git a/llvm/lib/MC/MCObjectFileInfo.cpp b/llvm/lib/MC/MCObjectFileInfo.cpp
--- a/llvm/lib/MC/MCObjectFileInfo.cpp
+++ b/llvm/lib/MC/MCObjectFileInfo.cpp
@@ -1002,7 +1002,7 @@
     Flags |= ELF::SHF_GROUP;
   }
 
-  return Ctx->getELFSection(".bb_addr_map", ELF::SHT_PROGBITS, Flags, 0,
-                            GroupName, MCSection::NonUniqueID,
+  return Ctx->getELFSection("__llvm_bb_addr_map", ELF::SHT_LLVM_BB_ADDR_MAP,
+                            Flags, 0, GroupName, MCSection::NonUniqueID,
                             cast<MCSymbolELF>(TextSec.getBeginSymbol()));
 }
diff --git a/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
--- a/llvm/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
@@ -626,6 +626,8 @@
       Type = ELF::SHT_LLVM_DEPENDENT_LIBRARIES;
     else if (TypeName == "llvm_sympart")
       Type = ELF::SHT_LLVM_SYMPART;
+    else if (TypeName == "llvm_bb_addr_map")
+      Type = ELF::SHT_LLVM_BB_ADDR_MAP;
     else if (TypeName.getAsInteger(0, Type))
       return TokError("unknown section type");
   }
diff --git a/llvm/lib/MC/MCSectionELF.cpp b/llvm/lib/MC/MCSectionELF.cpp
--- a/llvm/lib/MC/MCSectionELF.cpp
+++ b/llvm/lib/MC/MCSectionELF.cpp
@@ -156,6 +156,8 @@
     OS << "llvm_dependent_libraries";
   else if (Type == ELF::SHT_LLVM_SYMPART)
     OS << "llvm_sympart";
+  else if (Type == ELF::SHT_LLVM_BB_ADDR_MAP)
+    OS << "llvm_bb_addr_map";
   else
     report_fatal_error("unsupported type 0x" + Twine::utohexstr(Type) +
                        " for section " + getName());
diff --git a/llvm/lib/Object/ArchiveWriter.cpp b/llvm/lib/Object/ArchiveWriter.cpp
--- a/llvm/lib/Object/ArchiveWriter.cpp
+++ b/llvm/lib/Object/ArchiveWriter.cpp
@@ -359,22 +359,21 @@
   // reference to it, thus SymbolicFile should be destroyed first.
   LLVMContext Context;
   std::unique_ptr<object::SymbolicFile> Obj;
-  if (identify_magic(Buf.getBuffer()) == file_magic::bitcode) {
+
+  const file_magic Type = identify_magic(Buf.getBuffer());
+  // Treat unsupported file types as having no symbols.
+  if (!object::SymbolicFile::isSymbolicFile(Type, &Context))
+    return Ret;
+  if (Type == file_magic::bitcode) {
     auto ObjOrErr = object::SymbolicFile::createSymbolicFile(
         Buf, file_magic::bitcode, &Context);
-    if (!ObjOrErr) {
-      // FIXME: check only for "not an object file" errors.
-      consumeError(ObjOrErr.takeError());
-      return Ret;
-    }
+    if (!ObjOrErr)
+      return ObjOrErr.takeError();
     Obj = std::move(*ObjOrErr);
   } else {
     auto ObjOrErr = object::SymbolicFile::createSymbolicFile(Buf);
-    if (!ObjOrErr) {
-      // FIXME: check only for "not an object file" errors.
-      consumeError(ObjOrErr.takeError());
-      return Ret;
-    }
+    if (!ObjOrErr)
+      return ObjOrErr.takeError();
     Obj = std::move(*ObjOrErr);
   }
 
@@ -393,7 +392,7 @@
 static Expected<std::vector<MemberData>>
 computeMemberData(raw_ostream &StringTable, raw_ostream &SymNames,
                   object::Archive::Kind Kind, bool Thin, bool Deterministic,
-                  ArrayRef<NewArchiveMember> NewMembers) {
+                  bool NeedSymbols, ArrayRef<NewArchiveMember> NewMembers) {
   static char PaddingData[8] = {'\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'};
 
   // This ignores the symbol table, but we only need the value mod 8 and the
@@ -494,13 +493,17 @@
                       ModTime, Size);
     Out.flush();
 
-    Expected<std::vector<unsigned>> Symbols =
-        getSymbols(Buf, SymNames, HasObject);
-    if (auto E = Symbols.takeError())
-      return std::move(E);
+    std::vector<unsigned> Symbols;
+    if (NeedSymbols) {
+      Expected<std::vector<unsigned>> SymbolsOrErr =
+          getSymbols(Buf, SymNames, HasObject);
+      if (auto E = SymbolsOrErr.takeError())
+        return std::move(E);
+      Symbols = std::move(*SymbolsOrErr);
+    }
 
     Pos += Header.size() + Data.size() + Padding.size();
-    Ret.push_back({std::move(*Symbols), std::move(Header), Data, Padding});
+    Ret.push_back({std::move(Symbols), std::move(Header), Data, Padding});
   }
   // If there are no symbols, emit an empty symbol table, to satisfy Solaris
   // tools, older versions of which expect a symbol table in a non-empty
@@ -564,8 +567,9 @@
   SmallString<0> StringTableBuf;
   raw_svector_ostream StringTable(StringTableBuf);
 
-  Expected<std::vector<MemberData>> DataOrErr = computeMemberData(
-      StringTable, SymNames, Kind, Thin, Deterministic, NewMembers);
+  Expected<std::vector<MemberData>> DataOrErr =
+      computeMemberData(StringTable, SymNames, Kind, Thin, Deterministic,
+                        WriteSymtab, NewMembers);
   if (Error E = DataOrErr.takeError())
     return E;
   std::vector<MemberData> &Data = *DataOrErr;
diff --git a/llvm/lib/Object/ELF.cpp b/llvm/lib/Object/ELF.cpp
--- a/llvm/lib/Object/ELF.cpp
+++ b/llvm/lib/Object/ELF.cpp
@@ -276,6 +276,7 @@
     STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_SYMPART);
     STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_PART_EHDR);
     STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_PART_PHDR);
+    STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_BB_ADDR_MAP);
     STRINGIFY_ENUM_CASE(ELF, SHT_GNU_ATTRIBUTES);
     STRINGIFY_ENUM_CASE(ELF, SHT_GNU_HASH);
     STRINGIFY_ENUM_CASE(ELF, SHT_GNU_verdef);
diff --git a/llvm/lib/Object/SymbolicFile.cpp b/llvm/lib/Object/SymbolicFile.cpp
--- a/llvm/lib/Object/SymbolicFile.cpp
+++ b/llvm/lib/Object/SymbolicFile.cpp
@@ -41,20 +41,14 @@
   if (Type == file_magic::unknown)
     Type = identify_magic(Data);
 
+  if (!isSymbolicFile(Type, Context))
+    return errorCodeToError(object_error::invalid_file_type);
+
   switch (Type) {
   case file_magic::bitcode:
-    if (Context)
-      return IRObjectFile::create(Object, *Context);
-    LLVM_FALLTHROUGH;
-  case file_magic::unknown:
-  case file_magic::archive:
-  case file_magic::coff_cl_gl_object:
-  case file_magic::macho_universal_binary:
-  case file_magic::windows_resource:
-  case file_magic::pdb:
-  case file_magic::minidump:
-  case file_magic::tapi_file:
-    return errorCodeToError(object_error::invalid_file_type);
+    // Context is guaranteed to be non-null here, because bitcode magic only
+    // indicates a symbolic file when Context is non-null.
+    return IRObjectFile::create(Object, *Context);
   case file_magic::elf:
   case file_magic::elf_executable:
   case file_magic::elf_shared_object:
@@ -95,6 +89,39 @@
         MemoryBufferRef(BCData->getBuffer(), Object.getBufferIdentifier()),
         *Context);
   }
+  default:
+    llvm_unreachable("Unexpected Binary File Type");
+  }
+}
+
+bool SymbolicFile::isSymbolicFile(file_magic Type, const LLVMContext *Context) {
+  switch (Type) {
+  case file_magic::bitcode:
+    return Context != nullptr;
+  case file_magic::elf:
+  case file_magic::elf_executable:
+  case file_magic::elf_shared_object:
+  case file_magic::elf_core:
+  case file_magic::macho_executable:
+  case file_magic::macho_fixed_virtual_memory_shared_lib:
+  case file_magic::macho_core:
+  case file_magic::macho_preload_executable:
+  case file_magic::macho_dynamically_linked_shared_lib:
+  case file_magic::macho_dynamic_linker:
+  case file_magic::macho_bundle:
+  case file_magic::macho_dynamically_linked_shared_lib_stub:
+  case file_magic::macho_dsym_companion:
+  case file_magic::macho_kext_bundle:
+  case file_magic::pecoff_executable:
+  case file_magic::xcoff_object_32:
+  case file_magic::xcoff_object_64:
+  case file_magic::wasm_object:
+  case file_magic::coff_import_library:
+  case file_magic::elf_relocatable:
+  case file_magic::macho_object:
+  case file_magic::coff_object:
+    return true;
+  default:
+    return false;
   }
-  llvm_unreachable("Unexpected Binary File Type");
 }
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -150,6 +150,7 @@
 #include "llvm/Transforms/Scalar/LoopDataPrefetch.h"
 #include "llvm/Transforms/Scalar/LoopDeletion.h"
 #include "llvm/Transforms/Scalar/LoopDistribute.h"
+#include "llvm/Transforms/Scalar/LoopFlatten.h"
 #include "llvm/Transforms/Scalar/LoopFuse.h"
 #include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
 #include "llvm/Transforms/Scalar/LoopInstSimplify.h"
@@ -250,6 +251,10 @@
     "enable-npm-unroll-and-jam", cl::init(false), cl::Hidden,
     cl::desc("Enable the Unroll and Jam pass for the new PM (default = off)"));
 
+static cl::opt<bool> EnableLoopFlatten(
+    "enable-npm-loop-flatten", cl::init(false), cl::Hidden,
+    cl::desc("Enable the Loop flattening pass for the new PM (default = off)"));
+
 static cl::opt<bool> EnableSyntheticCounts(
     "enable-npm-synthetic-counts", cl::init(false), cl::Hidden, cl::ZeroOrMore,
     cl::desc("Run synthetic function entry count generation "
@@ -510,6 +515,8 @@
     C(LPM2, Level);
 
   LPM2.addPass(LoopDeletionPass());
+  if (EnableLoopFlatten)
+    LPM2.addPass(LoopFlattenPass());
   // Do not enable unrolling in PreLinkThinLTO phase during sample PGO
   // because it changes IR to makes profile annotation in back compile
   // inaccurate. The normal unroller doesn't pay attention to forced full unroll
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -97,7 +97,7 @@
 MODULE_PASS("strip-debug-declare", StripDebugDeclarePass())
 MODULE_PASS("strip-nondebug", StripNonDebugSymbolsPass())
 MODULE_PASS("synthetic-counts-propagation", SyntheticCountsPropagation())
-MODULE_PASS("wholeprogramdevirt", WholeProgramDevirtPass(nullptr, nullptr))
+MODULE_PASS("wholeprogramdevirt", WholeProgramDevirtPass())
 MODULE_PASS("verify", VerifierPass())
 MODULE_PASS("dfsan", DataFlowSanitizerPass())
 MODULE_PASS("asan-module", ModuleAddressSanitizerPass(/*CompileKernel=*/false, false, true, false))
@@ -359,6 +359,7 @@
 LOOP_PASS("no-op-loop", NoOpLoopPass())
 LOOP_PASS("print", PrintLoopPass(dbgs()))
 LOOP_PASS("loop-deletion", LoopDeletionPass())
+LOOP_PASS("loop-flatten", LoopFlattenPass())
 LOOP_PASS("loop-simplifycfg", LoopSimplifyCFGPass())
 LOOP_PASS("loop-reduce", LoopStrengthReducePass())
 LOOP_PASS("indvars", IndVarSimplifyPass())
diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp
--- a/llvm/lib/Passes/StandardInstrumentations.cpp
+++ b/llvm/lib/Passes/StandardInstrumentations.cpp
@@ -26,6 +26,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/raw_ostream.h"
+#include <unordered_set>
 #include <vector>
 
 using namespace llvm;
@@ -51,18 +52,48 @@
                    cl::desc("Print all pass management debugging information. "
                             "`-debug-pass-manager` must also be specified"));
 
+// An option that prints out the IR after passes, similar to
+// -print-after-all except that it only prints the IR after passes that
+// change the IR.  Those passes that do not make changes to the IR are
+// reported as not making any changes.  In addition, the initial IR is
+// also reported.  Other hidden options affect the output from this
+// option.  -filter-passes will limit the output to the named passes
+// that actually change the IR and other passes are reported as filtered out.
+// The specified passes will either be reported as making no changes (with
+// no IR reported) or the changed IR will be reported.  Also, the
+// -filter-print-funcs and -print-module-scope options will do similar
+// filtering based on function name, reporting changed IRs as functions(or
+// modules if -print-module-scope is specified) for a particular function
+// or indicating that the IR has been filtered out.  The extra options
+// can be combined, allowing only changed IRs for certain passes on certain
+// functions to be reported in different formats, with the rest being
+// reported as filtered out.
+static cl::opt<bool> PrintChanged("print-changed",
+                                  cl::desc("Print changed IRs"),
+                                  cl::init(false), cl::Hidden);
+// An option that supports the -print-changed option.  See
+// the description for -print-changed for an explanation of the use
+// of this option.  Note that this option has no effect without -print-changed.
+static cl::list<std::string>
+    PrintPassesList("filter-passes", cl::value_desc("pass names"),
+                    cl::desc("Only consider IR changes for passes whose names "
+                             "match for the print-changed option"),
+                    cl::CommaSeparated, cl::Hidden);
+
 namespace {
 
 /// Extracting Module out of \p IR unit. Also fills a textual description
 /// of \p IR for use in header when printing.
-Optional<std::pair<const Module *, std::string>> unwrapModule(Any IR) {
+Optional<std::pair<const Module *, std::string>>
+unwrapModule(Any IR, bool Force = false) {
   if (any_isa<const Module *>(IR))
     return std::make_pair(any_cast<const Module *>(IR), std::string());
 
   if (any_isa<const Function *>(IR)) {
     const Function *F = any_cast<const Function *>(IR);
-    if (!llvm::isFunctionInPrintList(F->getName()))
+    if (!Force && !llvm::isFunctionInPrintList(F->getName()))
       return None;
+
     const Module *M = F->getParent();
     return std::make_pair(M, formatv(" (function: {0})", F->getName()).str());
   }
@@ -71,18 +102,19 @@
     const LazyCallGraph::SCC *C = any_cast<const LazyCallGraph::SCC *>(IR);
     for (const LazyCallGraph::Node &N : *C) {
       const Function &F = N.getFunction();
-      if (!F.isDeclaration() && isFunctionInPrintList(F.getName())) {
+      if (Force || (!F.isDeclaration() && isFunctionInPrintList(F.getName()))) {
         const Module *M = F.getParent();
         return std::make_pair(M, formatv(" (scc: {0})", C->getName()).str());
       }
     }
+    assert(!Force && "Expected to have made a pair when forced.");
     return None;
   }
 
   if (any_isa<const Loop *>(IR)) {
     const Loop *L = any_cast<const Loop *>(IR);
     const Function *F = L->getHeader()->getParent();
-    if (!isFunctionInPrintList(F->getName()))
+    if (!Force && !isFunctionInPrintList(F->getName()))
       return None;
     const Module *M = F->getParent();
     std::string LoopName;
@@ -107,7 +139,8 @@
 }
 
 void printIR(raw_ostream &OS, const Module *M, StringRef Banner,
-             StringRef Extra = StringRef(), bool Brief = false) {
+             StringRef Extra = StringRef(), bool Brief = false,
+             bool ShouldPreserveUseListOrder = false) {
   if (Brief) {
     OS << M->getName() << '\n';
     return;
@@ -115,7 +148,7 @@
 
   if (llvm::isFunctionInPrintList("*") || llvm::forcePrintModuleIR()) {
     OS << Banner << Extra << "\n";
-    M->print(OS, nullptr, false);
+    M->print(OS, nullptr, ShouldPreserveUseListOrder);
   } else {
     for (const auto &F : M->functions()) {
       printIR(OS, &F, Banner, Extra);
@@ -159,17 +192,19 @@
 /// Generic IR-printing helper that unpacks a pointer to IRUnit wrapped into
 /// llvm::Any and does actual print job.
 void unwrapAndPrint(raw_ostream &OS, Any IR, StringRef Banner,
-                    bool ForceModule = false, bool Brief = false) {
+                    bool ForceModule = false, bool Brief = false,
+                    bool ShouldPreserveUseListOrder = false) {
   if (ForceModule) {
     if (auto UnwrappedModule = unwrapModule(IR))
-      printIR(OS, UnwrappedModule->first, Banner, UnwrappedModule->second);
+      printIR(OS, UnwrappedModule->first, Banner, UnwrappedModule->second,
+              Brief, ShouldPreserveUseListOrder);
     return;
   }
 
   if (any_isa<const Module *>(IR)) {
     const Module *M = any_cast<const Module *>(IR);
     assert(M && "module should be valid for printing");
-    printIR(OS, M, Banner, "", Brief);
+    printIR(OS, M, Banner, "", Brief, ShouldPreserveUseListOrder);
     return;
   }
 
@@ -197,8 +232,196 @@
   llvm_unreachable("Unknown wrapped IR type");
 }
 
+// Return true when this is a pass for which changes should be ignored
+inline bool isIgnored(StringRef PassID) {
+  return isSpecialPass(PassID,
+                       {"PassManager", "PassAdaptor", "AnalysisManagerProxy"});
+}
+
+// Return true when this is a defined function for which printing
+// of changes is desired.
+inline bool isInterestingFunction(const Function &F) {
+  return llvm::isFunctionInPrintList(F.getName());
+}
+
+// Return true when this is a pass for which printing of changes is desired.
+inline bool isInterestingPass(StringRef PassID) {
+  if (isIgnored(PassID))
+    return false;
+
+  static std::unordered_set<std::string> PrintPassNames(PrintPassesList.begin(),
+                                                        PrintPassesList.end());
+  return PrintPassNames.empty() || PrintPassNames.count(PassID.str());
+}
+
+// Return true when this is a pass on IR for which printing
+// of changes is desired.
+bool isInteresting(Any IR, StringRef PassID) {
+  if (!isInterestingPass(PassID))
+    return false;
+  if (any_isa<const Function *>(IR))
+    return isInterestingFunction(*any_cast<const Function *>(IR));
+  return true;
+}
+
 } // namespace
 
+template <typename IRUnitT>
+void ChangePrinter<IRUnitT>::saveIRBeforePass(Any IR, StringRef PassID) {
+  // Always need to place something on the stack because invalidated passes
+  // are not given the IR so it cannot be determined whether the pass was for
+  // something that was filtered out.
+  BeforeStack.emplace_back();
+
+  if (!isInteresting(IR, PassID))
+    return;
+  // Is this the initial IR?
+  if (InitialIR) {
+    InitialIR = false;
+    handleInitialIR(IR);
+  }
+
+  // Save the IR representation on the stack.
+  IRUnitT &Data = BeforeStack.back();
+  generateIRRepresentation(IR, PassID, Data);
+}
+
+template <typename IRUnitT>
+void ChangePrinter<IRUnitT>::handleIRAfterPass(Any IR, StringRef PassID) {
+  assert(!BeforeStack.empty() && "Unexpected empty stack encountered.");
+  std::string Name;
+
+  // unwrapModule has inconsistent handling of names for function IRs.
+  if (any_isa<const Function *>(IR)) {
+    const Function *F = any_cast<const Function *>(IR);
+    Name = formatv(" (function: {0})", F->getName()).str();
+  } else {
+    if (auto UM = unwrapModule(IR))
+      Name = UM->second;
+  }
+  if (Name.empty())
+    Name = " (module)";
+
+  if (isIgnored(PassID))
+    handleIgnored(PassID, Name);
+  else if (!isInteresting(IR, PassID))
+    handleFiltered(PassID, Name);
+  else {
+    // Get the before rep from the stack
+    IRUnitT &Before = BeforeStack.back();
+    // Create the after rep
+    IRUnitT After;
+    generateIRRepresentation(IR, PassID, After);
+
+    // Was there a change in IR?
+    if (same(Before, After))
+      omitAfter(PassID, Name);
+    else
+      handleAfter(PassID, Name, Before, After, IR);
+  }
+  BeforeStack.pop_back();
+}
+
+template <typename IRUnitT>
+void ChangePrinter<IRUnitT>::handleInvalidatedPass(StringRef PassID) {
+  assert(!BeforeStack.empty() && "Unexpected empty stack encountered.");
+
+  // Always flag it as invalidated as we cannot determine when
+  // a pass for a filtered function is invalidated since we do not
+  // get the IR in the call.  Also, the output is just alternate
+  // forms of the banner anyway.
+  handleInvalidated(PassID);
+  BeforeStack.pop_back();
+}
+
+template <typename IRUnitT> ChangePrinter<IRUnitT>::~ChangePrinter<IRUnitT>() {
+  assert(BeforeStack.empty() && "Problem with Change Printer stack.");
+}
+
+IRChangePrinter::IRChangePrinter() : Out(dbgs()) {}
+
+IRChangePrinter::~IRChangePrinter() {}
+
+void IRChangePrinter::registerCallbacks(PassInstrumentationCallbacks &PIC) {
+  if (!PrintChanged)
+    return;
+
+  PIC.registerBeforePassCallback([this](StringRef P, Any IR) {
+    saveIRBeforePass(IR, P);
+    return true;
+  });
+
+  PIC.registerAfterPassCallback(
+      [this](StringRef P, Any IR, const PreservedAnalyses &) {
+        handleIRAfterPass(IR, P);
+      });
+  PIC.registerAfterPassInvalidatedCallback(
+      [this](StringRef P, const PreservedAnalyses &) {
+        handleInvalidatedPass(P);
+      });
+}
+
+void IRChangePrinter::handleInitialIR(Any IR) {
+  // Always print the module.
+  // Unwrap and print directly to avoid filtering problems in general routines.
+  auto UnwrappedModule = unwrapModule(IR, /*Force=*/true);
+  assert(UnwrappedModule && "Expected module to be unwrapped when forced.");
+  Out << "*** IR Dump At Start: ***" << UnwrappedModule->second << "\n";
+  UnwrappedModule->first->print(Out, nullptr,
+                                /*ShouldPreserveUseListOrder=*/true);
+}
+
+void IRChangePrinter::generateIRRepresentation(Any IR, StringRef PassID,
+                                               std::string &Output) {
+  raw_string_ostream OS(Output);
+  // use the after banner for all cases so it will match
+  SmallString<20> Banner = formatv("*** IR Dump After {0} ***", PassID);
+  unwrapAndPrint(OS, IR, Banner, llvm::forcePrintModuleIR(),
+                 /*Brief=*/false, /*ShouldPreserveUseListOrder=*/true);
+  OS.str();
+}
+
+void IRChangePrinter::omitAfter(StringRef PassID, std::string &Name) {
+  Out << formatv("*** IR Dump After {0}{1} omitted because no change ***\n",
+                 PassID, Name);
+}
+
+void IRChangePrinter::handleAfter(StringRef PassID, std::string &Name,
+                                  const std::string &Before,
+                                  const std::string &After, Any) {
+  assert(After.find("*** IR Dump") == 0 && "Unexpected banner format.");
+  StringRef AfterRef = After;
+  StringRef Banner =
+      AfterRef.take_until([](char C) -> bool { return C == '\n'; });
+  Out << Banner;
+
+  // LazyCallGraph::SCC already has "(scc:..." in banner so only add
+  // in the name if it isn't already there.
+  if (Name.substr(0, 6) != " (scc:" && !llvm::forcePrintModuleIR())
+    Out << Name;
+
+  Out << After.substr(Banner.size());
+}
+
+void IRChangePrinter::handleInvalidated(StringRef PassID) {
+  Out << formatv("*** IR Pass {0} invalidated ***\n", PassID);
+}
+
+void IRChangePrinter::handleFiltered(StringRef PassID, std::string &Name) {
+  SmallString<20> Banner =
+      formatv("*** IR Dump After {0}{1} filtered out ***\n", PassID, Name);
+  Out << Banner;
+}
+
+void IRChangePrinter::handleIgnored(StringRef PassID, std::string &Name) {
+  Out << formatv("*** IR Pass {0}{1} ignored ***\n", PassID, Name);
+}
+
+bool IRChangePrinter::same(const std::string &Before,
+                           const std::string &After) {
+  return Before == After;
+}
+
 PrintIRInstrumentation::~PrintIRInstrumentation() {
   assert(ModuleDescStack.empty() && "ModuleDescStack is not empty at exit");
 }
@@ -508,4 +731,5 @@
   TimePasses.registerCallbacks(PIC);
   OptNone.registerCallbacks(PIC);
   PreservedCFGChecker.registerCallbacks(PIC);
+  PrintChangedIR.registerCallbacks(PIC);
 }
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -842,7 +842,7 @@
   // Test if the significand excluding the integral bit is all ones. This allows
   // us to test for binade boundaries.
   const integerPart *Parts = significandParts();
-  const unsigned PartCount = partCount();
+  const unsigned PartCount = partCountForBits(semantics->precision);
   for (unsigned i = 0; i < PartCount - 1; i++)
     if (~Parts[i])
       return false;
@@ -850,8 +850,8 @@
   // Set the unused high bits to all ones when we compare.
   const unsigned NumHighBits =
     PartCount*integerPartWidth - semantics->precision + 1;
-  assert(NumHighBits <= integerPartWidth && "Can not have more high bits to "
-         "fill than integerPartWidth");
+  assert(NumHighBits <= integerPartWidth && NumHighBits > 0 &&
+         "Can not have more high bits to fill than integerPartWidth");
   const integerPart HighBitFill =
     ~integerPart(0) << (integerPartWidth - NumHighBits);
   if (~(Parts[PartCount - 1] | HighBitFill))
@@ -864,15 +864,16 @@
   // Test if the significand excluding the integral bit is all zeros. This
   // allows us to test for binade boundaries.
   const integerPart *Parts = significandParts();
-  const unsigned PartCount = partCount();
+  const unsigned PartCount = partCountForBits(semantics->precision);
 
   for (unsigned i = 0; i < PartCount - 1; i++)
     if (Parts[i])
       return false;
 
+  // Compute how many bits are used in the final word.
   const unsigned NumHighBits =
     PartCount*integerPartWidth - semantics->precision + 1;
-  assert(NumHighBits <= integerPartWidth && "Can not have more high bits to "
+  assert(NumHighBits < integerPartWidth && "Can not have more high bits to "
          "clear than integerPartWidth");
   const integerPart HighBitMask = ~integerPart(0) >> NumHighBits;
 
@@ -2242,26 +2243,15 @@
     if (!X86SpecialNan && semantics == &semX87DoubleExtended)
       APInt::tcSetBit(significandParts(), semantics->precision - 1);
 
-    // If we are truncating NaN, it is possible that we shifted out all of the
-    // set bits in a signalling NaN payload. But NaN must remain NaN, so some
-    // bit in the significand must be set (otherwise it is Inf).
-    // This can only happen with sNaN. Set the 1st bit after the quiet bit,
-    // so that we still have an sNaN.
-    // FIXME: Set quiet and return opInvalidOp (on convert of any sNaN).
-    //        But this requires fixing LLVM to parse 32-bit hex FP or ignoring
-    //        conversions while parsing IR.
-    if (APInt::tcIsZero(significandParts(), newPartCount)) {
-      assert(shift < 0 && "Should not lose NaN payload on extend");
-      assert(semantics->precision >= 3 && "Unexpectedly narrow significand");
-      assert(*losesInfo && "Missing payload should have set lost info");
-      APInt::tcSetBit(significandParts(), semantics->precision - 3);
+    // Convert of sNaN creates qNaN and raises an exception (invalid op).
+    // This also guarantees that a sNaN does not become Inf on a truncation
+    // that loses all payload bits.
+    if (isSignaling()) {
+      makeQuiet();
+      fs = opInvalidOp;
+    } else {
+      fs = opOK;
     }
-
-    // gcc forces the Quiet bit on, which means (float)(double)(float_sNan)
-    // does not give you back the same bits.  This is dubious, and we
-    // don't currently do it.  You're really supposed to get
-    // an invalid operation signal at runtime, but nobody does that.
-    fs = opOK;
   } else {
     *losesInfo = false;
     fs = opOK;
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1770,8 +1770,6 @@
   if (NeedsWinCFI && HasWinCFI)
     BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd))
         .setMIFlag(MachineInstr::FrameDestroy);
-
-  MF.setHasWinCFI(HasWinCFI);
 }
 
 /// getFrameIndexReference - Provide a base+offset reference to an FI slot for
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -105,6 +105,8 @@
   FROUNDEVEN_MERGE_PASSTHRU,
   FSQRT_MERGE_PASSTHRU,
   FTRUNC_MERGE_PASSTHRU,
+  FP_ROUND_MERGE_PASSTHRU,
+  FP_EXTEND_MERGE_PASSTHRU,
   UINT_TO_FP_MERGE_PASSTHRU,
   SINT_TO_FP_MERGE_PASSTHRU,
   FCVTZU_MERGE_PASSTHRU,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -137,6 +137,23 @@
   }
 }
 
+static inline MVT getPromotedVTForPredicate(MVT VT) {
+  assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
+         "Expected scalable predicate vector type!");
+  switch (VT.getVectorMinNumElements()) {
+  default:
+    llvm_unreachable("unexpected element count for vector");
+  case 2:
+    return MVT::nxv2i64;
+  case 4:
+    return MVT::nxv4i32;
+  case 8:
+    return MVT::nxv8i16;
+  case 16:
+    return MVT::nxv16i8;
+  }
+}
+
 /// Returns true if VT's elements occupy the lowest bit positions of its
 /// associated register class without any intervening space.
 ///
@@ -166,6 +183,8 @@
   case AArch64ISD::FROUND_MERGE_PASSTHRU:
   case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU:
   case AArch64ISD::FTRUNC_MERGE_PASSTHRU:
+  case AArch64ISD::FP_ROUND_MERGE_PASSTHRU:
+  case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU:
   case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU:
   case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU:
   case AArch64ISD::FCVTZU_MERGE_PASSTHRU:
@@ -973,76 +992,76 @@
     // FIXME: Add custom lowering of MLOAD to handle different passthrus (not a
     // splat of 0 or undef) once vector selects supported in SVE codegen. See
     // D68877 for more details.
-    for (MVT VT : MVT::integer_scalable_vector_valuetypes()) {
-      if (isTypeLegal(VT)) {
-        setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
-        setOperationAction(ISD::UINT_TO_FP, VT, Custom);
-        setOperationAction(ISD::SINT_TO_FP, VT, Custom);
-        setOperationAction(ISD::FP_TO_UINT, VT, Custom);
-        setOperationAction(ISD::FP_TO_SINT, VT, Custom);
-        setOperationAction(ISD::MUL, VT, Custom);
-        setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
-        setOperationAction(ISD::SELECT, VT, Custom);
-        setOperationAction(ISD::SDIV, VT, Custom);
-        setOperationAction(ISD::UDIV, VT, Custom);
-        setOperationAction(ISD::SMIN, VT, Custom);
-        setOperationAction(ISD::UMIN, VT, Custom);
-        setOperationAction(ISD::SMAX, VT, Custom);
-        setOperationAction(ISD::UMAX, VT, Custom);
-        setOperationAction(ISD::SHL, VT, Custom);
-        setOperationAction(ISD::SRL, VT, Custom);
-        setOperationAction(ISD::SRA, VT, Custom);
-        if (VT.getScalarType() == MVT::i1) {
-          setOperationAction(ISD::SETCC, VT, Custom);
-          setOperationAction(ISD::TRUNCATE, VT, Custom);
-          setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);
-        }
-      }
-    }
-
+    for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
+      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+      setOperationAction(ISD::UINT_TO_FP, VT, Custom);
+      setOperationAction(ISD::SINT_TO_FP, VT, Custom);
+      setOperationAction(ISD::FP_TO_UINT, VT, Custom);
+      setOperationAction(ISD::FP_TO_SINT, VT, Custom);
+      setOperationAction(ISD::MUL, VT, Custom);
+      setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
+      setOperationAction(ISD::SELECT, VT, Custom);
+      setOperationAction(ISD::SDIV, VT, Custom);
+      setOperationAction(ISD::UDIV, VT, Custom);
+      setOperationAction(ISD::SMIN, VT, Custom);
+      setOperationAction(ISD::UMIN, VT, Custom);
+      setOperationAction(ISD::SMAX, VT, Custom);
+      setOperationAction(ISD::UMAX, VT, Custom);
+      setOperationAction(ISD::SHL, VT, Custom);
+      setOperationAction(ISD::SRL, VT, Custom);
+      setOperationAction(ISD::SRA, VT, Custom);
+    }
+
+    // Illegal unpacked integer vector types.
     for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
     }
 
-    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom);
-    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
+    for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1}) {
+      setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+      setOperationAction(ISD::SELECT, VT, Custom);
+      setOperationAction(ISD::SETCC, VT, Custom);
+      setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
+      setOperationAction(ISD::TRUNCATE, VT, Custom);
 
-    for (MVT VT : MVT::fp_scalable_vector_valuetypes()) {
-      if (isTypeLegal(VT)) {
-        setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
-        setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
-        setOperationAction(ISD::SELECT, VT, Custom);
-        setOperationAction(ISD::FADD, VT, Custom);
-        setOperationAction(ISD::FDIV, VT, Custom);
-        setOperationAction(ISD::FMA, VT, Custom);
-        setOperationAction(ISD::FMUL, VT, Custom);
-        setOperationAction(ISD::FNEG, VT, Custom);
-        setOperationAction(ISD::FSUB, VT, Custom);
-        setOperationAction(ISD::FCEIL, VT, Custom);
-        setOperationAction(ISD::FFLOOR, VT, Custom);
-        setOperationAction(ISD::FNEARBYINT, VT, Custom);
-        setOperationAction(ISD::FRINT, VT, Custom);
-        setOperationAction(ISD::FROUND, VT, Custom);
-        setOperationAction(ISD::FROUNDEVEN, VT, Custom);
-        setOperationAction(ISD::FTRUNC, VT, Custom);
-        setOperationAction(ISD::FSQRT, VT, Custom);
+      // There are no legal MVT::nxv16f## based types.
+      if (VT != MVT::nxv16i1) {
+        setOperationAction(ISD::SINT_TO_FP, VT, Promote);
+        AddPromotedToType(ISD::SINT_TO_FP, VT, getPromotedVTForPredicate(VT));
+        setOperationAction(ISD::UINT_TO_FP, VT, Promote);
+        AddPromotedToType(ISD::UINT_TO_FP, VT, getPromotedVTForPredicate(VT));
       }
     }
 
-    setOperationAction(ISD::SINT_TO_FP, MVT::nxv2i1, Promote);
-    AddPromotedToType(ISD::SINT_TO_FP, MVT::nxv2i1, MVT::nxv2i64);
-    setOperationAction(ISD::SINT_TO_FP, MVT::nxv4i1, Promote);
-    AddPromotedToType(ISD::SINT_TO_FP, MVT::nxv4i1, MVT::nxv4i32);
-    setOperationAction(ISD::SINT_TO_FP, MVT::nxv8i1, Promote);
-    AddPromotedToType(ISD::SINT_TO_FP, MVT::nxv8i1, MVT::nxv8i16);
+    for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
+                    MVT::nxv4f32, MVT::nxv2f64}) {
+      setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+      setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
+      setOperationAction(ISD::SELECT, VT, Custom);
+      setOperationAction(ISD::FADD, VT, Custom);
+      setOperationAction(ISD::FDIV, VT, Custom);
+      setOperationAction(ISD::FMA, VT, Custom);
+      setOperationAction(ISD::FMUL, VT, Custom);
+      setOperationAction(ISD::FNEG, VT, Custom);
+      setOperationAction(ISD::FSUB, VT, Custom);
+      setOperationAction(ISD::FCEIL, VT, Custom);
+      setOperationAction(ISD::FFLOOR, VT, Custom);
+      setOperationAction(ISD::FNEARBYINT, VT, Custom);
+      setOperationAction(ISD::FRINT, VT, Custom);
+      setOperationAction(ISD::FROUND, VT, Custom);
+      setOperationAction(ISD::FROUNDEVEN, VT, Custom);
+      setOperationAction(ISD::FTRUNC, VT, Custom);
+      setOperationAction(ISD::FSQRT, VT, Custom);
+      setOperationAction(ISD::FP_EXTEND, VT, Custom);
+      setOperationAction(ISD::FP_ROUND, VT, Custom);
+    }
+
+    setOperationAction(ISD::SPLAT_VECTOR, MVT::nxv8bf16, Custom);
 
-    setOperationAction(ISD::UINT_TO_FP, MVT::nxv2i1, Promote);
-    AddPromotedToType(ISD::UINT_TO_FP, MVT::nxv2i1, MVT::nxv2i64);
-    setOperationAction(ISD::UINT_TO_FP, MVT::nxv4i1, Promote);
-    AddPromotedToType(ISD::UINT_TO_FP, MVT::nxv4i1, MVT::nxv4i32);
-    setOperationAction(ISD::UINT_TO_FP, MVT::nxv8i1, Promote);
-    AddPromotedToType(ISD::UINT_TO_FP, MVT::nxv8i1, MVT::nxv8i16);
+    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom);
+    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
 
     // NOTE: Currently this has to happen after computeRegisterProperties rather
     // than the preferred option of combining it with the addRegisterClass call.
@@ -1565,6 +1584,8 @@
     MAKE_CASE(AArch64ISD::FROUND_MERGE_PASSTHRU)
     MAKE_CASE(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU)
     MAKE_CASE(AArch64ISD::FTRUNC_MERGE_PASSTHRU)
+    MAKE_CASE(AArch64ISD::FP_ROUND_MERGE_PASSTHRU)
+    MAKE_CASE(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU)
     MAKE_CASE(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU)
     MAKE_CASE(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU)
     MAKE_CASE(AArch64ISD::FCVTZU_MERGE_PASSTHRU)
@@ -2893,6 +2914,9 @@
 
 SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
                                               SelectionDAG &DAG) const {
+  if (Op.getValueType().isScalableVector())
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
+
   assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
 
   RTLIB::Libcall LC;
@@ -2903,6 +2927,9 @@
 
 SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
                                              SelectionDAG &DAG) const {
+  if (Op.getValueType().isScalableVector())
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
+
   bool IsStrict = Op->isStrictFPOpcode();
   SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
   EVT SrcVT = SrcVal.getValueType();
@@ -3456,7 +3483,7 @@
                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
   case Intrinsic::aarch64_sve_frintm:
     return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, dl, Op.getValueType(),
-                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));                     
+                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
   case Intrinsic::aarch64_sve_frinti:
     return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, dl, Op.getValueType(),
                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
@@ -3820,6 +3847,8 @@
     return LowerRETURNADDR(Op, DAG);
   case ISD::ADDROFRETURNADDR:
     return LowerADDROFRETURNADDR(Op, DAG);
+  case ISD::CONCAT_VECTORS:
+    return LowerCONCAT_VECTORS(Op, DAG);
   case ISD::INSERT_VECTOR_ELT:
     return LowerINSERT_VECTOR_ELT(Op, DAG);
   case ISD::EXTRACT_VECTOR_ELT:
@@ -9135,6 +9164,18 @@
   return SDValue();
 }
 
+SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  assert(Op.getValueType().isScalableVector() &&
+         isTypeLegal(Op.getValueType()) &&
+         "Expected legal scalable vector type!");
+
+  if (isTypeLegal(Op.getOperand(0).getValueType()) && Op.getNumOperands() == 2)
+    return Op;
+
+  return SDValue();
+}
+
 SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
                                                       SelectionDAG &DAG) const {
   assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
@@ -15974,7 +16015,8 @@
 
   SmallVector<SDValue, 4> Operands = {Pg};
   for (const SDValue &V : Op->op_values()) {
-    assert((isa<CondCodeSDNode>(V) || V.getValueType().isScalableVector()) &&
+    assert((!V.getValueType().isVector() ||
+            V.getValueType().isScalableVector()) &&
            "Only scalable vectors are supported!");
     Operands.push_back(V);
   }
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -914,6 +914,13 @@
   let ParserMatchClass = Imm0_1Operand;
 }
 
+// timm0_1 - as above, but use TargetConstant (TImmLeaf)
+def timm0_1 : Operand<i64>, TImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 2;
+}]> {
+  let ParserMatchClass = Imm0_1Operand;
+}
+
 // imm0_15 predicate - True if the immediate is in the range [0,15]
 def imm0_15 : Operand<i64>, ImmLeaf<i64, [{
   return ((uint64_t)Imm) < 16;
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -1763,6 +1763,11 @@
   return false;
 }
 
+static bool needsWinCFI(const MachineFunction *MF) {
+  return MF->getTarget().getMCAsmInfo()->usesWindowsCFI() &&
+         MF->getFunction().needsUnwindTableEntry();
+}
+
 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
     MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
   MachineBasicBlock::iterator E = I->getParent()->end();
@@ -1803,14 +1808,11 @@
   // the memory access (I) and the increment (MBBI) can access the memory
   // region defined by [SP, MBBI].
   const bool BaseRegSP = BaseReg == AArch64::SP;
-  if (BaseRegSP) {
+  if (BaseRegSP && needsWinCFI(I->getMF())) {
     // FIXME: For now, we always block the optimization over SP in windows
     // targets as it requires to adjust the unwind/debug info, messing up
     // the unwind info can actually cause a miscompile.
-    const MCAsmInfo *MAI = I->getMF()->getTarget().getMCAsmInfo();
-    if (MAI->usesWindowsCFI() &&
-        I->getMF()->getFunction().needsUnwindTableEntry())
-      return E;
+    return E;
   }
 
   for (unsigned Count = 0; MBBI != E && Count < Limit;
@@ -1866,6 +1868,14 @@
     }
   }
 
+  const bool BaseRegSP = BaseReg == AArch64::SP;
+  if (BaseRegSP && needsWinCFI(I->getMF())) {
+    // FIXME: For now, we always block the optimization over SP in windows
+    // targets as it requires to adjust the unwind/debug info, messing up
+    // the unwind info can actually cause a miscompile.
+    return E;
+  }
+
   // Track which register units have been modified and used between the first
   // insn (inclusive) and the second insn.
   ModifiedRegUnits.clear();
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -219,6 +219,13 @@
   SDTCVecEltisVT<1,i1>
 ]>;
 
+def SDT_AArch64FCVTR : SDTypeProfile<1, 4, [
+  SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>, SDTCisVec<4>,
+  SDTCVecEltisVT<1,i1>
+]>;
+
+def AArch64fcvtr_mt  : SDNode<"AArch64ISD::FP_ROUND_MERGE_PASSTHRU", SDT_AArch64FCVTR>;
+def AArch64fcvte_mt  : SDNode<"AArch64ISD::FP_EXTEND_MERGE_PASSTHRU", SDT_AArch64FCVT>;
 def AArch64ucvtf_mt  : SDNode<"AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU", SDT_AArch64FCVT>;
 def AArch64scvtf_mt  : SDNode<"AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU", SDT_AArch64FCVT>;
 def AArch64fcvtzu_mt : SDNode<"AArch64ISD::FCVTZU_MERGE_PASSTHRU", SDT_AArch64FCVT>;
@@ -1178,6 +1185,11 @@
             (ZIP2_PPP_B PPR:$Ps, (PFALSE))>;
 
   // Extract subvectors from FP SVE vectors
+  def : Pat<(nxv2f16 (extract_subvector (nxv4f16 ZPR:$Zs), (i64 0))),
+        (UUNPKLO_ZZ_D ZPR:$Zs)>;
+  def : Pat<(nxv2f16 (extract_subvector (nxv4f16 ZPR:$Zs), (i64 2))),
+        (UUNPKHI_ZZ_D ZPR:$Zs)>;
+
   def : Pat<(nxv4f16 (extract_subvector (nxv8f16 ZPR:$Zs), (i64 0))),
             (UUNPKLO_ZZ_S ZPR:$Zs)>;
   def : Pat<(nxv4f16 (extract_subvector (nxv8f16 ZPR:$Zs), (i64 4))),
@@ -1195,6 +1207,14 @@
   def : Pat<(nxv16i1 (concat_vectors nxv8i1:$p1, nxv8i1:$p2)),
             (UZP1_PPP_B $p1, $p2)>;
 
+  // Concatenate two floating point vectors.
+  def : Pat<(nxv4f16 (concat_vectors nxv2f16:$v1, nxv2f16:$v2)),
+            (UZP1_ZZZ_S $v1, $v2)>;
+  def : Pat<(nxv8f16 (concat_vectors nxv4f16:$v1, nxv4f16:$v2)),
+            (UZP1_ZZZ_H $v1, $v2)>;
+  def : Pat<(nxv4f32 (concat_vectors nxv2f32:$v1, nxv2f32:$v2)),
+            (UZP1_ZZZ_S $v1, $v2)>;
+
   defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs", SETUGE, SETULE>;
   defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi", SETUGT, SETULT>;
   defm CMPGE_PPzZZ : sve_int_cmp_0<0b100, "cmpge", SETGE, SETLE>;
@@ -1392,40 +1412,48 @@
   defm LSR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b001, "lsr", int_aarch64_sve_lsr_wide>;
   defm LSL_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b011, "lsl", int_aarch64_sve_lsl_wide>;
 
-  defm FCVT_ZPmZ_StoH   : sve_fp_2op_p_zd<0b1001000, "fcvt",   ZPR32, ZPR16, int_aarch64_sve_fcvt_f16f32,   null_frag,        nxv8f16, nxv4i1, nxv4f32, ElementSizeS>;
-  defm FCVT_ZPmZ_HtoS   : sve_fp_2op_p_zd<0b1001001, "fcvt",   ZPR16, ZPR32, int_aarch64_sve_fcvt_f32f16,   null_frag,        nxv4f32, nxv4i1, nxv8f16, ElementSizeS>;
-  defm SCVTF_ZPmZ_HtoH  : sve_fp_2op_p_zd<0b0110010, "scvtf",  ZPR16, ZPR16, null_frag,                     AArch64scvtf_mt,  nxv8f16, nxv8i1, nxv8i16, ElementSizeH>;
-  defm SCVTF_ZPmZ_StoS  : sve_fp_2op_p_zd<0b1010100, "scvtf",  ZPR32, ZPR32, null_frag,                     AArch64scvtf_mt,  nxv4f32, nxv4i1, nxv4i32, ElementSizeS>;
-  defm UCVTF_ZPmZ_StoS  : sve_fp_2op_p_zd<0b1010101, "ucvtf",  ZPR32, ZPR32, null_frag,                     AArch64ucvtf_mt,  nxv4f32, nxv4i1, nxv4i32, ElementSizeS>;
-  defm UCVTF_ZPmZ_HtoH  : sve_fp_2op_p_zd<0b0110011, "ucvtf",  ZPR16, ZPR16, null_frag,                     AArch64ucvtf_mt,  nxv8f16, nxv8i1, nxv8i16, ElementSizeH>;
-  defm FCVTZS_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111010, "fcvtzs", ZPR16, ZPR16, null_frag,                     AArch64fcvtzs_mt, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>;
-  defm FCVTZS_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011100, "fcvtzs", ZPR32, ZPR32, null_frag,                     AArch64fcvtzs_mt, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>;
-  defm FCVTZU_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111011, "fcvtzu", ZPR16, ZPR16, null_frag,                     AArch64fcvtzu_mt, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>;
-  defm FCVTZU_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011101, "fcvtzu", ZPR32, ZPR32, null_frag,                     AArch64fcvtzu_mt, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>;
-  defm FCVT_ZPmZ_DtoH   : sve_fp_2op_p_zd<0b1101000, "fcvt",   ZPR64, ZPR16, int_aarch64_sve_fcvt_f16f64,   null_frag,        nxv8f16, nxv2i1, nxv2f64, ElementSizeD>;
-  defm FCVT_ZPmZ_HtoD   : sve_fp_2op_p_zd<0b1101001, "fcvt",   ZPR16, ZPR64, int_aarch64_sve_fcvt_f64f16,   null_frag,        nxv2f64, nxv2i1, nxv8f16, ElementSizeD>;
-  defm FCVT_ZPmZ_DtoS   : sve_fp_2op_p_zd<0b1101010, "fcvt",   ZPR64, ZPR32, int_aarch64_sve_fcvt_f32f64,   null_frag,        nxv4f32, nxv2i1, nxv2f64, ElementSizeD>;
-  defm FCVT_ZPmZ_StoD   : sve_fp_2op_p_zd<0b1101011, "fcvt",   ZPR32, ZPR64, int_aarch64_sve_fcvt_f64f32,   null_frag,        nxv2f64, nxv2i1, nxv4f32, ElementSizeD>;
-  defm SCVTF_ZPmZ_StoD  : sve_fp_2op_p_zd<0b1110000, "scvtf",  ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32,  AArch64scvtf_mt,  nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
-  defm UCVTF_ZPmZ_StoD  : sve_fp_2op_p_zd<0b1110001, "ucvtf",  ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32,  AArch64ucvtf_mt,  nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
-  defm UCVTF_ZPmZ_StoH  : sve_fp_2op_p_zd<0b0110101, "ucvtf",  ZPR32, ZPR16, int_aarch64_sve_ucvtf_f16i32,  AArch64ucvtf_mt,  nxv4f16, nxv4i1, nxv4i32, ElementSizeS>;
-  defm SCVTF_ZPmZ_DtoS  : sve_fp_2op_p_zd<0b1110100, "scvtf",  ZPR64, ZPR32, int_aarch64_sve_scvtf_f32i64,  AArch64scvtf_mt,  nxv2f32, nxv2i1, nxv2i64, ElementSizeD>;
-  defm SCVTF_ZPmZ_StoH  : sve_fp_2op_p_zd<0b0110100, "scvtf",  ZPR32, ZPR16, int_aarch64_sve_scvtf_f16i32,  AArch64scvtf_mt,  nxv4f16, nxv4i1, nxv4i32, ElementSizeS>;
-  defm SCVTF_ZPmZ_DtoH  : sve_fp_2op_p_zd<0b0110110, "scvtf",  ZPR64, ZPR16, int_aarch64_sve_scvtf_f16i64,  AArch64scvtf_mt,  nxv2f16, nxv2i1, nxv2i64, ElementSizeD>;
-  defm UCVTF_ZPmZ_DtoS  : sve_fp_2op_p_zd<0b1110101, "ucvtf",  ZPR64, ZPR32, int_aarch64_sve_ucvtf_f32i64,  AArch64ucvtf_mt,  nxv2f32, nxv2i1, nxv2i64, ElementSizeD>;
-  defm UCVTF_ZPmZ_DtoH  : sve_fp_2op_p_zd<0b0110111, "ucvtf",  ZPR64, ZPR16, int_aarch64_sve_ucvtf_f16i64,  AArch64ucvtf_mt,  nxv2f16, nxv2i1, nxv2i64, ElementSizeD>;
-  defm SCVTF_ZPmZ_DtoD  : sve_fp_2op_p_zd<0b1110110, "scvtf",  ZPR64, ZPR64, null_frag,                     AArch64scvtf_mt,  nxv2f64, nxv2i1, nxv2i64, ElementSizeD>;
-  defm UCVTF_ZPmZ_DtoD  : sve_fp_2op_p_zd<0b1110111, "ucvtf",  ZPR64, ZPR64, null_frag,                     AArch64ucvtf_mt,  nxv2f64, nxv2i1, nxv2i64, ElementSizeD>;
-  defm FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111000, "fcvtzs", ZPR64, ZPR32, int_aarch64_sve_fcvtzs_i32f64, null_frag,        nxv4i32, nxv2i1, nxv2f64, ElementSizeD>;
-  defm FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111001, "fcvtzu", ZPR64, ZPR32, int_aarch64_sve_fcvtzu_i32f64, null_frag,        nxv4i32, nxv2i1, nxv2f64, ElementSizeD>;
-  defm FCVTZS_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111100, "fcvtzs", ZPR32, ZPR64, int_aarch64_sve_fcvtzs_i64f32, AArch64fcvtzs_mt, nxv2i64, nxv2i1, nxv2f32, ElementSizeD>;
-  defm FCVTZS_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111100, "fcvtzs", ZPR16, ZPR32, int_aarch64_sve_fcvtzs_i32f16, AArch64fcvtzs_mt, nxv4i32, nxv4i1, nxv4f16, ElementSizeS>;
-  defm FCVTZS_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111110, "fcvtzs", ZPR16, ZPR64, int_aarch64_sve_fcvtzs_i64f16, AArch64fcvtzs_mt, nxv2i64, nxv2i1, nxv2f16, ElementSizeD>;
-  defm FCVTZU_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111101, "fcvtzu", ZPR16, ZPR32, int_aarch64_sve_fcvtzu_i32f16, AArch64fcvtzu_mt, nxv4i32, nxv4i1, nxv4f16, ElementSizeS>;
-  defm FCVTZU_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111111, "fcvtzu", ZPR16, ZPR64, int_aarch64_sve_fcvtzu_i64f16, AArch64fcvtzu_mt, nxv2i64, nxv2i1, nxv2f16, ElementSizeD>;
-  defm FCVTZU_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111101, "fcvtzu", ZPR32, ZPR64, int_aarch64_sve_fcvtzu_i64f32, AArch64fcvtzu_mt, nxv2i64, nxv2i1, nxv2f32, ElementSizeD>;
-  defm FCVTZS_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111110, "fcvtzs", ZPR64, ZPR64, null_frag,                     AArch64fcvtzs_mt, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>;
-  defm FCVTZU_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111111, "fcvtzu", ZPR64, ZPR64, null_frag,                     AArch64fcvtzu_mt, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>;
+  defm FCVT_ZPmZ_StoH   : sve_fp_2op_p_zdr<0b1001000, "fcvt",   ZPR32, ZPR16, int_aarch64_sve_fcvt_f16f32,   AArch64fcvtr_mt,  nxv4f16, nxv4i1, nxv4f32, ElementSizeS>;
+  defm FCVT_ZPmZ_HtoS   : sve_fp_2op_p_zd< 0b1001001, "fcvt",   ZPR16, ZPR32, int_aarch64_sve_fcvt_f32f16,   AArch64fcvte_mt,  nxv4f32, nxv4i1, nxv4f16, ElementSizeS>;
+  defm SCVTF_ZPmZ_HtoH  : sve_fp_2op_p_zd< 0b0110010, "scvtf",  ZPR16, ZPR16, null_frag,                     AArch64scvtf_mt,  nxv8f16, nxv8i1, nxv8i16, ElementSizeH>;
+  defm SCVTF_ZPmZ_StoS  : sve_fp_2op_p_zd< 0b1010100, "scvtf",  ZPR32, ZPR32, null_frag,                     AArch64scvtf_mt,  nxv4f32, nxv4i1, nxv4i32, ElementSizeS>;
+  defm UCVTF_ZPmZ_StoS  : sve_fp_2op_p_zd< 0b1010101, "ucvtf",  ZPR32, ZPR32, null_frag,                     AArch64ucvtf_mt,  nxv4f32, nxv4i1, nxv4i32, ElementSizeS>;
+  defm UCVTF_ZPmZ_HtoH  : sve_fp_2op_p_zd< 0b0110011, "ucvtf",  ZPR16, ZPR16, null_frag,                     AArch64ucvtf_mt,  nxv8f16, nxv8i1, nxv8i16, ElementSizeH>;
+  defm FCVTZS_ZPmZ_HtoH : sve_fp_2op_p_zd< 0b0111010, "fcvtzs", ZPR16, ZPR16, null_frag,                     AArch64fcvtzs_mt, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>;
+  defm FCVTZS_ZPmZ_StoS : sve_fp_2op_p_zd< 0b1011100, "fcvtzs", ZPR32, ZPR32, null_frag,                     AArch64fcvtzs_mt, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>;
+  defm FCVTZU_ZPmZ_HtoH : sve_fp_2op_p_zd< 0b0111011, "fcvtzu", ZPR16, ZPR16, null_frag,                     AArch64fcvtzu_mt, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>;
+  defm FCVTZU_ZPmZ_StoS : sve_fp_2op_p_zd< 0b1011101, "fcvtzu", ZPR32, ZPR32, null_frag,                     AArch64fcvtzu_mt, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>;
+  defm FCVT_ZPmZ_DtoH   : sve_fp_2op_p_zdr<0b1101000, "fcvt",   ZPR64, ZPR16, int_aarch64_sve_fcvt_f16f64,   AArch64fcvtr_mt,  nxv2f16, nxv2i1, nxv2f64, ElementSizeD>;
+  defm FCVT_ZPmZ_HtoD   : sve_fp_2op_p_zd< 0b1101001, "fcvt",   ZPR16, ZPR64, int_aarch64_sve_fcvt_f64f16,   AArch64fcvte_mt,  nxv2f64, nxv2i1, nxv2f16, ElementSizeD>;
+  defm FCVT_ZPmZ_DtoS   : sve_fp_2op_p_zdr<0b1101010, "fcvt",   ZPR64, ZPR32, int_aarch64_sve_fcvt_f32f64,   AArch64fcvtr_mt,  nxv2f32, nxv2i1, nxv2f64, ElementSizeD>;
+  defm FCVT_ZPmZ_StoD   : sve_fp_2op_p_zd< 0b1101011, "fcvt",   ZPR32, ZPR64, int_aarch64_sve_fcvt_f64f32,   AArch64fcvte_mt,  nxv2f64, nxv2i1, nxv2f32, ElementSizeD>;
+  defm SCVTF_ZPmZ_StoD  : sve_fp_2op_p_zd< 0b1110000, "scvtf",  ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32,  AArch64scvtf_mt,  nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
+  defm UCVTF_ZPmZ_StoD  : sve_fp_2op_p_zd< 0b1110001, "ucvtf",  ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32,  AArch64ucvtf_mt,  nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
+  defm UCVTF_ZPmZ_StoH  : sve_fp_2op_p_zd< 0b0110101, "ucvtf",  ZPR32, ZPR16, int_aarch64_sve_ucvtf_f16i32,  AArch64ucvtf_mt,  nxv4f16, nxv4i1, nxv4i32, ElementSizeS>;
+  defm SCVTF_ZPmZ_DtoS  : sve_fp_2op_p_zd< 0b1110100, "scvtf",  ZPR64, ZPR32, int_aarch64_sve_scvtf_f32i64,  AArch64scvtf_mt,  nxv2f32, nxv2i1, nxv2i64, ElementSizeD>;
+  defm SCVTF_ZPmZ_StoH  : sve_fp_2op_p_zd< 0b0110100, "scvtf",  ZPR32, ZPR16, int_aarch64_sve_scvtf_f16i32,  AArch64scvtf_mt,  nxv4f16, nxv4i1, nxv4i32, ElementSizeS>;
+  defm SCVTF_ZPmZ_DtoH  : sve_fp_2op_p_zd< 0b0110110, "scvtf",  ZPR64, ZPR16, int_aarch64_sve_scvtf_f16i64,  AArch64scvtf_mt,  nxv2f16, nxv2i1, nxv2i64, ElementSizeD>;
+  defm UCVTF_ZPmZ_DtoS  : sve_fp_2op_p_zd< 0b1110101, "ucvtf",  ZPR64, ZPR32, int_aarch64_sve_ucvtf_f32i64,  AArch64ucvtf_mt,  nxv2f32, nxv2i1, nxv2i64, ElementSizeD>;
+  defm UCVTF_ZPmZ_DtoH  : sve_fp_2op_p_zd< 0b0110111, "ucvtf",  ZPR64, ZPR16, int_aarch64_sve_ucvtf_f16i64,  AArch64ucvtf_mt,  nxv2f16, nxv2i1, nxv2i64, ElementSizeD>;
+  defm SCVTF_ZPmZ_DtoD  : sve_fp_2op_p_zd< 0b1110110, "scvtf",  ZPR64, ZPR64, null_frag,                     AArch64scvtf_mt,  nxv2f64, nxv2i1, nxv2i64, ElementSizeD>;
+  defm UCVTF_ZPmZ_DtoD  : sve_fp_2op_p_zd< 0b1110111, "ucvtf",  ZPR64, ZPR64, null_frag,                     AArch64ucvtf_mt,  nxv2f64, nxv2i1, nxv2i64, ElementSizeD>;
+  defm FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1111000, "fcvtzs", ZPR64, ZPR32, int_aarch64_sve_fcvtzs_i32f64, null_frag,        nxv4i32, nxv2i1, nxv2f64, ElementSizeD>;
+  defm FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1111001, "fcvtzu", ZPR64, ZPR32, int_aarch64_sve_fcvtzu_i32f64, null_frag,        nxv4i32, nxv2i1, nxv2f64, ElementSizeD>;
+  defm FCVTZS_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1111100, "fcvtzs", ZPR32, ZPR64, int_aarch64_sve_fcvtzs_i64f32, AArch64fcvtzs_mt, nxv2i64, nxv2i1, nxv2f32, ElementSizeD>;
+  defm FCVTZS_ZPmZ_HtoS : sve_fp_2op_p_zd< 0b0111100, "fcvtzs", ZPR16, ZPR32, int_aarch64_sve_fcvtzs_i32f16, AArch64fcvtzs_mt, nxv4i32, nxv4i1, nxv4f16, ElementSizeS>;
+  defm FCVTZS_ZPmZ_HtoD : sve_fp_2op_p_zd< 0b0111110, "fcvtzs", ZPR16, ZPR64, int_aarch64_sve_fcvtzs_i64f16, AArch64fcvtzs_mt, nxv2i64, nxv2i1, nxv2f16, ElementSizeD>;
+  defm FCVTZU_ZPmZ_HtoS : sve_fp_2op_p_zd< 0b0111101, "fcvtzu", ZPR16, ZPR32, int_aarch64_sve_fcvtzu_i32f16, AArch64fcvtzu_mt, nxv4i32, nxv4i1, nxv4f16, ElementSizeS>;
+  defm FCVTZU_ZPmZ_HtoD : sve_fp_2op_p_zd< 0b0111111, "fcvtzu", ZPR16, ZPR64, int_aarch64_sve_fcvtzu_i64f16, AArch64fcvtzu_mt, nxv2i64, nxv2i1, nxv2f16, ElementSizeD>;
+  defm FCVTZU_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1111101, "fcvtzu", ZPR32, ZPR64, int_aarch64_sve_fcvtzu_i64f32, AArch64fcvtzu_mt, nxv2i64, nxv2i1, nxv2f32, ElementSizeD>;
+  defm FCVTZS_ZPmZ_DtoD : sve_fp_2op_p_zd< 0b1111110, "fcvtzs", ZPR64, ZPR64, null_frag,                     AArch64fcvtzs_mt, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>;
+  defm FCVTZU_ZPmZ_DtoD : sve_fp_2op_p_zd< 0b1111111, "fcvtzu", ZPR64, ZPR64, null_frag,                     AArch64fcvtzu_mt, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>;
+
+  def : Pat<(nxv2f32 (AArch64fcvte_mt (nxv2i1 PPR:$Pg), (nxv2f16 ZPR:$Zs), (nxv2f32 ZPR:$Zd))),
+            (FCVT_ZPmZ_HtoS ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+
+  // FP_ROUND has an additional 'precise' flag which indicates the type of rounding.
+  // This is ignored by the pattern below where it is matched by (i64 timm0_1)
+  def : Pat<(nxv2f16 (AArch64fcvtr_mt (nxv2i1 PPR:$Pg), (nxv2f32 ZPR:$Zs), (i64 timm0_1), (nxv2f16 ZPR:$Zd))),
+            (FCVT_ZPmZ_StoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
 
   // Floating-point -> signed integer
   def : Pat<(nxv2f16 (AArch64scvtf_mt (nxv2i1 PPR:$Pg),
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -172,6 +172,11 @@
   emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
                      MachineOperand &Predicate,
                      MachineIRBuilder &MIRBuilder) const;
+
+  /// Emit a floating point comparison between \p LHS and \p RHS.
+  MachineInstr *emitFPCompare(Register LHS, Register RHS,
+                              MachineIRBuilder &MIRBuilder) const;
+
   MachineInstr *emitInstr(unsigned Opcode,
                           std::initializer_list<llvm::DstOp> DstOps,
                           std::initializer_list<llvm::SrcOp> SrcOps,
@@ -238,9 +243,16 @@
   MachineInstr *emitFMovForFConstant(MachineInstr &MI,
                                      MachineRegisterInfo &MRI) const;
 
-  /// Emit a CSet for a compare.
+  /// Emit a CSet for an integer compare.
+  ///
+  /// \p DefReg is expected to be a 32-bit scalar register.
   MachineInstr *emitCSetForICMP(Register DefReg, unsigned Pred,
                                 MachineIRBuilder &MIRBuilder) const;
+  /// Emit a CSet for a FP compare.
+  ///
+  /// \p Dst is expected to be a 32-bit scalar register.
+  MachineInstr *emitCSetForFCmp(Register Dst, CmpInst::Predicate Pred,
+                                MachineIRBuilder &MIRBuilder) const;
 
   /// Emit a TB(N)Z instruction which tests \p Bit in \p TestReg.
   /// \p IsNegative is true if the test should be "not zero".
@@ -998,20 +1010,6 @@
   return 0;
 }
 
-/// Helper function to select the opcode for a G_FCMP.
-static unsigned selectFCMPOpc(MachineInstr &I, MachineRegisterInfo &MRI) {
-  // If this is a compare against +0.0, then we don't have to explicitly
-  // materialize a constant.
-  const ConstantFP *FPImm = getConstantFPVRegVal(I.getOperand(3).getReg(), MRI);
-  bool ShouldUseImm = FPImm && (FPImm->isZero() && !FPImm->isNegative());
-  unsigned OpSize = MRI.getType(I.getOperand(2).getReg()).getSizeInBits();
-  if (OpSize != 32 && OpSize != 64)
-    return 0;
-  unsigned CmpOpcTbl[2][2] = {{AArch64::FCMPSrr, AArch64::FCMPDrr},
-                              {AArch64::FCMPSri, AArch64::FCMPDri}};
-  return CmpOpcTbl[ShouldUseImm][OpSize == 64];
-}
-
 /// Returns true if \p P is an unsigned integer comparison predicate.
 static bool isUnsignedICMPPred(const CmpInst::Predicate P) {
   switch (P) {
@@ -2882,64 +2880,13 @@
   }
 
   case TargetOpcode::G_FCMP: {
-    if (Ty != LLT::scalar(32)) {
-      LLVM_DEBUG(dbgs() << "G_FCMP result has type: " << Ty
-                        << ", expected: " << LLT::scalar(32) << '\n');
-      return false;
-    }
-
-    unsigned CmpOpc = selectFCMPOpc(I, MRI);
-    if (!CmpOpc)
+    MachineIRBuilder MIRBuilder(I);
+    CmpInst::Predicate Pred =
+        static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate());
+    if (!emitFPCompare(I.getOperand(2).getReg(), I.getOperand(3).getReg(),
+                       MIRBuilder) ||
+        !emitCSetForFCmp(I.getOperand(0).getReg(), Pred, MIRBuilder))
       return false;
-
-    // FIXME: regbank
-
-    AArch64CC::CondCode CC1, CC2;
-    changeFCMPPredToAArch64CC(
-        (CmpInst::Predicate)I.getOperand(1).getPredicate(), CC1, CC2);
-
-    // Partially build the compare. Decide if we need to add a use for the
-    // third operand based off whether or not we're comparing against 0.0.
-    auto CmpMI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(CmpOpc))
-                     .addUse(I.getOperand(2).getReg());
-
-    // If we don't have an immediate compare, then we need to add a use of the
-    // register which wasn't used for the immediate.
-    // Note that the immediate will always be the last operand.
-    if (CmpOpc != AArch64::FCMPSri && CmpOpc != AArch64::FCMPDri)
-      CmpMI = CmpMI.addUse(I.getOperand(3).getReg());
-
-    const Register DefReg = I.getOperand(0).getReg();
-    Register Def1Reg = DefReg;
-    if (CC2 != AArch64CC::AL)
-      Def1Reg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
-
-    MachineInstr &CSetMI =
-        *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::CSINCWr))
-             .addDef(Def1Reg)
-             .addUse(AArch64::WZR)
-             .addUse(AArch64::WZR)
-             .addImm(getInvertedCondCode(CC1));
-
-    if (CC2 != AArch64CC::AL) {
-      Register Def2Reg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
-      MachineInstr &CSet2MI =
-          *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::CSINCWr))
-               .addDef(Def2Reg)
-               .addUse(AArch64::WZR)
-               .addUse(AArch64::WZR)
-               .addImm(getInvertedCondCode(CC2));
-      MachineInstr &OrMI =
-          *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::ORRWrr))
-               .addDef(DefReg)
-               .addUse(Def1Reg)
-               .addUse(Def2Reg);
-      constrainSelectedInstRegOperands(OrMI, TII, TRI, RBI);
-      constrainSelectedInstRegOperands(CSet2MI, TII, TRI, RBI);
-    }
-    constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
-    constrainSelectedInstRegOperands(CSetMI, TII, TRI, RBI);
-
     I.eraseFromParent();
     return true;
   }
@@ -3984,6 +3931,66 @@
   return {&*CmpMI, P};
 }
 
+MachineInstr *AArch64InstructionSelector::emitCSetForFCmp(
+    Register Dst, CmpInst::Predicate Pred, MachineIRBuilder &MIRBuilder) const {
+  MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
+#ifndef NDEBUG
+  LLT Ty = MRI.getType(Dst);
+  assert(!Ty.isVector() && Ty.getSizeInBits() == 32 &&
+         "Expected a 32-bit scalar register?");
+#endif
+  const Register ZeroReg = AArch64::WZR;
+  auto EmitCSet = [&](Register CsetDst, AArch64CC::CondCode CC) {
+    auto CSet =
+        MIRBuilder.buildInstr(AArch64::CSINCWr, {CsetDst}, {ZeroReg, ZeroReg})
+            .addImm(getInvertedCondCode(CC));
+    constrainSelectedInstRegOperands(*CSet, TII, TRI, RBI);
+    return &*CSet;
+  };
+
+  AArch64CC::CondCode CC1, CC2;
+  changeFCMPPredToAArch64CC(Pred, CC1, CC2);
+  if (CC2 == AArch64CC::AL)
+    return EmitCSet(Dst, CC1);
+
+  const TargetRegisterClass *RC = &AArch64::GPR32RegClass;
+  Register Def1Reg = MRI.createVirtualRegister(RC);
+  Register Def2Reg = MRI.createVirtualRegister(RC);
+  EmitCSet(Def1Reg, CC1);
+  EmitCSet(Def2Reg, CC2);
+  auto OrMI = MIRBuilder.buildInstr(AArch64::ORRWrr, {Dst}, {Def1Reg, Def2Reg});
+  constrainSelectedInstRegOperands(*OrMI, TII, TRI, RBI);
+  return &*OrMI;
+}
+
+MachineInstr *
+AArch64InstructionSelector::emitFPCompare(Register LHS, Register RHS,
+                                          MachineIRBuilder &MIRBuilder) const {
+  MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
+  LLT Ty = MRI.getType(LHS);
+  if (Ty.isVector())
+    return nullptr;
+  unsigned OpSize = Ty.getSizeInBits();
+  if (OpSize != 32 && OpSize != 64)
+    return nullptr;
+
+  // If this is a compare against +0.0, then we don't have
+  // to explicitly materialize a constant.
+  const ConstantFP *FPImm = getConstantFPVRegVal(RHS, MRI);
+  bool ShouldUseImm = FPImm && (FPImm->isZero() && !FPImm->isNegative());
+  unsigned CmpOpcTbl[2][2] = {{AArch64::FCMPSrr, AArch64::FCMPDrr},
+                              {AArch64::FCMPSri, AArch64::FCMPDri}};
+  unsigned CmpOpc = CmpOpcTbl[ShouldUseImm][OpSize == 64];
+
+  // Partially build the compare. Decide if we need to add a use for the
+  // third operand based off whether or not we're comparing against 0.0.
+  auto CmpMI = MIRBuilder.buildInstr(CmpOpc).addUse(LHS);
+  if (!ShouldUseImm)
+    CmpMI.addUse(RHS);
+  constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
+  return &*CmpMI;
+}
+
 MachineInstr *AArch64InstructionSelector::emitVectorConcat(
     Optional<Register> Dst, Register Op1, Register Op2,
     MachineIRBuilder &MIRBuilder) const {
@@ -4169,10 +4176,10 @@
     CondCode = changeICMPPredToAArch64CC(Pred);
   } else {
     // Get the condition code for the select.
+    CmpInst::Predicate Pred =
+        static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate());
     AArch64CC::CondCode CondCode2;
-    changeFCMPPredToAArch64CC(
-        (CmpInst::Predicate)CondDef->getOperand(1).getPredicate(), CondCode,
-        CondCode2);
+    changeFCMPPredToAArch64CC(Pred, CondCode, CondCode2);
 
     // changeFCMPPredToAArch64CC sets CondCode2 to AL when we require two
     // instructions to emit the comparison.
@@ -4181,16 +4188,11 @@
     if (CondCode2 != AArch64CC::AL)
       return false;
 
-    // Make sure we'll be able to select the compare.
-    unsigned CmpOpc = selectFCMPOpc(*CondDef, MRI);
-    if (!CmpOpc)
+    if (!emitFPCompare(CondDef->getOperand(2).getReg(),
+                       CondDef->getOperand(3).getReg(), MIB)) {
+      LLVM_DEBUG(dbgs() << "Couldn't emit compare for select!\n");
       return false;
-
-    // Emit a new compare.
-    auto Cmp = MIB.buildInstr(CmpOpc, {}, {CondDef->getOperand(2).getReg()});
-    if (CmpOpc != AArch64::FCMPSri && CmpOpc != AArch64::FCMPDri)
-      Cmp.addUse(CondDef->getOperand(3).getReg());
-    constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
+    }
   }
 
   // Emit the select.
@@ -4701,8 +4703,9 @@
 bool AArch64InstructionSelector::tryOptConstantBuildVec(
     MachineInstr &I, LLT DstTy, MachineRegisterInfo &MRI) const {
   assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
-  assert(DstTy.getSizeInBits() <= 128 && "Unexpected build_vec type!");
-  if (DstTy.getSizeInBits() < 32)
+  unsigned DstSize = DstTy.getSizeInBits();
+  assert(DstSize <= 128 && "Unexpected build_vec type!");
+  if (DstSize < 32)
     return false;
   // Check if we're building a constant vector, in which case we want to
   // generate a constant pool load instead of a vector insert sequence.
@@ -4723,6 +4726,24 @@
   }
   Constant *CV = ConstantVector::get(Csts);
   MachineIRBuilder MIB(I);
+  if (CV->isNullValue()) {
+    // Until the importer can support immAllZerosV in pattern leaf nodes,
+    // select a zero move manually here.
+    Register DstReg = I.getOperand(0).getReg();
+    if (DstSize == 128) {
+      auto Mov = MIB.buildInstr(AArch64::MOVIv2d_ns, {DstReg}, {}).addImm(0);
+      I.eraseFromParent();
+      return constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
+    } else if (DstSize == 64) {
+      auto Mov =
+          MIB.buildInstr(AArch64::MOVIv2d_ns, {&AArch64::FPR128RegClass}, {})
+              .addImm(0);
+      MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
+          .addReg(Mov.getReg(0), 0, AArch64::dsub);
+      I.eraseFromParent();
+      return RBI.constrainGenericRegister(DstReg, AArch64::FPR64RegClass, MRI);
+    }
+  }
   auto *CPLoad = emitLoadFromConstantPool(CV, MIB);
   if (!CPLoad) {
     LLVM_DEBUG(dbgs() << "Could not generate cp load for build_vector");
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -23,6 +23,7 @@
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Type.h"
+#include <initializer_list>
 
 #define DEBUG_TYPE "aarch64-legalinfo"
 
@@ -54,6 +55,13 @@
   const LLT v2s64 = LLT::vector(2, 64);
   const LLT v2p0 = LLT::vector(2, p0);
 
+  std::initializer_list<LLT> PackedVectorAllTypeList = {/* Begin 128bit types */
+                                                        v16s8, v8s16, v4s32,
+                                                        v2s64, v2p0,
+                                                        /* End 128bit types */
+                                                        /* Begin 64bit types */
+                                                        v8s8, v4s16, v2s32};
+
   const TargetMachine &TM = ST.getTargetLowering()->getTargetMachine();
 
   // FIXME: support subtargets which have neon/fp-armv8 disabled.
@@ -63,7 +71,8 @@
   }
 
   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
-      .legalFor({p0, s1, s8, s16, s32, s64, v2s32, v4s32, v2s64, v16s8, v8s16})
+      .legalFor({p0, s1, s8, s16, s32, s64})
+      .legalFor(PackedVectorAllTypeList)
       .clampScalar(0, s1, s64)
       .widenScalarToNextPow2(0, 8)
       .fewerElementsIf(
@@ -79,8 +88,8 @@
             return std::make_pair(0, EltTy);
           });
 
-  getActionDefinitionsBuilder(G_PHI)
-      .legalFor({p0, s16, s32, s64, v2s32, v4s32, v2s64})
+  getActionDefinitionsBuilder(G_PHI).legalFor({p0, s16, s32, s64})
+      .legalFor(PackedVectorAllTypeList)
       .clampScalar(0, s16, s64)
       .widenScalarToNextPow2(0);
 
@@ -175,7 +184,9 @@
       .minScalar(0, s32);
 
   getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FNEG})
-      .legalFor({s32, s64, v2s64, v4s32, v2s32});
+      .legalFor({s32, s64, v2s64, v4s32, v2s32})
+      .clampNumElements(0, v2s32, v4s32)
+      .clampNumElements(0, v2s64, v2s64);
 
   getActionDefinitionsBuilder(G_FREM).libcallFor({s32, s64});
 
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -318,6 +318,13 @@
 : Pat<(vtd (op pg:$Op1, vts:$Op2, vtd:$Op3)),
       (inst $Op3, $Op1, $Op2)>;
 
+// Used to match FP_ROUND_MERGE_PASSTHRU, which has an additional flag for the
+// type of rounding. This is matched by timm0_1 in pattern below and ignored.
+class SVE_1_Op_Passthru_Round_Pat<ValueType vtd, SDPatternOperator op, ValueType pg,
+                                  ValueType vts, Instruction inst>
+: Pat<(vtd (op pg:$Op1, vts:$Op2, (i64 timm0_1), vtd:$Op3)),
+      (inst $Op3, $Op1, $Op2)>;
+
 class SVE_1_Op_Imm_OptLsl_Reverse_Pat<ValueType vt, SDPatternOperator op, ZPRRegOp zprty,
                                       ValueType it, ComplexPattern cpx, Instruction inst>
   : Pat<(vt (op (vt (AArch64dup (it (cpx i32:$imm, i32:$shift)))), (vt zprty:$Op1))),
@@ -2299,6 +2306,25 @@
   def : SVE_1_Op_Passthru_Pat<vt1, ir_op, vt2, vt3, !cast<Instruction>(NAME)>;
 }
 
+multiclass sve_fp_2op_p_zdr<bits<7> opc, string asm,
+                            RegisterOperand i_zprtype,
+                            RegisterOperand o_zprtype,
+                            SDPatternOperator int_op,
+                            SDPatternOperator ir_op, ValueType vt1,
+                            ValueType vt2, ValueType vt3, ElementSizeEnum Sz> {
+  def NAME : sve_fp_2op_p_zd<opc, asm, i_zprtype, o_zprtype, Sz>;
+
+  // convert vt1 to a packed type for the intrinsic patterns
+  defvar packedvt1 = !cond(!eq(!cast<string>(vt1), "nxv2f16"): nxv8f16,
+                           !eq(!cast<string>(vt1), "nxv4f16"): nxv8f16,
+                           !eq(!cast<string>(vt1), "nxv2f32"): nxv4f32,
+                           1 : vt1);
+
+  def : SVE_3_Op_Pat<packedvt1, int_op, packedvt1, vt2, vt3, !cast<Instruction>(NAME)>;
+
+  def : SVE_1_Op_Passthru_Round_Pat<vt1, ir_op, vt2, vt3, !cast<Instruction>(NAME)>;
+}
+
 multiclass sve_fp_2op_p_zd_HSD<bits<5> opc, string asm, SDPatternOperator op> {
   def _H : sve_fp_2op_p_zd<{ 0b01, opc }, asm, ZPR16, ZPR16, ElementSizeH>;
   def _S : sve_fp_2op_p_zd<{ 0b10, opc }, asm, ZPR32, ZPR32, ElementSizeS>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1712,7 +1712,7 @@
         SDLoc DL(N);
         uint64_t RemainderOffset = COffsetVal;
         uint64_t ImmField = 0;
-        const unsigned NumBits = TII->getNumFlatOffsetBits(AS, IsSigned);
+        const unsigned NumBits = TII->getNumFlatOffsetBits(IsSigned);
         if (IsSigned) {
           // Use signed division by a power of two to truncate towards 0.
           int64_t D = 1LL << (NumBits - 1);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1016,7 +1016,7 @@
     return isUInt<12>(Imm);
   }
 
-  unsigned getNumFlatOffsetBits(unsigned AddrSpace, bool Signed) const;
+  unsigned getNumFlatOffsetBits(bool Signed) const;
 
   /// Returns if \p Offset is legal for the subtarget as the offset to a FLAT
   /// encoded instruction. If \p Signed, this is for an instruction that
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6893,14 +6893,7 @@
   return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
 }
 
-unsigned SIInstrInfo::getNumFlatOffsetBits(unsigned AddrSpace,
-                                           bool Signed) const {
-  if (!ST.hasFlatInstOffsets())
-    return 0;
-
-  if (ST.hasFlatSegmentOffsetBug() && AddrSpace == AMDGPUAS::FLAT_ADDRESS)
-    return 0;
-
+unsigned SIInstrInfo::getNumFlatOffsetBits(bool Signed) const {
   if (ST.getGeneration() >= AMDGPUSubtarget::GFX10)
     return Signed ? 12 : 11;
 
@@ -6916,13 +6909,10 @@
   if (ST.hasFlatSegmentOffsetBug() && AddrSpace == AMDGPUAS::FLAT_ADDRESS)
     return false;
 
-  if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
-    return (Signed && isInt<12>(Offset)) ||
-           (!Signed && isUInt<11>(Offset));
-  }
+  if (ST.getGeneration() >= AMDGPUSubtarget::GFX10)
+    return Signed ? isInt<12>(Offset) : isUInt<11>(Offset);
 
-  return (Signed && isInt<13>(Offset)) ||
-         (!Signed && isUInt<12>(Offset));
+  return Signed ? isInt<13>(Offset) :isUInt<12>(Offset);
 }
 
 
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -418,7 +418,7 @@
 def : MnemonicAlias<"v_accvgpr_read",  "v_accvgpr_read_b32">;
 def : MnemonicAlias<"v_accvgpr_write", "v_accvgpr_write_b32">;
 
-multiclass VOP3P_Real_vi<bits<10> op> {
+multiclass VOP3P_Real_vi<bits<7> op> {
   def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
             VOP3Pe <op, !cast<VOP3_Pseudo>(NAME).Pfl> {
     let AssemblerPredicate = HasVOP3PInsts;
@@ -426,7 +426,7 @@
   }
 }
 
-multiclass VOP3P_Real_MAI<bits<10> op> {
+multiclass VOP3P_Real_MAI<bits<7> op> {
   def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
             VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME).Pfl> {
     let AssemblerPredicate = HasMAIInsts;
@@ -434,32 +434,32 @@
   }
 }
 
-defm V_PK_MAD_I16 : VOP3P_Real_vi <0x380>;
-defm V_PK_MUL_LO_U16 : VOP3P_Real_vi <0x381>;
-defm V_PK_ADD_I16 : VOP3P_Real_vi <0x382>;
-defm V_PK_SUB_I16 : VOP3P_Real_vi <0x383>;
-defm V_PK_LSHLREV_B16 : VOP3P_Real_vi <0x384>;
-defm V_PK_LSHRREV_B16 : VOP3P_Real_vi <0x385>;
-defm V_PK_ASHRREV_I16 : VOP3P_Real_vi <0x386>;
-defm V_PK_MAX_I16 : VOP3P_Real_vi <0x387>;
-defm V_PK_MIN_I16 : VOP3P_Real_vi <0x388>;
-defm V_PK_MAD_U16 : VOP3P_Real_vi <0x389>;
-
-defm V_PK_ADD_U16 : VOP3P_Real_vi <0x38a>;
-defm V_PK_SUB_U16 : VOP3P_Real_vi <0x38b>;
-defm V_PK_MAX_U16 : VOP3P_Real_vi <0x38c>;
-defm V_PK_MIN_U16 : VOP3P_Real_vi <0x38d>;
-defm V_PK_FMA_F16 : VOP3P_Real_vi <0x38e>;
-defm V_PK_ADD_F16 : VOP3P_Real_vi <0x38f>;
-defm V_PK_MUL_F16 : VOP3P_Real_vi <0x390>;
-defm V_PK_MIN_F16 : VOP3P_Real_vi <0x391>;
-defm V_PK_MAX_F16 : VOP3P_Real_vi <0x392>;
+defm V_PK_MAD_I16 : VOP3P_Real_vi <0x00>;
+defm V_PK_MUL_LO_U16 : VOP3P_Real_vi <0x01>;
+defm V_PK_ADD_I16 : VOP3P_Real_vi <0x02>;
+defm V_PK_SUB_I16 : VOP3P_Real_vi <0x03>;
+defm V_PK_LSHLREV_B16 : VOP3P_Real_vi <0x04>;
+defm V_PK_LSHRREV_B16 : VOP3P_Real_vi <0x05>;
+defm V_PK_ASHRREV_I16 : VOP3P_Real_vi <0x06>;
+defm V_PK_MAX_I16 : VOP3P_Real_vi <0x07>;
+defm V_PK_MIN_I16 : VOP3P_Real_vi <0x08>;
+defm V_PK_MAD_U16 : VOP3P_Real_vi <0x09>;
+
+defm V_PK_ADD_U16 : VOP3P_Real_vi <0x0a>;
+defm V_PK_SUB_U16 : VOP3P_Real_vi <0x0b>;
+defm V_PK_MAX_U16 : VOP3P_Real_vi <0x0c>;
+defm V_PK_MIN_U16 : VOP3P_Real_vi <0x0d>;
+defm V_PK_FMA_F16 : VOP3P_Real_vi <0x0e>;
+defm V_PK_ADD_F16 : VOP3P_Real_vi <0x0f>;
+defm V_PK_MUL_F16 : VOP3P_Real_vi <0x10>;
+defm V_PK_MIN_F16 : VOP3P_Real_vi <0x11>;
+defm V_PK_MAX_F16 : VOP3P_Real_vi <0x12>;
 
 
 let SubtargetPredicate = HasMadMixInsts in {
-defm V_MAD_MIX_F32 : VOP3P_Real_vi <0x3a0>;
-defm V_MAD_MIXLO_F16 : VOP3P_Real_vi <0x3a1>;
-defm V_MAD_MIXHI_F16 : VOP3P_Real_vi <0x3a2>;
+defm V_MAD_MIX_F32 : VOP3P_Real_vi <0x20>;
+defm V_MAD_MIXLO_F16 : VOP3P_Real_vi <0x21>;
+defm V_MAD_MIXHI_F16 : VOP3P_Real_vi <0x22>;
 }
 
 let SubtargetPredicate = HasFmaMixInsts in {
@@ -467,54 +467,54 @@
 // The mad_mix instructions were renamed and their behaviors changed,
 // but the opcode stayed the same so we need to put these in a
 // different DecoderNamespace to avoid the ambiguity.
-defm V_FMA_MIX_F32 : VOP3P_Real_vi <0x3a0>;
-defm V_FMA_MIXLO_F16 : VOP3P_Real_vi <0x3a1>;
-defm V_FMA_MIXHI_F16 : VOP3P_Real_vi <0x3a2>;
+defm V_FMA_MIX_F32 : VOP3P_Real_vi <0x20>;
+defm V_FMA_MIXLO_F16 : VOP3P_Real_vi <0x21>;
+defm V_FMA_MIXHI_F16 : VOP3P_Real_vi <0x22>;
 }
 }
 
 
 let SubtargetPredicate = HasDot2Insts in {
 
-defm V_DOT2_F32_F16 : VOP3P_Real_vi <0x3a3>;
-defm V_DOT2_I32_I16 : VOP3P_Real_vi <0x3a6>;
-defm V_DOT2_U32_U16 : VOP3P_Real_vi <0x3a7>;
-defm V_DOT4_U32_U8  : VOP3P_Real_vi <0x3a9>;
-defm V_DOT8_U32_U4  : VOP3P_Real_vi <0x3ab>;
+defm V_DOT2_F32_F16 : VOP3P_Real_vi <0x23>;
+defm V_DOT2_I32_I16 : VOP3P_Real_vi <0x26>;
+defm V_DOT2_U32_U16 : VOP3P_Real_vi <0x27>;
+defm V_DOT4_U32_U8  : VOP3P_Real_vi <0x29>;
+defm V_DOT8_U32_U4  : VOP3P_Real_vi <0x2b>;
 
 } // End SubtargetPredicate = HasDot2Insts
 
 let SubtargetPredicate = HasDot1Insts in {
 
-defm V_DOT4_I32_I8  : VOP3P_Real_vi <0x3a8>;
-defm V_DOT8_I32_I4  : VOP3P_Real_vi <0x3aa>;
+defm V_DOT4_I32_I8  : VOP3P_Real_vi <0x28>;
+defm V_DOT8_I32_I4  : VOP3P_Real_vi <0x2a>;
 
 } // End SubtargetPredicate = HasDot1Insts
 
 let SubtargetPredicate = HasMAIInsts in {
 
-defm V_ACCVGPR_READ_B32  : VOP3P_Real_MAI <0x3d8>;
-defm V_ACCVGPR_WRITE_B32 : VOP3P_Real_MAI <0x3d9>;
-defm V_MFMA_F32_32X32X1F32  : VOP3P_Real_MAI <0x3c0>;
-defm V_MFMA_F32_16X16X1F32  : VOP3P_Real_MAI <0x3c1>;
-defm V_MFMA_F32_4X4X1F32    : VOP3P_Real_MAI <0x3c2>;
-defm V_MFMA_F32_32X32X2F32  : VOP3P_Real_MAI <0x3c4>;
-defm V_MFMA_F32_16X16X4F32  : VOP3P_Real_MAI <0x3c5>;
-defm V_MFMA_F32_32X32X4F16  : VOP3P_Real_MAI <0x3c8>;
-defm V_MFMA_F32_16X16X4F16  : VOP3P_Real_MAI <0x3c9>;
-defm V_MFMA_F32_4X4X4F16    : VOP3P_Real_MAI <0x3ca>;
-defm V_MFMA_F32_32X32X8F16  : VOP3P_Real_MAI <0x3cc>;
-defm V_MFMA_F32_16X16X16F16 : VOP3P_Real_MAI <0x3cd>;
-defm V_MFMA_I32_32X32X4I8   : VOP3P_Real_MAI <0x3d0>;
-defm V_MFMA_I32_16X16X4I8   : VOP3P_Real_MAI <0x3d1>;
-defm V_MFMA_I32_4X4X4I8     : VOP3P_Real_MAI <0x3d2>;
-defm V_MFMA_I32_32X32X8I8   : VOP3P_Real_MAI <0x3d4>;
-defm V_MFMA_I32_16X16X16I8  : VOP3P_Real_MAI <0x3d5>;
-defm V_MFMA_F32_32X32X2BF16 : VOP3P_Real_MAI <0x3e8>;
-defm V_MFMA_F32_16X16X2BF16 : VOP3P_Real_MAI <0x3e9>;
-defm V_MFMA_F32_4X4X2BF16   : VOP3P_Real_MAI <0x3eb>;
-defm V_MFMA_F32_32X32X4BF16 : VOP3P_Real_MAI <0x3ec>;
-defm V_MFMA_F32_16X16X8BF16 : VOP3P_Real_MAI <0x3ed>;
+defm V_ACCVGPR_READ_B32  : VOP3P_Real_MAI <0x58>;
+defm V_ACCVGPR_WRITE_B32 : VOP3P_Real_MAI <0x59>;
+defm V_MFMA_F32_32X32X1F32  : VOP3P_Real_MAI <0x40>;
+defm V_MFMA_F32_16X16X1F32  : VOP3P_Real_MAI <0x41>;
+defm V_MFMA_F32_4X4X1F32    : VOP3P_Real_MAI <0x42>;
+defm V_MFMA_F32_32X32X2F32  : VOP3P_Real_MAI <0x44>;
+defm V_MFMA_F32_16X16X4F32  : VOP3P_Real_MAI <0x45>;
+defm V_MFMA_F32_32X32X4F16  : VOP3P_Real_MAI <0x48>;
+defm V_MFMA_F32_16X16X4F16  : VOP3P_Real_MAI <0x49>;
+defm V_MFMA_F32_4X4X4F16    : VOP3P_Real_MAI <0x4a>;
+defm V_MFMA_F32_32X32X8F16  : VOP3P_Real_MAI <0x4c>;
+defm V_MFMA_F32_16X16X16F16 : VOP3P_Real_MAI <0x4d>;
+defm V_MFMA_I32_32X32X4I8   : VOP3P_Real_MAI <0x50>;
+defm V_MFMA_I32_16X16X4I8   : VOP3P_Real_MAI <0x51>;
+defm V_MFMA_I32_4X4X4I8     : VOP3P_Real_MAI <0x52>;
+defm V_MFMA_I32_32X32X8I8   : VOP3P_Real_MAI <0x54>;
+defm V_MFMA_I32_16X16X16I8  : VOP3P_Real_MAI <0x55>;
+defm V_MFMA_F32_32X32X2BF16 : VOP3P_Real_MAI <0x68>;
+defm V_MFMA_F32_16X16X2BF16 : VOP3P_Real_MAI <0x69>;
+defm V_MFMA_F32_4X4X2BF16   : VOP3P_Real_MAI <0x6b>;
+defm V_MFMA_F32_32X32X4BF16 : VOP3P_Real_MAI <0x6c>;
+defm V_MFMA_F32_16X16X8BF16 : VOP3P_Real_MAI <0x6d>;
 
 } // End SubtargetPredicate = HasMAIInsts
 
@@ -523,48 +523,48 @@
 //===----------------------------------------------------------------------===//
 
 let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
-  multiclass VOP3P_Real_gfx10<bits<10> op> {
+  multiclass VOP3P_Real_gfx10<bits<7> op> {
     def _gfx10 : VOP3P_Real<!cast<VOP3P_Pseudo>(NAME), SIEncodingFamily.GFX10>,
                  VOP3Pe_gfx10 <op, !cast<VOP3P_Pseudo>(NAME).Pfl>;
   }
 } // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
 
-defm V_PK_MAD_I16     : VOP3P_Real_gfx10<0x000>;
-defm V_PK_MUL_LO_U16  : VOP3P_Real_gfx10<0x001>;
-defm V_PK_ADD_I16     : VOP3P_Real_gfx10<0x002>;
-defm V_PK_SUB_I16     : VOP3P_Real_gfx10<0x003>;
-defm V_PK_LSHLREV_B16 : VOP3P_Real_gfx10<0x004>;
-defm V_PK_LSHRREV_B16 : VOP3P_Real_gfx10<0x005>;
-defm V_PK_ASHRREV_I16 : VOP3P_Real_gfx10<0x006>;
-defm V_PK_MAX_I16     : VOP3P_Real_gfx10<0x007>;
-defm V_PK_MIN_I16     : VOP3P_Real_gfx10<0x008>;
-defm V_PK_MAD_U16     : VOP3P_Real_gfx10<0x009>;
-defm V_PK_ADD_U16     : VOP3P_Real_gfx10<0x00a>;
-defm V_PK_SUB_U16     : VOP3P_Real_gfx10<0x00b>;
-defm V_PK_MAX_U16     : VOP3P_Real_gfx10<0x00c>;
-defm V_PK_MIN_U16     : VOP3P_Real_gfx10<0x00d>;
-defm V_PK_FMA_F16     : VOP3P_Real_gfx10<0x00e>;
-defm V_PK_ADD_F16     : VOP3P_Real_gfx10<0x00f>;
-defm V_PK_MUL_F16     : VOP3P_Real_gfx10<0x010>;
-defm V_PK_MIN_F16     : VOP3P_Real_gfx10<0x011>;
-defm V_PK_MAX_F16     : VOP3P_Real_gfx10<0x012>;
-defm V_FMA_MIX_F32    : VOP3P_Real_gfx10<0x020>;
-defm V_FMA_MIXLO_F16  : VOP3P_Real_gfx10<0x021>;
-defm V_FMA_MIXHI_F16  : VOP3P_Real_gfx10<0x022>;
+defm V_PK_MAD_I16     : VOP3P_Real_gfx10<0x00>;
+defm V_PK_MUL_LO_U16  : VOP3P_Real_gfx10<0x01>;
+defm V_PK_ADD_I16     : VOP3P_Real_gfx10<0x02>;
+defm V_PK_SUB_I16     : VOP3P_Real_gfx10<0x03>;
+defm V_PK_LSHLREV_B16 : VOP3P_Real_gfx10<0x04>;
+defm V_PK_LSHRREV_B16 : VOP3P_Real_gfx10<0x05>;
+defm V_PK_ASHRREV_I16 : VOP3P_Real_gfx10<0x06>;
+defm V_PK_MAX_I16     : VOP3P_Real_gfx10<0x07>;
+defm V_PK_MIN_I16     : VOP3P_Real_gfx10<0x08>;
+defm V_PK_MAD_U16     : VOP3P_Real_gfx10<0x09>;
+defm V_PK_ADD_U16     : VOP3P_Real_gfx10<0x0a>;
+defm V_PK_SUB_U16     : VOP3P_Real_gfx10<0x0b>;
+defm V_PK_MAX_U16     : VOP3P_Real_gfx10<0x0c>;
+defm V_PK_MIN_U16     : VOP3P_Real_gfx10<0x0d>;
+defm V_PK_FMA_F16     : VOP3P_Real_gfx10<0x0e>;
+defm V_PK_ADD_F16     : VOP3P_Real_gfx10<0x0f>;
+defm V_PK_MUL_F16     : VOP3P_Real_gfx10<0x10>;
+defm V_PK_MIN_F16     : VOP3P_Real_gfx10<0x11>;
+defm V_PK_MAX_F16     : VOP3P_Real_gfx10<0x12>;
+defm V_FMA_MIX_F32    : VOP3P_Real_gfx10<0x20>;
+defm V_FMA_MIXLO_F16  : VOP3P_Real_gfx10<0x21>;
+defm V_FMA_MIXHI_F16  : VOP3P_Real_gfx10<0x22>;
 
 let SubtargetPredicate = HasDot2Insts in {
 
-defm V_DOT2_F32_F16 : VOP3P_Real_gfx10 <0x013>;
-defm V_DOT2_I32_I16 : VOP3P_Real_gfx10 <0x014>;
-defm V_DOT2_U32_U16 : VOP3P_Real_gfx10 <0x015>;
-defm V_DOT4_U32_U8  : VOP3P_Real_gfx10 <0x017>;
-defm V_DOT8_U32_U4  : VOP3P_Real_gfx10 <0x019>;
+defm V_DOT2_F32_F16 : VOP3P_Real_gfx10 <0x13>;
+defm V_DOT2_I32_I16 : VOP3P_Real_gfx10 <0x14>;
+defm V_DOT2_U32_U16 : VOP3P_Real_gfx10 <0x15>;
+defm V_DOT4_U32_U8  : VOP3P_Real_gfx10 <0x17>;
+defm V_DOT8_U32_U4  : VOP3P_Real_gfx10 <0x19>;
 
 } // End SubtargetPredicate = HasDot2Insts
 
 let SubtargetPredicate = HasDot1Insts in {
 
-defm V_DOT4_I32_I8  : VOP3P_Real_gfx10 <0x016>;
-defm V_DOT8_I32_I4  : VOP3P_Real_gfx10 <0x018>;
+defm V_DOT4_I32_I8  : VOP3P_Real_gfx10 <0x16>;
+defm V_DOT8_I32_I4  : VOP3P_Real_gfx10 <0x18>;
 
 } // End SubtargetPredicate = HasDot1Insts
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -296,7 +296,7 @@
   let Inst{63}    = !if(P.HasSrc2Mods, src2_modifiers{0}, 0);
 }
 
-class VOP3Pe <bits<10> op, VOPProfile P> : Enc64 {
+class VOP3Pe <bits<7> op, VOPProfile P> : Enc64 {
   bits<8> vdst;
   // neg, neg_hi, op_sel put in srcN_modifiers
   bits<4> src0_modifiers;
@@ -320,8 +320,8 @@
 
   let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
 
-  let Inst{25-16} = op;
-  let Inst{31-26} = 0x34; //encoding
+  let Inst{22-16} = op;
+  let Inst{31-23} = 0x1a7; //encoding
   let Inst{40-32} = !if(P.HasSrc0, src0, 0);
   let Inst{49-41} = !if(P.HasSrc1, src1, 0);
   let Inst{58-50} = !if(P.HasSrc2, src2, 0);
@@ -332,7 +332,7 @@
   let Inst{63}    = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); // neg (lo)
 }
 
-class VOP3Pe_MAI <bits<10> op, VOPProfile P> : Enc64 {
+class VOP3Pe_MAI <bits<7> op, VOPProfile P> : Enc64 {
   bits<8> vdst;
   bits<10> src0;
   bits<10> src1;
@@ -349,8 +349,8 @@
 
   let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
 
-  let Inst{25-16} = op;
-  let Inst{31-26} = 0x34; //encoding
+  let Inst{22-16} = op;
+  let Inst{31-23} = 0x1a7; //encoding
   let Inst{40-32} = !if(P.HasSrc0, src0{8-0}, 0);
   let Inst{49-41} = !if(P.HasSrc1, src1{8-0}, 0);
   let Inst{58-50} = !if(P.HasSrc2, src2, 0);
@@ -362,8 +362,8 @@
 }
 
 
-class VOP3Pe_gfx10 <bits<10> op, VOPProfile P> : VOP3Pe<op, P> {
-  let Inst{31-26} = 0x33; //encoding
+class VOP3Pe_gfx10 <bits<7> op, VOPProfile P> : VOP3Pe<op, P> {
+  let Inst{31-23} = 0x198; //encoding
 }
 
 class VOP3be_gfx6_gfx7<bits<9> op, VOPProfile p> : VOP3be<p> {
diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
--- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -903,7 +903,7 @@
 
   MCSymbol *MCSym;
   if (ACPV->isLSDA()) {
-    MCSym = getCurExceptionSym();
+    MCSym = getMBBExceptionSym(MF->front());
   } else if (ACPV->isBlockAddress()) {
     const BlockAddress *BA =
       cast<ARMConstantPoolConstant>(ACPV)->getBlockAddress();
diff --git a/llvm/lib/Target/ARM/ARMInstrThumb2.td b/llvm/lib/Target/ARM/ARMInstrThumb2.td
--- a/llvm/lib/Target/ARM/ARMInstrThumb2.td
+++ b/llvm/lib/Target/ARM/ARMInstrThumb2.td
@@ -2575,7 +2575,6 @@
           Requires<[IsThumb2, HasDSP]>;
 
 // Signed/Unsigned saturate.
-let hasSideEffects = 1 in
 class T2SatI<dag iops, string opc, string asm>
   : T2I<(outs rGPR:$Rd), iops, NoItinerary, opc, asm, []> {
   bits<4> Rd;
diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -355,7 +355,8 @@
     const TargetRegisterInfo &TRI;
     const ARMBaseInstrInfo &TII;
     MachineFunction *MF = nullptr;
-    MachineInstr *InsertPt = nullptr;
+    MachineBasicBlock::iterator StartInsertPt;
+    MachineBasicBlock *StartInsertBB = nullptr;
     MachineInstr *Start = nullptr;
     MachineInstr *Dec = nullptr;
     MachineInstr *End = nullptr;
@@ -402,7 +403,7 @@
     // Check that the predication in the loop will be equivalent once we
     // perform the conversion. Also ensure that we can provide the number
     // of elements to the loop start instruction.
-    bool ValidateTailPredicate(MachineInstr *StartInsertPt);
+    bool ValidateTailPredicate();
 
     // Check that any values available outside of the loop will be the same
     // after tail predication conversion.
@@ -585,10 +586,7 @@
   return false;
 }
 
-bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt) {
-  if (!StartInsertPt)
-    return false;
-
+bool LowOverheadLoop::ValidateTailPredicate() {
   if (!IsTailPredicationLegal()) {
     LLVM_DEBUG(if (VCTPs.empty())
                  dbgs() << "ARM Loops: Didn't find a VCTP instruction.\n";
@@ -605,14 +603,33 @@
     return false;
   }
 
-  if (!VPTState::isValid(RDA))
+  if (!VPTState::isValid(RDA)) {
+    LLVM_DEBUG(dbgs() << "ARM Loops: Invalid VPT state.\n");
     return false;
+  }
 
   if (!ValidateLiveOuts()) {
     LLVM_DEBUG(dbgs() << "ARM Loops: Invalid live outs.\n");
     return false;
   }
 
+  // Check that creating a [W|D]LSTP, which will define LR with an element
+  // count instead of iteration count, won't affect any other instructions
+  // than the LoopStart and LoopDec.
+  // TODO: We should try to insert the [W|D]LSTP after any of the other uses.
+  if (StartInsertPt == Start && Start->getOperand(0).getReg() == ARM::LR) {
+    if (auto *IterCount = RDA.getMIOperand(Start, 0)) {
+      SmallPtrSet<MachineInstr *, 2> Uses;
+      RDA.getGlobalUses(IterCount, ARM::LR, Uses);
+      for (auto *Use : Uses) {
+        if (Use != Start && Use != Dec) {
+          LLVM_DEBUG(dbgs() << " ARM Loops: Found LR use: " << *Use);
+          return false;
+        }
+      }
+    }
+  }
+
   // For tail predication, we need to provide the number of elements, instead
   // of the iteration count, to the loop start instruction. The number of
   // elements is provided to the vctp instruction, so we need to check that
@@ -629,47 +646,10 @@
     return false;
   }
 
-  // The element count register maybe defined after InsertPt, in which case we
-  // need to try to move either InsertPt or the def so that the [w|d]lstp can
-  // use the value.
-  MachineBasicBlock *InsertBB = StartInsertPt->getParent();
-
-  if (!RDA.isReachingDefLiveOut(StartInsertPt, NumElements)) {
-    if (auto *ElemDef = RDA.getLocalLiveOutMIDef(InsertBB, NumElements)) {
-      if (RDA.isSafeToMoveForwards(ElemDef, StartInsertPt)) {
-        ElemDef->removeFromParent();
-        InsertBB->insert(MachineBasicBlock::iterator(StartInsertPt), ElemDef);
-        LLVM_DEBUG(dbgs() << "ARM Loops: Moved element count def: "
-                   << *ElemDef);
-      } else if (RDA.isSafeToMoveBackwards(StartInsertPt, ElemDef)) {
-        StartInsertPt->removeFromParent();
-        InsertBB->insertAfter(MachineBasicBlock::iterator(ElemDef),
-                              StartInsertPt);
-        LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef);
-      } else {
-        // If we fail to move an instruction and the element count is provided
-        // by a mov, use the mov operand if it will have the same value at the
-        // insertion point
-        MachineOperand Operand = ElemDef->getOperand(1);
-        if (isMovRegOpcode(ElemDef->getOpcode()) &&
-            RDA.getUniqueReachingMIDef(ElemDef, Operand.getReg()) ==
-                RDA.getUniqueReachingMIDef(StartInsertPt, Operand.getReg())) {
-          TPNumElements = Operand;
-          NumElements = TPNumElements.getReg();
-        } else {
-          LLVM_DEBUG(dbgs()
-                     << "ARM Loops: Unable to move element count to loop "
-                     << "start instruction.\n");
-          return false;
-        }
-      }
-    }
-  }
-
   // Could inserting the [W|D]LSTP cause some unintended affects? In a perfect
   // world the [w|d]lstp instruction would be last instruction in the preheader
   // and so it would only affect instructions within the loop body. But due to
-  // scheduling, and/or the logic in this pass (above), the insertion point can
+  // scheduling, and/or the logic in this pass, the insertion point can
   // be moved earlier. So if the Loop Start isn't the last instruction in the
   // preheader, and if the initial element count is smaller than the vector
   // width, the Loop Start instruction will immediately generate one or more
@@ -677,13 +657,17 @@
   // instructions in the preheader.
   auto CannotInsertWDLSTPBetween = [](MachineBasicBlock::iterator I,
                                       MachineBasicBlock::iterator E) {
-    for (; I != E; ++I)
-      if (shouldInspect(*I))
+    for (; I != E; ++I) {
+      if (shouldInspect(*I)) {
+        LLVM_DEBUG(dbgs() << "ARM Loops: Instruction blocks [W|D]LSTP"
+                   << " insertion: " << *I);
         return true;
+      }
+    }
     return false;
   };
 
-  if (CannotInsertWDLSTPBetween(StartInsertPt, InsertBB->end()))
+  if (CannotInsertWDLSTPBetween(StartInsertPt, StartInsertBB->end()))
     return false;
 
   // Especially in the case of while loops, InsertBB may not be the
@@ -704,7 +688,7 @@
 
   // Search backwards for a def, until we get to InsertBB.
   MachineBasicBlock *MBB = Preheader;
-  while (MBB && MBB != InsertBB) {
+  while (MBB && MBB != StartInsertBB) {
     if (CannotProvideElements(MBB, NumElements)) {
       LLVM_DEBUG(dbgs() << "ARM Loops: Unable to provide element count.\n");
       return false;
@@ -741,11 +725,17 @@
           continue;
 
         if (isSubImmOpcode(MI->getOpcode())) {
-          if (FoundSub || !IsValidSub(MI, ExpectedVectorWidth))
+          if (FoundSub || !IsValidSub(MI, ExpectedVectorWidth)) {
+            LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element"
+                       " count: " << *MI);
             return false;
+          }
           FoundSub = true;
-        } else
+        } else {
+          LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element"
+                     " count: " << *MI);
           return false;
+        }
       }
       ToRemove.insert(ElementChain.begin(), ElementChain.end());
     }
@@ -1017,10 +1007,17 @@
   // Find a suitable position to insert the loop start instruction. It needs to
   // be able to safely define LR.
   auto FindStartInsertionPoint = [](MachineInstr *Start,
-                                    ReachingDefAnalysis &RDA) -> MachineInstr* {
+                                    MachineInstr *Dec,
+                                    MachineBasicBlock::iterator &InsertPt,
+                                    MachineBasicBlock *&InsertBB,
+                                    ReachingDefAnalysis &RDA,
+                                    InstSet &ToRemove) {
     // We can define LR because LR already contains the same value.
-    if (Start->getOperand(0).getReg() == ARM::LR)
-      return Start;
+    if (Start->getOperand(0).getReg() == ARM::LR) {
+      InsertPt = MachineBasicBlock::iterator(Start);
+      InsertBB = Start->getParent();
+      return true;
+    }
 
     unsigned CountReg = Start->getOperand(0).getReg();
     auto IsMoveLR = [&CountReg](MachineInstr *MI) {
@@ -1030,37 +1027,83 @@
              MI->getOperand(2).getImm() == ARMCC::AL;
     };
 
-    MachineBasicBlock *MBB = Start->getParent();
-
     // Find an insertion point:
     // - Is there a (mov lr, Count) before Start? If so, and nothing else
-    //   writes to Count before Start, we can insert at that mov.
-    if (auto *LRDef = RDA.getUniqueReachingMIDef(Start, ARM::LR))
-      if (IsMoveLR(LRDef) && RDA.hasSameReachingDef(Start, LRDef, CountReg))
-        return LRDef;
+    //   writes to Count before Start, we can insert at start.
+    if (auto *LRDef = RDA.getUniqueReachingMIDef(Start, ARM::LR)) {
+      if (IsMoveLR(LRDef) && RDA.hasSameReachingDef(Start, LRDef, CountReg)) {
+        SmallPtrSet<MachineInstr *, 2> Ignore = { Dec };
+        if (!TryRemove(LRDef, RDA, ToRemove, Ignore))
+          return false;
+        InsertPt = MachineBasicBlock::iterator(Start);
+        InsertBB = Start->getParent();
+        return true;
+      }
+    }
 
     // - Is there a (mov lr, Count) after Start? If so, and nothing else writes
-    //   to Count after Start, we can insert at that mov.
-    if (auto *LRDef = RDA.getLocalLiveOutMIDef(MBB, ARM::LR))
-      if (IsMoveLR(LRDef) && RDA.hasSameReachingDef(Start, LRDef, CountReg))
-        return LRDef;
+    //   to Count after Start, we can insert at that mov (which will now be
+    //   dead).
+    MachineBasicBlock *MBB = Start->getParent();
+    if (auto *LRDef = RDA.getLocalLiveOutMIDef(MBB, ARM::LR)) {
+      if (IsMoveLR(LRDef) && RDA.hasSameReachingDef(Start, LRDef, CountReg)) {
+        SmallPtrSet<MachineInstr *, 2> Ignore = { Start, Dec };
+        if (!TryRemove(LRDef, RDA, ToRemove, Ignore))
+          return false;
+        InsertPt = MachineBasicBlock::iterator(LRDef);
+        InsertBB = LRDef->getParent();
+        return true;
+      }
+    }
 
     // We've found no suitable LR def and Start doesn't use LR directly. Can we
     // just define LR anyway?
-    return RDA.isSafeToDefRegAt(Start, ARM::LR) ? Start : nullptr;
+    if (!RDA.isSafeToDefRegAt(Start, ARM::LR))
+      return false;
+
+    InsertPt = MachineBasicBlock::iterator(Start);
+    InsertBB = Start->getParent();
+    return true;
   };
 
-  InsertPt = FindStartInsertionPoint(Start, RDA);
-  Revert = !ValidateRanges(Start, End, BBUtils, ML) || !InsertPt;
-  CannotTailPredicate = !ValidateTailPredicate(InsertPt);
+  // We know that we can define safely LR at InsertPt, but maybe we could
+  // push the insertion point to later on in the basic block.
+  auto TryAdjustInsertionPoint = [](MachineBasicBlock::iterator &InsertPt,
+                                    MachineInstr *Start,
+                                    ReachingDefAnalysis &RDA) {
+
+    MachineBasicBlock *MBB = InsertPt->getParent();
+    MachineBasicBlock::iterator FirstNonTerminator =
+      MBB->getFirstTerminator();
+    unsigned CountReg = Start->getOperand(0).getReg();
 
-  LLVM_DEBUG(if (!InsertPt)
-               dbgs() << "ARM Loops: Unable to find safe insertion point.\n";
+    // Get the latest possible insertion point and check whether the semantics
+    // will be maintained if Start was inserted there.
+    if (FirstNonTerminator == MBB->end()) {
+      if (RDA.isReachingDefLiveOut(Start, CountReg) &&
+          RDA.isReachingDefLiveOut(Start, ARM::LR))
+        InsertPt = FirstNonTerminator;
+    } else if (RDA.hasSameReachingDef(Start, &*FirstNonTerminator, CountReg) &&
+               RDA.hasSameReachingDef(Start, &*FirstNonTerminator, ARM::LR))
+      InsertPt = FirstNonTerminator;
+  };
+
+  if (!FindStartInsertionPoint(Start, Dec, StartInsertPt, StartInsertBB, RDA,
+                               ToRemove)) {
+    LLVM_DEBUG(dbgs() << "ARM Loops: Unable to find safe insertion point.\n");
+    Revert = true;
+    return;
+  }
+  TryAdjustInsertionPoint(StartInsertPt, Start, RDA);
+  LLVM_DEBUG(if (StartInsertPt == StartInsertBB->end())
+               dbgs() << "ARM Loops: Will insert LoopStart at end of block\n";
              else
-                dbgs() << "ARM Loops: Start insertion point: " << *InsertPt;
-             if (CannotTailPredicate)
-                dbgs() << "ARM Loops: Couldn't validate tail predicate.\n"
+               dbgs() << "ARM Loops: Will insert LoopStart at "
+               << *StartInsertPt
             );
+
+  Revert = !ValidateRanges(Start, End, BBUtils, ML);
+  CannotTailPredicate = !ValidateTailPredicate();
 }
 
 bool LowOverheadLoop::AddVCTP(MachineInstr *MI) {
@@ -1398,7 +1441,7 @@
 
   // Collect and remove the users of iteration count.
   SmallPtrSet<MachineInstr*, 4> Killed  = { LoLoop.Start, LoLoop.Dec,
-                                            LoLoop.End, LoLoop.InsertPt };
+                                            LoLoop.End };
   if (!TryRemove(Def, *RDA, LoLoop.ToRemove, Killed))
     LLVM_DEBUG(dbgs() << "ARM Loops: Unsafe to remove loop iteration count.\n");
 }
@@ -1409,24 +1452,21 @@
   // calculate the number of loop iterations.
   IterationCountDCE(LoLoop);
 
-  MachineInstr *InsertPt = LoLoop.InsertPt;
+  MachineBasicBlock::iterator InsertPt = LoLoop.StartInsertPt;
   MachineInstr *Start = LoLoop.Start;
-  MachineBasicBlock *MBB = InsertPt->getParent();
+  MachineBasicBlock *MBB = LoLoop.StartInsertBB;
   bool IsDo = Start->getOpcode() == ARM::t2DoLoopStart;
   unsigned Opc = LoLoop.getStartOpcode();
   MachineOperand &Count = LoLoop.getLoopStartOperand();
 
   MachineInstrBuilder MIB =
-    BuildMI(*MBB, InsertPt, InsertPt->getDebugLoc(), TII->get(Opc));
+    BuildMI(*MBB, InsertPt, Start->getDebugLoc(), TII->get(Opc));
 
   MIB.addDef(ARM::LR);
   MIB.add(Count);
   if (!IsDo)
     MIB.add(Start->getOperand(1));
 
-  // If we're inserting at a mov lr, then remove it as it's redundant.
-  if (InsertPt != Start)
-    LoLoop.ToRemove.insert(InsertPt);
   LoLoop.ToRemove.insert(Start);
   LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB);
   return &*MIB;
diff --git a/llvm/lib/Target/AVR/AVRFrameLowering.cpp b/llvm/lib/Target/AVR/AVRFrameLowering.cpp
--- a/llvm/lib/Target/AVR/AVRFrameLowering.cpp
+++ b/llvm/lib/Target/AVR/AVRFrameLowering.cpp
@@ -131,6 +131,26 @@
       .setMIFlag(MachineInstr::FrameSetup);
 }
 
+static void restoreStatusRegister(MachineFunction &MF, MachineBasicBlock &MBB) {
+  const AVRMachineFunctionInfo *AFI = MF.getInfo<AVRMachineFunctionInfo>();
+
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+
+  DebugLoc DL = MBBI->getDebugLoc();
+  const AVRSubtarget &STI = MF.getSubtarget<AVRSubtarget>();
+  const AVRInstrInfo &TII = *STI.getInstrInfo();
+
+  // Emit special epilogue code to restore R1, R0 and SREG in interrupt/signal
+  // handlers at the very end of the function, just before reti.
+  if (AFI->isInterruptOrSignalHandler()) {
+    BuildMI(MBB, MBBI, DL, TII.get(AVR::POPRd), AVR::R0);
+    BuildMI(MBB, MBBI, DL, TII.get(AVR::OUTARr))
+        .addImm(0x3f)
+        .addReg(AVR::R0, RegState::Kill);
+    BuildMI(MBB, MBBI, DL, TII.get(AVR::POPWRd), AVR::R1R0);
+  }
+}
+
 void AVRFrameLowering::emitEpilogue(MachineFunction &MF,
                                     MachineBasicBlock &MBB) const {
   const AVRMachineFunctionInfo *AFI = MF.getInfo<AVRMachineFunctionInfo>();
@@ -151,18 +171,9 @@
   const AVRSubtarget &STI = MF.getSubtarget<AVRSubtarget>();
   const AVRInstrInfo &TII = *STI.getInstrInfo();
 
-  // Emit special epilogue code to restore R1, R0 and SREG in interrupt/signal
-  // handlers at the very end of the function, just before reti.
-  if (AFI->isInterruptOrSignalHandler()) {
-    BuildMI(MBB, MBBI, DL, TII.get(AVR::POPRd), AVR::R0);
-    BuildMI(MBB, MBBI, DL, TII.get(AVR::OUTARr))
-        .addImm(0x3f)
-        .addReg(AVR::R0, RegState::Kill);
-    BuildMI(MBB, MBBI, DL, TII.get(AVR::POPWRd), AVR::R1R0);
-  }
-
   // Early exit if there is no need to restore the frame pointer.
   if (!FrameSize) {
+    restoreStatusRegister(MF, MBB);
     return;
   }
 
@@ -198,6 +209,8 @@
   // Write back R29R28 to SP and temporarily disable interrupts.
   BuildMI(MBB, MBBI, DL, TII.get(AVR::SPWRITE), AVR::SP)
       .addReg(AVR::R29R28, RegState::Kill);
+
+  restoreStatusRegister(MF, MBB);
 }
 
 // Return true if the specified function should have a dedicated frame
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.h b/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
@@ -37,6 +37,8 @@
   static unsigned getModuleMatchQuality(const Module &M);
 
   void adjustPassManager(PassManagerBuilder &PMB) override;
+  void registerPassBuilderCallbacks(PassBuilder &PB,
+                                    bool DebugPassManager) override;
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
   TargetTransformInfo getTargetTransformInfo(const Function &F) override;
 
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Passes/PassBuilder.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
@@ -273,6 +274,18 @@
       });
 }
 
+void HexagonTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB,
+                                                        bool DebugPassManager) {
+  PB.registerOptimizerLastEPCallback(
+      [=](ModulePassManager &MPM, PassBuilder::OptimizationLevel Level) {
+        LoopPassManager LPM(DebugPassManager);
+        FunctionPassManager FPM(DebugPassManager);
+        LPM.addPass(HexagonVectorLoopCarriedReusePass());
+        FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM)));
+        MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+      });
+}
+
 TargetTransformInfo
 HexagonTargetMachine::getTargetTransformInfo(const Function &F) {
   return TargetTransformInfo(HexagonTTIImpl(this, F));
diff --git a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
--- a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -528,6 +528,11 @@
     Inst.addOperand(MCOperand::createReg(VSRpRegs[getVSRpEvenReg()]));
   }
 
+  void addRegVSRpEvenRCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(VSRpRegs[getVSRpEvenReg()]));
+  }
+
   void addRegCRBITRCOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     Inst.addOperand(MCOperand::createReg(CRBITRegs[getCRBit()]));
diff --git a/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
--- a/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
+++ b/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
@@ -212,6 +212,15 @@
   return MCDisassembler::Success;
 }
 
+static DecodeStatus decodeVSRpEvenOperands(MCInst &Inst, uint64_t RegNo,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  if (RegNo & 1)
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createReg(VSRpRegs[RegNo >> 1]));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus decodeMemRIOperands(MCInst &Inst, uint64_t Imm,
                                         int64_t Address, const void *Decoder) {
   // Decode the memri field (imm, reg), which has the low 16-bits as the
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h
@@ -93,6 +93,9 @@
   unsigned get_crbitm_encoding(const MCInst &MI, unsigned OpNo,
                                SmallVectorImpl<MCFixup> &Fixups,
                                const MCSubtargetInfo &STI) const;
+  unsigned getVSRpEvenEncoding(const MCInst &MI, unsigned OpNo,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
 
   /// getMachineOpValue - Return binary encoding of operand. If the machine
   /// operand requires relocation, record the relocation and return zero.
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -94,6 +94,16 @@
   return 0;
 }
 
+unsigned
+PPCMCCodeEmitter::getVSRpEvenEncoding(const MCInst &MI, unsigned OpNo,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const {
+  assert(MI.getOperand(OpNo).isReg() && "Operand should be a register");
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI)
+                     << 1;
+  return RegBits;
+}
+
 unsigned PPCMCCodeEmitter::getImm16Encoding(const MCInst &MI, unsigned OpNo,
                                        SmallVectorImpl<MCFixup> &Fixups,
                                        const MCSubtargetInfo &STI) const {
diff --git a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
--- a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
@@ -647,6 +647,185 @@
   let Inst{31} = 0;
 }
 
+class XX3Form_AT3_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
+                           string asmstr, InstrItinClass itin,
+                           list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3> AT;
+  bits<6> XA;
+  bits<6> XB;
+
+  let Pattern = pattern;
+
+  let Inst{6-8} = AT;
+  let Inst{9-10} = 0;
+  let Inst{11-15} = XA{4-0};
+  let Inst{16-20} = XB{4-0};
+  let Inst{21-28} = xo;
+  let Inst{29}    = XA{5};
+  let Inst{30}    = XB{5};
+  let Inst{31} = 0;
+}
+
+class MMIRR_XX3Form_XY4P2_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
+                               string asmstr, InstrItinClass itin,
+                               list<dag> pattern>
+  : PI<1, opcode, OOL, IOL, asmstr, itin> {
+  bits<3> AT;
+  bits<6> XA;
+  bits<6> XB;
+  bits<4> XMSK;
+  bits<4> YMSK;
+  bits<2> PMSK;
+
+  let Pattern = pattern;
+
+  // The prefix.
+  let Inst{6-7} = 3;
+  let Inst{8-11} = 9;
+  let Inst{12-15} = 0;
+  let Inst{16-17} = PMSK;
+  let Inst{18-23} = 0;
+  let Inst{24-27} = XMSK;
+  let Inst{28-31} = YMSK;
+
+  // The instruction.
+  let Inst{38-40} = AT;
+  let Inst{41-42} = 0;
+  let Inst{43-47} = XA{4-0};
+  let Inst{48-52} = XB{4-0};
+  let Inst{53-60} = xo;
+  let Inst{61} = XA{5};
+  let Inst{62} = XB{5};
+  let Inst{63} = 0;
+}
+
+class MMIRR_XX3Form_XY4_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
+                             string asmstr, InstrItinClass itin,
+                             list<dag> pattern>
+  : PI<1, opcode, OOL, IOL, asmstr, itin> {
+  bits<3> AT;
+  bits<6> XA;
+  bits<6> XB;
+  bits<4> XMSK;
+  bits<4> YMSK;
+
+  let Pattern = pattern;
+
+  // The prefix.
+  let Inst{6-7} = 3;
+  let Inst{8-11} = 9;
+  let Inst{12-23} = 0;
+  let Inst{24-27} = XMSK;
+  let Inst{28-31} = YMSK;
+
+  // The instruction.
+  let Inst{38-40} = AT;
+  let Inst{41-42} = 0;
+  let Inst{43-47} = XA{4-0};
+  let Inst{48-52} = XB{4-0};
+  let Inst{53-60} = xo;
+  let Inst{61} = XA{5};
+  let Inst{62} = XB{5};
+  let Inst{63} = 0;
+}
+
+class MMIRR_XX3Form_X4Y2_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
+                              string asmstr, InstrItinClass itin,
+                              list<dag> pattern>
+  : PI<1, opcode, OOL, IOL, asmstr, itin> {
+  bits<3> AT;
+  bits<6> XA;
+  bits<6> XB;
+  bits<4> XMSK;
+  bits<2> YMSK;
+
+  let Pattern = pattern;
+
+  // The prefix.
+  let Inst{6-7} = 3;
+  let Inst{8-11} = 9;
+  let Inst{12-23} = 0;
+  let Inst{24-27} = XMSK;
+  let Inst{28-29} = YMSK;
+  let Inst{30-31} = 0;
+
+  // The instruction.
+  let Inst{38-40} = AT;
+  let Inst{41-42} = 0;
+  let Inst{43-47} = XA{4-0};
+  let Inst{48-52} = XB{4-0};
+  let Inst{53-60} = xo;
+  let Inst{61} = XA{5};
+  let Inst{62} = XB{5};
+  let Inst{63} = 0;
+}
+
+class MMIRR_XX3Form_XY4P8_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
+                               string asmstr, InstrItinClass itin,
+                               list<dag> pattern>
+  : PI<1, opcode, OOL, IOL, asmstr, itin> {
+  bits<3> AT;
+  bits<6> XA;
+  bits<6> XB;
+  bits<4> XMSK;
+  bits<4> YMSK;
+  bits<8> PMSK;
+
+  let Pattern = pattern;
+
+  // The prefix.
+  let Inst{6-7} = 3;
+  let Inst{8-11} = 9;
+  let Inst{12-15} = 0;
+  let Inst{16-23} = PMSK;
+  let Inst{24-27} = XMSK;
+  let Inst{28-31} = YMSK;
+
+  // The instruction.
+  let Inst{38-40} = AT;
+  let Inst{41-42} = 0;
+  let Inst{43-47} = XA{4-0};
+  let Inst{48-52} = XB{4-0};
+  let Inst{53-60} = xo;
+  let Inst{61} = XA{5};
+  let Inst{62} = XB{5};
+  let Inst{63} = 0;
+}
+
+class MMIRR_XX3Form_XYP4_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
+                              string asmstr, InstrItinClass itin,
+                              list<dag> pattern>
+  : PI<1, opcode, OOL, IOL, asmstr, itin> {
+  bits<3> AT;
+  bits<6> XA;
+  bits<6> XB;
+  bits<4> XMSK;
+  bits<4> YMSK;
+  bits<4> PMSK;
+
+  let Pattern = pattern;
+
+  // The prefix.
+  let Inst{6-7} = 3;
+  let Inst{8-11} = 9;
+  let Inst{12-15} = 0;
+  let Inst{16-19} = PMSK;
+  let Inst{20-23} = 0;
+  let Inst{24-27} = XMSK;
+  let Inst{28-31} = YMSK;
+
+  // The instruction.
+  let Inst{38-40} = AT;
+  let Inst{41-42} = 0;
+  let Inst{43-47} = XA{4-0};
+  let Inst{48-52} = XB{4-0};
+  let Inst{53-60} = xo;
+  let Inst{61} = XA{5};
+  let Inst{62} = XB{5};
+  let Inst{63} = 0;
+}
+
 def PrefixInstrs : Predicate<"Subtarget->hasPrefixInstrs()">;
 def IsISA3_1 : Predicate<"Subtarget->isISA3_1()">;
 def PairedVectorMemops : Predicate<"Subtarget->pairedVectorMemops()">;
@@ -802,6 +981,286 @@
   }
 }
 
+// Multiclass definitions for MMA accumulator instructions.
+// ----------------------------------------------------------------------------
+
+// Defines 2 unmasked instructions where the xo field for acc/non-acc version
+// is even/odd.
+multiclass ACC_UM_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
+                       string asmstr> {
+  let Predicates = [MMA] in {
+  def NAME :
+    XX3Form_AT3_XAB6<opcode, !or(xo, 0x01), (outs acc:$AT), IOL,
+                     !strconcat(asmbase#" ", asmstr), IIC_VecFP, []>,
+    RegConstraint<"@earlyclobber $AT">;
+  def PP :
+    XX3Form_AT3_XAB6<opcode, xo, (outs acc:$AT), !con((ins acc:$ATi), IOL),
+                     !strconcat(asmbase#"pp ", asmstr), IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  }
+}
+
+// Defines 4 instructions, masked/unmasked with masks 8, 4, 4 bits.
+// The XO field for acc/non-acc version is even/odd.
+multiclass ACC_UM_M844_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
+                            string asmstr> {
+  defm NAME : ACC_UM_XOEO<opcode, xo, IOL, asmbase, asmstr>;
+  let Predicates = [MMA, PrefixInstrs] in {
+  def PM#NAME :
+    MMIRR_XX3Form_XY4P8_XAB6<
+      opcode, !or(xo, 0x01), (outs acc:$AT),
+      !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u8imm:$PMSK)),
+      !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"@earlyclobber $AT">;
+  def PM#NAME#PP :
+    MMIRR_XX3Form_XY4P8_XAB6<
+      opcode, xo, (outs acc:$AT),
+      !con((ins acc:$ATi),
+           !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u8imm:$PMSK))),
+      !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  }
+}
+
+// Defines 4 instructions, masked/unmasked with masks 4, 4, 4 bits.
+// The XO field for acc/non-acc version is even/odd.
+multiclass ACC_UM_M444_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
+                            string asmstr> {
+  defm NAME : ACC_UM_XOEO<opcode, xo, IOL, asmbase, asmstr>;
+  let Predicates = [MMA, PrefixInstrs] in {
+  def PM#NAME :
+    MMIRR_XX3Form_XYP4_XAB6<
+      opcode, !or(xo, 0x01), (outs acc:$AT),
+      !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u4imm:$PMSK)),
+      !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"@earlyclobber $AT">;
+  def PM#NAME#PP :
+    MMIRR_XX3Form_XYP4_XAB6<
+      opcode, xo, (outs acc:$AT),
+      !con((ins acc:$ATi),
+           !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u4imm:$PMSK))),
+      !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  }
+}
+
+// Defines 4 instructions, masked/unmasked with masks 2, 4, 4 bits.
+// The XO field for acc/non-acc version is even/odd.
+multiclass ACC_UM_M244_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
+                            string asmstr> {
+  defm NAME : ACC_UM_XOEO<opcode, xo, IOL, asmbase, asmstr>;
+  let Predicates = [MMA, PrefixInstrs] in {
+  def PM#NAME :
+    MMIRR_XX3Form_XY4P2_XAB6<
+      opcode, !or(xo, 0x01), (outs acc:$AT),
+      !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK)),
+      !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"@earlyclobber $AT">;
+  def PM#NAME#PP :
+    MMIRR_XX3Form_XY4P2_XAB6<
+      opcode, xo, (outs acc:$AT),
+      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+      !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  }
+}
+
+// Defines 4 instructions, masked/unmasked with masks 2, 4, 4 bits.
+// Upper nibble of XO field for acc/non-acc version is 0x4/0x6.
+multiclass ACC_UM_M244_XO46<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
+                            string asmstr> {
+  let Predicates = [MMA] in {
+  def NAME :
+    XX3Form_AT3_XAB6<opcode, xo, (outs acc:$AT), IOL,
+                     !strconcat(asmbase#" ", asmstr), IIC_VecFP, []>,
+    RegConstraint<"@earlyclobber $AT">;
+  def PP :
+    XX3Form_AT3_XAB6<
+      opcode, !or(xo, 0x20), (outs acc:$AT), !con((ins acc:$ATi), IOL),
+      !strconcat(asmbase#"pp ", asmstr), IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  }
+  let Predicates = [MMA, PrefixInstrs] in {
+  def PM#NAME :
+    MMIRR_XX3Form_XY4P2_XAB6<
+      opcode, xo, (outs acc:$AT),
+      !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK)),
+      !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"@earlyclobber $AT">;
+  def PM#NAME#PP :
+    MMIRR_XX3Form_XY4P2_XAB6<
+      opcode, !or(xo, 0x20), (outs acc:$AT),
+      !con((ins acc:$ATi),
+           !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+      !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  }
+}
+
+// Defines 10 instructions, operand negating, unmasked, masked with 2, 4, 4
+// bits. Upper nibble are masked with 0x8, 0x4, 0xC for negating operands.
+multiclass ACC_NEG_UM_M244_XOM84C<bits<6> opcode, bits<8> xo, dag IOL,
+                                  string asmbase, string asmstr> {
+  defm NAME : ACC_UM_M244_XOEO<opcode, xo, IOL, asmbase, asmstr>;
+  let Predicates = [MMA] in {
+  def PN : XX3Form_AT3_XAB6<
+             opcode, !or(xo, 0x80), (outs acc:$AT), !con((ins acc:$ATi), IOL),
+             !strconcat(asmbase#"pn ", asmstr), IIC_VecFP, []>,
+           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def NP : XX3Form_AT3_XAB6<
+             opcode, !or(xo, 0x40), (outs acc:$AT), !con((ins acc:$ATi), IOL),
+             !strconcat(asmbase#"np ", asmstr), IIC_VecFP, []>,
+           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def NN : XX3Form_AT3_XAB6<
+             opcode, !or(xo, 0xC0), (outs acc:$AT), !con((ins acc:$ATi), IOL),
+             !strconcat(asmbase#"nn ", asmstr), IIC_VecFP, []>,
+           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  }
+  let Predicates = [MMA, PrefixInstrs] in {
+  def PM#NAME#PN :
+    MMIRR_XX3Form_XY4P2_XAB6<
+      opcode, !or(xo, 0x80), (outs acc:$AT),
+      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+      !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK, $PMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def PM#NAME#NP :
+    MMIRR_XX3Form_XY4P2_XAB6<
+      opcode, !or(xo, 0x40), (outs acc:$AT),
+      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+      !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK, $PMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def PM#NAME#NN :
+    MMIRR_XX3Form_XY4P2_XAB6<
+      opcode, !or(xo, 0xC0), (outs acc:$AT),
+      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+      !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK, $PMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  }
+}
+
+// Defines 5 instructions, unmasked, operand negating.
+// Upper nibble are masked with 0x8, 0x4, 0xC for negating operands.
+multiclass ACC_NEG_UM_XOM84C<bits<6> opcode, bits<8> xo, dag IOL,
+                             string asmbase, string asmstr> {
+  defm NAME : ACC_UM_XOEO<opcode, xo, IOL, asmbase, asmstr>;
+  let Predicates = [MMA] in {
+  def PN : XX3Form_AT3_XAB6<opcode, !or(xo, 0x80), (outs acc:$AT),
+                            !con((ins acc:$ATi), IOL),
+                            !strconcat(asmbase#"pn ", asmstr), IIC_VecFP, []>,
+           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def NP : XX3Form_AT3_XAB6<opcode, !or(xo, 0x40), (outs acc:$AT),
+                            !con((ins acc:$ATi), IOL),
+                            !strconcat(asmbase#"np ", asmstr), IIC_VecFP, []>,
+           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def NN : XX3Form_AT3_XAB6<opcode, !or(xo, 0xC0), (outs acc:$AT),
+                            !con((ins acc:$ATi), IOL),
+                            !strconcat(asmbase#"nn ", asmstr), IIC_VecFP, []>,
+           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  }
+}
+
+// Defines 10 instructions, operand negating, unmasked, masked with 4, 4 bits.
+// Upper nibble are masked with 0x8, 0x4, 0xC for negating operands.
+multiclass ACC_NEG_UM_M44_XOM84C<bits<6> opcode, bits<8> xo, dag IOL,
+                                 string asmbase, string asmstr> {
+  defm NAME : ACC_NEG_UM_XOM84C<opcode, xo, IOL, asmbase, asmstr>;
+  let Predicates = [MMA, PrefixInstrs] in {
+  def PM#NAME :
+    MMIRR_XX3Form_XY4_XAB6<
+      opcode, !or(xo, 0x01), (outs acc:$AT),
+      !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK)),
+      !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"@earlyclobber $AT">;
+  def PM#NAME#PP :
+    MMIRR_XX3Form_XY4_XAB6<
+      opcode, xo, (outs acc:$AT),
+      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))),
+      !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def PM#NAME#PN :
+    MMIRR_XX3Form_XY4_XAB6<
+      opcode, !or(xo, 0x80), (outs acc:$AT),
+      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))),
+      !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def PM#NAME#NP :
+    MMIRR_XX3Form_XY4_XAB6<
+      opcode, !or(xo, 0x40), (outs acc:$AT),
+      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))),
+      !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def PM#NAME#NN :
+    MMIRR_XX3Form_XY4_XAB6<
+      opcode, !or(xo, 0xC0), (outs acc:$AT),
+      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))),
+      !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  }
+}
+
+// Defines 10 instructions, operand negating, unmasked, masked with 4, 2 bits.
+// Upper nibble are masked with 0x8, 0x4, 0xC for negating operands.
+multiclass ACC_NEG_UM_M42_XOM84C<bits<6> opcode, bits<8> xo, dag IOL,
+                                 string asmbase, string asmstr> {
+  defm NAME : ACC_NEG_UM_XOM84C<opcode, xo, IOL, asmbase, asmstr>;
+  let Predicates = [MMA, PrefixInstrs] in {
+  def PM#NAME :
+    MMIRR_XX3Form_X4Y2_XAB6<
+      opcode, !or(xo, 0x01), (outs acc:$AT),
+      !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK)),
+      !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"@earlyclobber $AT">;
+  def PM#NAME#PP :
+    MMIRR_XX3Form_X4Y2_XAB6<
+      opcode, xo, (outs acc:$AT),
+      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))),
+      !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def PM#NAME#PN :
+    MMIRR_XX3Form_X4Y2_XAB6<
+      opcode, !or(xo, 0x80), (outs acc:$AT),
+      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))),
+      !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def PM#NAME#NP :
+    MMIRR_XX3Form_X4Y2_XAB6<
+      opcode, !or(xo, 0x40), (outs acc:$AT),
+      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))),
+      !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def PM#NAME#NN :
+    MMIRR_XX3Form_X4Y2_XAB6<
+      opcode, !or(xo, 0xC0), (outs acc:$AT),
+      !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))),
+      !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  }
+}
+
+// End of class definitions.
+//-----------------------------------------------------------------------------
+
 let Predicates = [MMA] in {
   def XXMFACC :
     XForm_AT3<31, 0, 177, (outs acc:$ASo), (ins acc:$AS), "xxmfacc $AS",
@@ -824,8 +1283,62 @@
       XForm_AT3<31, 3, 177, (outs acc:$AT), (ins), "xxsetaccz $AT",
                 IIC_VecGeneral, []>;
   }
+  def XVI8GER4SPP :
+    XX3Form_AT3_XAB6<59, 99, (outs acc:$AT), (ins acc:$ATi, vsrc:$XA, vsrc:$XB),
+                     "xvi8ger4spp $AT, $XA, $XB", IIC_VecGeneral, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+}
+
+let Predicates = [MMA, PrefixInstrs] in {
+  def PMXVI8GER4SPP :
+    MMIRR_XX3Form_XYP4_XAB6<59, 99, (outs acc:$AT),
+                            (ins acc:$ATi, vsrc:$XA,vsrc:$XB, u4imm:$XMSK,
+                             u4imm:$YMSK, u4imm:$PMSK),
+                            "pmxvi8ger4spp $AT, $XA, $XB, $XMSK, $YMSK, $PMSK",
+                            IIC_VecGeneral, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
 }
 
+// MMA accumulating/non-accumulating instructions.
+//------------------------------------------------------------------------------
+
+// XVBF16GER2, XVBF16GER2PP, XVBF16GER2PN, XVBF16GER2NP, XVBF16GER2NN
+// PMXVBF16GER2, PMXVBF16GER2PP, PMXVBF16GER2PN, PMXVBF16GER2NP, PMXVBF16GER2NN
+defm XVBF16GER2 : ACC_NEG_UM_M244_XOM84C<59, 50, (ins vsrc:$XA, vsrc:$XB),
+                                         "xvbf16ger2", "$AT, $XA, $XB">;
+
+// XVI4GER8, XVI4GER8PP, PMXVI4GER8,  PMXVI4GER8PP
+defm XVI4GER8 : ACC_UM_M844_XOEO<59, 34, (ins vsrc:$XA, vsrc:$XB),
+                                 "xvi4ger8", "$AT, $XA, $XB">;
+
+// XVI8GER4, XVI8GER4PP, PMXVI8GER4, PMXVI8GER4PP
+defm XVI8GER4 : ACC_UM_M444_XOEO<59, 2, (ins vsrc:$XA, vsrc:$XB),
+                                 "xvi8ger4", "$AT, $XA, $XB">;
+
+// XVI16GER2, XVI16GER2PP, PMXVI16GER2, PMXVI16GER2PP
+defm XVI16GER2 : ACC_UM_M244_XO46<59, 75, (ins vsrc:$XA, vsrc:$XB),
+                                  "xvi16ger2", "$AT, $XA, $XB">;
+
+// XVI16GER2S, XVI16GER2SPP, PMXVI16GER2S, PMXVI16GER2SPP
+defm XVI16GER2S : ACC_UM_M244_XOEO<59, 42, (ins vsrc:$XA, vsrc:$XB),
+                                   "xvi16ger2s", "$AT, $XA, $XB">;
+
+// XVF16GER2, XVF16GER2PP, XVF16GER2PN, XVF16GER2NP, XVF16GER2NN
+// PMXVF16GER2, PMXVF16GER2PP, PMXVF16GER2PN, PMXVF16GER2NP, PMXVF16GER2NN
+defm XVF16GER2 : ACC_NEG_UM_M244_XOM84C<59, 18, (ins vsrc:$XA, vsrc:$XB),
+                                        "xvf16ger2", "$AT, $XA, $XB">;
+
+// XVF32GER, XVF32GERPP, XVF32GERPN, XVF32GERNP, XVF32GERPP
+// PMXVF32GER, PMXVF32GERPP, PMXVF32GERPN, PMXVF32GERNP, PMXVF32GERPP
+defm XVF32GER : ACC_NEG_UM_M44_XOM84C<59, 26, (ins vsrc:$XA, vsrc:$XB),
+                                      "xvf32ger", "$AT, $XA, $XB">;
+
+// XVF64GER, XVF64GERPP, XVF64GERPN, XVF64GERNP, XVF64GERNN
+// PMXVF64GER, PMXVF64GERPP, PMXVF64GERPN, PMXVF64GERNP, PMXVF64GERNN
+defm XVF64GER : ACC_NEG_UM_M42_XOM84C<59, 58, (ins vsrpevenrc:$XA, vsrc:$XB),
+                                      "xvf64ger", "$AT, $XA, $XB">;
+//------------------------------------------------------------------------------
+
 def Concats {
   dag VecsToVecPair0 =
     (v256i1 (INSERT_SUBREG
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -40939,8 +40939,7 @@
   // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
   // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
   // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
-  // the need for an extra compare
-  // against zero. e.g.
+  // the need for an extra compare against zero. e.g.
   // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
   // subl   %esi, %edi
   // testl  %edi, %edi
@@ -40950,17 +40949,28 @@
   // xorl   %eax, %eax
   // subl   %esi, $edi
   // cmovsl %eax, %edi
+  //
+  // We can also canonicalize
+  //  (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
+  //  (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
+  // This allows the use of a test instruction for the compare.
   if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
       Cond.hasOneUse() &&
       LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) {
     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
-    if ((CC == ISD::SETGT && isNullConstant(RHS)) ||
+    if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
         (CC == ISD::SETLT && isAllOnesConstant(RHS))) {
       ISD::CondCode NewCC = CC == ISD::SETGT ? ISD::SETGE : ISD::SETLE;
       Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
                           Cond.getOperand(0), Cond.getOperand(1), NewCC);
       return DAG.getSelect(DL, VT, Cond, LHS, RHS);
     }
+    if (CC == ISD::SETUGT && isOneConstant(RHS)) {
+      ISD::CondCode NewCC = ISD::SETUGE;
+      Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
+                          Cond.getOperand(0), Cond.getOperand(1), NewCC);
+      return DAG.getSelect(DL, VT, Cond, LHS, RHS);
+    }
   }
 
   // Match VSELECTs into subs with unsigned saturation.
diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
--- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -92,6 +92,10 @@
                                         cl::init(false), cl::Hidden,
                                         cl::desc("Enable Unroll And Jam Pass"));
 
+static cl::opt<bool> EnableLoopFlatten("enable-loop-flatten", cl::init(false),
+                                       cl::Hidden,
+                                       cl::desc("Enable the LoopFlatten Pass"));
+
 static cl::opt<bool>
     EnablePrepareForThinLTO("prepare-for-thinlto", cl::init(false), cl::Hidden,
                             cl::desc("Enable preparation for ThinLTO."));
@@ -444,6 +448,10 @@
 
   if (EnableLoopInterchange)
     MPM.add(createLoopInterchangePass()); // Interchange loops
+  if (EnableLoopFlatten) {
+    MPM.add(createLoopFlattenPass()); // Flatten loops
+    MPM.add(createLoopSimplifyCFGPass());
+  }
 
   // Unroll small loops
   MPM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops,
@@ -1035,6 +1043,8 @@
   PM.add(createLoopDeletionPass());
   if (EnableLoopInterchange)
     PM.add(createLoopInterchangePass());
+  if (EnableLoopFlatten)
+    PM.add(createLoopFlattenPass());
 
   // Unroll small loops
   PM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops,
diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
--- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -753,6 +753,11 @@
   auto LookupDomTree = [&FAM](Function &F) -> DominatorTree & {
     return FAM.getResult<DominatorTreeAnalysis>(F);
   };
+  if (UseCommandLine) {
+    if (DevirtModule::runForTesting(M, AARGetter, OREGetter, LookupDomTree))
+      return PreservedAnalyses::all();
+    return PreservedAnalyses::none();
+  }
   if (!DevirtModule(M, AARGetter, OREGetter, LookupDomTree, ExportSummary,
                     ImportSummary)
            .run())
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -6089,7 +6089,7 @@
 /// Optimize fabs(X) compared with zero.
 static Instruction *foldFabsWithFcmpZero(FCmpInst &I, InstCombinerImpl &IC) {
   Value *X;
-  if (!match(I.getOperand(0), m_Intrinsic<Intrinsic::fabs>(m_Value(X))) ||
+  if (!match(I.getOperand(0), m_FAbs(m_Value(X))) ||
       !match(I.getOperand(1), m_PosZeroFP()))
     return nullptr;
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -711,6 +711,7 @@
                             Value *A, Value *B, Instruction &Outer,
                             SelectPatternFlavor SPF2, Value *C);
   Instruction *foldSelectInstWithICmp(SelectInst &SI, ICmpInst *ICI);
+  Instruction *foldSelectValueEquivalence(SelectInst &SI, ICmpInst &ICI);
 
   Instruction *OptAndOp(BinaryOperator *Op, ConstantInt *OpRHS,
                         ConstantInt *AndRHS, BinaryOperator &TheAnd);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -432,13 +432,12 @@
 
   // fabs(X) * fabs(X) -> X * X
   // fabs(X) / fabs(X) -> X / X
-  if (Op0 == Op1 && match(Op0, m_Intrinsic<Intrinsic::fabs>(m_Value(X))))
+  if (Op0 == Op1 && match(Op0, m_FAbs(m_Value(X))))
     return BinaryOperator::CreateWithCopiedFlags(Opcode, X, X, &I);
 
   // fabs(X) * fabs(Y) --> fabs(X * Y)
   // fabs(X) / fabs(Y) --> fabs(X / Y)
-  if (match(Op0, m_Intrinsic<Intrinsic::fabs>(m_Value(X))) &&
-      match(Op1, m_Intrinsic<Intrinsic::fabs>(m_Value(Y))) &&
+  if (match(Op0, m_FAbs(m_Value(X))) && match(Op1, m_FAbs(m_Value(Y))) &&
       (Op0->hasOneUse() || Op1->hasOneUse())) {
     IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
     Builder.setFastMathFlags(I.getFastMathFlags());
@@ -1393,10 +1392,8 @@
   // X / fabs(X) -> copysign(1.0, X)
   // fabs(X) / X -> copysign(1.0, X)
   if (I.hasNoNaNs() && I.hasNoInfs() &&
-      (match(&I,
-             m_FDiv(m_Value(X), m_Intrinsic<Intrinsic::fabs>(m_Deferred(X)))) ||
-       match(&I, m_FDiv(m_Intrinsic<Intrinsic::fabs>(m_Value(X)),
-                        m_Deferred(X))))) {
+      (match(&I, m_FDiv(m_Value(X), m_FAbs(m_Deferred(X)))) ||
+       match(&I, m_FDiv(m_FAbs(m_Value(X)), m_Deferred(X))))) {
     Value *V = Builder.CreateBinaryIntrinsic(
         Intrinsic::copysign, ConstantFP::get(I.getType(), 1.0), X, &I);
     return replaceInstUsesWith(I, V);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1165,9 +1165,8 @@
 ///
 /// We can't replace %sel with %add unless we strip away the flags.
 /// TODO: Wrapping flags could be preserved in some cases with better analysis.
-static Instruction *foldSelectValueEquivalence(SelectInst &Sel, ICmpInst &Cmp,
-                                               const SimplifyQuery &Q,
-                                               InstCombiner &IC) {
+Instruction *InstCombinerImpl::foldSelectValueEquivalence(SelectInst &Sel,
+                                                          ICmpInst &Cmp) {
   if (!Cmp.isEquality())
     return nullptr;
 
@@ -1179,18 +1178,20 @@
     Swapped = true;
   }
 
-  // In X == Y ? f(X) : Z, try to evaluate f(X) and replace the operand.
-  // Take care to avoid replacing X == Y ? X : Z with X == Y ? Y : Z, as that
-  // would lead to an infinite replacement cycle.
+  // In X == Y ? f(X) : Z, try to evaluate f(Y) and replace the operand.
+  // Make sure Y cannot be undef though, as we might pick different values for
+  // undef in the icmp and in f(Y). Additionally, take care to avoid replacing
+  // X == Y ? X : Z with X == Y ? Y : Z, as that would lead to an infinite
+  // replacement cycle.
   Value *CmpLHS = Cmp.getOperand(0), *CmpRHS = Cmp.getOperand(1);
-  if (TrueVal != CmpLHS)
-    if (Value *V = SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, Q,
+  if (TrueVal != CmpLHS && isGuaranteedNotToBeUndefOrPoison(CmpRHS, &Sel, &DT))
+    if (Value *V = SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, SQ,
                                           /* AllowRefinement */ true))
-      return IC.replaceOperand(Sel, Swapped ? 2 : 1, V);
-  if (TrueVal != CmpRHS)
-    if (Value *V = SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, Q,
+      return replaceOperand(Sel, Swapped ? 2 : 1, V);
+  if (TrueVal != CmpRHS && isGuaranteedNotToBeUndefOrPoison(CmpLHS, &Sel, &DT))
+    if (Value *V = SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, SQ,
                                           /* AllowRefinement */ true))
-      return IC.replaceOperand(Sel, Swapped ? 2 : 1, V);
+      return replaceOperand(Sel, Swapped ? 2 : 1, V);
 
   auto *FalseInst = dyn_cast<Instruction>(FalseVal);
   if (!FalseInst)
@@ -1215,11 +1216,11 @@
   // We have an 'EQ' comparison, so the select's false value will propagate.
   // Example:
   // (X == 42) ? 43 : (X + 1) --> (X == 42) ? (X + 1) : (X + 1) --> X + 1
-  if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, Q,
+  if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, SQ,
                              /* AllowRefinement */ false) == TrueVal ||
-      SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, Q,
+      SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, SQ,
                              /* AllowRefinement */ false) == TrueVal) {
-    return IC.replaceInstUsesWith(Sel, FalseVal);
+    return replaceInstUsesWith(Sel, FalseVal);
   }
 
   // Restore poison-generating flags if the transform did not apply.
@@ -1455,7 +1456,7 @@
 /// Visit a SelectInst that has an ICmpInst as its first operand.
 Instruction *InstCombinerImpl::foldSelectInstWithICmp(SelectInst &SI,
                                                       ICmpInst *ICI) {
-  if (Instruction *NewSel = foldSelectValueEquivalence(SI, *ICI, SQ, *this))
+  if (Instruction *NewSel = foldSelectValueEquivalence(SI, *ICI))
     return NewSel;
 
   if (Instruction *NewSel = canonicalizeMinMaxWithConstant(SI, *ICI, *this))
diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt
--- a/llvm/lib/Transforms/Scalar/CMakeLists.txt
+++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt
@@ -32,6 +32,7 @@
   LoopIdiomRecognize.cpp
   LoopInstSimplify.cpp
   LoopInterchange.cpp
+  LoopFlatten.cpp
   LoopLoadElimination.cpp
   LoopPassManager.cpp
   LoopPredication.cpp
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -2404,10 +2404,44 @@
 
     if (auto *LoadI = dyn_cast<LoadInst>(Store->getOperand(0))) {
       if (LoadI->getPointerOperand() == Store->getOperand(1)) {
+        // Get the defining access for the load.
         auto *LoadAccess = MSSA.getMemoryAccess(LoadI)->getDefiningAccess();
-        // If both accesses share the same defining access, no instructions
-        // between them can modify the memory location.
-        return LoadAccess == Def->getDefiningAccess();
+        // Fast path: the defining accesses are the same.
+        if (LoadAccess == Def->getDefiningAccess())
+          return true;
+
+        // Look through phi accesses. Recursively scan all phi accesses by
+        // adding them to a worklist. Bail when we run into a memory def that
+        // does not match LoadAccess.
+        SetVector<MemoryAccess *> ToCheck;
+        MemoryAccess *Current = Def->getDefiningAccess();
+        // We don't want to bail when we run into the store memory def. But,
+        // the phi access may point to it. So, pretend like we've already
+        // checked it.
+        ToCheck.insert(Def);
+        ToCheck.insert(Current);
+        // Start at current (1) to simulate already having checked Def.
+        for (unsigned I = 1; I < ToCheck.size(); ++I) {
+          Current = ToCheck[I];
+          if (auto PhiAccess = dyn_cast<MemoryPhi>(Current)) {
+            // Check all the operands.
+            for (auto &Use : PhiAccess->incoming_values())
+              ToCheck.insert(cast<MemoryAccess>(&Use));
+            continue;
+          }
+
+          // If we found a memory def, bail. This happens when we have an
+          // unrelated write in between an otherwise noop store.
+          assert(isa<MemoryDef>(Current) &&
+                 "Only MemoryDefs should reach here.");
+          // TODO: Skip no alias MemoryDefs that have no aliasing reads.
+          // We are searching for the definition of the store's destination.
+          // So, if that is the same definition as the load, then this is a
+          // noop. Otherwise, fail.
+          if (LoadAccess != Current)
+            return false;
+        }
+        return true;
       }
     }
 
diff --git a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
@@ -0,0 +1,605 @@
+//===- LoopFlatten.cpp - Loop flattening pass------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass flattens pairs nested loops into a single loop.
+//
+// The intention is to optimise loop nests like this, which together access an
+// array linearly:
+//   for (int i = 0; i < N; ++i)
+//     for (int j = 0; j < M; ++j)
+//       f(A[i*M+j]);
+// into one loop:
+//   for (int i = 0; i < (N*M); ++i)
+//     f(A[i]);
+//
+// It can also flatten loops where the induction variables are not used in the
+// loop. This is only worth doing if the induction variables are only used in an
+// expression like i*M+j. If they had any other uses, we would have to insert a
+// div/mod to reconstruct the original values, so this wouldn't be profitable.
+//
+// We also need to prove that N*M will not overflow.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopFlatten.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+
+#define DEBUG_TYPE "loop-flatten"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+static cl::opt<unsigned> RepeatedInstructionThreshold(
+    "loop-flatten-cost-threshold", cl::Hidden, cl::init(2),
+    cl::desc("Limit on the cost of instructions that can be repeated due to "
+             "loop flattening"));
+
+static cl::opt<bool>
+    AssumeNoOverflow("loop-flatten-assume-no-overflow", cl::Hidden,
+                     cl::init(false),
+                     cl::desc("Assume that the product of the two iteration "
+                              "limits will never overflow"));
+
+// Finds the induction variable, increment and limit for a simple loop that we
+// can flatten.
+static bool findLoopComponents(
+    Loop *L, SmallPtrSetImpl<Instruction *> &IterationInstructions,
+    PHINode *&InductionPHI, Value *&Limit, BinaryOperator *&Increment,
+    BranchInst *&BackBranch, ScalarEvolution *SE) {
+  LLVM_DEBUG(dbgs() << "Finding components of loop: " << L->getName() << "\n");
+
+  if (!L->isLoopSimplifyForm()) {
+    LLVM_DEBUG(dbgs() << "Loop is not in normal form\n");
+    return false;
+  }
+
+  // There must be exactly one exiting block, and it must be the same at the
+  // latch.
+  BasicBlock *Latch = L->getLoopLatch();
+  if (L->getExitingBlock() != Latch) {
+    LLVM_DEBUG(dbgs() << "Exiting and latch block are different\n");
+    return false;
+  }
+  // Latch block must end in a conditional branch.
+  BackBranch = dyn_cast<BranchInst>(Latch->getTerminator());
+  if (!BackBranch || !BackBranch->isConditional()) {
+    LLVM_DEBUG(dbgs() << "Could not find back-branch\n");
+    return false;
+  }
+  IterationInstructions.insert(BackBranch);
+  LLVM_DEBUG(dbgs() << "Found back branch: "; BackBranch->dump());
+  bool ContinueOnTrue = L->contains(BackBranch->getSuccessor(0));
+
+  // Find the induction PHI. If there is no induction PHI, we can't do the
+  // transformation. TODO: could other variables trigger this? Do we have to
+  // search for the best one?
+  InductionPHI = nullptr;
+  for (PHINode &PHI : L->getHeader()->phis()) {
+    InductionDescriptor ID;
+    if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID)) {
+      InductionPHI = &PHI;
+      LLVM_DEBUG(dbgs() << "Found induction PHI: "; InductionPHI->dump());
+      break;
+    }
+  }
+  if (!InductionPHI) {
+    LLVM_DEBUG(dbgs() << "Could not find induction PHI\n");
+    return false;
+  }
+
+  auto IsValidPredicate = [&](ICmpInst::Predicate Pred) {
+    if (ContinueOnTrue)
+      return Pred == CmpInst::ICMP_NE || Pred == CmpInst::ICMP_ULT;
+    else
+      return Pred == CmpInst::ICMP_EQ;
+  };
+
+  // Find Compare and make sure it is valid
+  ICmpInst *Compare = dyn_cast<ICmpInst>(BackBranch->getCondition());
+  if (!Compare || !IsValidPredicate(Compare->getUnsignedPredicate()) ||
+      Compare->hasNUsesOrMore(2)) {
+    LLVM_DEBUG(dbgs() << "Could not find valid comparison\n");
+    return false;
+  }
+  IterationInstructions.insert(Compare);
+  LLVM_DEBUG(dbgs() << "Found comparison: "; Compare->dump());
+
+  // Find increment and limit from the compare
+  Increment = nullptr;
+  if (match(Compare->getOperand(0),
+            m_c_Add(m_Specific(InductionPHI), m_ConstantInt<1>()))) {
+    Increment = dyn_cast<BinaryOperator>(Compare->getOperand(0));
+    Limit = Compare->getOperand(1);
+  } else if (Compare->getUnsignedPredicate() == CmpInst::ICMP_NE &&
+             match(Compare->getOperand(1),
+                   m_c_Add(m_Specific(InductionPHI), m_ConstantInt<1>()))) {
+    Increment = dyn_cast<BinaryOperator>(Compare->getOperand(1));
+    Limit = Compare->getOperand(0);
+  }
+  if (!Increment || Increment->hasNUsesOrMore(3)) {
+    LLVM_DEBUG(dbgs() << "Cound not find valid increment\n");
+    return false;
+  }
+  IterationInstructions.insert(Increment);
+  LLVM_DEBUG(dbgs() << "Found increment: "; Increment->dump());
+  LLVM_DEBUG(dbgs() << "Found limit: "; Limit->dump());
+
+  assert(InductionPHI->getNumIncomingValues() == 2);
+  assert(InductionPHI->getIncomingValueForBlock(Latch) == Increment &&
+         "PHI value is not increment inst");
+
+  auto *CI = dyn_cast<ConstantInt>(
+      InductionPHI->getIncomingValueForBlock(L->getLoopPreheader()));
+  if (!CI || !CI->isZero()) {
+    LLVM_DEBUG(dbgs() << "PHI value is not zero: "; CI->dump());
+    return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "Successfully found all loop components\n");
+  return true;
+}
+
+static bool checkPHIs(Loop *OuterLoop, Loop *InnerLoop,
+                      SmallPtrSetImpl<PHINode *> &InnerPHIsToTransform,
+                      PHINode *InnerInductionPHI, PHINode *OuterInductionPHI,
+                      TargetTransformInfo *TTI) {
+  // All PHIs in the inner and outer headers must either be:
+  // - The induction PHI, which we are going to rewrite as one induction in
+  //   the new loop. This is already checked by findLoopComponents.
+  // - An outer header PHI with all incoming values from outside the loop.
+  //   LoopSimplify guarantees we have a pre-header, so we don't need to
+  //   worry about that here.
+  // - Pairs of PHIs in the inner and outer headers, which implement a
+  //   loop-carried dependency that will still be valid in the new loop. To
+  //   be valid, this variable must be modified only in the inner loop.
+
+  // The set of PHI nodes in the outer loop header that we know will still be
+  // valid after the transformation. These will not need to be modified (with
+  // the exception of the induction variable), but we do need to check that
+  // there are no unsafe PHI nodes.
+  SmallPtrSet<PHINode *, 4> SafeOuterPHIs;
+  SafeOuterPHIs.insert(OuterInductionPHI);
+
+  // Check that all PHI nodes in the inner loop header match one of the valid
+  // patterns.
+  for (PHINode &InnerPHI : InnerLoop->getHeader()->phis()) {
+    // The induction PHIs break these rules, and that's OK because we treat
+    // them specially when doing the transformation.
+    if (&InnerPHI == InnerInductionPHI)
+      continue;
+
+    // Each inner loop PHI node must have two incoming values/blocks - one
+    // from the pre-header, and one from the latch.
+    assert(InnerPHI.getNumIncomingValues() == 2);
+    Value *PreHeaderValue =
+        InnerPHI.getIncomingValueForBlock(InnerLoop->getLoopPreheader());
+    Value *LatchValue =
+        InnerPHI.getIncomingValueForBlock(InnerLoop->getLoopLatch());
+
+    // The incoming value from the outer loop must be the PHI node in the
+    // outer loop header, with no modifications made in the top of the outer
+    // loop.
+    PHINode *OuterPHI = dyn_cast<PHINode>(PreHeaderValue);
+    if (!OuterPHI || OuterPHI->getParent() != OuterLoop->getHeader()) {
+      LLVM_DEBUG(dbgs() << "value modified in top of outer loop\n");
+      return false;
+    }
+
+    // The other incoming value must come from the inner loop, without any
+    // modifications in the tail end of the outer loop. We are in LCSSA form,
+    // so this will actually be a PHI in the inner loop's exit block, which
+    // only uses values from inside the inner loop.
+    PHINode *LCSSAPHI = dyn_cast<PHINode>(
+        OuterPHI->getIncomingValueForBlock(OuterLoop->getLoopLatch()));
+    if (!LCSSAPHI) {
+      LLVM_DEBUG(dbgs() << "could not find LCSSA PHI\n");
+      return false;
+    }
+
+    // The value used by the LCSSA PHI must be the same one that the inner
+    // loop's PHI uses.
+    if (LCSSAPHI->hasConstantValue() != LatchValue) {
+      LLVM_DEBUG(
+          dbgs() << "LCSSA PHI incoming value does not match latch value\n");
+      return false;
+    }
+
+    LLVM_DEBUG(dbgs() << "PHI pair is safe:\n");
+    LLVM_DEBUG(dbgs() << "  Inner: "; InnerPHI.dump());
+    LLVM_DEBUG(dbgs() << "  Outer: "; OuterPHI->dump());
+    SafeOuterPHIs.insert(OuterPHI);
+    InnerPHIsToTransform.insert(&InnerPHI);
+  }
+
+  for (PHINode &OuterPHI : OuterLoop->getHeader()->phis()) {
+    if (!SafeOuterPHIs.count(&OuterPHI)) {
+      LLVM_DEBUG(dbgs() << "found unsafe PHI in outer loop: "; OuterPHI.dump());
+      return false;
+    }
+  }
+
+  return true;
+}
+
+static bool
+checkOuterLoopInsts(Loop *OuterLoop, Loop *InnerLoop,
+                    SmallPtrSetImpl<Instruction *> &IterationInstructions,
+                    Value *InnerLimit, PHINode *OuterPHI,
+                    TargetTransformInfo *TTI) {
+  // Check for instructions in the outer but not inner loop. If any of these
+  // have side-effects then this transformation is not legal, and if there is
+  // a significant amount of code here which can't be optimised out that it's
+  // not profitable (as these instructions would get executed for each
+  // iteration of the inner loop).
+  unsigned RepeatedInstrCost = 0;
+  for (auto *B : OuterLoop->getBlocks()) {
+    if (InnerLoop->contains(B))
+      continue;
+
+    for (auto &I : *B) {
+      if (!isa<PHINode>(&I) && !I.isTerminator() &&
+          !isSafeToSpeculativelyExecute(&I)) {
+        LLVM_DEBUG(dbgs() << "Cannot flatten because instruction may have "
+                             "side effects: ";
+                   I.dump());
+        return false;
+      }
+      // The execution count of the outer loop's iteration instructions
+      // (increment, compare and branch) will be increased, but the
+      // equivalent instructions will be removed from the inner loop, so
+      // they make a net difference of zero.
+      if (IterationInstructions.count(&I))
+        continue;
+      // The uncoditional branch to the inner loop's header will turn into
+      // a fall-through, so adds no cost.
+      BranchInst *Br = dyn_cast<BranchInst>(&I);
+      if (Br && Br->isUnconditional() &&
+          Br->getSuccessor(0) == InnerLoop->getHeader())
+        continue;
+      // Multiplies of the outer iteration variable and inner iteration
+      // count will be optimised out.
+      if (match(&I, m_c_Mul(m_Specific(OuterPHI), m_Specific(InnerLimit))))
+        continue;
+      int Cost = TTI->getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency);
+      LLVM_DEBUG(dbgs() << "Cost " << Cost << ": "; I.dump());
+      RepeatedInstrCost += Cost;
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "Cost of instructions that will be repeated: "
+                    << RepeatedInstrCost << "\n");
+  // Bail out if flattening the loops would cause instructions in the outer
+  // loop but not in the inner loop to be executed extra times.
+  if (RepeatedInstrCost > RepeatedInstructionThreshold)
+    return false;
+
+  return true;
+}
+
+static bool checkIVUsers(PHINode *InnerPHI, PHINode *OuterPHI,
+                         BinaryOperator *InnerIncrement,
+                         BinaryOperator *OuterIncrement, Value *InnerLimit,
+                         SmallPtrSetImpl<Value *> &LinearIVUses) {
+  // We require all uses of both induction variables to match this pattern:
+  //
+  //   (OuterPHI * InnerLimit) + InnerPHI
+  //
+  // Any uses of the induction variables not matching that pattern would
+  // require a div/mod to reconstruct in the flattened loop, so the
+  // transformation wouldn't be profitable.
+
+  // Check that all uses of the inner loop's induction variable match the
+  // expected pattern, recording the uses of the outer IV.
+  SmallPtrSet<Value *, 4> ValidOuterPHIUses;
+  for (User *U : InnerPHI->users()) {
+    if (U == InnerIncrement)
+      continue;
+
+    LLVM_DEBUG(dbgs() << "Found use of inner induction variable: "; U->dump());
+
+    Value *MatchedMul, *MatchedItCount;
+    if (match(U, m_c_Add(m_Specific(InnerPHI), m_Value(MatchedMul))) &&
+        match(MatchedMul,
+              m_c_Mul(m_Specific(OuterPHI), m_Value(MatchedItCount))) &&
+        MatchedItCount == InnerLimit) {
+      LLVM_DEBUG(dbgs() << "Use is optimisable\n");
+      ValidOuterPHIUses.insert(MatchedMul);
+      LinearIVUses.insert(U);
+    } else {
+      LLVM_DEBUG(dbgs() << "Did not match expected pattern, bailing\n");
+      return false;
+    }
+  }
+
+  // Check that there are no uses of the outer IV other than the ones found
+  // as part of the pattern above.
+  for (User *U : OuterPHI->users()) {
+    if (U == OuterIncrement)
+      continue;
+
+    LLVM_DEBUG(dbgs() << "Found use of outer induction variable: "; U->dump());
+
+    if (!ValidOuterPHIUses.count(U)) {
+      LLVM_DEBUG(dbgs() << "Did not match expected pattern, bailing\n");
+      return false;
+    } else {
+      LLVM_DEBUG(dbgs() << "Use is optimisable\n");
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "Found " << LinearIVUses.size()
+                    << " value(s) that can be replaced:\n";
+             for (Value *V : LinearIVUses) {
+               dbgs() << "  ";
+               V->dump();
+             });
+
+  return true;
+}
+
+// Return an OverflowResult dependant on if overflow of the multiplication of
+// InnerLimit and OuterLimit can be assumed not to happen.
+static OverflowResult checkOverflow(Loop *OuterLoop, Value *InnerLimit,
+                                    Value *OuterLimit,
+                                    SmallPtrSetImpl<Value *> &LinearIVUses,
+                                    DominatorTree *DT, AssumptionCache *AC) {
+  Function *F = OuterLoop->getHeader()->getParent();
+  const DataLayout &DL = F->getParent()->getDataLayout();
+
+  // For debugging/testing.
+  if (AssumeNoOverflow)
+    return OverflowResult::NeverOverflows;
+
+  // Check if the multiply could not overflow due to known ranges of the
+  // input values.
+  OverflowResult OR = computeOverflowForUnsignedMul(
+      InnerLimit, OuterLimit, DL, AC,
+      OuterLoop->getLoopPreheader()->getTerminator(), DT);
+  if (OR != OverflowResult::MayOverflow)
+    return OR;
+
+  for (Value *V : LinearIVUses) {
+    for (Value *U : V->users()) {
+      if (auto *GEP = dyn_cast<GetElementPtrInst>(U)) {
+        // The IV is used as the operand of a GEP, and the IV is at least as
+        // wide as the address space of the GEP. In this case, the GEP would
+        // wrap around the address space before the IV increment wraps, which
+        // would be UB.
+        if (GEP->isInBounds() &&
+            V->getType()->getIntegerBitWidth() >=
+                DL.getPointerTypeSizeInBits(GEP->getType())) {
+          LLVM_DEBUG(
+              dbgs() << "use of linear IV would be UB if overflow occurred: ";
+              GEP->dump());
+          return OverflowResult::NeverOverflows;
+        }
+      }
+    }
+  }
+
+  return OverflowResult::MayOverflow;
+}
+
+static bool FlattenLoopPair(Loop *OuterLoop, Loop *InnerLoop, DominatorTree *DT,
+                            LoopInfo *LI, ScalarEvolution *SE,
+                            AssumptionCache *AC, TargetTransformInfo *TTI,
+                            std::function<void(Loop *)> markLoopAsDeleted) {
+  Function *F = OuterLoop->getHeader()->getParent();
+
+  LLVM_DEBUG(dbgs() << "Loop flattening running on outer loop "
+                    << OuterLoop->getHeader()->getName() << " and inner loop "
+                    << InnerLoop->getHeader()->getName() << " in "
+                    << F->getName() << "\n");
+
+  SmallPtrSet<Instruction *, 8> IterationInstructions;
+
+  PHINode *InnerInductionPHI, *OuterInductionPHI;
+  Value *InnerLimit, *OuterLimit;
+  BinaryOperator *InnerIncrement, *OuterIncrement;
+  BranchInst *InnerBranch, *OuterBranch;
+
+  if (!findLoopComponents(InnerLoop, IterationInstructions, InnerInductionPHI,
+                          InnerLimit, InnerIncrement, InnerBranch, SE))
+    return false;
+  if (!findLoopComponents(OuterLoop, IterationInstructions, OuterInductionPHI,
+                          OuterLimit, OuterIncrement, OuterBranch, SE))
+    return false;
+
+  // Both of the loop limit values must be invariant in the outer loop
+  // (non-instructions are all inherently invariant).
+  if (!OuterLoop->isLoopInvariant(InnerLimit)) {
+    LLVM_DEBUG(dbgs() << "inner loop limit not invariant\n");
+    return false;
+  }
+  if (!OuterLoop->isLoopInvariant(OuterLimit)) {
+    LLVM_DEBUG(dbgs() << "outer loop limit not invariant\n");
+    return false;
+  }
+
+  SmallPtrSet<PHINode *, 4> InnerPHIsToTransform;
+  if (!checkPHIs(OuterLoop, InnerLoop, InnerPHIsToTransform, InnerInductionPHI,
+                 OuterInductionPHI, TTI))
+    return false;
+
+  // FIXME: it should be possible to handle different types correctly.
+  if (InnerInductionPHI->getType() != OuterInductionPHI->getType())
+    return false;
+
+  if (!checkOuterLoopInsts(OuterLoop, InnerLoop, IterationInstructions,
+                           InnerLimit, OuterInductionPHI, TTI))
+    return false;
+
+  // Find the values in the loop that can be replaced with the linearized
+  // induction variable, and check that there are no other uses of the inner
+  // or outer induction variable. If there were, we could still do this
+  // transformation, but we'd have to insert a div/mod to calculate the
+  // original IVs, so it wouldn't be profitable.
+  SmallPtrSet<Value *, 4> LinearIVUses;
+  if (!checkIVUsers(InnerInductionPHI, OuterInductionPHI, InnerIncrement,
+                    OuterIncrement, InnerLimit, LinearIVUses))
+    return false;
+
+  // Check if the new iteration variable might overflow. In this case, we
+  // need to version the loop, and select the original version at runtime if
+  // the iteration space is too large.
+  // TODO: We currently don't version the loop.
+  // TODO: it might be worth using a wider iteration variable rather than
+  // versioning the loop, if a wide enough type is legal.
+  bool MustVersionLoop = true;
+  OverflowResult OR =
+      checkOverflow(OuterLoop, InnerLimit, OuterLimit, LinearIVUses, DT, AC);
+  if (OR == OverflowResult::AlwaysOverflowsHigh ||
+      OR == OverflowResult::AlwaysOverflowsLow) {
+    LLVM_DEBUG(dbgs() << "Multiply would always overflow, so not profitable\n");
+    return false;
+  } else if (OR == OverflowResult::MayOverflow) {
+    LLVM_DEBUG(dbgs() << "Multiply might overflow, not flattening\n");
+  } else {
+    LLVM_DEBUG(dbgs() << "Multiply cannot overflow, modifying loop in-place\n");
+    MustVersionLoop = false;
+  }
+
+  // We cannot safely flatten the loop. Exit now.
+  if (MustVersionLoop)
+    return false;
+
+  // Do the actual transformation.
+  LLVM_DEBUG(dbgs() << "Checks all passed, doing the transformation\n");
+
+  {
+    using namespace ore;
+    OptimizationRemark Remark(DEBUG_TYPE, "Flattened", InnerLoop->getStartLoc(),
+                              InnerLoop->getHeader());
+    OptimizationRemarkEmitter ORE(F);
+    Remark << "Flattened into outer loop";
+    ORE.emit(Remark);
+  }
+
+  Value *NewTripCount =
+      BinaryOperator::CreateMul(InnerLimit, OuterLimit, "flatten.tripcount",
+                                OuterLoop->getLoopPreheader()->getTerminator());
+  LLVM_DEBUG(dbgs() << "Created new trip count in preheader: ";
+             NewTripCount->dump());
+
+  // Fix up PHI nodes that take values from the inner loop back-edge, which
+  // we are about to remove.
+  InnerInductionPHI->removeIncomingValue(InnerLoop->getLoopLatch());
+  for (PHINode *PHI : InnerPHIsToTransform)
+    PHI->removeIncomingValue(InnerLoop->getLoopLatch());
+
+  // Modify the trip count of the outer loop to be the product of the two
+  // trip counts.
+  cast<User>(OuterBranch->getCondition())->setOperand(1, NewTripCount);
+
+  // Replace the inner loop backedge with an unconditional branch to the exit.
+  BasicBlock *InnerExitBlock = InnerLoop->getExitBlock();
+  BasicBlock *InnerExitingBlock = InnerLoop->getExitingBlock();
+  InnerExitingBlock->getTerminator()->eraseFromParent();
+  BranchInst::Create(InnerExitBlock, InnerExitingBlock);
+  DT->deleteEdge(InnerExitingBlock, InnerLoop->getHeader());
+
+  // Replace all uses of the polynomial calculated from the two induction
+  // variables with the one new one.
+  for (Value *V : LinearIVUses)
+    V->replaceAllUsesWith(OuterInductionPHI);
+
+  // Tell LoopInfo, SCEV and the pass manager that the inner loop has been
+  // deleted, and any information that have about the outer loop invalidated.
+  markLoopAsDeleted(InnerLoop);
+  SE->forgetLoop(OuterLoop);
+  SE->forgetLoop(InnerLoop);
+  LI->erase(InnerLoop);
+
+  return true;
+}
+
+PreservedAnalyses LoopFlattenPass::run(Loop &L, LoopAnalysisManager &AM,
+                                       LoopStandardAnalysisResults &AR,
+                                       LPMUpdater &Updater) {
+  if (L.getSubLoops().size() != 1)
+    return PreservedAnalyses::all();
+
+  Loop *InnerLoop = *L.begin();
+  std::string LoopName(InnerLoop->getName());
+  if (!FlattenLoopPair(
+          &L, InnerLoop, &AR.DT, &AR.LI, &AR.SE, &AR.AC, &AR.TTI,
+          [&](Loop *L) { Updater.markLoopAsDeleted(*L, LoopName); }))
+    return PreservedAnalyses::all();
+  return getLoopPassPreservedAnalyses();
+}
+
+namespace {
+class LoopFlattenLegacyPass : public LoopPass {
+public:
+  static char ID; // Pass ID, replacement for typeid
+  LoopFlattenLegacyPass() : LoopPass(ID) {
+    initializeLoopFlattenLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  // Possibly flatten loop L into its child.
+  bool runOnLoop(Loop *L, LPPassManager &) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    getLoopAnalysisUsage(AU);
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addPreserved<TargetTransformInfoWrapperPass>();
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addPreserved<AssumptionCacheTracker>();
+  }
+};
+} // namespace
+
+char LoopFlattenLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopFlattenLegacyPass, "loop-flatten", "Flattens loops",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_END(LoopFlattenLegacyPass, "loop-flatten", "Flattens loops",
+                    false, false)
+
+Pass *llvm::createLoopFlattenPass() { return new LoopFlattenLegacyPass(); }
+
+bool LoopFlattenLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
+  if (skipLoop(L))
+    return false;
+
+  if (L->getSubLoops().size() != 1)
+    return false;
+
+  ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+  DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+  auto &TTIP = getAnalysis<TargetTransformInfoWrapperPass>();
+  TargetTransformInfo *TTI = &TTIP.getTTI(*L->getHeader()->getParent());
+  AssumptionCache *AC =
+      &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+          *L->getHeader()->getParent());
+
+  Loop *InnerLoop = *L->begin();
+  return FlattenLoopPair(L, InnerLoop, DT, LI, SE, AC, TTI,
+                         [&](Loop *L) { LPM.markLoopAsDeleted(*L); });
+}
diff --git a/llvm/lib/Transforms/Scalar/Scalar.cpp b/llvm/lib/Transforms/Scalar/Scalar.cpp
--- a/llvm/lib/Transforms/Scalar/Scalar.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalar.cpp
@@ -67,6 +67,7 @@
   initializeLoopAccessLegacyAnalysisPass(Registry);
   initializeLoopInstSimplifyLegacyPassPass(Registry);
   initializeLoopInterchangePass(Registry);
+  initializeLoopFlattenLegacyPassPass(Registry);
   initializeLoopPredicationLegacyPassPass(Registry);
   initializeLoopRotateLegacyPassPass(Registry);
   initializeLoopStrengthReducePass(Registry);
@@ -186,6 +187,10 @@
   unwrap(PM)->add(createLoopDeletionPass());
 }
 
+void LLVMAddLoopFlattenPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopFlattenPass());
+}
+
 void LLVMAddLoopIdiomPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createLoopIdiomPass());
 }
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -2832,7 +2832,7 @@
     return I->second;
 
   auto &Result = BPS[V] = None;
-  auto BitWidth = cast<IntegerType>(V->getType())->getBitWidth();
+  auto BitWidth = V->getType()->getScalarSizeInBits();
 
   // Prevent stack overflow by limiting the recursion depth
   if (Depth == BitPartRecursionMaxDepth) {
@@ -2840,13 +2840,16 @@
     return Result;
   }
 
-  if (Instruction *I = dyn_cast<Instruction>(V)) {
+  if (auto *I = dyn_cast<Instruction>(V)) {
+    Value *X, *Y;
+    const APInt *C;
+
     // If this is an or instruction, it may be an inner node of the bswap.
-    if (I->getOpcode() == Instruction::Or) {
-      const auto &A = collectBitParts(I->getOperand(0), MatchBSwaps,
-                                      MatchBitReversals, BPS, Depth + 1);
-      const auto &B = collectBitParts(I->getOperand(1), MatchBSwaps,
-                                      MatchBitReversals, BPS, Depth + 1);
+    if (match(V, m_Or(m_Value(X), m_Value(Y)))) {
+      const auto &A =
+          collectBitParts(X, MatchBSwaps, MatchBitReversals, BPS, Depth + 1);
+      const auto &B =
+          collectBitParts(Y, MatchBSwaps, MatchBitReversals, BPS, Depth + 1);
       if (!A || !B)
         return Result;
 
@@ -2871,15 +2874,15 @@
     }
 
     // If this is a logical shift by a constant, recurse then shift the result.
-    if (I->isLogicalShift() && isa<ConstantInt>(I->getOperand(1))) {
-      unsigned BitShift =
-          cast<ConstantInt>(I->getOperand(1))->getLimitedValue(~0U);
+    if (match(V, m_LogicalShift(m_Value(X), m_APInt(C)))) {
+      const APInt &BitShift = *C;
+
       // Ensure the shift amount is defined.
-      if (BitShift > BitWidth)
+      if (BitShift.uge(BitWidth))
         return Result;
 
-      const auto &Res = collectBitParts(I->getOperand(0), MatchBSwaps,
-                                        MatchBitReversals, BPS, Depth + 1);
+      const auto &Res =
+          collectBitParts(X, MatchBSwaps, MatchBitReversals, BPS, Depth + 1);
       if (!Res)
         return Result;
       Result = Res;
@@ -2887,11 +2890,11 @@
       // Perform the "shift" on BitProvenance.
       auto &P = Result->Provenance;
       if (I->getOpcode() == Instruction::Shl) {
-        P.erase(std::prev(P.end(), BitShift), P.end());
-        P.insert(P.begin(), BitShift, BitPart::Unset);
+        P.erase(std::prev(P.end(), BitShift.getZExtValue()), P.end());
+        P.insert(P.begin(), BitShift.getZExtValue(), BitPart::Unset);
       } else {
-        P.erase(P.begin(), std::next(P.begin(), BitShift));
-        P.insert(P.end(), BitShift, BitPart::Unset);
+        P.erase(P.begin(), std::next(P.begin(), BitShift.getZExtValue()));
+        P.insert(P.end(), BitShift.getZExtValue(), BitPart::Unset);
       }
 
       return Result;
@@ -2899,9 +2902,8 @@
 
     // If this is a logical 'and' with a mask that clears bits, recurse then
     // unset the appropriate bits.
-    if (I->getOpcode() == Instruction::And &&
-        isa<ConstantInt>(I->getOperand(1))) {
-      const APInt &AndMask = cast<ConstantInt>(I->getOperand(1))->getValue();
+    if (match(V, m_And(m_Value(X), m_APInt(C)))) {
+      const APInt &AndMask = *C;
 
       // Check that the mask allows a multiple of 8 bits for a bswap, for an
       // early exit.
@@ -2909,8 +2911,8 @@
       if (!MatchBitReversals && (NumMaskedBits % 8) != 0)
         return Result;
 
-      const auto &Res = collectBitParts(I->getOperand(0), MatchBSwaps,
-                                        MatchBitReversals, BPS, Depth + 1);
+      const auto &Res =
+          collectBitParts(X, MatchBSwaps, MatchBitReversals, BPS, Depth + 1);
       if (!Res)
         return Result;
       Result = Res;
@@ -2923,15 +2925,14 @@
     }
 
     // If this is a zext instruction zero extend the result.
-    if (I->getOpcode() == Instruction::ZExt) {
-      const auto &Res = collectBitParts(I->getOperand(0), MatchBSwaps,
-                                        MatchBitReversals, BPS, Depth + 1);
+    if (match(V, m_ZExt(m_Value(X)))) {
+      const auto &Res =
+          collectBitParts(X, MatchBSwaps, MatchBitReversals, BPS, Depth + 1);
       if (!Res)
         return Result;
 
       Result = BitPart(Res->Provider, BitWidth);
-      auto NarrowBitWidth =
-          cast<IntegerType>(cast<ZExtInst>(I)->getSrcTy())->getBitWidth();
+      auto NarrowBitWidth = X->getType()->getScalarSizeInBits();
       for (unsigned BitIdx = 0; BitIdx < NarrowBitWidth; ++BitIdx)
         Result->Provenance[BitIdx] = Res->Provenance[BitIdx];
       for (unsigned BitIdx = NarrowBitWidth; BitIdx < BitWidth; ++BitIdx)
@@ -2939,40 +2940,33 @@
       return Result;
     }
 
-    // Handle intrinsic calls.
-    if (auto *II = dyn_cast<IntrinsicInst>(I)) {
-      Intrinsic::ID IntrinsicID = II->getIntrinsicID();
-
-      // Funnel 'double' shifts take 3 operands, 2 inputs and the shift
-      // amount (modulo).
-      // fshl(X,Y,Z): (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
-      // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
-      const APInt *Amt;
-      if ((IntrinsicID == Intrinsic::fshl || IntrinsicID == Intrinsic::fshr) &&
-          match(II->getArgOperand(2), m_APInt(Amt))) {
-
-        // We can treat fshr as a fshl by flipping the modulo amount.
-        unsigned ModAmt = Amt->urem(BitWidth);
-        if (IntrinsicID == Intrinsic::fshr)
-          ModAmt = BitWidth - ModAmt;
-
-        const auto &LHS = collectBitParts(II->getArgOperand(0), MatchBSwaps,
-                                          MatchBitReversals, BPS, Depth + 1);
-        const auto &RHS = collectBitParts(II->getArgOperand(1), MatchBSwaps,
-                                          MatchBitReversals, BPS, Depth + 1);
-
-        // Check we have both sources and they are from the same provider.
-        if (!LHS || !RHS || !LHS->Provider || LHS->Provider != RHS->Provider)
-          return Result;
-
-        unsigned StartBitRHS = BitWidth - ModAmt;
-        Result = BitPart(LHS->Provider, BitWidth);
-        for (unsigned BitIdx = 0; BitIdx < StartBitRHS; ++BitIdx)
-          Result->Provenance[BitIdx + ModAmt] = LHS->Provenance[BitIdx];
-        for (unsigned BitIdx = 0; BitIdx < ModAmt; ++BitIdx)
-          Result->Provenance[BitIdx] = RHS->Provenance[BitIdx + StartBitRHS];
+    // Funnel 'double' shifts take 3 operands, 2 inputs and the shift
+    // amount (modulo).
+    // fshl(X,Y,Z): (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
+    // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
+    if (match(V, m_FShl(m_Value(X), m_Value(Y), m_APInt(C))) ||
+        match(V, m_FShr(m_Value(X), m_Value(Y), m_APInt(C)))) {
+      // We can treat fshr as a fshl by flipping the modulo amount.
+      unsigned ModAmt = C->urem(BitWidth);
+      if (cast<IntrinsicInst>(I)->getIntrinsicID() == Intrinsic::fshr)
+        ModAmt = BitWidth - ModAmt;
+
+      const auto &LHS =
+          collectBitParts(X, MatchBSwaps, MatchBitReversals, BPS, Depth + 1);
+      const auto &RHS =
+          collectBitParts(Y, MatchBSwaps, MatchBitReversals, BPS, Depth + 1);
+
+      // Check we have both sources and they are from the same provider.
+      if (!LHS || !RHS || !LHS->Provider || LHS->Provider != RHS->Provider)
         return Result;
-      }
+
+      unsigned StartBitRHS = BitWidth - ModAmt;
+      Result = BitPart(LHS->Provider, BitWidth);
+      for (unsigned BitIdx = 0; BitIdx < StartBitRHS; ++BitIdx)
+        Result->Provenance[BitIdx + ModAmt] = LHS->Provenance[BitIdx];
+      for (unsigned BitIdx = 0; BitIdx < ModAmt; ++BitIdx)
+        Result->Provenance[BitIdx] = RHS->Provenance[BitIdx + StartBitRHS];
+      return Result;
     }
   }
 
diff --git a/llvm/runtimes/CMakeLists.txt b/llvm/runtimes/CMakeLists.txt
--- a/llvm/runtimes/CMakeLists.txt
+++ b/llvm/runtimes/CMakeLists.txt
@@ -98,9 +98,17 @@
 
   include(CheckLibraryExists)
   include(CheckCCompilerFlag)
+  include(CMakePushCheckState)
 
-  # We don't have libc++ (yet).
-  set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -nostdlib++")
+  cmake_push_check_state()
+
+  # We don't have libc++ (yet)...
+  set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -nostdinc++ -nostdlib++")
+
+  # ...but we need access to libc++ headers for CMake checks to succeed.
+  if (LLVM_EXTERNAL_LIBCXX_SOURCE_DIR AND "libcxx" IN_LIST LLVM_ENABLE_RUNTIMES)
+    set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -isystem ${LLVM_EXTERNAL_LIBCXX_SOURCE_DIR}/include")
+  endif()
 
   # Avoid checking whether the compiler is working.
   set(LLVM_COMPILER_CHECKED ON)
@@ -110,8 +118,7 @@
   include(HandleLLVMOptions)
   include(FindPythonInterp)
 
-  # Remove the -nostdlib++ option we've added earlier.
-  string(REPLACE "-nostdlib++" "" CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}")
+  cmake_pop_check_state()
 
   # Use libtool instead of ar if you are both on an Apple host, and targeting Apple.
   if(CMAKE_HOST_APPLE AND APPLE)
@@ -215,15 +222,6 @@
 
 else() # if this is included from LLVM's CMake
   include(LLVMExternalProjectUtils)
-  if (LLVM_EXTERNAL_LIBCXX_SOURCE_DIR AND "libcxx" IN_LIST LLVM_ENABLE_RUNTIMES)
-    # This looks wrong, but libcxx's build actually wants the header dir to be
-    # the root build dir, not the include directory.
-    set(LIBCXX_BINARY_DIR ${LLVM_BINARY_DIR})
-    set(LIBCXX_SOURCE_DIR ${LLVM_EXTERNAL_LIBCXX_SOURCE_DIR})
-    set(LIBCXX_HEADER_DIR ${LLVM_BINARY_DIR})
-    set(CXX_HEADER_TARGET runtime-libcxx-headers)
-    add_subdirectory(${LLVM_EXTERNAL_LIBCXX_SOURCE_DIR}/include ${CXX_HEADER_TARGET})
-  endif()
 
   if(NOT LLVM_BUILD_RUNTIMES)
     set(EXTRA_ARGS EXCLUDE_FROM_ALL)
@@ -298,11 +296,6 @@
       endif()
     endforeach()
 
-    # 64-bit XCOFF and big AR format is not yet supported in some of these tools.
-    if(NOT target MATCHES aix)
-      set(${target}_toolchain_tools lld llvm-ar llvm-lipo llvm-ranlib llvm-nm llvm-objcopy llvm-objdump llvm-strip)
-    endif()
-
     llvm_ExternalProject_Add(builtins-${target}
                              ${compiler_rt_path}/lib/builtins
                              DEPENDS ${ARG_DEPENDS}
@@ -316,7 +309,6 @@
                                         -DCMAKE_ASM_COMPILER_WORKS=ON
                                         -DCOMPILER_RT_DEFAULT_TARGET_ONLY=ON
                                         ${${target}_extra_args}
-                             TOOLCHAIN_TOOLS clang ${${target}_toolchain_tools}
                              USE_TOOLCHAIN
                              ${EXTRA_ARGS})
   endfunction()
@@ -420,7 +412,7 @@
 
     llvm_ExternalProject_Add(runtimes
                              ${CMAKE_CURRENT_SOURCE_DIR}
-                             DEPENDS ${ARG_DEPENDS} ${CXX_HEADER_TARGET}
+                             DEPENDS ${ARG_DEPENDS}
                              # Builtins were built separately above
                              CMAKE_ARGS -DCOMPILER_RT_BUILD_BUILTINS=Off
                                         -DLLVM_INCLUDE_TESTS=${LLVM_INCLUDE_TESTS}
@@ -524,14 +516,9 @@
       list(APPEND EXTRA_ARGS STRIP_TOOL ${CMAKE_CURRENT_BINARY_DIR}/llvm-strip-link)
     endif()
 
-    # 64-bit XCOFF and big AR format is not yet supported in some of these tools.
-    if(NOT target MATCHES aix)
-      set(${name}_toolchain_tools lld llvm-ar llvm-lipo llvm-ranlib llvm-nm llvm-objcopy llvm-objdump llvm-strip)
-    endif()
-
     llvm_ExternalProject_Add(runtimes-${name}
                              ${CMAKE_CURRENT_SOURCE_DIR}
-                             DEPENDS ${${name}_deps} ${CXX_HEADER_TARGET}
+                             DEPENDS ${${name}_deps}
                              # Builtins were built separately above
                              CMAKE_ARGS -DCOMPILER_RT_BUILD_BUILTINS=Off
                                         -DLLVM_INCLUDE_TESTS=${LLVM_INCLUDE_TESTS}
@@ -547,7 +534,6 @@
                                         -DCOMPILER_RT_DEFAULT_TARGET_ONLY=ON
                                         -DLLVM_RUNTIMES_TARGET=${name}
                                         ${${name}_extra_args}
-                             TOOLCHAIN_TOOLS clang ${${name}_toolchain_tools}
                              EXTRA_TARGETS ${${name}_extra_targets}
                                            ${${name}_test_targets}
                              USE_TOOLCHAIN
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fp-arith.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fp-arith.mir
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fp-arith.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fp-arith.mir
@@ -73,3 +73,42 @@
     $q0 = COPY %2(<4 x s32>)
 
 ...
+---
+name:            test_fmul_v4s64
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: test_fmul_v4s64
+    ; CHECK: [[DEF:%[0-9]+]]:_(<2 x s64>) = G_IMPLICIT_DEF
+    ; CHECK: [[FMUL:%[0-9]+]]:_(<2 x s64>) = G_FMUL [[DEF]], [[DEF]]
+    ; CHECK: [[FMUL1:%[0-9]+]]:_(<2 x s64>) = G_FMUL [[DEF]], [[DEF]]
+    ; CHECK: $q0 = COPY [[FMUL]](<2 x s64>)
+    ; CHECK: $q1 = COPY [[FMUL1]](<2 x s64>)
+    %0:_(<4 x s64>) = G_IMPLICIT_DEF
+    %1:_(<4 x s64>) = G_IMPLICIT_DEF
+    %2:_(<4 x s64>) = G_FMUL %0, %1
+    %uv1:_(<2 x s64>), %uv2:_(<2 x s64>) = G_UNMERGE_VALUES %2
+    $q0 = COPY %uv1(<2 x s64>)
+    $q1 = COPY %uv2(<2 x s64>)
+
+...
+---
+name:            test_fmul_v8s32
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: test_fmul_v8s32
+    ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[DEF]](s32), [[DEF]](s32), [[DEF]](s32), [[DEF]](s32)
+    ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[DEF]](s32), [[DEF]](s32), [[DEF]](s32), [[DEF]](s32)
+    ; CHECK: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[DEF]](s32), [[DEF]](s32), [[DEF]](s32), [[DEF]](s32)
+    ; CHECK: [[BUILD_VECTOR3:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[DEF]](s32), [[DEF]](s32), [[DEF]](s32), [[DEF]](s32)
+    ; CHECK: [[FMUL:%[0-9]+]]:_(<4 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR2]]
+    ; CHECK: [[FMUL1:%[0-9]+]]:_(<4 x s32>) = G_FMUL [[BUILD_VECTOR1]], [[BUILD_VECTOR3]]
+    ; CHECK: $q0 = COPY [[FMUL]](<4 x s32>)
+    ; CHECK: $q1 = COPY [[FMUL1]](<4 x s32>)
+    %0:_(<8 x s32>) = G_IMPLICIT_DEF
+    %1:_(<8 x s32>) = G_IMPLICIT_DEF
+    %2:_(<8 x s32>) = G_FMUL %0, %1
+    %uv1:_(<4 x s32>), %uv2:_(<4 x s32>) = G_UNMERGE_VALUES %2
+    $q0 = COPY %uv1(<4 x s32>)
+    $q1 = COPY %uv2(<4 x s32>)
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-freeze.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-freeze.mir
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-freeze.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-freeze.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -march=aarch64 -run-pass=legalizer -O0 %s -o - | FileCheck %s
+# RUN: llc -march=aarch64 -run-pass=legalizer -global-isel-abort=1 -O0 %s -o - | FileCheck %s
 ---
 name:            test_freeze_s64
 body: |
@@ -67,3 +67,21 @@
     $w0 = COPY %1
     $w1 = COPY %2
 ...
+---
+name: test_freeze_v8s8
+body: |
+  bb.0:
+    liveins: $d0
+
+    ; CHECK-LABEL: name: test_freeze_v8s8
+    ; CHECK: %d0:_(<8 x s8>) = COPY $d0
+    ; CHECK: [[FREEZE:%[0-9]+]]:_(<8 x s8>) = G_FREEZE %d0
+    ; CHECK: [[UV:%[0-9]+]]:_(<4 x s8>), [[UV1:%[0-9]+]]:_(<4 x s8>) = G_UNMERGE_VALUES [[FREEZE]](<8 x s8>)
+    ; CHECK: $w0 = COPY [[UV]](<4 x s8>)
+    ; CHECK: $w1 = COPY [[UV1]](<4 x s8>)
+    %d0:_(<8 x s8>) = COPY $d0
+    %0:_(<8 x s8>) = G_FREEZE %d0
+    %1:_(<4 x s8>), %2:_(<4 x s8>) = G_UNMERGE_VALUES %0
+    $w0 = COPY %1
+    $w1 = COPY %2
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-phi.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-phi.mir
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-phi.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-phi.mir
@@ -1,51 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -mtriple=aarch64-unknown-unknown -verify-machineinstrs -run-pass=legalizer %s -o - | FileCheck %s
---- |
-  ; ModuleID = '/tmp/test.ll'
-  source_filename = "/tmp/test.ll"
-  target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-  target triple = "aarch64-unknown-unknown"
-
-  define i32 @legalize_phi(i32 %argc) {
-  entry:
-    ret i32 0
-  }
-
-  define i64* @legalize_phi_ptr(i64* %a, i64* %b, i1 %cond) {
-  entry:
-    ret i64* null
-  }
-
-  define i32 @legalize_phi_empty(i32 %argc) {
-  entry:
-    ret i32 0
-  }
-
-  define i32 @legalize_phi_loop(i32 %argc) {
-  entry:
-    ret i32 0
-  }
-
-  define i32 @legalize_phi_cycle(i32 %argc) {
-  entry:
-    ret i32 0
-  }
-  define i32 @legalize_phi_same_bb(i32 %argc) {
-  entry:
-    ret i32 0
-  }
-
-  define i32 @legalize_phi_diff_bb(i32 %argc, i32 %argc2) {
-  entry:
-    ret i32 0
-  }
-
-  define i32 @legalize_phi_check_insertpt(i64 %a) {
-  entry:
-    ret i32 0
-  }
-
-...
+# RUN: llc -O0 -mtriple=aarch64-unknown-unknown -verify-machineinstrs -global-isel-abort=1 -run-pass=legalizer %s -o - | FileCheck %s
 ---
 name:            legalize_phi
 alignment:       4
@@ -610,7 +564,6 @@
 selected:        false
 tracksRegLiveness: true
 body:             |
-  ; Check that the G_MERGE here gets inserted after all the PHIs.
   ; CHECK-LABEL: name: legalize_phi_check_insertpt
   ; CHECK: bb.0:
   ; CHECK:   successors: %bb.1(0x80000000)
@@ -627,6 +580,7 @@
   ; CHECK:   G_STORE [[MV]](s128), [[COPY1]](p0) :: (store 16)
   ; CHECK:   G_STORE [[PHI2]](s64), [[COPY1]](p0) :: (store 8)
   ; CHECK:   RET_ReallyLR
+  ; Check that the G_MERGE here gets inserted after all the PHIs.
   bb.0:
     successors: %bb.1(0x40000000)
     liveins: $x0, $x1
@@ -644,3 +598,29 @@
     RET_ReallyLR
 
 ...
+---
+name:            legalize_phi_vector
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: legalize_phi_vector
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: $q0, $x1
+  ; CHECK:   [[COPY:%[0-9]+]]:_(<16 x s8>) = COPY $q0
+  ; CHECK:   G_BR %bb.1
+  ; CHECK: bb.1:
+  ; CHECK:   [[PHI:%[0-9]+]]:_(<16 x s8>) = G_PHI [[COPY]](<16 x s8>), %bb.0
+  ; CHECK:   $q0 = COPY [[PHI]](<16 x s8>)
+  ; CHECK:   RET_ReallyLR
+  bb.0:
+    successors: %bb.1
+    liveins: $q0, $x1
+
+    %0:_(<16 x s8>) = COPY $q0
+    G_BR %bb.1
+
+  bb.1:
+    %3:_(<16 x s8>) = G_PHI %0(<16 x s8>), %bb.0
+    $q0 = COPY %3(<16 x s8>)
+    RET_ReallyLR
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-build-vector.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-build-vector.mir
--- a/llvm/test/CodeGen/AArch64/GlobalISel/select-build-vector.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-build-vector.mir
@@ -1,28 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
---- |
-  target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-  target triple = "aarch64"
-
-  define <4 x float> @test_f32(float %a, float %b, float %c, float %d) {
-    ret <4 x float> undef
-  }
-
-  define <2 x double> @test_f64(double %a, double %b) {
-    ret <2 x double> undef
-  }
-
-  define <4 x i32> @test_i32(i32 %a, i32 %b, i32 %c, i32 %d) {
-    ret <4 x i32> undef
-  }
-
-  define <2 x i64> @test_i64(i64 %a, i64 %b) {
-    ret <2 x i64> undef
-  }
-
-  define void @test_p0(i64 *%a, i64 *%b) { ret void }
-
-...
 ---
 name:            test_f32
 alignment:       4
@@ -33,7 +10,7 @@
 failedISel:      false
 tracksRegLiveness: true
 body:             |
-  bb.0 (%ir-block.0):
+  bb.0:
     liveins: $s0, $s1, $s2, $s3
 
     ; CHECK-LABEL: name: test_f32
@@ -74,7 +51,7 @@
 failedISel:      false
 tracksRegLiveness: true
 body:             |
-  bb.0 (%ir-block.0):
+  bb.0:
     liveins: $d0, $d1, $d2, $d3
 
     ; CHECK-LABEL: name: test_f64
@@ -105,7 +82,7 @@
 failedISel:      false
 tracksRegLiveness: true
 body:             |
-  bb.0 (%ir-block.0):
+  bb.0:
     liveins: $w0, $w1, $w2, $w3
 
     ; CHECK-LABEL: name: test_i32
@@ -140,7 +117,7 @@
 failedISel:      false
 tracksRegLiveness: true
 body:             |
-  bb.0 (%ir-block.0):
+  bb.0:
     liveins: $x0, $x1
 
     ; CHECK-LABEL: name: test_i64
@@ -169,7 +146,7 @@
 failedISel:      false
 tracksRegLiveness: true
 body:             |
-  bb.0 (%ir-block.0):
+  bb.0:
     liveins: $x0, $x1
 
     ; CHECK-LABEL: name: test_p0
@@ -188,3 +165,66 @@
     RET_ReallyLR implicit $q0
 
 ...
+---
+name:            test_v4s32_zero
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+liveins:
+  - { reg: '$x0' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.1:
+    liveins: $x0
+
+    ; CHECK-LABEL: name: test_v4s32_zero
+    ; CHECK: liveins: $x0
+    ; CHECK: [[MOVIv2d_ns:%[0-9]+]]:fpr128 = MOVIv2d_ns 0
+    ; CHECK: $q0 = COPY [[MOVIv2d_ns]]
+    ; CHECK: RET_ReallyLR
+    %0:gpr(p0) = COPY $x0
+    %2:gpr(s32) = G_CONSTANT i32 0
+    %3:fpr(s32) = COPY %2(s32)
+    %4:fpr(s32) = COPY %2(s32)
+    %5:fpr(s32) = COPY %2(s32)
+    %6:fpr(s32) = COPY %2(s32)
+    %1:fpr(<4 x s32>) = G_BUILD_VECTOR %3(s32), %4(s32), %5(s32), %6(s32)
+    $q0 = COPY %1(<4 x s32>)
+    RET_ReallyLR
+...
+---
+name:            test_v8s8_zero
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+liveins:
+  - { reg: '$x0' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.1:
+    liveins: $x0
+
+    ; CHECK-LABEL: name: test_v8s8_zero
+    ; CHECK: liveins: $x0
+    ; CHECK: [[MOVIv2d_ns:%[0-9]+]]:fpr128 = MOVIv2d_ns 0
+    ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY [[MOVIv2d_ns]].dsub
+    ; CHECK: $d0 = COPY [[COPY]]
+    ; CHECK: RET_ReallyLR
+    %0:gpr(p0) = COPY $x0
+    %2:gpr(s8) = G_CONSTANT i8 0
+    %3:fpr(s8) = COPY %2(s8)
+    %4:fpr(s8) = COPY %2(s8)
+    %5:fpr(s8) = COPY %2(s8)
+    %6:fpr(s8) = COPY %2(s8)
+    %7:fpr(s8) = COPY %2(s8)
+    %8:fpr(s8) = COPY %2(s8)
+    %9:fpr(s8) = COPY %2(s8)
+    %10:fpr(s8) = COPY %2(s8)
+    %1:fpr(<8 x s8>) = G_BUILD_VECTOR %3(s8), %4(s8), %5(s8), %6(s8), %7(s8), %8(s8), %9(s8), %10(s8)
+    $d0 = COPY %1(<8 x s8>)
+    RET_ReallyLR
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-fcmp.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-fcmp.mir
--- a/llvm/test/CodeGen/AArch64/GlobalISel/select-fcmp.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-fcmp.mir
@@ -54,3 +54,56 @@
     %3:gpr(s32) = G_FCMP floatpred(oeq), %0(s32), %2
     $s0 = COPY %3(s32)
     RET_ReallyLR implicit $s0
+
+...
+---
+name:            notzero_s64
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.1:
+    liveins: $d0, $d1
+
+    ; CHECK-LABEL: name: notzero_s64
+    ; CHECK: liveins: $d0, $d1
+    ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d0
+    ; CHECK: [[FMOVDi:%[0-9]+]]:fpr64 = FMOVDi 112
+    ; CHECK: FCMPDrr [[COPY]], [[FMOVDi]], implicit-def $nzcv
+    ; CHECK: [[CSINCWr:%[0-9]+]]:gpr32 = CSINCWr $wzr, $wzr, 1, implicit $nzcv
+    ; CHECK: $s0 = COPY [[CSINCWr]]
+    ; CHECK: RET_ReallyLR implicit $s0
+    %0:fpr(s64) = COPY $d0
+    %1:fpr(s64) = COPY $d1
+    %2:fpr(s64) = G_FCONSTANT double 1.000000e+00
+    %3:gpr(s32) = G_FCMP floatpred(oeq), %0(s64), %2
+    $s0 = COPY %3(s32)
+    RET_ReallyLR implicit $s0
+
+
+...
+---
+name:            zero_s64
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $d0, $d1, $s0
+
+    ; CHECK-LABEL: name: zero_s64
+    ; CHECK: liveins: $d0, $d1, $s0
+    ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d0
+    ; CHECK: FCMPDri [[COPY]], implicit-def $nzcv
+    ; CHECK: [[CSINCWr:%[0-9]+]]:gpr32 = CSINCWr $wzr, $wzr, 1, implicit $nzcv
+    ; CHECK: $s0 = COPY [[CSINCWr]]
+    ; CHECK: RET_ReallyLR implicit $s0
+    %0:fpr(s64) = COPY $d0
+    %1:fpr(s64) = COPY $d1
+    %2:fpr(s64) = G_FCONSTANT double 0.000000e+00
+    %3:gpr(s32) = G_FCMP floatpred(oeq), %0(s64), %2
+    $s0 = COPY %3(s32)
+    RET_ReallyLR implicit $s0
diff --git a/llvm/test/CodeGen/AArch64/arm64-vabs.ll b/llvm/test/CodeGen/AArch64/arm64-vabs.ll
--- a/llvm/test/CodeGen/AArch64/arm64-vabs.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vabs.ll
@@ -956,8 +956,8 @@
 ; DAG: abs.2s
 ; DAG-NEXT: ret
 
-; GISEL: neg.2s
-; GISEL: cmge.2s
+; GISEL-DAG: neg.2s
+; GISEL-DAG: cmge.2s
 ; GISEL: bif.8b
         %tmp1neg = sub <2 x i32> zeroinitializer, %a
         %b = icmp sge <2 x i32> %a, zeroinitializer
@@ -1035,8 +1035,8 @@
 ; DAG: abs.2d
 ; DAG-NEXT: ret
 
-; GISEL: neg.2d
-; GISEL: cmge.2d
+; GISEL-DAG: neg.2d
+; GISEL-DAG: cmge.2d
 ; GISEL: bit.16b
         %tmp1neg = sub <2 x i64> zeroinitializer, %a
         %b = icmp sle <2 x i64> %a, zeroinitializer
diff --git a/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll b/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll
--- a/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll
@@ -26,7 +26,10 @@
 entry:
 ; FIXME: Missed optimization, the entire SP push/pop could be removed
 ; CHECK-LABEL: f2
-; CHECK:         stp     xzr, xzr, [sp, #-16]!
+; CHECK:         sub     sp, sp, #16
+; CHECK-NEXT:    .seh_stackalloc 16
+; CHECK-NEXT:    .seh_endprologue
+; CHECK-NEXT:    stp     xzr, xzr, [sp]
 ; CHECK-NEXT:    mov     x0, xzr
 ; CHECK-NEXT:    mov     x1, xzr
 ; CHECK-NEXT:    .seh_startepilogue
diff --git a/llvm/test/CodeGen/AArch64/combine-loads.ll b/llvm/test/CodeGen/AArch64/combine-loads.ll
--- a/llvm/test/CodeGen/AArch64/combine-loads.ll
+++ b/llvm/test/CodeGen/AArch64/combine-loads.ll
@@ -4,8 +4,7 @@
 define <2 x i64> @z(i64* nocapture nonnull readonly %p) {
 ; CHECK-LABEL: z:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI0_0
-; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI0_0]
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
 ; CHECK-NEXT:    ldr x9, [x0]
 ; CHECK-NEXT:    ldr x8, [x0, #8]
 ; CHECK-NEXT:    mov v0.d[0], x9
diff --git a/llvm/test/CodeGen/AArch64/sve-fcvt.ll b/llvm/test/CodeGen/AArch64/sve-fcvt.ll
--- a/llvm/test/CodeGen/AArch64/sve-fcvt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fcvt.ll
@@ -5,6 +5,94 @@
 ; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
 ; WARN-NOT: warning
 
+;
+; FP_EXTEND
+;
+
+define <vscale x 2 x float> @fcvts_nxv2f16(<vscale x 2 x half> %a) {
+; CHECK-LABEL: fcvts_nxv2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvt z0.s, p0/m, z0.h
+; CHECK-NEXT:    ret
+  %res = fpext <vscale x 2 x half> %a to <vscale x 2 x float>
+  ret <vscale x 2 x float> %res
+}
+
+define <vscale x 4 x float> @fcvts_nxv4f16(<vscale x 4 x half> %a) {
+; CHECK-LABEL: fcvts_nxv4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvt z0.s, p0/m, z0.h
+; CHECK-NEXT:    ret
+  %res = fpext <vscale x 4 x half> %a to <vscale x 4 x float>
+  ret <vscale x 4 x float> %res
+}
+
+define <vscale x 2 x double> @fcvtd_nxv2f16(<vscale x 2 x half> %a) {
+; CHECK-LABEL: fcvtd_nxv2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvt z0.d, p0/m, z0.h
+; CHECK-NEXT:    ret
+  %res = fpext <vscale x 2 x half> %a to <vscale x 2 x double>
+  ret <vscale x 2 x double> %res
+}
+
+define <vscale x 2 x double> @fcvtd_nxv2f32(<vscale x 2 x float> %a) {
+; CHECK-LABEL: fcvtd_nxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvt z0.d, p0/m, z0.s
+; CHECK-NEXT:    ret
+  %res = fpext <vscale x 2 x float> %a to <vscale x 2 x double>
+  ret <vscale x 2 x double> %res
+}
+
+;
+; FP_ROUND
+;
+
+define <vscale x 2 x half> @fcvth_nxv2f32(<vscale x 2 x float> %a) {
+; CHECK-LABEL: fcvth_nxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvt z0.h, p0/m, z0.s
+; CHECK-NEXT:    ret
+  %res = fptrunc <vscale x 2 x float> %a to <vscale x 2 x half>
+  ret <vscale x 2 x half> %res
+}
+
+define <vscale x 4 x half> @fcvth_nxv4f32(<vscale x 4 x float> %a) {
+; CHECK-LABEL: fcvth_nxv4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvt z0.h, p0/m, z0.s
+; CHECK-NEXT:    ret
+  %res = fptrunc <vscale x 4 x float> %a to <vscale x 4 x half>
+  ret <vscale x 4 x half> %res
+}
+
+define <vscale x 2 x half> @fcvth_nxv2f64(<vscale x 2 x double> %a) {
+; CHECK-LABEL: fcvth_nxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvt z0.h, p0/m, z0.d
+; CHECK-NEXT:    ret
+  %res = fptrunc <vscale x 2 x double> %a to <vscale x 2 x half>
+  ret <vscale x 2 x half> %res
+}
+
+define <vscale x 2 x float> @fcvts_nxv2f64(<vscale x 2 x double> %a) {
+; CHECK-LABEL: fcvts_nxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvt z0.s, p0/m, z0.d
+; CHECK-NEXT:    ret
+  %res = fptrunc <vscale x 2 x double> %a to <vscale x 2 x float>
+  ret <vscale x 2 x float> %res
+}
+
 ;
 ; FP_TO_SINT
 ;
diff --git a/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll b/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll
--- a/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll
@@ -5,6 +5,152 @@
 ; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
 ; WARN-NOT: warning
 
+; FP_EXTEND
+
+define <vscale x 8 x float> @fcvts_nxv8f16(<vscale x 8 x half> %a) {
+; CHECK-LABEL: fcvts_nxv8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uunpklo z1.s, z0.h
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    uunpkhi z2.s, z0.h
+; CHECK-NEXT:    fcvt z0.s, p0/m, z1.h
+; CHECK-NEXT:    fcvt z1.s, p0/m, z2.h
+; CHECK-NEXT:    ret
+  %res = fpext <vscale x 8 x half> %a to <vscale x 8 x float>
+  ret <vscale x 8 x float> %res
+}
+
+define <vscale x 4 x double> @fcvtd_nxv4f16(<vscale x 4 x half> %a) {
+; CHECK-LABEL: fcvtd_nxv4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uunpklo z1.d, z0.s
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpkhi z2.d, z0.s
+; CHECK-NEXT:    fcvt z0.d, p0/m, z1.h
+; CHECK-NEXT:    fcvt z1.d, p0/m, z2.h
+; CHECK-NEXT:    ret
+  %res = fpext <vscale x 4 x half> %a to <vscale x 4 x double>
+  ret <vscale x 4 x double> %res
+}
+
+define <vscale x 8 x double> @fcvtd_nxv8f16(<vscale x 8 x half> %a) {
+; CHECK-LABEL: fcvtd_nxv8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uunpklo z1.s, z0.h
+; CHECK-NEXT:    uunpkhi z0.s, z0.h
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpklo z2.d, z1.s
+; CHECK-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-NEXT:    uunpklo z3.d, z0.s
+; CHECK-NEXT:    uunpkhi z4.d, z0.s
+; CHECK-NEXT:    fcvt z0.d, p0/m, z2.h
+; CHECK-NEXT:    fcvt z1.d, p0/m, z1.h
+; CHECK-NEXT:    fcvt z2.d, p0/m, z3.h
+; CHECK-NEXT:    fcvt z3.d, p0/m, z4.h
+; CHECK-NEXT:    ret
+  %res = fpext <vscale x 8 x half> %a to <vscale x 8 x double>
+  ret <vscale x 8 x double> %res
+}
+
+define <vscale x 4 x double> @fcvtd_nxv4f32(<vscale x 4 x float> %a) {
+; CHECK-LABEL: fcvtd_nxv4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uunpklo z1.d, z0.s
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpkhi z2.d, z0.s
+; CHECK-NEXT:    fcvt z0.d, p0/m, z1.s
+; CHECK-NEXT:    fcvt z1.d, p0/m, z2.s
+; CHECK-NEXT:    ret
+  %res = fpext <vscale x 4 x float> %a to <vscale x 4 x double>
+  ret <vscale x 4 x double> %res
+}
+
+define <vscale x 8 x double> @fcvtd_nxv8f32(<vscale x 8 x float> %a) {
+; CHECK-LABEL: fcvtd_nxv8f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uunpklo z2.d, z0.s
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpkhi z3.d, z0.s
+; CHECK-NEXT:    uunpklo z4.d, z1.s
+; CHECK-NEXT:    uunpkhi z5.d, z1.s
+; CHECK-NEXT:    fcvt z0.d, p0/m, z2.s
+; CHECK-NEXT:    fcvt z1.d, p0/m, z3.s
+; CHECK-NEXT:    fcvt z2.d, p0/m, z4.s
+; CHECK-NEXT:    fcvt z3.d, p0/m, z5.s
+; CHECK-NEXT:    ret
+  %res = fpext <vscale x 8 x float> %a to <vscale x 8 x double>
+  ret <vscale x 8 x double> %res
+}
+
+; FP_ROUND
+
+define <vscale x 8 x half> @fcvth_nxv8f32(<vscale x 8 x float> %a) {
+; CHECK-LABEL: fcvth_nxv8f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvt z1.h, p0/m, z1.s
+; CHECK-NEXT:    fcvt z0.h, p0/m, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %res = fptrunc <vscale x 8 x float> %a to <vscale x 8 x half>
+  ret <vscale x 8 x half> %res
+}
+
+define <vscale x 8 x half> @fcvth_nxv8f64(<vscale x 8 x double> %a) {
+; CHECK-LABEL: fcvth_nxv8f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvt z3.h, p0/m, z3.d
+; CHECK-NEXT:    fcvt z2.h, p0/m, z2.d
+; CHECK-NEXT:    fcvt z1.h, p0/m, z1.d
+; CHECK-NEXT:    fcvt z0.h, p0/m, z0.d
+; CHECK-NEXT:    uzp1 z2.s, z2.s, z3.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z2.h
+; CHECK-NEXT:    ret
+  %res = fptrunc <vscale x 8 x double> %a to <vscale x 8 x half>
+  ret <vscale x 8 x half> %res
+}
+
+define <vscale x 4 x half> @fcvth_nxv4f64(<vscale x 4 x double> %a) {
+; CHECK-LABEL: fcvth_nxv4f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvt z1.h, p0/m, z1.d
+; CHECK-NEXT:    fcvt z0.h, p0/m, z0.d
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    ret
+  %res = fptrunc <vscale x 4 x double> %a to <vscale x 4 x half>
+  ret <vscale x 4 x half> %res
+}
+
+define <vscale x 4 x float> @fcvts_nxv4f64(<vscale x 4 x double> %a) {
+; CHECK-LABEL: fcvts_nxv4f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvt z1.s, p0/m, z1.d
+; CHECK-NEXT:    fcvt z0.s, p0/m, z0.d
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    ret
+  %res = fptrunc <vscale x 4 x double> %a to <vscale x 4 x float>
+  ret <vscale x 4 x float> %res
+}
+
+define <vscale x 8 x float> @fcvts_nxv8f64(<vscale x 8 x double> %a) {
+; CHECK-LABEL: fcvts_nxv8f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvt z1.s, p0/m, z1.d
+; CHECK-NEXT:    fcvt z0.s, p0/m, z0.d
+; CHECK-NEXT:    fcvt z3.s, p0/m, z3.d
+; CHECK-NEXT:    fcvt z2.s, p0/m, z2.d
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    uzp1 z1.s, z2.s, z3.s
+; CHECK-NEXT:    ret
+  %res = fptrunc <vscale x 8 x double> %a to <vscale x 8 x float>
+  ret <vscale x 8 x float> %res
+}
+
 ; FP_TO_SINT
 
 ; Split operand
@@ -95,3 +241,144 @@
   %res = fptoui <vscale x 4 x float> %a to <vscale x 4 x i64>
   ret <vscale x 4 x i64> %res
 }
+
+; SINT_TO_FP
+
+; Split operand
+define <vscale x 4 x float> @scvtf_s_nxv4i64(<vscale x 4 x i64> %a) {
+; CHECK-LABEL: scvtf_s_nxv4i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    scvtf z1.s, p0/m, z1.d
+; CHECK-NEXT:    scvtf z0.s, p0/m, z0.d
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    ret
+  %res = sitofp <vscale x 4 x i64> %a to <vscale x 4 x float>
+  ret <vscale x 4 x float> %res
+}
+
+define <vscale x 8 x half> @scvtf_h_nxv8i64(<vscale x 8 x i64> %a) {
+; CHECK-LABEL: scvtf_h_nxv8i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    scvtf z3.h, p0/m, z3.d
+; CHECK-NEXT:    scvtf z2.h, p0/m, z2.d
+; CHECK-NEXT:    scvtf z1.h, p0/m, z1.d
+; CHECK-NEXT:    scvtf z0.h, p0/m, z0.d
+; CHECK-NEXT:    uzp1 z2.s, z2.s, z3.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z2.h
+; CHECK-NEXT:    ret
+  %res = sitofp <vscale x 8 x i64> %a to <vscale x 8 x half>
+  ret <vscale x 8 x half> %res
+}
+
+; Split result
+define <vscale x 16 x float> @scvtf_s_nxv16i8(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: scvtf_s_nxv16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sunpklo z1.h, z0.b
+; CHECK-NEXT:    sunpkhi z0.h, z0.b
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    sunpklo z2.s, z1.h
+; CHECK-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-NEXT:    sunpklo z3.s, z0.h
+; CHECK-NEXT:    sunpkhi z4.s, z0.h
+; CHECK-NEXT:    scvtf z0.s, p0/m, z2.s
+; CHECK-NEXT:    scvtf z1.s, p0/m, z1.s
+; CHECK-NEXT:    scvtf z2.s, p0/m, z3.s
+; CHECK-NEXT:    scvtf z3.s, p0/m, z4.s
+; CHECK-NEXT:    ret
+  %res = sitofp <vscale x 16 x i8> %a to <vscale x 16 x float>
+  ret <vscale x 16 x float> %res
+}
+
+define <vscale x 4 x double> @scvtf_d_nxv4i32(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: scvtf_d_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sunpklo z1.d, z0.s
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    sunpkhi z2.d, z0.s
+; CHECK-NEXT:    scvtf z0.d, p0/m, z1.d
+; CHECK-NEXT:    scvtf z1.d, p0/m, z2.d
+; CHECK-NEXT:    ret
+  %res = sitofp <vscale x 4 x i32> %a to <vscale x 4 x double>
+  ret <vscale x 4 x double> %res
+}
+
+define <vscale x 4 x double> @scvtf_d_nxv4i1(<vscale x 4 x i1> %a) {
+; CHECK-LABEL: scvtf_d_nxv4i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    pfalse p1.b
+; CHECK-NEXT:    zip1 p3.s, p0.s, p1.s
+; CHECK-NEXT:    zip2 p0.s, p0.s, p1.s
+; CHECK-NEXT:    ptrue p2.d
+; CHECK-NEXT:    mov z0.d, p3/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov z1.d, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    scvtf z0.d, p2/m, z0.d
+; CHECK-NEXT:    scvtf z1.d, p2/m, z1.d
+; CHECK-NEXT:    ret
+  %res = sitofp <vscale x 4 x i1> %a to <vscale x 4 x double>
+  ret <vscale x 4 x double> %res
+}
+
+; UINT_TO_FP
+
+; Split operand
+define <vscale x 4 x float> @ucvtf_s_nxv4i64(<vscale x 4 x i64> %a) {
+; CHECK-LABEL: ucvtf_s_nxv4i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ucvtf z1.s, p0/m, z1.d
+; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.d
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    ret
+  %res = uitofp <vscale x 4 x i64> %a to <vscale x 4 x float>
+  ret <vscale x 4 x float> %res
+}
+
+define <vscale x 8 x half> @ucvtf_h_nxv8i64(<vscale x 8 x i64> %a) {
+; CHECK-LABEL: ucvtf_h_nxv8i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ucvtf z3.h, p0/m, z3.d
+; CHECK-NEXT:    ucvtf z2.h, p0/m, z2.d
+; CHECK-NEXT:    ucvtf z1.h, p0/m, z1.d
+; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.d
+; CHECK-NEXT:    uzp1 z2.s, z2.s, z3.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z2.h
+; CHECK-NEXT:    ret
+  %res = uitofp <vscale x 8 x i64> %a to <vscale x 8 x half>
+  ret <vscale x 8 x half> %res
+}
+
+; Split result
+define <vscale x 4 x double> @ucvtf_d_nxv4i32(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: ucvtf_d_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uunpklo z1.d, z0.s
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpkhi z2.d, z0.s
+; CHECK-NEXT:    ucvtf z0.d, p0/m, z1.d
+; CHECK-NEXT:    ucvtf z1.d, p0/m, z2.d
+; CHECK-NEXT:    ret
+  %res = uitofp <vscale x 4 x i32> %a to <vscale x 4 x double>
+  ret <vscale x 4 x double> %res
+}
+
+define <vscale x 4 x double> @ucvtf_d_nxv4i1(<vscale x 4 x i1> %a) {
+; CHECK-LABEL: ucvtf_d_nxv4i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    pfalse p1.b
+; CHECK-NEXT:    zip1 p3.s, p0.s, p1.s
+; CHECK-NEXT:    zip2 p0.s, p0.s, p1.s
+; CHECK-NEXT:    ptrue p2.d
+; CHECK-NEXT:    mov z0.d, p3/z, #1 // =0x1
+; CHECK-NEXT:    mov z1.d, p0/z, #1 // =0x1
+; CHECK-NEXT:    ucvtf z0.d, p2/m, z0.d
+; CHECK-NEXT:    ucvtf z1.d, p2/m, z1.d
+; CHECK-NEXT:    ret
+  %res = uitofp <vscale x 4 x i1> %a to <vscale x 4 x double>
+  ret <vscale x 4 x double> %res
+}
diff --git a/llvm/test/CodeGen/AVR/interrupts.ll b/llvm/test/CodeGen/AVR/interrupts.ll
--- a/llvm/test/CodeGen/AVR/interrupts.ll
+++ b/llvm/test/CodeGen/AVR/interrupts.ll
@@ -64,5 +64,40 @@
   ret void
 }
 
+define avr_intrcc void @interrupt_alloca() {
+; CHECK-LABEL: interrupt_alloca:
+; CHECK: sei
+; CHECK-NEXT: push r0
+; CHECK-NEXT: push r1
+; CHECK-NEXT: in r0, 63
+; CHECK-NEXT: push r0
+; CHECK: clr r0
+; CHECK: push r28
+; CHECK-NEXT: push r29
+; CHECK-NEXT: in r28, 61
+; CHECK-NEXT: in r29, 62
+; CHECK-NEXT: sbiw r28, 1
+; CHECK-NEXT: in r0, 63
+; CHECK-NEXT: cli
+; CHECK-NEXT: out 62, r29
+; CHECK-NEXT: out 63, r0
+; CHECK-NEXT: out 61, r28
+; CHECK: adiw r28, 1
+; CHECK-NEXT: in r0, 63
+; CHECK-NEXT: cli
+; CHECK-NEXT: out 62, r29
+; CHECK-NEXT: out 63, r0
+; CHECK-NEXT: out 61, r28
+; CHECK-NEXT: pop r29
+; CHECK-NEXT: pop r28
+; CHECK: pop r0
+; CHECK-NEXT: out 63, r0
+; CHECK-NEXT: pop r1
+; CHECK-NEXT: pop r0
+; CHECK-NEXT: reti
+  alloca i8
+  ret void
+}
+
 attributes #0 = { "interrupt" }
 attributes #1 = { "signal" }
diff --git a/llvm/test/CodeGen/Hexagon/registerpassbuildercallbacks.ll b/llvm/test/CodeGen/Hexagon/registerpassbuildercallbacks.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/registerpassbuildercallbacks.ll
@@ -0,0 +1,27 @@
+; RUN: opt -mtriple=hexagon -disable-verify -debug-pass-manager \
+; RUN:     -disable-output -passes='default<O1>' -S %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=NPM
+; RUN: opt -mtriple=hexagon -disable-verify -debug-pass-manager \
+; RUN:     -disable-output -passes='default<O2>' -S %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=NPM
+; RUN: opt -mtriple=hexagon -disable-verify -debug-pass-manager \
+; RUN:     -disable-output -passes='default<O3>' -S %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=NPM
+
+; Test TargetMachine::registerPassBuilderCallbacks
+; NPM: Running pass: HexagonVectorLoopCarriedReusePass
+
+declare void @bar() local_unnamed_addr
+
+define void @foo(i32 %n) local_unnamed_addr {
+entry:
+  br label %loop
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.next = add i32 %iv, 1
+  tail call void @bar()
+  %cmp = icmp eq i32 %iv, %n
+  br i1 %cmp, label %exit, label %loop
+exit:
+  ret void
+}
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-chain-store.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-chain-store.mir
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-chain-store.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-chain-store.mir
@@ -142,18 +142,22 @@
   ; CHECK:   renamable $lr = t2ADDri killed renamable $r2, 3, 14 /* CC::al */, $noreg, $noreg
   ; CHECK:   renamable $r2, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $lr = nuw nsw t2ADDrs killed renamable $r2, killed renamable $lr, 19, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   t2STRi12 killed renamable $lr, killed renamable $r3, 0, 14 /* CC::al */, $noreg :: (store 4 into %ir.iter.addr)
-  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r12
+  ; CHECK:   t2STRi12 renamable $lr, killed renamable $r3, 0, 14 /* CC::al */, $noreg :: (store 4 into %ir.iter.addr)
+  ; CHECK:   $lr = t2DLS killed renamable $lr
   ; CHECK:   $r2 = tMOVr killed $lr, 14 /* CC::al */, $noreg
   ; CHECK: bb.1.do.body:
   ; CHECK:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
-  ; CHECK:   liveins: $r0, $r1, $r2
+  ; CHECK:   liveins: $r0, $r1, $r2, $r12
   ; CHECK:   $lr = tMOVr $r2, 14 /* CC::al */, $noreg
+  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r12, 0, $noreg
   ; CHECK:   renamable $r2, dead $cpsr = nsw tSUBi8 killed $r2, 1, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $r0, renamable $q0 = MVE_VLDRWU32_post killed renamable $r0, 16, 0, $noreg :: (load 16 from %ir.pSrc.addr.02, align 4)
+  ; CHECK:   renamable $r12 = nsw t2SUBri killed renamable $r12, 4, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   MVE_VPST 8, implicit $vpr
+  ; CHECK:   renamable $r0, renamable $q0 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.pSrc.addr.02, align 4)
   ; CHECK:   renamable $q0 = MVE_VMULf32 killed renamable $q0, killed renamable $q0, 0, $noreg, undef renamable $q0
-  ; CHECK:   renamable $r1 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r1, 16, 0, killed $noreg :: (store 16 into %ir.pDst.addr.01, align 4)
-  ; CHECK:   dead $lr = MVE_LETP killed renamable $lr, %bb.1
+  ; CHECK:   MVE_VPST 8, implicit $vpr
+  ; CHECK:   renamable $r1 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r1, 16, 1, killed renamable $vpr :: (store 16 into %ir.pDst.addr.01, align 4)
+  ; CHECK:   dead $lr = t2LEUpdate killed renamable $lr, %bb.1
   ; CHECK: bb.2.do.end:
   ; CHECK:   frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc
   bb.0.entry:
@@ -242,19 +246,23 @@
   ; CHECK:   renamable $r2 = t2RSBrs killed renamable $lr, killed renamable $r2, 10, 14 /* CC::al */, $noreg, $noreg
   ; CHECK:   renamable $lr = t2ADDri killed renamable $r2, 3, 14 /* CC::al */, $noreg, $noreg
   ; CHECK:   renamable $r2, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
-  ; CHECK:   dead renamable $lr = nuw nsw t2ADDrs killed renamable $r2, killed renamable $lr, 19, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r12
+  ; CHECK:   renamable $lr = nuw nsw t2ADDrs killed renamable $r2, killed renamable $lr, 19, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   $lr = t2DLS killed renamable $lr
   ; CHECK:   t2STRi12 renamable $lr, killed renamable $r3, 0, 14 /* CC::al */, $noreg :: (store 4 into %ir.iter.addr)
   ; CHECK:   $r2 = tMOVr killed $lr, 14 /* CC::al */, $noreg
   ; CHECK: bb.1.do.body:
   ; CHECK:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
-  ; CHECK:   liveins: $r0, $r1, $r2
+  ; CHECK:   liveins: $r0, $r1, $r2, $r12
   ; CHECK:   $lr = tMOVr $r2, 14 /* CC::al */, $noreg
+  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r12, 0, $noreg
   ; CHECK:   renamable $r2, dead $cpsr = nsw tSUBi8 killed $r2, 1, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $r0, renamable $q0 = MVE_VLDRWU32_post killed renamable $r0, 16, 0, $noreg :: (load 16 from %ir.pSrc.addr.02, align 4)
+  ; CHECK:   renamable $r12 = nsw t2SUBri killed renamable $r12, 4, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   MVE_VPST 8, implicit $vpr
+  ; CHECK:   renamable $r0, renamable $q0 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.pSrc.addr.02, align 4)
   ; CHECK:   renamable $q0 = MVE_VMULf32 killed renamable $q0, killed renamable $q0, 0, $noreg, undef renamable $q0
-  ; CHECK:   renamable $r1 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r1, 16, 0, killed $noreg :: (store 16 into %ir.pDst.addr.01, align 4)
-  ; CHECK:   dead $lr = MVE_LETP killed renamable $lr, %bb.1
+  ; CHECK:   MVE_VPST 8, implicit $vpr
+  ; CHECK:   renamable $r1 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r1, 16, 1, killed renamable $vpr :: (store 16 into %ir.pDst.addr.01, align 4)
+  ; CHECK:   dead $lr = t2LEUpdate killed renamable $lr, %bb.1
   ; CHECK: bb.2.do.end:
   ; CHECK:   frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc
   bb.0.entry:
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-mov.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-mov.mir
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-mov.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-mov.mir
@@ -78,12 +78,12 @@
   ; CHECK:   successors: %bb.5(0x80000000)
   ; CHECK:   liveins: $q0, $r0, $r1, $r2, $r4
   ; CHECK:   renamable $s4 = nnan ninf nsz VADDS renamable $s0, renamable $s1, 14 /* CC::al */, $noreg
-  ; CHECK:   $lr = t2DLS killed $r4
   ; CHECK:   $r3 = tMOVr $r1, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $s4 = nnan ninf nsz VADDS renamable $s2, killed renamable $s4, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $s0 = nnan ninf nsz VADDS killed renamable $s3, killed renamable $s4, 14 /* CC::al */, $noreg, implicit killed $q0
   ; CHECK:   $s2 = VMOVSR $r1, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $s2 = VUITOS killed renamable $s2, 14 /* CC::al */, $noreg
+  ; CHECK:   $lr = t2DLS killed $r4
   ; CHECK:   renamable $s4 = nnan ninf nsz VDIVS killed renamable $s0, killed renamable $s2, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
   ; CHECK: bb.5:
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-dec-copy-prev-iteration.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-dec-copy-prev-iteration.mir
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-dec-copy-prev-iteration.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-dec-copy-prev-iteration.mir
@@ -273,7 +273,6 @@
   ; CHECK:   renamable $r5 = tLDRr renamable $r1, $r3, 14 /* CC::al */, $noreg :: (load 4 from %ir.scevgep617)
   ; CHECK:   renamable $r7, dead $cpsr = tADDrr renamable $r2, renamable $r3, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $r6 = tLDRr renamable $r2, $r3, 14 /* CC::al */, $noreg :: (load 4 from %ir.scevgep418)
-  ; CHECK:   dead $r12 = tMOVr $lr, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $r8 = nuw t2ADDri killed renamable $r8, 4, 14 /* CC::al */, $noreg, $noreg
   ; CHECK:   renamable $r5, dead $cpsr = tEOR killed renamable $r5, killed renamable $r6, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $r6 = tLDRr renamable $r0, $r3, 14 /* CC::al */, $noreg :: (load 4 from %ir.scevgep219)
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lstp-insertion-position.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lstp-insertion-position.mir
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lstp-insertion-position.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lstp-insertion-position.mir
@@ -153,25 +153,17 @@
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
   ; CHECK:   dead $r7 = frame-setup tMOVr $sp, 14 /* CC::al */, $noreg
   ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_register $r7
-  ; CHECK:   renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
   ; CHECK:   renamable $r3 = tLDRpci %const.0, 14 /* CC::al */, $noreg :: (load 4 from constant-pool)
-  ; CHECK:   $lr = t2DLS killed renamable $lr
   ; CHECK:   renamable $q1 = MVE_VDUP32 killed renamable $r3, 0, $noreg, undef renamable $q1
   ; CHECK:   $s4 = VMOVS killed $s0, 14 /* CC::al */, $noreg, implicit killed $q1, implicit-def $q1
+  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r2
   ; CHECK: bb.2.vector.body:
   ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
-  ; CHECK:   liveins: $lr, $q1, $r0, $r1, $r2
-  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg
-  ; CHECK:   renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg
-  ; CHECK:   MVE_VPST 2, implicit $vpr
-  ; CHECK:   renamable $r0, renamable $q0 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv12, align 4)
-  ; CHECK:   renamable $r1, renamable $q2 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1315, align 4)
-  ; CHECK:   renamable $q1 = MVE_VFMAf32 killed renamable $q1, killed renamable $q2, killed renamable $q0, 1, killed renamable $vpr
-  ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.2
+  ; CHECK:   liveins: $lr, $q1, $r0, $r1
+  ; CHECK:   renamable $r0, renamable $q0 = MVE_VLDRWU32_post killed renamable $r0, 16, 0, $noreg :: (load 16 from %ir.lsr.iv12, align 4)
+  ; CHECK:   renamable $r1, renamable $q2 = MVE_VLDRWU32_post killed renamable $r1, 16, 0, $noreg :: (load 16 from %ir.lsr.iv1315, align 4)
+  ; CHECK:   renamable $q1 = MVE_VFMAf32 killed renamable $q1, killed renamable $q2, killed renamable $q0, 0, killed $noreg
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.2
   ; CHECK: bb.3.middle.block:
   ; CHECK:   liveins: $q1
   ; CHECK:   renamable $s0 = nnan ninf nsz arcp contract afn reassoc VADDS renamable $s6, renamable $s7, 14 /* CC::al */, $noreg
@@ -285,27 +277,18 @@
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
   ; CHECK:   dead $r7 = frame-setup tMOVr $sp, 14 /* CC::al */, $noreg
   ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_register $r7
-  ; CHECK:   renamable $r3, dead $cpsr = tMOVi8 3, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $r3 = nuw nsw t2ADDrs killed renamable $r3, renamable $r2, 19, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $lr = t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
   ; CHECK:   renamable $r3 = tLDRpci %const.0, 14 /* CC::al */, $noreg :: (load 4 from constant-pool)
-  ; CHECK:   $lr = t2DLS killed renamable $lr
   ; CHECK:   renamable $q1 = MVE_VDUP32 killed renamable $r3, 0, $noreg, undef renamable $q1
   ; CHECK:   renamable $r2, dead $cpsr = tLSRri killed renamable $r2, 2, 14 /* CC::al */, $noreg
   ; CHECK:   $s4 = VMOVS killed $s0, 14 /* CC::al */, $noreg, implicit killed $q1, implicit-def $q1
+  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r2
   ; CHECK: bb.2.vector.body:
   ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
-  ; CHECK:   liveins: $lr, $q1, $r0, $r1, $r2
-  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg
-  ; CHECK:   renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg
-  ; CHECK:   MVE_VPST 2, implicit $vpr
-  ; CHECK:   renamable $r0, renamable $q0 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4)
-  ; CHECK:   renamable $r1, renamable $q2 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4)
-  ; CHECK:   renamable $q1 = MVE_VFMAf32 killed renamable $q1, killed renamable $q2, killed renamable $q0, 1, killed renamable $vpr
-  ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.2
+  ; CHECK:   liveins: $lr, $q1, $r0, $r1
+  ; CHECK:   renamable $r0, renamable $q0 = MVE_VLDRWU32_post killed renamable $r0, 16, 0, $noreg :: (load 16 from %ir.lsr.iv13, align 4)
+  ; CHECK:   renamable $r1, renamable $q2 = MVE_VLDRWU32_post killed renamable $r1, 16, 0, $noreg :: (load 16 from %ir.lsr.iv1416, align 4)
+  ; CHECK:   renamable $q1 = MVE_VFMAf32 killed renamable $q1, killed renamable $q2, killed renamable $q0, 0, killed $noreg
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.2
   ; CHECK: bb.3.middle.block:
   ; CHECK:   liveins: $q1
   ; CHECK:   renamable $s0 = nnan ninf nsz arcp contract afn reassoc VADDS renamable $s6, renamable $s7, 14 /* CC::al */, $noreg
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix-debug.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix-debug.mir
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix-debug.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix-debug.mir
@@ -249,7 +249,7 @@
   ; CHECK:   renamable $r2 = t2LDRs renamable $r9, renamable $r1, 2, 14 /* CC::al */, $noreg, debug-location !41 :: (load 4 from %ir.arrayidx7.us)
   ; CHECK:   $r3 = tMOVr $r5, 14 /* CC::al */, $noreg, debug-location !32
   ; CHECK:   $r0 = tMOVr $r8, 14 /* CC::al */, $noreg, debug-location !32
-  ; CHECK:   $lr = t2DLS renamable $r10, debug-location !32
+  ; CHECK:   $lr = t2DLS renamable $r10, debug-location !42
   ; CHECK: bb.3.for.body3.us:
   ; CHECK:   successors: %bb.3(0x7c000000), %bb.4(0x04000000)
   ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3, $r5, $r8, $r9, $r10, $r12
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dlstp.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dlstp.mir
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dlstp.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dlstp.mir
@@ -160,11 +160,11 @@
   ; CHECK:   renamable $r3, dead $cpsr = tSUBrr renamable $r1, killed renamable $r3, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
   ; CHECK:   renamable $r3, dead $cpsr = tADDi8 killed renamable $r3, 3, 14 /* CC::al */, $noreg
-  ; CHECK:   dead renamable $lr = nuw nsw t2ADDrs killed renamable $r12, killed renamable $r3, 19, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   renamable $lr = nuw nsw t2ADDrs killed renamable $r12, killed renamable $r3, 19, 14 /* CC::al */, $noreg, $noreg
   ; CHECK:   $r3 = tMOVr $r1, 14 /* CC::al */, $noreg
   ; CHECK:   $r12 = tMOVr $r0, 14 /* CC::al */, $noreg
+  ; CHECK:   $r4 = tMOVr killed $lr, 14 /* CC::al */, $noreg
   ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r3
-  ; CHECK:   $r4 = tMOVr $lr, 14 /* CC::al */, $noreg
   ; CHECK: bb.1.do.body.i:
   ; CHECK:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK:   liveins: $lr, $q0, $r0, $r1, $r2, $r4, $r12
@@ -175,8 +175,8 @@
   ; CHECK:   successors: %bb.3(0x80000000)
   ; CHECK:   liveins: $q0, $r0, $r1, $r2, $r4
   ; CHECK:   $s4 = VMOVSR $r1, 14 /* CC::al */, $noreg
-  ; CHECK:   $lr = t2DLS killed $r4
   ; CHECK:   renamable $s0 = nnan ninf nsz arcp contract afn reassoc VADDS killed renamable $s3, killed renamable $s3, 14 /* CC::al */, $noreg, implicit killed $q0
+  ; CHECK:   $lr = t2DLS killed $r4
   ; CHECK:   renamable $s4 = VUITOS killed renamable $s4, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $s0 = nnan ninf nsz arcp contract afn reassoc VDIVS killed renamable $s0, killed renamable $s4, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $r3 = VMOVRS killed renamable $s0, 14 /* CC::al */, $noreg
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll
@@ -17,8 +17,8 @@
 ; CHECK-NEXT:    add.w lr, r12, r3, lsr #2
 ; CHECK-NEXT:    mov r3, r1
 ; CHECK-NEXT:    mov r12, r0
-; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:    mov r4, lr
+; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB0_1: @ %do.body.i
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q1, [r12], #16
@@ -26,10 +26,10 @@
 ; CHECK-NEXT:    letp lr, .LBB0_1
 ; CHECK-NEXT:  @ %bb.2: @ %arm_mean_f32_mve.exit
 ; CHECK-NEXT:    vmov s4, r1
-; CHECK-NEXT:    dls lr, r4
 ; CHECK-NEXT:    vadd.f32 s0, s3, s3
 ; CHECK-NEXT:    mov r3, r1
 ; CHECK-NEXT:    vcvt.f32.u32 s4, s4
+; CHECK-NEXT:    dls lr, r4
 ; CHECK-NEXT:    vdiv.f32 s0, s0, s4
 ; CHECK-NEXT:    vmov r12, s0
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-def-before-start.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-def-before-start.mir
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-def-before-start.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-def-before-start.mir
@@ -117,32 +117,21 @@
   ; CHECK: bb.1.vector.ph:
   ; CHECK:   successors: %bb.2(0x80000000)
   ; CHECK:   liveins: $r0, $r1, $r2, $r3
-  ; CHECK:   renamable $r12 = t2MOVi 3, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $lr = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r12 = nuw t2ADDrs killed renamable $r12, renamable $r3, 11, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r12 = t2BICri killed renamable $r12, 3, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r12 = t2SUBri killed renamable $r12, 4, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
   ; CHECK:   $r12 = t2MOVr killed $r3, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   $lr = t2DLS killed renamable $lr
   ; CHECK:   renamable $r3, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $r12 = t2LSRri killed renamable $r12, 1, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r12
   ; CHECK: bb.2.vector.body:
   ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
-  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3, $r12
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3
   ; CHECK:   renamable $r4, dead $cpsr = tADDrr renamable $r1, renamable $r3, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r12, 0, $noreg
-  ; CHECK:   MVE_VPST 8, implicit $vpr
-  ; CHECK:   renamable $q0 = MVE_VLDRBU32 killed renamable $r4, 0, 1, renamable $vpr :: (load 4 from %ir.scevgep45, align 1)
+  ; CHECK:   renamable $q0 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep45, align 1)
   ; CHECK:   renamable $r4, dead $cpsr = tADDrr renamable $r2, renamable $r3, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $r3, dead $cpsr = tADDi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $r12 = t2SUBri killed renamable $r12, 4, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   MVE_VPST 8, implicit $vpr
-  ; CHECK:   renamable $q1 = MVE_VLDRBU32 killed renamable $r4, 0, 1, renamable $vpr :: (load 4 from %ir.scevgep23, align 1)
+  ; CHECK:   renamable $q1 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep23, align 1)
   ; CHECK:   renamable $q0 = nuw nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
-  ; CHECK:   MVE_VPST 8, implicit $vpr
-  ; CHECK:   renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4)
-  ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.2
+  ; CHECK:   renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 0, killed $noreg :: (store 16 into %ir.lsr.iv1, align 4)
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.2
   ; CHECK: bb.3.for.cond.cleanup:
   ; CHECK:   tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc
   bb.0.entry:
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-start-after-def.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-start-after-def.mir
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-start-after-def.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-start-after-def.mir
@@ -117,32 +117,21 @@
   ; CHECK: bb.1.vector.ph:
   ; CHECK:   successors: %bb.2(0x80000000)
   ; CHECK:   liveins: $r0, $r1, $r2, $r3
-  ; CHECK:   renamable $r12 = t2MOVi 3, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $lr = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r12 = nuw t2ADDrs killed renamable $r12, renamable $r3, 11, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r12 = t2BICri killed renamable $r12, 3, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r12 = t2SUBri killed renamable $r12, 4, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   $lr = t2DLS killed renamable $lr
   ; CHECK:   $r12 = t2MOVr killed $r3, 14 /* CC::al */, $noreg, $noreg
   ; CHECK:   renamable $r3, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $r12 = t2LSRri killed renamable $r12, 1, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r12
   ; CHECK: bb.2.vector.body:
   ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
-  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3, $r12
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3
   ; CHECK:   renamable $r4, dead $cpsr = tADDrr renamable $r1, renamable $r3, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r12, 0, $noreg
-  ; CHECK:   MVE_VPST 8, implicit $vpr
-  ; CHECK:   renamable $q0 = MVE_VLDRBU32 killed renamable $r4, 0, 1, renamable $vpr :: (load 4 from %ir.scevgep45, align 1)
+  ; CHECK:   renamable $q0 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep45, align 1)
   ; CHECK:   renamable $r4, dead $cpsr = tADDrr renamable $r2, renamable $r3, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $r3, dead $cpsr = tADDi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $r12 = t2SUBri killed renamable $r12, 4, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   MVE_VPST 8, implicit $vpr
-  ; CHECK:   renamable $q1 = MVE_VLDRBU32 killed renamable $r4, 0, 1, renamable $vpr :: (load 4 from %ir.scevgep23, align 1)
+  ; CHECK:   renamable $q1 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep23, align 1)
   ; CHECK:   renamable $q0 = nuw nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
-  ; CHECK:   MVE_VPST 8, implicit $vpr
-  ; CHECK:   renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4)
-  ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.2
+  ; CHECK:   renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 0, killed $noreg :: (store 16 into %ir.lsr.iv1, align 4)
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.2
   ; CHECK: bb.3.for.cond.cleanup:
   ; CHECK:   tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc
   bb.0.entry:
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
@@ -451,9 +451,9 @@
 ; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    add.w lr, r3, r6, lsr #2
 ; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    vdup.32 q0, r3
 ; CHECK-NEXT:    vmov.32 q0[0], r12
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB6_5: @ %vector.body46
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r2
@@ -686,8 +686,8 @@
 ; CHECK-NEXT:    mla r2, r4, r3, r2
 ; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vdup.32 q0, r3
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB8_6: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r1
diff --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
--- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
@@ -1156,8 +1156,8 @@
 ; CHECK-NEXT:  @ %bb.5: @ %for.body.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
 ; CHECK-NEXT:    ldr.w lr, [sp] @ 4-byte Reload
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB16_6: @ %for.body
 ; CHECK-NEXT:    @ Parent Loop BB16_4 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
--- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
@@ -1116,8 +1116,8 @@
 ; CHECK-NEXT:  @ %bb.5: @ %for.body.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
 ; CHECK-NEXT:    ldr.w lr, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB16_6: @ %for.body
 ; CHECK-NEXT:    @ Parent Loop BB16_4 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
@@ -1435,10 +1435,10 @@
 ; CHECK-NEXT:    vdup.32 q1, r6
 ; CHECK-NEXT:    mov r6, r2
 ; CHECK-NEXT:    vmov.f32 s6, s12
-; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:    vmov.f32 s10, s14
 ; CHECK-NEXT:    vmov.f32 s7, s12
 ; CHECK-NEXT:    vmov.f32 s11, s14
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB17_3: @ Parent Loop BB17_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vldrw.u32 q4, [r1, q0, uxtw #2]
@@ -1589,8 +1589,8 @@
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB18_3 Depth 2
 ; CHECK-NEXT:    ldr r4, [r2]
-; CHECK-NEXT:    dls lr, r5
 ; CHECK-NEXT:    vdup.32 q0, r4
+; CHECK-NEXT:    dls lr, r5
 ; CHECK-NEXT:  .LBB18_3: @ %while.body
 ; CHECK-NEXT:    @ Parent Loop BB18_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
diff --git a/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll b/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll
--- a/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll
@@ -265,9 +265,9 @@
 ; CHECK-NEXT:    poplt {r4, pc}
 ; CHECK-NEXT:  .LBB4_1: @ %vector.ph
 ; CHECK-NEXT:    vmov r4, s0
-; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:    eor r12, r4, #-2147483648
 ; CHECK-NEXT:    movs r4, #0
+; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB4_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    adds r4, #4
@@ -529,9 +529,9 @@
 ; CHECK-NEXT:    poplt {r4, pc}
 ; CHECK-NEXT:  .LBB8_1: @ %vector.ph
 ; CHECK-NEXT:    vmov r4, s0
-; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:    eor r12, r4, #-2147483648
 ; CHECK-NEXT:    movs r4, #0
+; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB8_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    adds r4, #4
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
--- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
@@ -709,12 +709,12 @@
 ; CHECK-NEXT:    @ in Loop: Header=BB10_8 Depth=2
 ; CHECK-NEXT:    ldr r0, [sp, #112]
 ; CHECK-NEXT:    sub.w lr, r11, r5
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    mla r3, r0, r5, r1
 ; CHECK-NEXT:    add r5, r9
 ; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    add.w r5, r0, r5, lsl #1
 ; CHECK-NEXT:    add.w r3, r6, r3, lsl #1
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB10_14: @ %for.body8.us.us
 ; CHECK-NEXT:    @ Parent Loop BB10_5 Depth=1
 ; CHECK-NEXT:    @ Parent Loop BB10_8 Depth=2
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll
--- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll
@@ -556,8 +556,8 @@
 ; CHECK-NEXT:    vmov.f16 r1, s0
 ; CHECK-NEXT:    vldrw.u32 q0, [r2]
 ; CHECK-NEXT:    adr r2, .LCPI9_1
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB9_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrh.u16 q2, [r0, q0, uxtw #1]
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll
--- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll
@@ -11,9 +11,9 @@
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    movw lr, #1250
 ; CHECK-NEXT:    vmov.i32 q1, #0x0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    vadd.i32 q0, q0, r1
 ; CHECK-NEXT:    adds r1, r3, #4
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB0_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r3
@@ -231,17 +231,11 @@
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    beq.w .LBB3_3
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
-; CHECK-NEXT:    adds r3, r2, #3
 ; CHECK-NEXT:    adr r7, .LCPI3_5
-; CHECK-NEXT:    bic r3, r3, #3
 ; CHECK-NEXT:    vmov.i32 q0, #0x8000
-; CHECK-NEXT:    sub.w r12, r3, #4
-; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    adr r6, .LCPI3_4
 ; CHECK-NEXT:    adr r5, .LCPI3_3
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
 ; CHECK-NEXT:    adr r4, .LCPI3_2
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #160] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r7]
 ; CHECK-NEXT:    adr.w r8, .LCPI3_1
@@ -274,22 +268,18 @@
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r3]
 ; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    dlstp.32 lr, r2
 ; CHECK-NEXT:  .LBB3_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #192] @ 16-byte Reload
-; CHECK-NEXT:    vctp.32 r2
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vldrbt.u32 q4, [r0, q0]
+; CHECK-NEXT:    vldrb.u32 q4, [r0, q0]
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #176] @ 16-byte Reload
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vldrbt.u32 q7, [r0, q0]
+; CHECK-NEXT:    vldrb.u32 q7, [r0, q0]
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #144] @ 16-byte Reload
 ; CHECK-NEXT:    vldrw.u32 q5, [sp, #112] @ 16-byte Reload
-; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    vmul.i32 q6, q7, q0
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #128] @ 16-byte Reload
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vldrbt.u32 q1, [r0, q5]
+; CHECK-NEXT:    vldrb.u32 q1, [r0, q5]
 ; CHECK-NEXT:    vldrw.u32 q2, [sp, #80] @ 16-byte Reload
 ; CHECK-NEXT:    vmul.i32 q3, q4, q0
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #96] @ 16-byte Reload
@@ -320,14 +310,12 @@
 ; CHECK-NEXT:    vadd.i32 q1, q1, q0
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #192] @ 16-byte Reload
 ; CHECK-NEXT:    vshr.u32 q1, q1, #16
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vstrbt.32 q1, [r1, q0]
+; CHECK-NEXT:    vstrb.32 q1, [r1, q0]
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #176] @ 16-byte Reload
-; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vstrbt.32 q2, [r1, q0]
-; CHECK-NEXT:    vstrbt.32 q6, [r1, q5]
+; CHECK-NEXT:    vstrb.32 q2, [r1, q0]
+; CHECK-NEXT:    vstrb.32 q6, [r1, q5]
 ; CHECK-NEXT:    adds r1, #12
-; CHECK-NEXT:    le lr, .LBB3_2
+; CHECK-NEXT:    letp lr, .LBB3_2
 ; CHECK-NEXT:  .LBB3_3: @ %for.cond.cleanup
 ; CHECK-NEXT:    add sp, #216
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
--- a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
@@ -257,13 +257,13 @@
 ; CHECK-NEXT:    ldr r3, [sp, #64]
 ; CHECK-NEXT:    mov r6, r12
 ; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    ldr r5, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    mov r10, r12
 ; CHECK-NEXT:    mla r7, r11, r3, r1
 ; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    ldrd r4, r3, [sp] @ 8-byte Folded Reload
 ; CHECK-NEXT:    mov r8, r12
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB2_7: @ %for.body24
 ; CHECK-NEXT:    @ Parent Loop BB2_5 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
@@ -425,13 +425,13 @@
 ; CHECK-NEXT:    ldr r3, [sp, #64]
 ; CHECK-NEXT:    mov r6, r12
 ; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    ldr r5, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    mov r10, r12
 ; CHECK-NEXT:    mla r7, r11, r3, r1
 ; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    ldrd r4, r3, [sp] @ 8-byte Folded Reload
 ; CHECK-NEXT:    mov r8, r12
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB3_5: @ %for.body24
 ; CHECK-NEXT:    @ Parent Loop BB3_3 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
@@ -735,13 +735,13 @@
 ; CHECK-NEXT:    ldr.w r11, [sp, #88]
 ; CHECK-NEXT:    mov r6, r12
 ; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    dlstp.16 lr, r11
 ; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    mov r10, r12
 ; CHECK-NEXT:    mla r3, r9, r11, r0
 ; CHECK-NEXT:    ldr r5, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    ldrd r7, r0, [sp] @ 8-byte Folded Reload
 ; CHECK-NEXT:    mov r8, r12
+; CHECK-NEXT:    dlstp.16 lr, r11
 ; CHECK-NEXT:  .LBB5_7: @ %for.body24
 ; CHECK-NEXT:    @ Parent Loop BB5_5 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
@@ -907,13 +907,13 @@
 ; CHECK-NEXT:    ldr.w r11, [sp, #88]
 ; CHECK-NEXT:    mov r6, r12
 ; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    dlstp.16 lr, r11
 ; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    mov r10, r12
 ; CHECK-NEXT:    mla r3, r9, r11, r0
 ; CHECK-NEXT:    ldr r5, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    ldrd r7, r0, [sp] @ 8-byte Folded Reload
 ; CHECK-NEXT:    mov r8, r12
+; CHECK-NEXT:    dlstp.16 lr, r11
 ; CHECK-NEXT:  .LBB6_5: @ %for.body24
 ; CHECK-NEXT:    @ Parent Loop BB6_3 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
@@ -1120,7 +1120,6 @@
 ; CHECK-NEXT:    ldr.w r1, [r1, r10, lsl #2]
 ; CHECK-NEXT:    ldrd r6, r7, [r0, #32]
 ; CHECK-NEXT:    ldr.w r3, [r3, r10, lsl #2]
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    add.w r6, r6, r2, lsl #2
 ; CHECK-NEXT:    add.w r12, r12, r1, lsl #2
 ; CHECK-NEXT:    ldr r1, [sp, #20] @ 4-byte Reload
@@ -1129,6 +1128,7 @@
 ; CHECK-NEXT:    add.w r1, r2, r11, lsl #2
 ; CHECK-NEXT:    add.w r8, r1, r11, lsl #2
 ; CHECK-NEXT:    add.w r9, r8, r11, lsl #2
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB7_7: @ Parent Loop BB7_3 Depth=1
 ; CHECK-NEXT:    @ Parent Loop BB7_6 Depth=2
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=3
diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll b/llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll
--- a/llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll
@@ -187,8 +187,8 @@
 ; CHECK-NEXT:    add.w lr, r2, r1, lsr #2
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    eor r2, r1, #-2147483648
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB3_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
@@ -480,8 +480,8 @@
 ; CHECK-NEXT:    add.w lr, r2, r1, lsr #2
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    eor r2, r1, #-2147483648
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB8_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
diff --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
--- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
@@ -36,8 +36,8 @@
 ; CHECK-NEXT:    vldrw.u32 q0, [r4]
 ; CHECK-NEXT:    vmvn.i32 q1, #0x80000000
 ; CHECK-NEXT:    mov.w r10, #-1
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    str r3, [sp] @ 4-byte Spill
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB0_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldrd r4, r5, [r0]
@@ -256,10 +256,10 @@
 ; CHECK-NEXT:    adr r7, .LCPI1_1
 ; CHECK-NEXT:    add.w r12, r0, r3, lsl #2
 ; CHECK-NEXT:    vldrw.u32 q1, [r7]
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    str r3, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    mov.w r3, #-1
 ; CHECK-NEXT:    mvn r9, #-2147483648
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB1_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q2, [r0], #16
@@ -544,8 +544,8 @@
 ; CHECK-NEXT:    vdup.32 q1, r7
 ; CHECK-NEXT:    mov.w r12, #-1
 ; CHECK-NEXT:    mvn r8, #-2147483648
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB2_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
@@ -773,8 +773,8 @@
 ; CHECK-NEXT:    add.w r11, r1, r5, lsl #2
 ; CHECK-NEXT:    add.w lr, r6, r7, lsr #1
 ; CHECK-NEXT:    add.w r12, r0, r5, lsl #2
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    str r5, [sp] @ 4-byte Spill
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB3_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldrd r4, r9, [r0]
@@ -1617,8 +1617,8 @@
 ; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vdup.32 q1, r12
 ; CHECK-NEXT:    vmov.i8 q3, #0xff
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB9_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q5, [sp] @ 16-byte Reload
@@ -2842,7 +2842,6 @@
 ; CHECK-NEXT:    vmov.i8 q2, #0x0
 ; CHECK-NEXT:    add.w lr, lr, r12, lsr #4
 ; CHECK-NEXT:    sub.w r12, r3, #1
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r4]
 ; CHECK-NEXT:    adr r4, .LCPI18_2
@@ -2854,6 +2853,7 @@
 ; CHECK-NEXT:    vmov.i8 q3, #0xff
 ; CHECK-NEXT:    vldrw.u32 q6, [r4]
 ; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB18_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
@@ -3142,7 +3142,6 @@
 ; CHECK-NEXT:    vmov.i8 q2, #0x0
 ; CHECK-NEXT:    add.w lr, lr, r12, lsr #4
 ; CHECK-NEXT:    sub.w r12, r3, #1
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r4]
 ; CHECK-NEXT:    adr r4, .LCPI19_2
@@ -3154,6 +3153,7 @@
 ; CHECK-NEXT:    vmov.i8 q3, #0xff
 ; CHECK-NEXT:    vldrw.u32 q6, [r4]
 ; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB19_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q4, [sp, #32] @ 16-byte Reload
diff --git a/llvm/test/CodeGen/X86/basic-block-sections-labels-functions-sections.ll b/llvm/test/CodeGen/X86/basic-block-sections-labels-functions-sections.ll
--- a/llvm/test/CodeGen/X86/basic-block-sections-labels-functions-sections.ll
+++ b/llvm/test/CodeGen/X86/basic-block-sections-labels-functions-sections.ll
@@ -5,11 +5,11 @@
 define dso_local i32 @_Z3barv() {
   ret i32 0
 }
-;; Check we add SHF_LINK_ORDER for .bb_addr_map and link it with the corresponding .text sections.
+;; Check we add SHF_LINK_ORDER for __llvm_bb_addr_map and link it with the corresponding .text sections.
 ; CHECK:		.section .text._Z3barv,"ax",@progbits
 ; CHECK-LABEL:	_Z3barv:
 ; CHECK-NEXT:	[[BAR_BEGIN:.Lfunc_begin[0-9]+]]:
-; CHECK:		.section .bb_addr_map,"o",@progbits,.text._Z3barv{{$}}
+; CHECK:		.section __llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text._Z3barv{{$}}
 ; CHECK-NEXT:		.quad [[BAR_BEGIN]]
 
 
@@ -20,16 +20,16 @@
 ; CHECK:		.section .text._Z3foov,"ax",@progbits
 ; CHECK-LABEL:	_Z3foov:
 ; CHECK-NEXT:	[[FOO_BEGIN:.Lfunc_begin[0-9]+]]:
-; CHECK:		.section  .bb_addr_map,"o",@progbits,.text._Z3foov{{$}}
+; CHECK:		.section  __llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text._Z3foov{{$}}
 ; CHECK-NEXT:		.quad [[FOO_BEGIN]]
 
 
 define linkonce_odr dso_local i32 @_Z4fooTIiET_v() comdat {
   ret i32 0
 }
-;; Check we add .bb_addr_map section to a COMDAT group with the corresponding .text section if such a COMDAT exists.
+;; Check we add __llvm_bb_addr_map section to a COMDAT group with the corresponding .text section if such a COMDAT exists.
 ; CHECK:		.section .text._Z4fooTIiET_v,"axG",@progbits,_Z4fooTIiET_v,comdat
 ; CHECK-LABEL:	_Z4fooTIiET_v:
 ; CHECK-NEXT:	[[FOOCOMDAT_BEGIN:.Lfunc_begin[0-9]+]]:
-; CHECK:		.section .bb_addr_map,"Go",@progbits,_Z4fooTIiET_v,comdat,.text._Z4fooTIiET_v{{$}}
+; CHECK:		.section __llvm_bb_addr_map,"Go",@llvm_bb_addr_map,_Z4fooTIiET_v,comdat,.text._Z4fooTIiET_v{{$}}
 ; CHECK-NEXT:		.quad [[FOOCOMDAT_BEGIN]]
diff --git a/llvm/test/CodeGen/X86/basic-block-sections-labels.ll b/llvm/test/CodeGen/X86/basic-block-sections-labels.ll
--- a/llvm/test/CodeGen/X86/basic-block-sections-labels.ll
+++ b/llvm/test/CodeGen/X86/basic-block-sections-labels.ll
@@ -39,7 +39,7 @@
 ; CHECK-LABEL:	.LBB_END0_3:
 ; CHECK-LABEL:	.Lfunc_end0:
 
-; CHECK:	.section	.bb_addr_map,"o",@progbits,.text
+; CHECK:	.section	__llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text
 ; CHECK-NEXT:	.quad	.Lfunc_begin0
 ; CHECK-NEXT:	.byte	4
 ; CHECK-NEXT:	.uleb128 .Lfunc_begin0-.Lfunc_begin0
diff --git a/llvm/test/CodeGen/X86/cmov.ll b/llvm/test/CodeGen/X86/cmov.ll
--- a/llvm/test/CodeGen/X86/cmov.ll
+++ b/llvm/test/CodeGen/X86/cmov.ll
@@ -235,3 +235,27 @@
   %3 = select i1 %2, i32 %0, i32 -1
   ret i32 %3
 }
+
+define i32 @pr47049_3(i32 %0) {
+; CHECK-LABEL: pr47049_3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    cmovgl %edi, %eax
+; CHECK-NEXT:    retq
+  %2 = icmp sgt i32 %0, 1
+  %3 = select i1 %2, i32 %0, i32 1
+  ret i32 %3
+}
+
+define i32 @pr47049_4(i32 %0) {
+; CHECK-LABEL: pr47049_4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    cmovnel %edi, %eax
+; CHECK-NEXT:    retq
+  %2 = icmp ugt i32 %0, 1
+  %3 = select i1 %2, i32 %0, i32 1
+  ret i32 %3
+}
diff --git a/llvm/test/CodeGen/X86/gcc_except_table_bb_sections.ll b/llvm/test/CodeGen/X86/gcc_except_table_bb_sections.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/gcc_except_table_bb_sections.ll
@@ -0,0 +1,166 @@
+; RUN: llc -basic-block-sections=all -mtriple x86_64-pc-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NON-PIC
+; RUN: llc -basic-block-sections=all -mtriple x86_64-pc-linux-gnu -relocation-model=pic < %s | FileCheck %s --check-prefixes=CHECK,CHECK-PIC
+@_ZTIi = external constant i8*
+
+define i32 @main() uwtable optsize ssp personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+; Verify that each basic block section gets its own LSDA exception symbol.
+;
+; CHECK-LABEL:        main:
+; CHECK-NEXT:         .Lfunc_begin0:
+; CHECK-NEXT:           .cfi_startproc
+
+;; Verify personality function and LSDA encoding for NON-PIC mode.
+; PersonalityEncoding = dwarf::DW_EH_PE_udata4
+; CHECK-NON-PIC-NEXT:   .cfi_personality 3, __gxx_personality_v0
+; LSDAEncoding = dwarf::DW_EH_PE_udata4
+; CHECK-NON-PIC-NEXT:   .cfi_lsda 3, .Lexception0
+
+;; Verify personality function and LSDA encoding for PIC mode.
+; PersonalityEncoding = DW_EH_PE_indirect | DW_EH_PE_pcrel | DW_EH_PE_sdata4
+; CHECK-PIC-NEXT:       .cfi_personality 155, DW.ref.__gxx_personality_v0
+; LSDAEncoding = DW_EH_PE_pcrel | DW_EH_PE_sdata4
+; CHECK-PIC-NEXT:       .cfi_lsda 27, .Lexception0
+
+; CHECK-LABEL:        .Ltmp0:
+; CHECK-NEXT:           callq   _Z1fv
+; CHECK-LABEL:        .Ltmp1:
+
+; CHECK-NOT:            .cfi_lsda
+
+; CHECK-LABEL:        main.1:
+; CHECK-NEXT:           .cfi_startproc
+
+; CHECK-NON-PIC-NEXT:   .cfi_personality 3, __gxx_personality_v0
+; CHECK-NON-PIC-NEXT:   .cfi_lsda 3, .Lexception1
+
+; CHECK-PIC-NEXT:       .cfi_personality 155, DW.ref.__gxx_personality_v0
+; CHECK-PIC-NEXT:       .cfi_lsda 27, .Lexception1
+
+; CHECK-NOT:            .cfi_lsda
+
+; CHECK-LABEL:        main.2:
+; CHECK-NEXT:           .cfi_startproc
+
+; CHECK-NON-PIC-NEXT:   .cfi_personality 3, __gxx_personality_v0
+; CHECK-NON-PIC-NEXT:   .cfi_lsda 3, .Lexception2
+
+; CHECK-PIC-NEXT:       .cfi_personality 155, DW.ref.__gxx_personality_v0
+; CHECK-PIC-NEXT:       .cfi_lsda 27, .Lexception2
+
+; CHECK:                nop
+; CHECK-LABEL:        .Ltmp2:
+; CHECK-LABEL:        .LBB_END0_2:
+
+; CHECK-NOT:            .cfi_lsda
+
+entry:
+  invoke void @_Z1fv() optsize
+          to label %try.cont unwind label %lpad
+
+lpad:
+  %0 = landingpad { i8*, i32 }
+          cleanup
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  br label %eh.resume
+
+try.cont:
+  ret i32 0
+
+eh.resume:
+  resume { i8*, i32 } %0
+}
+
+declare void @_Z1fv() optsize
+
+declare i32 @__gxx_personality_v0(...)
+;; Verify that the exception table gets split across the three basic block sections.
+;
+; CHECK:                .section .gcc_except_table
+; CHECK-NEXT:           .p2align 2
+; CHECK-NEXT:         GCC_except_table0:
+; CHECK-NEXT:         .Lexception0:
+
+;; Verify @LPStart encoding for NON-PIC mode.
+; CHECK-NON-PIC-NEXT:   .byte	0                       # @LPStart Encoding = absptr
+; CHECK-NON-PIC-NEXT:   .quad	main.2
+
+;; Verify @LPStart encoding for PIC mode.
+; CHECK-PIC-NEXT:       .byte	16                      # @LPStart Encoding = pcrel
+; CHECK-PIC-NEXT:     [[DOT:\.Ltmp[0-9]+]]:
+; CHECK-PIC-NEXT:       .quad	main.2-[[DOT]]
+
+;; Verify @TType encoding for NON-PIC mode.
+; CHECK-NON-PIC-NEXT:   .byte	3                       # @TType Encoding = udata4
+
+;; Verify @TType encoding for PIC mode.
+; CHECK-PIC-NEXT:       .byte 156                       # @TType Encoding = indirect pcrel sdata8
+
+; CHECK-NEXT:           .uleb128 .Lttbase0-.Lttbaseref0
+; CHECK-NEXT:         .Lttbaseref0:
+; CHECK-NEXT:           .byte	1                       # Call site Encoding = uleb128
+; CHECK-NEXT:           .uleb128 .Laction_table_base0-.Lcst_begin0
+; CHECK-NEXT:         .Lcst_begin0:
+; CHECK-NEXT:           .uleb128 .Ltmp0-.Lfunc_begin0   # >> Call Site 1 <<
+; CHECK-NEXT:           .uleb128 .Ltmp1-.Ltmp0          #   Call between .Ltmp0 and .Ltmp1
+; CHECK-NEXT:           .uleb128 .Ltmp2-main.2          #     jumps to .Ltmp2
+; CHECK-NEXT:           .byte	3                       #   On action: 2
+; CHECK-NEXT:           .p2align	2
+; CHECK-NEXT:         .Lexception1:
+
+; CHECK-NON-PIC-NEXT:   .byte	0                       # @LPStart Encoding = absptr
+; CHECK-NON-PIC-NEXT:   .quad	main.2
+
+; CHECK-PIC-NEXT:       .byte	16                      # @LPStart Encoding = pcrel
+; CHECK-PIC-NEXT:     [[DOT:\.Ltmp[0-9]+]]:
+; CHECK-PIC-NEXT:       .quad	main.2-[[DOT]]
+
+; CHECK-NON-PIC-NEXT:   .byte	3                       # @TType Encoding = udata4
+
+; CHECK-PIC-NEXT:       .byte 156                       # @TType Encoding = indirect pcrel sdata8
+
+; CHECK-NEXT:           .uleb128 .Lttbase0-.Lttbaseref1
+; CHECK-NEXT:         .Lttbaseref1:
+; CHECK-NEXT:           .byte	1                       # Call site Encoding = uleb128
+; CHECK-NEXT:           .uleb128 .Laction_table_base0-.Lcst_begin1
+; CHECK-NEXT:         .Lcst_begin1:
+; CHECK-NEXT:           .p2align 2
+; CHECK-NEXT:         .Lexception2:
+
+; CHECK-NON-PIC-NEXT:   .byte	0                       # @LPStart Encoding = absptr
+; CHECK-NON-PIC-NEXT:   .quad	main.2
+
+; CHECK-PIC-NEXT:       .byte	16                      # @LPStart Encoding = pcrel
+; CHECK-PIC-NEXT:     [[DOT:\.Ltmp[0-9]+]]:
+; CHECK-PIC-NEXT:       .quad	main.2-[[DOT]]
+
+; CHECK-NON-PIC-NEXT:   .byte	3                       # @TType Encoding = udata4
+
+; CHECK-PIC-NEXT:       .byte 156                       # @TType Encoding = indirect pcrel sdata8
+
+; CHECK-NEXT:           .uleb128 .Lttbase0-.Lttbaseref2
+; CHECK-NEXT:         .Lttbaseref2:
+; CHECK-NEXT:           .byte	1                       # Call site Encoding = uleb128
+; CHECK-NEXT:           .uleb128 .Laction_table_base0-.Lcst_begin2
+; CHECK-NEXT:         .Lcst_begin2:
+; CHECK-NEXT:           .uleb128 main.2-main.2          # >> Call Site 2 <<
+; CHECK-NEXT:           .uleb128 .LBB_END0_2-main.2     #   Call between main.2 and .LBB_END0_2
+; CHECK-NEXT:           .byte	0                       #     has no landing pad
+; CHECK-NEXT:           .byte	0                       #   On action: cleanup
+; CHECK-NEXT:         .Laction_table_base0:
+; CHECK-NEXT:           .byte	0                       # >> Action Record 1 <<
+; CHECK-NEXT:                                           #   Cleanup
+; CHECK-NEXT:           .byte	0                       #   No further actions
+; CHECK-NEXT:           .byte	1                       # >> Action Record 2 <<
+; CHECK-NEXT:                                           #   Catch TypeInfo 1
+; CHECK-NEXT:           .byte	125                     #   Continue to action 1
+; CHECK-NEXT:           .p2align 2
+; CHECK-NEXT:                                           # >> Catch TypeInfos <<
+
+; CHECK-NON-PIC-NEXT:   .long _ZTIi                     # TypeInfo 1
+
+; CHECK-PIC-NEXT:     [[DOT:\.Ltmp[0-9]+]]:
+; CHECK-PIC-NEXT:       .quad .L_ZTIi.DW.stub-[[DOT]]
+
+; CHECK-NEXT:         .Lttbase0:
+; CHECK-NEXT:           .p2align 2
+; CHECK-NEXT:                                           # -- End function
diff --git a/llvm/test/CodeGen/X86/gcc_except_table_bb_sections_ehpad_groups_with_cold.ll b/llvm/test/CodeGen/X86/gcc_except_table_bb_sections_ehpad_groups_with_cold.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/gcc_except_table_bb_sections_ehpad_groups_with_cold.ll
@@ -0,0 +1,96 @@
+; Check that when all exception handling blocks are cold, they get grouped with the cold bbs.
+; RUN: echo '!main' > %t
+; RUN: echo '!!0' >> %t
+; RUN: llc -function-sections -basic-block-sections=%t -mtriple x86_64-pc-linux-gnu < %s | FileCheck %s
+@_ZTIi = external constant i8*
+
+define i32 @main() uwtable optsize ssp personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+; Verify that each basic block section gets its own LSDA exception symbol.
+;
+; CHECK-LABEL:  main:
+; CHECK-NEXT:    .Lfunc_begin0:
+; CHECK-NEXT:    .cfi_startproc
+; PersonalityEncoding = dwarf::DW_EH_PE_udata4
+; CHECK-NEXT:    .cfi_personality 3, __gxx_personality_v0
+; LSDAEncoding = dwarf::DW_EH_PE_udata4
+; CHECK-NEXT:    .cfi_lsda 3, .Lexception0
+; CHECK-LABEL:  .Ltmp0:
+; CHECK-LABEL:  .Ltmp1:
+
+; CHECK-NOT: .cfi_lsda
+
+; CHECK-LABEL:  main.cold:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:    .cfi_personality 3, __gxx_personality_v0
+; CHECK-NEXT:    .cfi_lsda 3, .Lexception1
+; CHECK-LABEL:  .Ltmp2:
+; CHECK-LABEL:  .LBB_END0_2:
+
+; CHECK-NOT: .cfi_lsda
+
+entry:
+  invoke void @_Z1fv() optsize
+          to label %try.cont unwind label %lpad
+
+lpad:
+  %0 = landingpad { i8*, i32 }
+          cleanup
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  br label %eh.resume
+
+try.cont:
+  ret i32 0
+
+eh.resume:
+  resume { i8*, i32 } %0
+}
+
+declare void @_Z1fv() optsize
+
+declare i32 @__gxx_personality_v0(...)
+
+; Verify that the exception table gets split across the two basic block sections.
+;
+; CHECK:       .section .gcc_except_table
+; CHECK-NEXT:  .p2align 2
+; CHECK-NEXT:  GCC_except_table0:
+; CHECK-NEXT:  .Lexception0:
+; CHECK-NEXT:    .byte	0                       # @LPStart Encoding = absptr
+; CHECK-NEXT:    .quad	main.cold
+; CHECK-NEXT:    .byte	3                       # @TType Encoding = udata4
+; CHECK-NEXT:    .uleb128 .Lttbase0-.Lttbaseref0
+; CHECK-NEXT:  .Lttbaseref0:
+; CHECK-NEXT:    .byte	1                       # Call site Encoding = uleb128
+; CHECK-NEXT:    .uleb128 .Laction_table_base0-.Lcst_begin0
+; CHECK-NEXT:  .Lcst_begin0:
+; CHECK-NEXT:    .uleb128 .Ltmp0-.Lfunc_begin0  # >> Call Site 1 <<
+; CHECK-NEXT:    .uleb128 .Ltmp1-.Ltmp0         #   Call between .Ltmp0 and .Ltmp1
+; CHECK-NEXT:    .uleb128 .Ltmp2-main.cold      #     jumps to .Ltmp2
+; CHECK-NEXT:    .byte	3                       #   On action: 2
+; CHECK-NEXT:    .p2align	2
+; CHECK-NEXT:  .Lexception1:
+; CHECK-NEXT:    .byte	0                       # @LPStart Encoding = absptr
+; CHECK-NEXT:    .quad	main.cold
+; CHECK-NEXT:    .byte	3                       # @TType Encoding = udata4
+; CHECK-NEXT:    .uleb128 .Lttbase0-.Lttbaseref1
+; CHECK-NEXT:  .Lttbaseref1:
+; CHECK-NEXT:    .byte	1                       # Call site Encoding = uleb128
+; CHECK-NEXT:    .uleb128 .Laction_table_base0-.Lcst_begin1
+; CHECK-NEXT:  .Lcst_begin1:
+; CHECK-NEXT:    .uleb128 main.cold-main.cold   # >> Call Site 2 <<
+; CHECK-NEXT:    .uleb128 .LBB_END0_2-main.cold #   Call between main.cold and .LBB_END0_2
+; CHECK-NEXT:    .byte	0                       #     has no landing pad
+; CHECK-NEXT:    .byte	0                       #   On action: cleanup
+; CHECK-NEXT:  .Laction_table_base0:
+; CHECK-NEXT:    .byte	0                       # >> Action Record 1 <<
+; CHECK-NEXT:                                   #   Cleanup
+; CHECK-NEXT:    .byte	0                       #   No further actions
+; CHECK-NEXT:    .byte	1                       # >> Action Record 2 <<
+; CHECK-NEXT:                                   #   Catch TypeInfo 1
+; CHECK-NEXT:    .byte	125                     #   Continue to action 1
+; CHECK-NEXT:    .p2align	2
+; CHECK-NEXT:                                   # >> Catch TypeInfos <<
+; CHECK-NEXT:    .long	_ZTIi                   # TypeInfo 1
+; CHECK-NEXT:  .Lttbase0:
+; CHECK-NEXT:    .p2align	2
+; CHECK-NEXT:                                   # -- End function
diff --git a/llvm/test/MC/AsmParser/llvm_section_types.s b/llvm/test/MC/AsmParser/llvm_section_types.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/MC/AsmParser/llvm_section_types.s
@@ -0,0 +1,27 @@
+## Verify that LLVM-specific section types are correctly inferred from assembly input.
+# RUN: llvm-mc -triple i386-pc-linux-gnu -filetype=obj -o %t %s
+# RUN: llvm-readobj -S - < %t | FileCheck %s
+.section    .section1,"",@llvm_bb_addr_map
+.byte 1
+.section    .section2,"",@llvm_call_graph_profile
+.byte 1
+.section    .section3,"",@llvm_odrtab
+.byte 1
+.section    .section4,"",@llvm_linker_options
+.byte 1
+.section    .section5,"",@llvm_sympart
+.byte 1
+.section    .section6,"",@llvm_dependent_libraries
+.byte 1
+# CHECK:        Name: .section1
+# CHECK-NEXT:   Type: SHT_LLVM_BB_ADDR_MAP
+# CHECK:        Name: .section2
+# CHECK-NEXT:   Type: SHT_LLVM_CALL_GRAPH_PROFILE
+# CHECK:        Name: .section3
+# CHECK-NEXT:   Type: SHT_LLVM_ODRTAB
+# CHECK:        Name: .section4
+# CHECK-NEXT:   Type: SHT_LLVM_LINKER_OPTIONS
+# CHECK:        Name: .section5
+# CHECK-NEXT:   Type: SHT_LLVM_SYMPART
+# CHECK:        Name: .section6
+# CHECK-NEXT:   Type: SHT_LLVM_DEPENDENT_LIBRARIES
diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc64-encoding-ISA31.txt b/llvm/test/MC/Disassembler/PowerPC/ppc64-encoding-ISA31.txt
--- a/llvm/test/MC/Disassembler/PowerPC/ppc64-encoding-ISA31.txt
+++ b/llvm/test/MC/Disassembler/PowerPC/ppc64-encoding-ISA31.txt
@@ -34,6 +34,180 @@
 # CHECK: xxsetaccz 0
 0x7c 0x03 0x01 0x62
 
+# CHECK: pmxvf16ger2 0, 1, 2, 4, 4, 2
+0x07 0x90 0x80 0x44 0xec 0x01 0x10 0x98
+
+# CHECK: pmxvf16ger2pp 0, 1, 2, 4, 4, 2
+0x07 0x90 0x80 0x44 0xec 0x01 0x10 0x90
+
+# CHECK: pmxvf16ger2pn 0, 1, 2, 4, 4, 2
+0x07 0x90 0x80 0x44 0xec 0x01 0x14 0x90
+
+# CHECK: pmxvf16ger2np 0, 1, 2, 4, 4, 2
+0x07 0x90 0x80 0x44 0xec 0x01 0x12 0x90
+
+# CHECK: pmxvf16ger2nn 0, 1, 2, 4, 4, 2
+0x07 0x90 0x80 0x44 0xec 0x01 0x16 0x90
+
+# CHECK: pmxvf32ger 0, 1, 2, 4, 4
+0x07 0x90 0x00 0x44 0xec 0x01 0x10 0xd8
+
+# CHECK: pmxvf32gerpp 0, 1, 2, 4, 4
+0x07 0x90 0x00 0x44 0xec 0x01 0x10 0xd0
+
+# CHECK: pmxvf32gerpn 0, 1, 2, 4, 4
+0x07 0x90 0x00 0x44 0xec 0x01 0x14 0xd0
+
+# CHECK: pmxvf32gernp 0, 1, 2, 4, 4
+0x07 0x90 0x00 0x44 0xec 0x01 0x12 0xd0
+
+# CHECK: pmxvf32gernn 0, 1, 2, 4, 4
+0x07 0x90 0x00 0x44 0xec 0x01 0x16 0xd0
+
+# CHECK: pmxvf64ger 0, 0, 2, 4, 3
+0x07 0x90 0x00 0x4c 0xec 0x00 0x11 0xd8
+
+# CHECK: pmxvf64gerpp 0, 2, 2, 4, 3
+0x07 0x90 0x00 0x4c 0xec 0x02 0x11 0xd0
+
+# CHECK: pmxvf64gerpn 0, 4, 2, 4, 3
+0x07 0x90 0x00 0x4c 0xec 0x04 0x15 0xd0
+
+# CHECK: pmxvf64gernp 0, 62, 2, 4, 3
+0x07 0x90 0x00 0x4c 0xec 0x1e 0x13 0xd4
+
+# CHECK: pmxvf64gernn 0, 30, 2, 4, 3
+0x07 0x90 0x00 0x4c 0xec 0x1e 0x17 0xd0
+
+# CHECK: pmxvi4ger8 0, 1, 2, 4, 4, 4
+0x07 0x90 0x04 0x44 0xec 0x01 0x11 0x18
+
+# CHECK: pmxvi4ger8pp 0, 1, 2, 4, 4, 4
+0x07 0x90 0x04 0x44 0xec 0x01 0x11 0x10
+
+# CHECK: pmxvi8ger4 0, 1, 2, 4, 4, 4
+0x07 0x90 0x40 0x44 0xec 0x01 0x10 0x18
+
+# CHECK: pmxvi8ger4pp 0, 1, 2, 4, 4, 4
+0x07 0x90 0x40 0x44 0xec 0x01 0x10 0x10
+
+# CHECK: pmxvi16ger2s 0, 1, 2, 4, 4, 2
+0x07 0x90 0x80 0x44 0xec 0x01 0x11 0x58
+
+# CHECK: pmxvi16ger2spp 0, 1, 2, 4, 4, 2
+0x07 0x90 0x80 0x44 0xec 0x01 0x11 0x50
+
+# CHECK: xvf16ger2 0, 1, 2
+0xec 0x01 0x10 0x98
+
+# CHECK: xvf16ger2pp 0, 1, 2
+0xec 0x01 0x10 0x90
+
+# CHECK: xvf16ger2pn 0, 1, 2
+0xec 0x01 0x14 0x90
+
+# CHECK: xvf16ger2np 0, 1, 2
+0xec 0x01 0x12 0x90
+
+# CHECK: xvf16ger2nn 0, 1, 2
+0xec 0x01 0x16 0x90
+
+# CHECK: xvf32ger 0, 1, 2
+0xec 0x01 0x10 0xd8
+
+# CHECK: xvf32gerpp 0, 1, 2
+0xec 0x01 0x10 0xd0
+
+# CHECK: xvf32gerpn 0, 1, 2
+0xec 0x01 0x14 0xd0
+
+# CHECK: xvf32gernp 0, 1, 2
+0xec 0x01 0x12 0xd0
+
+# CHECK: xvf32gernn 0, 1, 2
+0xec 0x01 0x16 0xd0
+
+# CHECK: xvf64ger 0, 0, 2
+0xec 0x00 0x11 0xd8
+
+# CHECK: xvf64gerpp 0, 2, 2
+0xec 0x02 0x11 0xd0
+
+# CHECK: xvf64gerpn 0, 62, 2
+0xec 0x1e 0x15 0xd4
+
+# CHECK: xvf64gernp 0, 0, 2
+0xec 0x00 0x13 0xd0
+
+# CHECK: xvf64gernn 0, 0, 2
+0xec 0x00 0x17 0xd0
+
+# CHECK: xvi4ger8 0, 1, 2
+0xec 0x01 0x11 0x18
+
+# CHECK: xvi4ger8pp 0, 1, 2
+0xec 0x01 0x11 0x10
+
+# CHECK: xvi8ger4 0, 1, 2
+0xec 0x01 0x10 0x18
+
+# CHECK: xvi8ger4pp 0, 1, 2
+0xec 0x01 0x10 0x10
+
+# CHECK: xvi16ger2s 0, 1, 2
+0xec 0x01 0x11 0x58
+
+# CHECK: xvi16ger2spp 0, 1, 2
+0xec 0x01 0x11 0x50
+
+# CHECK: xvbf16ger2 2, 33, 34
+0xed 0x01 0x11 0x9e
+
+# CHECK: xvbf16ger2pp 1, 33, 34
+0xec 0x81 0x11 0x96
+
+# CHECK: xvbf16ger2pn 2, 33, 34
+0xed 0x01 0x15 0x96
+
+# CHECK: xvbf16ger2np 1, 33, 34
+0xec 0x81 0x13 0x96
+
+# CHECK: xvbf16ger2nn 2, 33, 34
+0xed 0x01 0x17 0x96
+
+# CHECK: pmxvbf16ger2 2, 33, 34, 4, 4, 2
+0x07 0x90 0x80 0x44 0xed 0x01 0x11 0x9e
+
+# CHECK: pmxvbf16ger2pp 1, 33, 34, 4, 4, 2
+0x07 0x90 0x80 0x44 0xec 0x81 0x11 0x96
+
+# CHECK: pmxvbf16ger2pn 2, 33, 34, 4, 4, 2
+0x07 0x90 0x80 0x44 0xed 0x01 0x15 0x96
+
+# CHECK: pmxvbf16ger2np 1, 33, 34, 4, 4, 2
+0x07 0x90 0x80 0x44 0xec 0x81 0x13 0x96
+
+# CHECK: pmxvbf16ger2nn 2, 33, 34, 4, 4, 2
+0x07 0x90 0x80 0x44 0xed 0x01 0x17 0x96
+
+# CHECK: xvi8ger4spp 1, 33, 34
+0xec 0x81 0x13 0x1e
+
+# CHECK: xvi16ger2 1, 33, 34
+0xec 0x81 0x12 0x5e
+
+# CHECK: xvi16ger2pp 1, 33, 34
+0xec 0x81 0x13 0x5e
+
+# CHECK: pmxvi8ger4spp 1, 33, 34, 4, 4, 8
+0x07 0x90 0x80 0x44 0xec 0x81 0x13 0x1e
+
+# CHECK: pmxvi16ger2 1, 33, 34, 4, 4, 2
+0x07 0x90 0x80 0x44 0xec 0x81 0x12 0x5e
+
+# CHECK: pmxvi16ger2pp 1, 33, 34, 4, 4, 2
+0x07 0x90 0x80 0x44 0xec 0x81 0x13 0x5e
+
 # CHECK: lxvp 2, 32(4)
 0x18 0x44 0x00 0x20
 
diff --git a/llvm/test/MC/PowerPC/ppc64-encoding-ISA31.s b/llvm/test/MC/PowerPC/ppc64-encoding-ISA31.s
--- a/llvm/test/MC/PowerPC/ppc64-encoding-ISA31.s
+++ b/llvm/test/MC/PowerPC/ppc64-encoding-ISA31.s
@@ -32,6 +32,238 @@
 # CHECK-BE: xxsetaccz 1                        # encoding: [0x7c,0x83,0x01,0x62]
 # CHECK-LE: xxsetaccz 1                        # encoding: [0x62,0x01,0x83,0x7c]
             xxsetaccz 1
+# CHECK-BE: pmxvf16ger2 0, 1, 2, 4, 4, 2          # encoding: [0x07,0x90,0x80,0x44,
+# CHECK-BE-SAME:                                               0xec,0x01,0x10,0x98]
+# CHECK-LE: pmxvf16ger2 0, 1, 2, 4, 4, 2          # encoding: [0x44,0x80,0x90,0x07,
+# CHECK-LE-SAME:                                               0x98,0x10,0x01,0xec]
+            pmxvf16ger2 0, 1, 2, 4, 4, 2
+# CHECK-BE: pmxvf16ger2pp 0, 1, 2, 4, 4, 2        # encoding: [0x07,0x90,0x80,0x44,
+# CHECK-BE-SAME:                                               0xec,0x01,0x10,0x90]
+# CHECK-LE: pmxvf16ger2pp 0, 1, 2, 4, 4, 2        # encoding: [0x44,0x80,0x90,0x07,
+# CHECK-LE-SAME:                                               0x90,0x10,0x01,0xec
+            pmxvf16ger2pp 0, 1, 2, 4, 4, 2
+# CHECK-BE: pmxvf16ger2pn 0, 1, 2, 4, 4, 2        # encoding: [0x07,0x90,0x80,0x44,
+# CHECK-BE-SAME:                                               0xec,0x01,0x14,0x90]
+# CHECK-LE: pmxvf16ger2pn 0, 1, 2, 4, 4, 2        # encoding: [0x44,0x80,0x90,0x07,
+# CHECK-LE-SAME:                                               0x90,0x14,0x01,0xec]
+            pmxvf16ger2pn 0, 1, 2, 4, 4, 2
+# CHECK-BE: pmxvf16ger2np 0, 1, 2, 4, 4, 2        # encoding: [0x07,0x90,0x80,0x44,
+# CHECK-BE-SAME:                                               0xec,0x01,0x12,0x90]
+# CHECK-LE: pmxvf16ger2np 0, 1, 2, 4, 4, 2        # encoding: [0x44,0x80,0x90,0x07,
+# CHECK-LE-SAME:                                               0x90,0x12,0x01,0xec]
+            pmxvf16ger2np 0, 1, 2, 4, 4, 2
+# CHECK-BE: pmxvf16ger2nn 0, 1, 2, 4, 4, 2        # encoding: [0x07,0x90,0x80,0x44,
+# CHECK-BE-SAME:                                               0xec,0x01,0x16,0x90]
+# CHECK-LE: pmxvf16ger2nn 0, 1, 2, 4, 4, 2        # encoding: [0x44,0x80,0x90,0x07,
+# CHECK-LE-SAME:                                               0x90,0x16,0x01,0xec]
+            pmxvf16ger2nn 0, 1, 2, 4, 4, 2
+# CHECK-BE: pmxvf32ger 0, 1, 2, 4, 4              # encoding: [0x07,0x90,0x00,0x44,
+# CHECK-BE-SAME:                                               0xec,0x01,0x10,0xd8]
+# CHECK-LE: pmxvf32ger 0, 1, 2, 4, 4              # encoding: [0x44,0x00,0x90,0x07,
+# CHECK-LE-SAME:                                               0xd8,0x10,0x01,0xec]
+            pmxvf32ger 0, 1, 2, 4, 4
+# CHECK-BE: pmxvf32gerpp 0, 1, 2, 4, 4            # encoding: [0x07,0x90,0x00,0x44,
+# CHECK-BE-SAME:                                               0xec,0x01,0x10,0xd0]
+# CHECK-LE: pmxvf32gerpp 0, 1, 2, 4, 4            # encoding: [0x44,0x00,0x90,0x07,
+# CHECK-LE-SAME:                                               0xd0,0x10,0x01,0xec]
+            pmxvf32gerpp 0, 1, 2, 4, 4
+# CHECK-BE: pmxvf32gerpn 0, 1, 2, 4, 4            # encoding: [0x07,0x90,0x00,0x44,
+# CHECK-BE-SAME:                                               0xec,0x01,0x14,0xd0]
+# CHECK-LE: pmxvf32gerpn 0, 1, 2, 4, 4            # encoding: [0x44,0x00,0x90,0x07,
+# CHECK-LE-SAME:                                               0xd0,0x14,0x01,0xec]
+            pmxvf32gerpn 0, 1, 2, 4, 4
+# CHECK-BE: pmxvf32gernp 0, 1, 2, 4, 4            # encoding: [0x07,0x90,0x00,0x44,
+# CHECK-BE-SAME:                                               0xec,0x01,0x12,0xd0]
+# CHECK-LE: pmxvf32gernp 0, 1, 2, 4, 4            # encoding: [0x44,0x00,0x90,0x07,
+# CHECK-LE-SAME:                                               0xd0,0x12,0x01,0xec]
+            pmxvf32gernp 0, 1, 2, 4, 4
+# CHECK-BE: pmxvf32gernn 0, 1, 2, 4, 4            # encoding: [0x07,0x90,0x00,0x44,
+# CHECK-BE-SAME:                                               0xec,0x01,0x16,0xd0]
+# CHECK-LE: pmxvf32gernn 0, 1, 2, 4, 4            # encoding: [0x44,0x00,0x90,0x07,
+# CHECK-LE-SAME:                                               0xd0,0x16,0x01,0xec]
+            pmxvf32gernn 0, 1, 2, 4, 4
+# CHECK-BE: pmxvf64ger 0, 0, 2, 4, 3              # encoding: [0x07,0x90,0x00,0x4c,
+# CHECK-BE-SAME:                                               0xec,0x00,0x11,0xd8]
+# CHECK-LE: pmxvf64ger 0, 0, 2, 4, 3              # encoding: [0x4c,0x00,0x90,0x07,
+# CHECK-LE-SAME:                                               0xd8,0x11,0x00,0xec]
+            pmxvf64ger 0, 0, 2, 4, 3
+# CHECK-BE: pmxvf64gerpp 0, 2, 2, 4, 3            # encoding: [0x07,0x90,0x00,0x4c,
+# CHECK-BE-SAME:                                               0xec,0x02,0x11,0xd0]
+# CHECK-LE: pmxvf64gerpp 0, 2, 2, 4, 3            # encoding: [0x4c,0x00,0x90,0x07,
+# CHECK-LE-SAME:                                               0xd0,0x11,0x02,0xec]
+            pmxvf64gerpp 0, 2, 2, 4, 3
+# CHECK-BE: pmxvf64gerpn 0, 4, 2, 4, 3            # encoding: [0x07,0x90,0x00,0x4c,
+# CHECK-BE-SAME:                                               0xec,0x04,0x15,0xd0]
+# CHECK-LE: pmxvf64gerpn 0, 4, 2, 4, 3            # encoding: [0x4c,0x00,0x90,0x07,
+# CHECK-LE-SAME:                                               0xd0,0x15,0x04,0xec]
+            pmxvf64gerpn 0, 4, 2, 4, 3
+# CHECK-BE: pmxvf64gernp 0, 32, 2, 4, 3           # encoding: [0x07,0x90,0x00,0x4c,
+# CHECK-BE-SAME:                                               0xec,0x00,0x13,0xd4]
+# CHECK-LE: pmxvf64gernp 0, 32, 2, 4, 3           # encoding: [0x4c,0x00,0x90,0x07,
+# CHECK-LE-SAME:                                               0xd4,0x13,0x00,0xec]
+            pmxvf64gernp 0, 32, 2, 4, 3
+# CHECK-BE: pmxvf64gernn 0, 62, 2, 4, 3           # encoding: [0x07,0x90,0x00,0x4c,
+# CHECK-BE-SAME:                                               0xec,0x1e,0x17,0xd4]
+# CHECK-LE: pmxvf64gernn 0, 62, 2, 4, 3           # encoding: [0x4c,0x00,0x90,0x07,
+# CHECK-LE-SAME:                                               0xd4,0x17,0x1e,0xec]
+            pmxvf64gernn 0, 62, 2, 4, 3
+# CHECK-BE: pmxvi4ger8 0, 1, 2, 4, 4, 4           # encoding: [0x07,0x90,0x04,0x44,
+# CHECK-BE-SAME:                                               0xec,0x01,0x11,0x18]
+# CHECK-LE: pmxvi4ger8 0, 1, 2, 4, 4, 4           # encoding: [0x44,0x04,0x90,0x07
+# CHECK-LE-SAME:                                               0x18,0x11,0x01,0xec]
+            pmxvi4ger8 0, 1, 2, 4, 4, 4
+# CHECK-BE: pmxvi4ger8pp 0, 1, 2, 4, 4, 4         # encoding: [0x07,0x90,0x04,0x44,
+# CHECK-BE-SAME:                                               0xec,0x01,0x11,0x10]
+# CHECK-LE: pmxvi4ger8pp 0, 1, 2, 4, 4, 4         # encoding: [0x44,0x04,0x90,0x07
+# CHECK-LE-SAME:                                               0x10,0x11,0x01,0xec]
+            pmxvi4ger8pp 0, 1, 2, 4, 4, 4
+# CHECK-BE: pmxvi8ger4 0, 1, 2, 4, 4, 4           # encoding: [0x07,0x90,0x40,0x44,
+# CHECK-BE-SAME:                                               0xec,0x01,0x10,0x18]
+# CHECK-LE: pmxvi8ger4 0, 1, 2, 4, 4, 4           # encoding: [0x44,0x40,0x90,0x07,
+# CHECK-LE-SAME:                                               0x18,0x10,0x01,0xec]
+            pmxvi8ger4 0, 1, 2, 4, 4, 4
+# CHECK-BE: pmxvi8ger4pp 0, 1, 2, 4, 4, 4         # encoding: [0x07,0x90,0x40,0x44,
+# CHECK-BE-SAME:                                               0xec,0x01,0x10,0x10]
+# CHECK-LE: pmxvi8ger4pp 0, 1, 2, 4, 4, 4         # encoding: [0x44,0x40,0x90,0x07,
+# CHECK-LE-SAME:                                               0x10,0x10,0x01,0xec]
+            pmxvi8ger4pp 0, 1, 2, 4, 4, 4
+# CHECK-BE: pmxvi16ger2s 0, 1, 2, 4, 4, 2         # encoding: [0x07,0x90,0x80,0x44,
+# CHECK-BE-SAME:                                               0xec,0x01,0x11,0x58]
+# CHECK-LE: pmxvi16ger2s 0, 1, 2, 4, 4, 2         # encoding: [0x44,0x80,0x90,0x07,
+# CHECK-LE-SAME:                                               0x58,0x11,0x01,0xec]
+            pmxvi16ger2s 0, 1, 2, 4, 4, 2
+# CHECK-BE: pmxvi16ger2spp 0, 1, 2, 4, 4, 2       # encoding: [0x07,0x90,0x80,0x44,
+# CHECK-BE-SAME:                                               0xec,0x01,0x11,0x50]
+# CHECK-LE: pmxvi16ger2spp 0, 1, 2, 4, 4, 2       # encoding: [0x44,0x80,0x90,0x07,
+# CHECK-LE-SAME:                                               0x50,0x11,0x01,0xec]
+            pmxvi16ger2spp 0, 1, 2, 4, 4, 2
+# CHECK-BE: xvf16ger2 0, 1, 2                     # encoding: [0xec,0x01,0x10,0x98]
+# CHECK-LE: xvf16ger2 0, 1, 2                     # encoding: [0x98,0x10,0x01,0xec]
+            xvf16ger2 0, 1, 2
+# CHECK-BE: xvf16ger2pp 0, 1, 2                   # encoding: [0xec,0x01,0x10,0x90]
+# CHECK-LE: xvf16ger2pp 0, 1, 2                   # encoding: [0x90,0x10,0x01,0xec]
+            xvf16ger2pp 0, 1, 2
+# CHECK-BE: xvf16ger2pn 0, 1, 2                   # encoding: [0xec,0x01,0x14,0x90]
+# CHECK-LE: xvf16ger2pn 0, 1, 2                   # encoding: [0x90,0x14,0x01,0xec]
+            xvf16ger2pn 0, 1, 2
+# CHECK-BE: xvf16ger2np 0, 1, 2                   # encoding: [0xec,0x01,0x12,0x90]
+# CHECK-LE: xvf16ger2np 0, 1, 2                   # encoding: [0x90,0x12,0x01,0xec]
+            xvf16ger2np 0, 1, 2
+# CHECK-BE: xvf16ger2nn 0, 1, 2                   # encoding: [0xec,0x01,0x16,0x90]
+# CHECK-LE: xvf16ger2nn 0, 1, 2                   # encoding: [0x90,0x16,0x01,0xec]
+            xvf16ger2nn 0, 1, 2
+# CHECK-BE: xvf32ger 0, 1, 2                      # encoding: [0xec,0x01,0x10,0xd8]
+# CHECK-LE: xvf32ger 0, 1, 2                      # encoding: [0xd8,0x10,0x01,0xec]
+            xvf32ger 0, 1, 2
+# CHECK-BE: xvf32gerpp 0, 1, 2                    # encoding: [0xec,0x01,0x10,0xd0]
+# CHECK-LE: xvf32gerpp 0, 1, 2                    # encoding: [0xd0,0x10,0x01,0xec]
+            xvf32gerpp 0, 1, 2
+# CHECK-BE: xvf32gerpn 0, 1, 2                    # encoding: [0xec,0x01,0x14,0xd0]
+# CHECK-LE: xvf32gerpn 0, 1, 2                    # encoding: [0xd0,0x14,0x01,0xec]
+            xvf32gerpn 0, 1, 2
+# CHECK-BE: xvf32gernp 0, 1, 2                    # encoding: [0xec,0x01,0x12,0xd0]
+# CHECK-LE: xvf32gernp 0, 1, 2                    # encoding: [0xd0,0x12,0x01,0xec]
+            xvf32gernp 0, 1, 2
+# CHECK-BE: xvf32gernn 0, 1, 2                    # encoding: [0xec,0x01,0x16,0xd0]
+# CHECK-LE: xvf32gernn 0, 1, 2                    # encoding: [0xd0,0x16,0x01,0xec]
+            xvf32gernn 0, 1, 2
+# CHECK-BE: xvf64ger 0, 2, 2                      # encoding: [0xec,0x02,0x11,0xd8]
+# CHECK-LE: xvf64ger 0, 2, 2                      # encoding: [0xd8,0x11,0x02,0xec]
+            xvf64ger 0, 2, 2
+# CHECK-BE: xvf64gerpp 0, 0, 2                    # encoding: [0xec,0x00,0x11,0xd0]
+# CHECK-LE: xvf64gerpp 0, 0, 2                    # encoding: [0xd0,0x11,0x00,0xec]
+            xvf64gerpp 0, 0, 2
+# CHECK-BE: xvf64gerpn 0, 4, 2                    # encoding: [0xec,0x04,0x15,0xd0]
+# CHECK-LE: xvf64gerpn 0, 4, 2                    # encoding: [0xd0,0x15,0x04,0xec]
+            xvf64gerpn 0, 4, 2
+# CHECK-BE: xvf64gernp 0, 62, 2                   # encoding: [0xec,0x1e,0x13,0xd4]
+# CHECK-LE: xvf64gernp 0, 62, 2                   # encoding: [0xd4,0x13,0x1e,0xec]
+            xvf64gernp 0, 62, 2
+# CHECK-BE: xvf64gernn 0, 0, 2                    # encoding: [0xec,0x00,0x17,0xd0]
+# CHECK-LE: xvf64gernn 0, 0, 2                    # encoding: [0xd0,0x17,0x00,0xec]
+            xvf64gernn 0, 0, 2
+# CHECK-BE: xvi4ger8 0, 1, 2                      # encoding: [0xec,0x01,0x11,0x18]
+# CHECK-LE: xvi4ger8 0, 1, 2                      # encoding: [0x18,0x11,0x01,0xec]
+            xvi4ger8 0, 1, 2
+# CHECK-BE: xvi4ger8pp 0, 1, 2                    # encoding: [0xec,0x01,0x11,0x10]
+# CHECK-LE: xvi4ger8pp 0, 1, 2                    # encoding: [0x10,0x11,0x01,0xec]
+            xvi4ger8pp 0, 1, 2
+# CHECK-BE: xvi8ger4 0, 1, 2                      # encoding: [0xec,0x01,0x10,0x18]
+# CHECK-LE: xvi8ger4 0, 1, 2                      # encoding: [0x18,0x10,0x01,0xec]
+            xvi8ger4 0, 1, 2
+# CHECK-BE: xvi8ger4pp 0, 1, 2                    # encoding: [0xec,0x01,0x10,0x10]
+# CHECK-LE: xvi8ger4pp 0, 1, 2                    # encoding: [0x10,0x10,0x01,0xec]
+            xvi8ger4pp 0, 1, 2
+# CHECK-BE: xvi16ger2s 0, 1, 2                    # encoding: [0xec,0x01,0x11,0x58]
+# CHECK-LE: xvi16ger2s 0, 1, 2                    # encoding: [0x58,0x11,0x01,0xec]
+            xvi16ger2s 0, 1, 2
+# CHECK-BE: xvi16ger2spp 0, 1, 2                  # encoding: [0xec,0x01,0x11,0x50]
+# CHECK-LE: xvi16ger2spp 0, 1, 2                  # encoding: [0x50,0x11,0x01,0xec]
+            xvi16ger2spp 0, 1, 2
+# CHECK-BE: xvbf16ger2 2, 33, 34                  # encoding: [0xed,0x01,0x11,0x9e]
+# CHECK-LE: xvbf16ger2 2, 33, 34                  # encoding: [0x9e,0x11,0x01,0xed]
+            xvbf16ger2 2, 33, 34
+# CHECK-BE: xvbf16ger2pp 1, 33, 34                # encoding: [0xec,0x81,0x11,0x96]
+# CHECK-LE: xvbf16ger2pp 1, 33, 34                # encoding: [0x96,0x11,0x81,0xec]
+            xvbf16ger2pp 1, 33, 34
+# CHECK-BE: xvbf16ger2pn 2, 33, 34                # encoding: [0xed,0x01,0x15,0x96]
+# CHECK-LE: xvbf16ger2pn 2, 33, 34                # encoding: [0x96,0x15,0x01,0xed]
+            xvbf16ger2pn 2, 33, 34
+# CHECK-BE: xvbf16ger2np 1, 33, 34                # encoding: [0xec,0x81,0x13,0x96]
+# CHECK-LE: xvbf16ger2np 1, 33, 34                # encoding: [0x96,0x13,0x81,0xec]
+            xvbf16ger2np 1, 33, 34
+# CHECK-BE: xvbf16ger2nn 2, 33, 34                # encoding: [0xed,0x01,0x17,0x96]
+# CHECK-LE: xvbf16ger2nn 2, 33, 34                # encoding: [0x96,0x17,0x01,0xed]
+            xvbf16ger2nn 2, 33, 34
+# CHECK-BE: pmxvbf16ger2 2, 33, 34, 4, 4, 2       # encoding: [0x07,0x90,0x80,0x44,
+# CHECK-BE-SAME:                                               0xed,0x01,0x11,0x9e]
+# CHECK-LE: pmxvbf16ger2 2, 33, 34, 4, 4, 2       # encoding: [0x44,0x80,0x90,0x07,
+# CHECK-LE-SAME:                                               0x9e,0x11,0x01,0xed]
+            pmxvbf16ger2 2, 33, 34, 4, 4, 2
+# CHECK-BE: pmxvbf16ger2pp 1, 33, 34, 4, 4, 2     # encoding: [0x07,0x90,0x80,0x44,
+# CHECK-BE-SAME:                                               0xec,0x81,0x11,0x96]
+# CHECK-LE: pmxvbf16ger2pp 1, 33, 34, 4, 4, 2     # encoding: [0x44,0x80,0x90,0x07,
+# CHECK-LE-SAME:                                               0x96,0x11,0x81,0xec]
+            pmxvbf16ger2pp 1, 33, 34, 4, 4, 2
+# CHECK-BE: pmxvbf16ger2pn 2, 33, 34, 4, 4, 2     # encoding: [0x07,0x90,0x80,0x44,
+# CHECK-BE-SAME:                                               0xed,0x01,0x15,0x96]
+# CHECK-LE: pmxvbf16ger2pn 2, 33, 34, 4, 4, 2     # encoding: [0x44,0x80,0x90,0x07,
+# CHECK-LE-SAME:                                               0x96,0x15,0x01,0xed]
+            pmxvbf16ger2pn 2, 33, 34, 4, 4, 2
+# CHECK-BE: pmxvbf16ger2np 1, 33, 34, 4, 4, 2     # encoding: [0x07,0x90,0x80,0x44,
+# CHECK-BE-SAME:                                               0xec,0x81,0x13,0x96]
+# CHECK-LE: pmxvbf16ger2np 1, 33, 34, 4, 4, 2     # encoding: [0x44,0x80,0x90,0x07,
+# CHECK-LE-SAME:                                               0x96,0x13,0x81,0xec]
+            pmxvbf16ger2np 1, 33, 34, 4, 4, 2
+# CHECK-BE: pmxvbf16ger2nn 2, 33, 34, 4, 4, 2     # encoding: [0x07,0x90,0x80,0x44,
+# CHECK-BE-SAME:                                               0xed,0x01,0x17,0x96]
+# CHECK-LE: pmxvbf16ger2nn 2, 33, 34, 4, 4, 2     # encoding: [0x44,0x80,0x90,0x07,
+# CHECK-LE-SAME:                                               0x96,0x17,0x01,0xed]
+            pmxvbf16ger2nn 2, 33, 34, 4, 4, 2
+# CHECK-BE: xvi8ger4spp 1, 33, 34                 # encoding: [0xec,0x81,0x13,0x1e]
+# CHECK-LE: xvi8ger4spp 1, 33, 34                 # encoding: [0x1e,0x13,0x81,0xec]
+            xvi8ger4spp 1, 33, 34
+# CHECK-BE: xvi16ger2 1, 33, 34                   # encoding: [0xec,0x81,0x12,0x5e]
+# CHECK-LE: xvi16ger2 1, 33, 34                   # encoding: [0x5e,0x12,0x81,0xec]
+            xvi16ger2 1, 33, 34
+# CHECK-BE: xvi16ger2pp 1, 33, 34                 # encoding: [0xec,0x81,0x13,0x5e]
+# CHECK-LE: xvi16ger2pp 1, 33, 34                 # encoding: [0x5e,0x13,0x81,0xec]
+            xvi16ger2pp 1, 33, 34
+# CHECK-BE: pmxvi8ger4spp 1, 33, 34, 4, 4, 8      # encoding: [0x07,0x90,0x80,0x44,
+# CHECK-BE-SAME:                                               0xec,0x81,0x13,0x1e]
+# CHECK-LE: pmxvi8ger4spp 1, 33, 34, 4, 4, 8      # encoding: [0x44,0x80,0x90,0x07,
+# CHECK-LE-SAME:                                               0x1e,0x13,0x81,0xec]
+            pmxvi8ger4spp 1, 33, 34, 4, 4, 8
+# CHECK-BE: pmxvi16ger2 1, 33, 34, 4, 4, 2        # encoding: [0x07,0x90,0x80,0x44,
+# CHECK-BE-SAME:                                               0xec,0x81,0x12,0x5e]
+# CHECK-LE: pmxvi16ger2 1, 33, 34, 4, 4, 2        # encoding: [0x44,0x80,0x90,0x07,
+# CHECK-LE-SAME:                                               0x5e,0x12,0x81,0xec]
+            pmxvi16ger2 1, 33, 34, 4, 4, 2
+# CHECK-BE: pmxvi16ger2pp 1, 33, 34, 4, 4, 2      # encoding: [0x07,0x90,0x80,0x44,
+# CHECK-BE-SAME:                                               0xec,0x81,0x13,0x5e]
+# CHECK-LE: pmxvi16ger2pp 1, 33, 34, 4, 4, 2      # encoding: [0x44,0x80,0x90,0x07,
+# CHECK-LE-SAME:                                               0x5e,0x13,0x81,0xec]
+            pmxvi16ger2pp 1, 33, 34, 4, 4, 2
 # CHECK-BE: lxvp 2, 32(4)                      # encoding: [0x18,0x44,0x00,0x20]
 # CHECK-LE: lxvp 2, 32(4)                      # encoding: [0x20,0x00,0x44,0x18]
             lxvp 2, 32(4)
diff --git a/llvm/test/Object/archive-malformed-object.test b/llvm/test/Object/archive-malformed-object.test
new file mode 100644
--- /dev/null
+++ b/llvm/test/Object/archive-malformed-object.test
@@ -0,0 +1,38 @@
+## Show that the archive library emits error messages when adding malformed
+## objects.
+
+# RUN: rm -rf %t.dir
+# RUN: split-file %s %t.dir
+# RUN: cd %t.dir
+
+## Malformed bitcode object.
+# RUN: llvm-as input.ll -o input.bc
+# RUN: %python -c "with open('input.bc', 'a') as f: f.truncate(10)"
+# RUN: not llvm-ar rc bad.a input.bc 2>&1 | FileCheck %s --check-prefix=ERR1
+
+# ERR1: error: bad.a: Invalid bitcode signature
+
+## Non-bitcode malformed file.
+# RUN: yaml2obj input.yaml -o input.o
+# RUN: not llvm-ar rc bad.a input.o 2>&1 | FileCheck %s --check-prefix=ERR2
+
+# ERR2: error: bad.a: section header table goes past the end of the file: e_shoff = 0x9999
+
+## Don't emit an error if the symbol table is not required.
+# RUN: llvm-ar rcS good.a input.o input.bc
+# RUN: llvm-ar t good.a | FileCheck %s --check-prefix=CONTENTS
+
+# CONTENTS:      input.o
+# CONTENTS-NEXT: input.bc
+
+#--- input.ll
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux"
+
+#--- input.yaml
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_REL
+  EShOff:  0x9999
diff --git a/llvm/test/Object/archive-unknown-filetype.test b/llvm/test/Object/archive-unknown-filetype.test
new file mode 100644
--- /dev/null
+++ b/llvm/test/Object/archive-unknown-filetype.test
@@ -0,0 +1,11 @@
+## Show that the archive library does not emit an error or add any symbols to
+## the archive symbol table, when it encounters an unknown file type, but still
+## adds the file to the archive.
+
+# RUN: echo something > %t
+# RUN: rm -f %t.a
+# RUN: llvm-ar rc %t.a %t
+# RUN: llvm-ar t %t.a | FileCheck %s --check-prefix=CONTENTS -DFILE=%basename_t
+# RUN: llvm-nm --print-armap %t.a | FileCheck %s --allow-empty --implicit-check-not={{.}}
+
+# CONTENTS: [[FILE]]
diff --git a/llvm/test/Other/change-printer.ll b/llvm/test/Other/change-printer.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Other/change-printer.ll
@@ -0,0 +1,128 @@
+; Simple checks of -print-changed functionality
+;
+; Note that (mostly) only the banners are checked.
+;
+; Simple functionality check.
+; RUN: opt -S -print-changed -passes=instsimplify 2>&1 -o /dev/null < %s | FileCheck %s --check-prefix=CHECK-SIMPLE
+;
+; Check that only the passes that change the IR are printed and that the
+; others (including g) are filtered out.
+; RUN: opt -S -print-changed -passes=instsimplify -filter-print-funcs=f  2>&1 -o /dev/null < %s | FileCheck %s --check-prefix=CHECK-FUNC-FILTER
+;
+; Check that the reporting of IRs respects -print-module-scope
+; RUN: opt -S -print-changed -passes=instsimplify -print-module-scope 2>&1 -o /dev/null < %s | FileCheck %s --check-prefix=CHECK-PRINT-MOD-SCOPE
+;
+; Check that the reporting of IRs respects -print-module-scope
+; RUN: opt -S -print-changed -passes=instsimplify -filter-print-funcs=f -print-module-scope 2>&1 -o /dev/null < %s | FileCheck %s --check-prefix=CHECK-FUNC-FILTER-MOD-SCOPE
+;
+; Check that reporting of multiple functions happens
+; RUN: opt -S -print-changed -passes=instsimplify -filter-print-funcs="f,g" 2>&1 -o /dev/null < %s | FileCheck %s --check-prefix=CHECK-FILTER-MULT-FUNC
+;
+; Check that the reporting of IRs respects -filter-passes
+; RUN: opt -S -print-changed -passes="instsimplify,no-op-function" -filter-passes="NoOpFunctionPass" 2>&1 -o /dev/null < %s | FileCheck %s --check-prefix=CHECK-FILTER-PASSES
+;
+; Check that the reporting of IRs respects -filter-passes with multiple passes
+; RUN: opt -S -print-changed -passes="instsimplify,no-op-function" -filter-passes="NoOpFunctionPass,InstSimplifyPass" 2>&1 -o /dev/null < %s | FileCheck %s --check-prefix=CHECK-FILTER-MULT-PASSES
+;
+; Check that the reporting of IRs respects both -filter-passes and -filter-print-funcs
+; RUN: opt -S -print-changed -passes="instsimplify,no-op-function" -filter-passes="NoOpFunctionPass,InstSimplifyPass" -filter-print-funcs=f 2>&1 -o /dev/null < %s | FileCheck %s --check-prefix=CHECK-FILTER-FUNC-PASSES
+;
+; Check that the reporting of IRs respects -filter-passes, -filter-print-funcs and -print-module-scope
+; RUN: opt -S -print-changed -passes="instsimplify,no-op-function" -filter-passes="NoOpFunctionPass,InstSimplifyPass" -filter-print-funcs=f -print-module-scope 2>&1 -o /dev/null < %s | FileCheck %s --check-prefix=CHECK-FILTER-FUNC-PASSES-MOD-SCOPE
+;
+; Check that repeated passes that change the IR are printed and that the
+; others (including g) are filtered out.  Note that the second time
+; instsimplify is run on f, it does not change the IR
+; RUN: opt -S -print-changed -passes="instsimplify,instsimplify" -filter-print-funcs=f  2>&1 -o /dev/null < %s | FileCheck %s --check-prefix=CHECK-MULT-PASSES-FILTER-FUNC
+
+define i32 @g() {
+entry:
+  %a = add i32 2, 3
+  ret i32 %a
+}
+
+define i32 @f() {
+entry:
+  %a = add i32 2, 3
+  ret i32 %a
+}
+
+; CHECK-SIMPLE: *** IR Dump At Start: ***
+; CHECK-SIMPLE-NEXT: ; ModuleID = {{.+}}
+; CHECK-SIMPLE: *** IR Dump After VerifierPass (module) omitted because no change ***
+; CHECK-SIMPLE: *** IR Dump After InstSimplifyPass *** (function: g)
+; CHECK-SIMPLE-NEXT: define i32 @g()
+; CHECK-SIMPLE: *** IR Pass PassManager{{.*}} (function: g) ignored ***
+; CHECK-SIMPLE: *** IR Dump After InstSimplifyPass *** (function: f)
+; CHECK-SIMPLE-NEXT: define i32 @f()
+; CHECK-SIMPLE: *** IR Pass PassManager{{.*}} (function: f) ignored ***
+; CHECK-SIMPLE: *** IR Pass ModuleToFunctionPassAdaptor<{{.*}}PassManager{{.*}}> (module) ignored ***
+; CHECK-SIMPLE: *** IR Dump After VerifierPass (module) omitted because no change ***
+; CHECK-SIMPLE: *** IR Dump After PrintModulePass (module) omitted because no change ***
+; CHECK-SIMPLE-NOT: *** IR
+
+; CHECK-FUNC-FILTER: *** IR Dump At Start: ***
+; CHECK-FUNC-FILTER-NEXT: ; ModuleID = {{.+}}
+; CHECK-FUNC-FILTER: *** IR Dump After InstSimplifyPass (function: g) filtered out ***
+; CHECK-FUNC-FILTER: *** IR Dump After InstSimplifyPass *** (function: f)
+; CHECK-FUNC-FILTER-NEXT: define i32 @f()
+
+; CHECK-PRINT-MOD-SCOPE: *** IR Dump At Start: ***
+; CHECK-PRINT-MOD-SCOPE-NEXT: ModuleID = {{.+}}
+; CHECK-PRINT-MOD-SCOPE: *** IR Dump After InstSimplifyPass *** (function: g)
+; CHECK-PRINT-MOD-SCOPE-NEXT: ModuleID = {{.+}}
+; CHECK-PRINT-MOD-SCOPE: *** IR Dump After InstSimplifyPass *** (function: f)
+; CHECK-PRINT-MOD-SCOPE-NEXT: ModuleID = {{.+}}
+
+; CHECK-FUNC-FILTER-MOD-SCOPE: *** IR Dump At Start: ***
+; CHECK-FUNC-FILTER-MOD-SCOPE-NEXT: ; ModuleID = {{.+}}
+; CHECK-FUNC-FILTER-MOD-SCOPE: *** IR Dump After InstSimplifyPass (function: g) filtered out ***
+; CHECK-FUNC-FILTER-MOD-SCOPE: *** IR Dump After InstSimplifyPass *** (function: f)
+; CHECK-FUNC-FILTER-MOD-SCOPE-NEXT: ModuleID = {{.+}}
+
+; CHECK-FILTER-MULT-FUNC: *** IR Dump At Start: ***
+; CHECK-FILTER-MULT-FUNC-NEXT: ; ModuleID = {{.+}}
+; CHECK-FILTER-MULT-FUNC: *** IR Dump After InstSimplifyPass *** (function: g)
+; CHECK-FILTER-MULT-FUNC-NEXT: define i32 @g()
+; CHECK-FILTER-MULT-FUNC: *** IR Dump After InstSimplifyPass *** (function: f)
+; CHECK-FILTER-MULT-FUNC-NEXT: define i32 @f()
+
+; CHECK-FILTER-PASSES: *** IR Dump After InstSimplifyPass (function: g) filtered out ***
+; CHECK-FILTER-PASSES: *** IR Dump At Start: *** (function: g)
+; CHECK-FILTER-PASSES-NEXT: ; ModuleID = {{.+}}
+; CHECK-FILTER-PASSES: *** IR Dump After NoOpFunctionPass (function: g) omitted because no change ***
+; CHECK-FILTER-PASSES: *** IR Dump After InstSimplifyPass (function: f) filtered out ***
+; CHECK-FILTER-PASSES: *** IR Dump After NoOpFunctionPass (function: f) omitted because no change ***
+
+; CHECK-FILTER-MULT-PASSES: *** IR Dump At Start: *** (function: g)
+; CHECK-FILTER-MULT-PASSES-NEXT: ; ModuleID = {{.+}}
+; CHECK-FILTER-MULT-PASSES: *** IR Dump After InstSimplifyPass *** (function: g)
+; CHECK-FILTER-MULT-PASSES-NEXT: define i32 @g()
+; CHECK-FILTER-MULT-PASSES: *** IR Dump After NoOpFunctionPass (function: g) omitted because no change ***
+; CHECK-FILTER-MULT-PASSES: *** IR Dump After InstSimplifyPass *** (function: f)
+; CHECK-FILTER-MULT-PASSES-NEXT: define i32 @f()
+; CHECK-FILTER-MULT-PASSES: *** IR Dump After NoOpFunctionPass (function: f) omitted because no change ***
+
+; CHECK-FILTER-FUNC-PASSES: *** IR Dump After InstSimplifyPass (function: g) filtered out ***
+; CHECK-FILTER-FUNC-PASSES: *** IR Dump After NoOpFunctionPass (function: g) filtered out ***
+; CHECK-FILTER-FUNC-PASSES: *** IR Dump At Start: *** (function: f)
+; CHECK-FILTER-FUNC-PASSES-NEXT: ; ModuleID = {{.+}}
+; CHECK-FILTER-FUNC-PASSES: *** IR Dump After InstSimplifyPass *** (function: f)
+; CHECK-FILTER-FUNC-PASSES-NEXT: define i32 @f()
+; CHECK-FILTER-FUNC-PASSES: *** IR Dump After NoOpFunctionPass (function: f) omitted because no change ***
+
+; CHECK-FILTER-FUNC-PASSES-MOD-SCOPE: *** IR Dump After InstSimplifyPass (function: g) filtered out ***
+; CHECK-FILTER-FUNC-PASSES-MOD-SCOPE: *** IR Dump After NoOpFunctionPass (function: g) filtered out ***
+; CHECK-FILTER-FUNC-PASSES-MOD-SCOPE: *** IR Dump At Start: *** (function: f)
+; CHECK-FILTER-FUNC-PASSES-MOD-SCOPE-NEXT: ; ModuleID = {{.+}}
+; CHECK-FILTER-FUNC-PASSES-MOD-SCOPE: *** IR Dump After InstSimplifyPass *** (function: f)
+; CHECK-FILTER-FUNC-PASSES-MOD-SCOPE-NEXT: ModuleID = {{.+}}
+; CHECK-FILTER-FUNC-PASSES-MOD-SCOPE: *** IR Dump After NoOpFunctionPass (function: f) omitted because no change ***
+
+; CHECK-MULT-PASSES-FILTER-FUNC: *** IR Dump At Start: ***
+; CHECK-MULT-PASSES-FILTER-FUNC-NEXT: ; ModuleID = {{.+}}
+; CHECK-MULT-PASSES-FILTER-FUNC: *** IR Dump After InstSimplifyPass (function: g) filtered out ***
+; CHECK-MULT-PASSES-FILTER-FUNC: *** IR Dump After InstSimplifyPass (function: g) filtered out ***
+; CHECK-MULT-PASSES-FILTER-FUNC: *** IR Dump After InstSimplifyPass *** (function: f)
+; CHECK-MULT-PASSES-FILTER-FUNC-NEXT: define i32 @f()
+; CHECK-MULT-PASSES-FILTER-FUNC: *** IR Dump After InstSimplifyPass (function: f) omitted because no change ***
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/noop-stores.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/noop-stores.ll
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/noop-stores.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/noop-stores.ll
@@ -101,6 +101,47 @@
   ret i32 0
 }
 
+; Remove redundant store if loaded value is in another block inside a loop.
+define i32 @test31(i1 %c, i32* %p, i32 %i) {
+; CHECK-LABEL: @test31(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[BB1]], label [[BB2:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %v = load i32, i32* %p, align 4
+  br label %bb1
+bb1:
+  store i32 %v, i32* %p, align 4
+  br i1 %c, label %bb1, label %bb2
+bb2:
+  ret i32 0
+}
+
+; Don't remove "redundant" store if %p is possibly stored to.
+define i32 @test46(i1 %c, i32* %p, i32* %p2, i32 %i) {
+; CHECK-LABEL: @test46(
+; CHECK:  load
+; CHECK:  store
+; CHECK:  store
+; CHECK:  ret i32 0
+;
+entry:
+  %v = load i32, i32* %p, align 4
+  br label %bb1
+bb1:
+  store i32 %v, i32* %p, align 4
+  br i1 %c, label %bb1, label %bb2
+bb2:
+  store i32 0, i32* %p2, align 4
+  br i1 %c, label %bb3, label %bb1
+bb3:
+  ret i32 0
+}
+
 declare void @unknown_func()
 
 ; Remove redundant store, which is in the lame loop as the load.
@@ -112,7 +153,7 @@
 ; CHECK-NEXT:    br label [[BB2:%.*]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    call void @unknown_func()
-; CHECK-NEXT:    br i1 undef, label [[BB1]], label [[BB3:%.*]]
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[BB1]], label [[BB3:%.*]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret i32 0
 ;
@@ -125,7 +166,7 @@
   store i32 %v, i32* %p, align 4
   ; Might read and overwrite value at %p, but doesn't matter.
   call void @unknown_func()
-  br i1 undef, label %bb1, label %bb3
+  br i1 %c, label %bb1, label %bb3
 bb3:
   ret i32 0
 }
@@ -168,4 +209,52 @@
   ret void
 }
 
+define i32 @test48(i1 %c, i32* %p) {
+; CHECK-LABEL: @test48(
+; CHECK: entry:
+; CHECK-NEXT: [[V:%.*]] = load
+; CHECK: store i32 0
+; CHECK: store i32 [[V]]
+; CHECK: ret i32 0
+entry:
+  %v = load i32, i32* %p, align 4
+  br i1 %c, label %bb0, label %bb0.0
+
+bb0:
+  store i32 0, i32* %p
+  br i1 %c, label %bb1, label %bb2
+
+bb0.0:
+  br label %bb1
+
+bb1:
+  store i32 %v, i32* %p, align 4
+  br i1 %c, label %bb2, label %bb0
+bb2:
+  ret i32 0
+}
+
+; TODO: Remove both redundant stores if loaded value is in another block inside a loop.
+define i32 @test47(i1 %c, i32* %p, i32 %i) {
+; X-CHECK-LABEL: @test47(
+; X-CHECK-NEXT:  entry:
+; X-CHECK-NEXT:    br label [[BB1:%.*]]
+; X-CHECK:       bb1:
+; X-CHECK-NEXT:    br i1 [[C:%.*]], label [[BB1]], label [[BB2:%.*]]
+; X-CHECK:       bb2:
+; X-CHECK-NEXT:    br i1 [[C]], label [[BB2]], label [[BB3:%.*]]
+; X-CHECK:       bb3:
+; X-CHECK-NEXT:    ret i32 0
+entry:
+  %v = load i32, i32* %p, align 4
+  br label %bb1
+bb1:
+  store i32 %v, i32* %p, align 4
+  br i1 %c, label %bb1, label %bb2
+bb2:
+  store i32 %v, i32* %p, align 4
+  br i1 %c, label %bb3, label %bb1
+bb3:
+  ret i32 0
+}
 
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/simple-todo.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/simple-todo.ll
deleted file mode 100644
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/simple-todo.ll
+++ /dev/null
@@ -1,25 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; XFAIL: *
-; RUN: opt < %s -basic-aa -dse -enable-dse-memoryssa -S | FileCheck %s
-; RUN: opt < %s -aa-pipeline=basic-aa -passes=dse -enable-dse-memoryssa -S | FileCheck %s
-target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
-
-; Remove redundant store if loaded value is in another block inside a loop.
-define i32 @test31(i1 %c, i32* %p, i32 %i) {
-; CHECK-LABEL: @test31(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[BB1:%.*]]
-; CHECK:       bb1:
-; CHECK-NEXT:    br i1 undef, label [[BB1]], label [[BB2:%.*]]
-; CHECK:       bb2:
-; CHECK-NEXT:    ret i32 0
-;
-entry:
-  %v = load i32, i32* %p, align 4
-  br label %bb1
-bb1:
-  store i32 %v, i32* %p, align 4
-  br i1 undef, label %bb1, label %bb2
-bb2:
-  ret i32 0
-}
diff --git a/llvm/test/Transforms/InstCombine/bitreverse-known-bits.ll b/llvm/test/Transforms/InstCombine/bitreverse-known-bits.ll
--- a/llvm/test/Transforms/InstCombine/bitreverse-known-bits.ll
+++ b/llvm/test/Transforms/InstCombine/bitreverse-known-bits.ll
@@ -1,11 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -S -instcombine | FileCheck %s
 
 declare i8 @llvm.bitreverse.i8(i8)
 declare i32 @llvm.bitreverse.i32(i32)
 
-; CHECK-LABEL: @test1
-; CHECK: ret i1 true
 define i1 @test1(i32 %arg) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    ret i1 true
+;
   %a = or i32 %arg, 4294901760
   %b = call i32 @llvm.bitreverse.i32(i32 %a)
   %and = and i32 %b, 65535
@@ -13,9 +15,10 @@
   ret i1 %res
 }
 
-; CHECK-LABEL: @test2
-; CHECK: ret i1 true
 define i1 @test2(i32 %arg) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    ret i1 true
+;
   %a = or i32 %arg, 1
   %b = call i32 @llvm.bitreverse.i32(i32 %a)
   %c = and i32 %b, 2147483648
@@ -24,9 +27,10 @@
   ret i1 %res
 }
 
-; CHECK-LABEL: @test3
-; CHECK: ret i1 false
 define i1 @test3(i32 %arg) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    ret i1 false
+;
   %a = or i32 %arg, 65536
   %b = call i32 @llvm.bitreverse.i32(i32 %a)
   %and = and i32 %b, 32768
@@ -34,18 +38,22 @@
   ret i1 %res
 }
 
-; CHECK-LABEL: @add_bitreverse
-; Make sure we process range metadata on bitreverse
+; known bits for the bitreverse will say the result is in the range [0, 64)
+; but the metadata says [0, 16). So make sure the range metadata wins.
+;    add %reverse, 1111 0000
+; should become
+;    or  %reverse, 1111 0000
+
 define i8 @add_bitreverse(i8 %a) {
+; CHECK-LABEL: @add_bitreverse(
+; CHECK-NEXT:    [[B:%.*]] = and i8 [[A:%.*]], -4
+; CHECK-NEXT:    [[REVERSE:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[B]]), [[RNG0:!range !.*]]
+; CHECK-NEXT:    [[C:%.*]] = or i8 [[REVERSE]], -16
+; CHECK-NEXT:    ret i8 [[C]]
+;
   %b = and i8 %a, 252
-  ; known bits for the bitreverse will say the result is in the range [0, 64)
-  ; but the metadata says [0, 16). So make sure the range metadata wins.
-  ;    add %reverse, 1111 0000
-  ; should become
-  ;    or  %reverse, 1111 0000
   %reverse = call i8 @llvm.bitreverse.i8(i8 %b), !range !1
   %c = add i8 %reverse, -16
-; CHECK: or i8 %reverse, -16
   ret i8 %c
 }
 !1 = !{i8 0, i8 16}
diff --git a/llvm/test/Transforms/InstCombine/select-binop-cmp.ll b/llvm/test/Transforms/InstCombine/select-binop-cmp.ll
--- a/llvm/test/Transforms/InstCombine/select-binop-cmp.ll
+++ b/llvm/test/Transforms/InstCombine/select-binop-cmp.ll
@@ -564,10 +564,12 @@
   ret <2 x i8>  %C
 }
 
+; Folding this would only be legal if we sanitized undef to 0.
 define <2 x i8> @select_xor_icmp_vec_undef(<2 x i8> %x, <2 x i8> %y, <2 x i8> %z) {
 ; CHECK-LABEL: @select_xor_icmp_vec_undef(
 ; CHECK-NEXT:    [[A:%.*]] = icmp eq <2 x i8> [[X:%.*]], <i8 0, i8 undef>
-; CHECK-NEXT:    [[C:%.*]] = select <2 x i1> [[A]], <2 x i8> [[Z:%.*]], <2 x i8> [[Y:%.*]]
+; CHECK-NEXT:    [[B:%.*]] = xor <2 x i8> [[X]], [[Z:%.*]]
+; CHECK-NEXT:    [[C:%.*]] = select <2 x i1> [[A]], <2 x i8> [[B]], <2 x i8> [[Y:%.*]]
 ; CHECK-NEXT:    ret <2 x i8> [[C]]
 ;
   %A = icmp eq <2 x i8>  %x, <i8 0, i8 undef>
diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll
--- a/llvm/test/Transforms/InstCombine/select.ll
+++ b/llvm/test/Transforms/InstCombine/select.ll
@@ -2641,10 +2641,24 @@
   ret i8 %sel
 }
 
+define i8 @select_replacement_sub_noundef(i8 %x, i8 noundef %y, i8 %z) {
+; CHECK-LABEL: @select_replacement_sub_noundef(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i8 0, i8 [[Z:%.*]]
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %cmp = icmp eq i8 %x, %y
+  %sub = sub i8 %x, %y
+  %sel = select i1 %cmp, i8 %sub, i8 %z
+  ret i8 %sel
+}
+
+; TODO: The transform is also safe without noundef.
 define i8 @select_replacement_sub(i8 %x, i8 %y, i8 %z) {
 ; CHECK-LABEL: @select_replacement_sub(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i8 0, i8 [[Z:%.*]]
+; CHECK-NEXT:    [[SUB:%.*]] = sub i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i8 [[SUB]], i8 [[Z:%.*]]
 ; CHECK-NEXT:    ret i8 [[SEL]]
 ;
   %cmp = icmp eq i8 %x, %y
@@ -2653,11 +2667,29 @@
   ret i8 %sel
 }
 
+define i8 @select_replacement_shift_noundef(i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: @select_replacement_shift_noundef(
+; CHECK-NEXT:    [[SHR:%.*]] = lshr exact i8 [[X:%.*]], 1
+; CHECK-NEXT:    call void @use_i8(i8 noundef [[SHR]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[SHR]], [[Y:%.*]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i8 [[X]], i8 [[Z:%.*]]
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %shr = lshr exact i8 %x, 1
+  call void @use_i8(i8 noundef %shr)
+  %cmp = icmp eq i8 %shr, %y
+  %shl = shl i8 %y, 1
+  %sel = select i1 %cmp, i8 %shl, i8 %z
+  ret i8 %sel
+}
+
+; TODO: The transform is also safe without noundef.
 define i8 @select_replacement_shift(i8 %x, i8 %y, i8 %z) {
 ; CHECK-LABEL: @select_replacement_shift(
 ; CHECK-NEXT:    [[SHR:%.*]] = lshr exact i8 [[X:%.*]], 1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[SHR]], [[Y:%.*]]
-; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i8 [[X]], i8 [[Z:%.*]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl i8 [[Y]], 1
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i8 [[SHL]], i8 [[Z:%.*]]
 ; CHECK-NEXT:    ret i8 [[SEL]]
 ;
   %shr = lshr exact i8 %x, 1
@@ -2694,4 +2726,5 @@
 }
 
 declare void @use(i1)
+declare void @use_i8(i8)
 declare i32 @llvm.cttz.i32(i32, i1 immarg)
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/cast.ll b/llvm/test/Transforms/InstSimplify/ConstProp/cast.ll
--- a/llvm/test/Transforms/InstSimplify/ConstProp/cast.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/cast.ll
@@ -40,24 +40,24 @@
 }
 
 ; https://llvm.org/PR43907 - make sure that NaN doesn't morph into Inf.
-; SNaN remains SNaN.
+; SNaN becomes QNaN.
 
 define float @nan_f64_trunc() {
 ; CHECK-LABEL: @nan_f64_trunc(
-; CHECK-NEXT:    ret float 0x7FF4000000000000
+; CHECK-NEXT:    ret float 0x7FF8000000000000
 ;
   %f = fptrunc double 0x7FF0000000000001 to float
   ret float %f
 }
 
 ; Verify again with a vector and different destination type.
-; SNaN remains SNaN (first two elements).
+; SNaN becomes SNaN (first two elements).
 ; QNaN remains QNaN (third element).
 ; Lower 42 bits of NaN source payload are lost.
 
 define <3 x half> @nan_v3f64_trunc() {
 ; CHECK-LABEL: @nan_v3f64_trunc(
-; CHECK-NEXT:    ret <3 x half> <half 0xH7D00, half 0xH7D00, half 0xH7E00>
+; CHECK-NEXT:    ret <3 x half> <half 0xH7E00, half 0xH7E00, half 0xH7E00>
 ;
   %f = fptrunc <3 x double> <double 0x7FF0020000000000, double 0x7FF003FFFFFFFFFF, double 0x7FF8000000000001> to <3 x half>
   ret <3 x half> %f
diff --git a/llvm/test/Transforms/LoopFlatten/loop-flatten-negative.ll b/llvm/test/Transforms/LoopFlatten/loop-flatten-negative.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoopFlatten/loop-flatten-negative.ll
@@ -0,0 +1,395 @@
+; RUN: opt < %s -S -loop-flatten -debug-only=loop-flatten 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; Every function in this file has a reason that it can't be transformed.
+
+; CHECK-NOT: Checks all passed, doing the transformation
+
+; Outer loop does not start at zero
+define void @test_1(i32 %N, i32* nocapture %C, i32* nocapture readonly %A, i32 %scale) {
+entry:
+  %cmp25 = icmp sgt i32 %N, 0
+  br i1 %cmp25, label %for.body4.lr.ph, label %for.cond.cleanup
+
+for.body4.lr.ph:
+  %i.026 = phi i32 [ %inc10, %for.cond.cleanup3 ], [ 1, %entry ]
+  %mul = mul nsw i32 %i.026, %N
+  br label %for.body4
+
+for.body4:
+  %j.024 = phi i32 [ 0, %for.body4.lr.ph ], [ %inc, %for.body4 ]
+  %add = add nsw i32 %j.024, %mul
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %add
+  %0 = load i32, i32* %arrayidx, align 4
+  %mul5 = mul nsw i32 %0, %scale
+  %arrayidx8 = getelementptr inbounds i32, i32* %C, i32 %add
+  store i32 %mul5, i32* %arrayidx8, align 4
+  %inc = add nuw nsw i32 %j.024, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.cond.cleanup3, label %for.body4
+
+for.cond.cleanup3:
+  %inc10 = add nuw nsw i32 %i.026, 1
+  %exitcond27 = icmp eq i32 %inc10, %N
+  br i1 %exitcond27, label %for.cond.cleanup, label %for.body4.lr.ph
+
+for.cond.cleanup:
+  ret void
+}
+
+; Inner loop does not start at zero
+define void @test_2(i32 %N, i32* nocapture %C, i32* nocapture readonly %A, i32 %scale) {
+entry:
+  %cmp25 = icmp sgt i32 %N, 0
+  br i1 %cmp25, label %for.body4.lr.ph, label %for.cond.cleanup
+
+for.body4.lr.ph:
+  %i.026 = phi i32 [ %inc10, %for.cond.cleanup3 ], [ 0, %entry ]
+  %mul = mul nsw i32 %i.026, %N
+  br label %for.body4
+
+for.body4:
+  %j.024 = phi i32 [ 1, %for.body4.lr.ph ], [ %inc, %for.body4 ]
+  %add = add nsw i32 %j.024, %mul
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %add
+  %0 = load i32, i32* %arrayidx, align 4
+  %mul5 = mul nsw i32 %0, %scale
+  %arrayidx8 = getelementptr inbounds i32, i32* %C, i32 %add
+  store i32 %mul5, i32* %arrayidx8, align 4
+  %inc = add nuw nsw i32 %j.024, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.cond.cleanup3, label %for.body4
+
+for.cond.cleanup3:
+  %inc10 = add nuw nsw i32 %i.026, 1
+  %exitcond27 = icmp eq i32 %inc10, %N
+  br i1 %exitcond27, label %for.cond.cleanup, label %for.body4.lr.ph
+
+for.cond.cleanup:
+  ret void
+}
+
+; Outer IV used directly
+define hidden void @test_3(i16 zeroext %N, i32* nocapture %C, i32* nocapture readonly %A, i32 %scale) {
+entry:
+  %conv = zext i16 %N to i32
+  %cmp25 = icmp eq i16 %N, 0
+  br i1 %cmp25, label %for.cond.cleanup, label %for.body.lr.ph.split.us
+
+for.body.lr.ph.split.us:                          ; preds = %entry
+  br label %for.body.us
+
+for.body.us:                                      ; preds = %for.cond2.for.cond.cleanup6_crit_edge.us, %for.body.lr.ph.split.us
+  %i.026.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc12.us, %for.cond2.for.cond.cleanup6_crit_edge.us ]
+  %arrayidx.us = getelementptr inbounds i32, i32* %A, i32 %i.026.us
+  %mul9.us = mul nuw nsw i32 %i.026.us, %conv
+  br label %for.body7.us
+
+for.body7.us:                                     ; preds = %for.body.us, %for.body7.us
+  %j.024.us = phi i32 [ 0, %for.body.us ], [ %inc.us, %for.body7.us ]
+  %0 = load i32, i32* %arrayidx.us, align 4
+  %mul.us = mul nsw i32 %0, %scale
+  %add.us = add nuw nsw i32 %j.024.us, %mul9.us
+  %arrayidx10.us = getelementptr inbounds i32, i32* %C, i32 %add.us
+  store i32 %mul.us, i32* %arrayidx10.us, align 4
+  %inc.us = add nuw nsw i32 %j.024.us, 1
+  %exitcond = icmp ne i32 %inc.us, %conv
+  br i1 %exitcond, label %for.body7.us, label %for.cond2.for.cond.cleanup6_crit_edge.us
+
+for.cond2.for.cond.cleanup6_crit_edge.us:         ; preds = %for.body7.us
+  %inc12.us = add nuw nsw i32 %i.026.us, 1
+  %exitcond27 = icmp ne i32 %inc12.us, %conv
+  br i1 %exitcond27, label %for.body.us, label %for.cond.cleanup.loopexit
+
+for.cond.cleanup.loopexit:                        ; preds = %for.cond2.for.cond.cleanup6_crit_edge.us
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+}
+
+; Inner IV used directly
+define hidden void @test_4(i16 zeroext %N, i32* nocapture %C, i32* nocapture readonly %A, i32 %scale) {
+entry:
+  %conv = zext i16 %N to i32
+  %cmp25 = icmp eq i16 %N, 0
+  br i1 %cmp25, label %for.cond.cleanup, label %for.body.lr.ph.split.us
+
+for.body.lr.ph.split.us:                          ; preds = %entry
+  br label %for.body.us
+
+for.body.us:                                      ; preds = %for.cond2.for.cond.cleanup6_crit_edge.us, %for.body.lr.ph.split.us
+  %i.026.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc12.us, %for.cond2.for.cond.cleanup6_crit_edge.us ]
+  %mul9.us = mul nuw nsw i32 %i.026.us, %conv
+  br label %for.body7.us
+
+for.body7.us:                                     ; preds = %for.body.us, %for.body7.us
+  %j.024.us = phi i32 [ 0, %for.body.us ], [ %inc.us, %for.body7.us ]
+  %arrayidx.us = getelementptr inbounds i32, i32* %A, i32 %j.024.us
+  %0 = load i32, i32* %arrayidx.us, align 4
+  %mul.us = mul nsw i32 %0, %scale
+  %add.us = add nuw nsw i32 %j.024.us, %mul9.us
+  %arrayidx10.us = getelementptr inbounds i32, i32* %C, i32 %add.us
+  store i32 %mul.us, i32* %arrayidx10.us, align 4
+  %inc.us = add nuw nsw i32 %j.024.us, 1
+  %exitcond = icmp ne i32 %inc.us, %conv
+  br i1 %exitcond, label %for.body7.us, label %for.cond2.for.cond.cleanup6_crit_edge.us
+
+for.cond2.for.cond.cleanup6_crit_edge.us:         ; preds = %for.body7.us
+  %inc12.us = add nuw nsw i32 %i.026.us, 1
+  %exitcond27 = icmp ne i32 %inc12.us, %conv
+  br i1 %exitcond27, label %for.body.us, label %for.cond.cleanup.loopexit
+
+for.cond.cleanup.loopexit:                        ; preds = %for.cond2.for.cond.cleanup6_crit_edge.us
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+}
+
+; Inner iteration count not invariant in outer loop
+declare i32 @get_int() readonly
+define void @test_5(i16 zeroext %N, i32* nocapture %C, i32* nocapture readonly %A, i32 %scale) {
+entry:
+  %conv = zext i16 %N to i32
+  %cmp27 = icmp eq i16 %N, 0
+  br i1 %cmp27, label %for.cond.cleanup, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.cond.cleanup5
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.cond.cleanup5
+  %i.028 = phi i32 [ 0, %for.body.lr.ph ], [ %inc12, %for.cond.cleanup5 ]
+  %call = tail call i32 @get_int()
+  %cmp325 = icmp sgt i32 %call, 0
+  br i1 %cmp325, label %for.body6.lr.ph, label %for.cond.cleanup5
+
+for.body6.lr.ph:                                  ; preds = %for.body
+  %mul = mul nsw i32 %call, %i.028
+  br label %for.body6
+
+for.cond.cleanup5.loopexit:                       ; preds = %for.body6
+  br label %for.cond.cleanup5
+
+for.cond.cleanup5:                                ; preds = %for.cond.cleanup5.loopexit, %for.body
+  %inc12 = add nuw nsw i32 %i.028, 1
+  %exitcond29 = icmp ne i32 %inc12, %conv
+  br i1 %exitcond29, label %for.body, label %for.cond.cleanup.loopexit
+
+for.body6:                                        ; preds = %for.body6.lr.ph, %for.body6
+  %j.026 = phi i32 [ 0, %for.body6.lr.ph ], [ %inc, %for.body6 ]
+  %add = add nsw i32 %j.026, %mul
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %add
+  %0 = load i32, i32* %arrayidx, align 4
+  %mul7 = mul nsw i32 %0, %scale
+  %arrayidx10 = getelementptr inbounds i32, i32* %C, i32 %add
+  store i32 %mul7, i32* %arrayidx10, align 4
+  %inc = add nuw nsw i32 %j.026, 1
+  %exitcond = icmp ne i32 %inc, %call
+  br i1 %exitcond, label %for.body6, label %for.cond.cleanup5.loopexit
+}
+
+; Inner loop has an early exit
+define hidden void @test_6(i16 zeroext %N, i32* nocapture %C, i32* nocapture readonly %A, i32 %scale) {
+entry:
+  %conv = zext i16 %N to i32
+  %cmp39 = icmp eq i16 %N, 0
+  br i1 %cmp39, label %for.cond.cleanup, label %for.body.us.preheader
+
+for.body.us.preheader:                            ; preds = %entry
+  br label %for.body.us
+
+for.body.us:                                      ; preds = %for.body.us.preheader, %cleanup.us
+  %i.040.us = phi i32 [ %inc19.us, %cleanup.us ], [ 0, %for.body.us.preheader ]
+  %mul.us = mul nuw nsw i32 %i.040.us, %conv
+  br label %for.body7.us
+
+for.body7.us:                                     ; preds = %for.body.us, %if.end.us
+  %j.038.us = phi i32 [ 0, %for.body.us ], [ %inc.us, %if.end.us ]
+  %add.us = add nuw nsw i32 %j.038.us, %mul.us
+  %arrayidx.us = getelementptr inbounds i32, i32* %A, i32 %add.us
+  %0 = load i32, i32* %arrayidx.us, align 4
+  %tobool.us = icmp eq i32 %0, 0
+  br i1 %tobool.us, label %if.end.us, label %cleanup.us
+
+cleanup.us:                                       ; preds = %if.end.us, %for.body7.us
+  %inc19.us = add nuw nsw i32 %i.040.us, 1
+  %exitcond = icmp eq i32 %inc19.us, %conv
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body.us
+
+if.end.us:                                        ; preds = %for.body7.us
+  %arrayidx17.us = getelementptr inbounds i32, i32* %C, i32 %add.us
+  store i32 0, i32* %arrayidx17.us, align 4
+  %inc.us = add nuw nsw i32 %j.038.us, 1
+  %cmp4.us = icmp ult i32 %inc.us, %conv
+  br i1 %cmp4.us, label %for.body7.us, label %cleanup.us
+
+for.cond.cleanup:                                 ; preds = %cleanup.us, %entry
+  ret void
+}
+
+define hidden void @test_7(i16 zeroext %N, i32* nocapture %C, i32* nocapture readonly %A, i32 %scale) {
+entry:
+  %conv = zext i16 %N to i32
+  %cmp30 = icmp eq i16 %N, 0
+  br i1 %cmp30, label %cleanup, label %for.body.us.preheader
+
+for.body.us.preheader:                            ; preds = %entry
+  br label %for.body.us
+
+for.body.us:                                      ; preds = %for.body.us.preheader, %for.cond2.for.cond.cleanup6_crit_edge.us
+  %i.031.us = phi i32 [ %inc15.us, %for.cond2.for.cond.cleanup6_crit_edge.us ], [ 0, %for.body.us.preheader ]
+  %call.us = tail call i32 @get_int() #2
+  %tobool.us = icmp eq i32 %call.us, 0
+  br i1 %tobool.us, label %for.body7.lr.ph.us, label %cleanup
+
+for.body7.us:                                     ; preds = %for.body7.us, %for.body7.lr.ph.us
+  %j.029.us = phi i32 [ 0, %for.body7.lr.ph.us ], [ %inc.us, %for.body7.us ]
+  %add.us = add nuw nsw i32 %j.029.us, %mul.us
+  %arrayidx.us = getelementptr inbounds i32, i32* %A, i32 %add.us
+  %0 = load i32, i32* %arrayidx.us, align 4
+  %mul9.us = mul nsw i32 %0, %scale
+  %arrayidx13.us = getelementptr inbounds i32, i32* %C, i32 %add.us
+  store i32 %mul9.us, i32* %arrayidx13.us, align 4
+  %inc.us = add nuw nsw i32 %j.029.us, 1
+  %exitcond = icmp eq i32 %inc.us, %conv
+  br i1 %exitcond, label %for.cond2.for.cond.cleanup6_crit_edge.us, label %for.body7.us
+
+for.body7.lr.ph.us:                               ; preds = %for.body.us
+  %mul.us = mul nuw nsw i32 %i.031.us, %conv
+  br label %for.body7.us
+
+for.cond2.for.cond.cleanup6_crit_edge.us:         ; preds = %for.body7.us
+  %inc15.us = add nuw nsw i32 %i.031.us, 1
+  %cmp.us = icmp ult i32 %inc15.us, %conv
+  br i1 %cmp.us, label %for.body.us, label %cleanup
+
+cleanup:                                          ; preds = %for.cond2.for.cond.cleanup6_crit_edge.us, %for.body.us, %entry
+  ret void
+}
+
+; Step is not 1
+define i32 @test_8(i32 %val, i16* nocapture %A) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.inc6
+  %i.018 = phi i32 [ 0, %entry ], [ %inc7, %for.inc6 ]
+  %mul = mul nuw nsw i32 %i.018, 20
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body, %for.body3
+  %j.017 = phi i32 [ 0, %for.body ], [ %inc, %for.body3 ]
+  %add = add nuw nsw i32 %j.017, %mul
+  %arrayidx = getelementptr inbounds i16, i16* %A, i32 %add
+  %0 = load i16, i16* %arrayidx, align 2
+  %conv16 = zext i16 %0 to i32
+  %add4 = add i32 %conv16, %val
+  %conv5 = trunc i32 %add4 to i16
+  store i16 %conv5, i16* %arrayidx, align 2
+  %inc = add nuw nsw i32 %j.017, 1
+  %exitcond = icmp ne i32 %inc, 20
+  br i1 %exitcond, label %for.body3, label %for.inc6
+
+for.inc6:                                         ; preds = %for.body3
+  %inc7 = add nuw nsw i32 %i.018, 2
+  %exitcond19 = icmp ne i32 %inc7, 10
+  br i1 %exitcond19, label %for.body, label %for.end8
+
+for.end8:                                         ; preds = %for.inc6
+  ret i32 10
+}
+
+
+; Step is not 1
+define i32 @test_9(i32 %val, i16* nocapture %A) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.inc6
+  %i.018 = phi i32 [ 0, %entry ], [ %inc7, %for.inc6 ]
+  %mul = mul nuw nsw i32 %i.018, 20
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body, %for.body3
+  %j.017 = phi i32 [ 0, %for.body ], [ %inc, %for.body3 ]
+  %add = add nuw nsw i32 %j.017, %mul
+  %arrayidx = getelementptr inbounds i16, i16* %A, i32 %add
+  %0 = load i16, i16* %arrayidx, align 2
+  %conv16 = zext i16 %0 to i32
+  %add4 = add i32 %conv16, %val
+  %conv5 = trunc i32 %add4 to i16
+  store i16 %conv5, i16* %arrayidx, align 2
+  %inc = add nuw nsw i32 %j.017, 2
+  %exitcond = icmp ne i32 %inc, 20
+  br i1 %exitcond, label %for.body3, label %for.inc6
+
+for.inc6:                                         ; preds = %for.body3
+  %inc7 = add nuw nsw i32 %i.018, 1
+  %exitcond19 = icmp ne i32 %inc7, 10
+  br i1 %exitcond19, label %for.body, label %for.end8
+
+for.end8:                                         ; preds = %for.inc6
+  ret i32 10
+}
+
+
+; Outer loop conditional phi
+define i32 @e() {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.end16
+  %f.033 = phi i32 [ 0, %entry ], [ %inc18, %for.end16 ]
+  %g.032 = phi i32 [ undef, %entry ], [ %g.3.lcssa, %for.end16 ]
+  %.pr = add i32 10, 10
+  %tobool29 = icmp eq i32 %.pr, 0
+  br i1 %tobool29, label %for.end, label %for.body2.lr.ph
+
+for.body2.lr.ph:                                  ; preds = %for.body
+  br label %for.cond1.for.end_crit_edge
+
+for.cond1.for.end_crit_edge:                      ; preds = %for.body2.lr.ph
+  br label %for.end
+
+for.end:                                          ; preds = %for.cond1.for.end_crit_edge, %for.body
+  %g.1.lcssa = phi i32 [ 0, %for.cond1.for.end_crit_edge ], [ %g.032, %for.body ]
+  br label %for.body5
+
+for.body5:                                        ; preds = %for.end, %lor.end
+  %i.031 = phi i32 [ 0, %for.end ], [ %inc15, %lor.end ]
+  %g.230 = phi i32 [ %g.1.lcssa, %for.end ], [ %g.3, %lor.end ]
+  %0 = add i32 10, 10
+  %1 = add i32 10, 10
+  %tobool9 = icmp eq i32 %1, 0
+  br i1 %tobool9, label %lor.rhs, label %lor.end
+
+lor.rhs:                                          ; preds = %for.body5
+  %2 = add i32 10, 10
+  %call11 = add i32 10, 10
+  %tobool12 = icmp ne i32 %call11, 0
+  br label %lor.end
+
+lor.end:                                          ; preds = %for.body5, %lor.rhs
+  %g.3 = phi i32 [ %g.230, %for.body5 ], [ %call11, %lor.rhs ]
+  %3 = phi i1 [ true, %for.body5 ], [ %tobool12, %lor.rhs ]
+  %lor.ext = zext i1 %3 to i32
+  %inc15 = add nuw nsw i32 %i.031, 1
+  %exitcond = icmp ne i32 %inc15, 9
+  br i1 %exitcond, label %for.body5, label %for.end16
+
+for.end16:                                        ; preds = %lor.end
+  %g.3.lcssa = phi i32 [ %g.3, %lor.end ]
+  %inc18 = add nuw nsw i32 %f.033, 1
+  %exitcond34 = icmp ne i32 %inc18, 7
+  br i1 %exitcond34, label %for.body, label %for.end19
+
+for.end19:                                        ; preds = %for.end16
+  ret i32 undef
+}
diff --git a/llvm/test/Transforms/LoopFlatten/loop-flatten.ll b/llvm/test/Transforms/LoopFlatten/loop-flatten.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoopFlatten/loop-flatten.ll
@@ -0,0 +1,591 @@
+; RUN: opt < %s -S -loop-flatten -verify-loop-info -verify-dom-info -verify-scev -verify | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+; CHECK-LABEL: test1
+; Simple loop where the IV's is constant
+define i32 @test1(i32 %val, i16* nocapture %A) {
+entry:
+  br label %for.body
+; CHECK: entry:
+; CHECK:   %flatten.tripcount = mul i32 20, 10
+; CHECK:   br label %for.body
+
+for.body:                                         ; preds = %entry, %for.inc6
+  %i.018 = phi i32 [ 0, %entry ], [ %inc7, %for.inc6 ]
+  %mul = mul nuw nsw i32 %i.018, 20
+  br label %for.body3
+; CHECK: for.body:
+; CHECK:   %i.018 = phi i32 [ 0, %entry ], [ %inc7, %for.inc6 ]
+; CHECK:   %mul = mul nuw nsw i32 %i.018, 20
+; CHECK:   br label %for.body3
+
+for.body3:                                        ; preds = %for.body, %for.body3
+  %j.017 = phi i32 [ 0, %for.body ], [ %inc, %for.body3 ]
+  %add = add nuw nsw i32 %j.017, %mul
+  %arrayidx = getelementptr inbounds i16, i16* %A, i32 %add
+  %0 = load i16, i16* %arrayidx, align 2
+  %conv16 = zext i16 %0 to i32
+  %add4 = add i32 %conv16, %val
+  %conv5 = trunc i32 %add4 to i16
+  store i16 %conv5, i16* %arrayidx, align 2
+  %inc = add nuw nsw i32 %j.017, 1
+  %exitcond = icmp ne i32 %inc, 20
+  br i1 %exitcond, label %for.body3, label %for.inc6
+; CHECK: for.body3:
+; CHECK:   %j.017 = phi i32 [ 0, %for.body ]
+; CHECK:   %add = add nuw nsw i32 %j.017, %mul
+; CHECK:   %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.018
+; CHECK:   %0 = load i16, i16* %arrayidx, align 2
+; CHECK:   %conv16 = zext i16 %0 to i32
+; CHECK:   %add4 = add i32 %conv16, %val
+; CHECK:   %conv5 = trunc i32 %add4 to i16
+; CHECK:   store i16 %conv5, i16* %arrayidx, align 2
+; CHECK:   %inc = add nuw nsw i32 %j.017, 1
+; CHECK:   %exitcond = icmp ne i32 %inc, 20
+; CHECK:   br label %for.inc6
+
+for.inc6:                                         ; preds = %for.body3
+  %inc7 = add nuw nsw i32 %i.018, 1
+  %exitcond19 = icmp ne i32 %inc7, 10
+  br i1 %exitcond19, label %for.body, label %for.end8
+; CHECK: for.inc6:
+; CHECK:   %inc7 = add nuw nsw i32 %i.018, 1
+; CHECK:   %exitcond19 = icmp ne i32 %inc7, %flatten.tripcount
+; CHECK:   br i1 %exitcond19, label %for.body, label %for.end8
+
+for.end8:                                         ; preds = %for.inc6
+  ret i32 10
+}
+
+
+; CHECK-LABEL: test2
+; Same as above but non constant IV (which still cannot overflow)
+define i32 @test2(i8 zeroext %I, i32 %val, i16* nocapture %A) {
+entry:
+  %conv = zext i8 %I to i32
+  %cmp26 = icmp eq i8 %I, 0
+  br i1 %cmp26, label %for.end13, label %for.body.lr.ph.split.us
+
+for.body.lr.ph.split.us:                          ; preds = %entry
+  br label %for.body.us
+; CHECK: for.body.lr.ph.split.us:
+; CHECK:   %flatten.tripcount = mul i32 %conv, %conv
+; CHECK:   br label %for.body.us
+
+for.body.us:                                      ; preds = %for.cond2.for.inc11_crit_edge.us, %for.body.lr.ph.split.us
+  %i.027.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc12.us, %for.cond2.for.inc11_crit_edge.us ]
+  %mul.us = mul nuw nsw i32 %i.027.us, %conv
+  br label %for.body6.us
+; CHECK: for.body.us:
+; CHECK:   %i.027.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc12.us, %for.cond2.for.inc11_crit_edge.us ]
+; CHECK:   %mul.us = mul nuw nsw i32 %i.027.us, %conv
+; CHECK:   br label %for.body6.us
+
+for.body6.us:                                     ; preds = %for.body.us, %for.body6.us
+  %j.025.us = phi i32 [ 0, %for.body.us ], [ %inc.us, %for.body6.us ]
+  %add.us = add nuw nsw i32 %j.025.us, %mul.us
+  %arrayidx.us = getelementptr inbounds i16, i16* %A, i32 %add.us
+  %0 = load i16, i16* %arrayidx.us, align 2
+  %conv823.us = zext i16 %0 to i32
+  %add9.us = add i32 %conv823.us, %val
+  %conv10.us = trunc i32 %add9.us to i16
+  store i16 %conv10.us, i16* %arrayidx.us, align 2
+  %inc.us = add nuw nsw i32 %j.025.us, 1
+  %exitcond = icmp ne i32 %inc.us, %conv
+  br i1 %exitcond, label %for.body6.us, label %for.cond2.for.inc11_crit_edge.us
+; CHECK: for.body6.us:
+; CHECK:   %j.025.us = phi i32 [ 0, %for.body.us ]
+; CHECK:   %add.us = add nuw nsw i32 %j.025.us, %mul.us
+; CHECK:   %arrayidx.us = getelementptr inbounds i16, i16* %A, i32 %i.027.us
+; CHECK:   %0 = load i16, i16* %arrayidx.us, align 2
+; CHECK:   %conv823.us = zext i16 %0 to i32
+; CHECK:   %add9.us = add i32 %conv823.us, %val
+; CHECK:   %conv10.us = trunc i32 %add9.us to i16
+; CHECK:   store i16 %conv10.us, i16* %arrayidx.us, align 2
+; CHECK:   %inc.us = add nuw nsw i32 %j.025.us, 1
+; CHECK:   %exitcond = icmp ne i32 %inc.us, %conv
+; CHECK:   br label %for.cond2.for.inc11_crit_edge.us
+
+for.cond2.for.inc11_crit_edge.us:                 ; preds = %for.body6.us
+  %inc12.us = add nuw nsw i32 %i.027.us, 1
+  %exitcond28 = icmp ne i32 %inc12.us, %conv
+  br i1 %exitcond28, label %for.body.us, label %for.end13.loopexit
+; CHECK: for.cond2.for.inc11_crit_edge.us:                 ; preds = %for.body6.us
+; CHECK:   %inc12.us = add nuw nsw i32 %i.027.us, 1
+; CHECK:   %exitcond28 = icmp ne i32 %inc12.us, %flatten.tripcount
+; CHECK:   br i1 %exitcond28, label %for.body.us, label %for.end13.loopexit
+
+for.end13.loopexit:                               ; preds = %for.cond2.for.inc11_crit_edge.us
+  br label %for.end13
+
+for.end13:                                        ; preds = %for.end13.loopexit, %entry
+  %i.0.lcssa = phi i32 [ 0, %entry ], [ %conv, %for.end13.loopexit ]
+  ret i32 %i.0.lcssa
+}
+
+
+; CHECK-LABEL: test3
+; Same as above, uses load to determine it can't overflow
+define i32 @test3(i32 %N, i32 %val, i16* nocapture %A) local_unnamed_addr #0 {
+entry:
+  %cmp21 = icmp eq i32 %N, 0
+  br i1 %cmp21, label %for.end8, label %for.body.lr.ph.split.us
+
+for.body.lr.ph.split.us:                          ; preds = %entry
+  br label %for.body.us
+; CHECK: for.body.lr.ph.split.us:
+; CHECK:   %flatten.tripcount = mul i32 %N, %N
+; CHECK:   br label %for.body.us
+
+for.body.us:                                      ; preds = %for.cond1.for.inc6_crit_edge.us, %for.body.lr.ph.split.us
+  %i.022.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc7.us, %for.cond1.for.inc6_crit_edge.us ]
+  %mul.us = mul i32 %i.022.us, %N
+  br label %for.body3.us
+; CHECK: for.body.us:
+; CHECK:   %i.022.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc7.us, %for.cond1.for.inc6_crit_edge.us ]
+; CHECK:   %mul.us = mul i32 %i.022.us, %N
+; CHECK:   br label %for.body3.us
+
+for.body3.us:                                     ; preds = %for.body.us, %for.body3.us
+  %j.020.us = phi i32 [ 0, %for.body.us ], [ %inc.us, %for.body3.us ]
+  %add.us = add i32 %j.020.us, %mul.us
+  %arrayidx.us = getelementptr inbounds i16, i16* %A, i32 %add.us
+  %0 = load i16, i16* %arrayidx.us, align 2
+  %conv18.us = zext i16 %0 to i32
+  %add4.us = add i32 %conv18.us, %val
+  %conv5.us = trunc i32 %add4.us to i16
+  store i16 %conv5.us, i16* %arrayidx.us, align 2
+  %inc.us = add nuw i32 %j.020.us, 1
+  %exitcond = icmp ne i32 %inc.us, %N
+  br i1 %exitcond, label %for.body3.us, label %for.cond1.for.inc6_crit_edge.us
+; CHECK: for.body3.us:
+; CHECK:   %j.020.us = phi i32 [ 0, %for.body.us ]
+; CHECK:   %add.us = add i32 %j.020.us, %mul.us
+; CHECK:   %arrayidx.us = getelementptr inbounds i16, i16* %A, i32 %i.022.us
+; CHECK:   %0 = load i16, i16* %arrayidx.us, align 2
+; CHECK:   %conv18.us = zext i16 %0 to i32
+; CHECK:   %add4.us = add i32 %conv18.us, %val
+; CHECK:   %conv5.us = trunc i32 %add4.us to i16
+; CHECK:   store i16 %conv5.us, i16* %arrayidx.us, align 2
+; CHECK:   %inc.us = add nuw i32 %j.020.us, 1
+; CHECK:   %exitcond = icmp ne i32 %inc.us, %N
+; CHECK:   br label %for.cond1.for.inc6_crit_edge.us
+
+for.cond1.for.inc6_crit_edge.us:                  ; preds = %for.body3.us
+  %inc7.us = add nuw i32 %i.022.us, 1
+  %exitcond23 = icmp ne i32 %inc7.us, %N
+  br i1 %exitcond23, label %for.body.us, label %for.end8.loopexit
+; CHECK: for.cond1.for.inc6_crit_edge.us:
+; CHECK:   %inc7.us = add nuw i32 %i.022.us, 1
+; CHECK:   %exitcond23 = icmp ne i32 %inc7.us, %flatten.tripcount
+; CHECK:   br i1 %exitcond23, label %for.body.us, label %for.end8.loopexit
+
+for.end8.loopexit:                                ; preds = %for.cond1.for.inc6_crit_edge.us
+  br label %for.end8
+
+for.end8:                                         ; preds = %for.end8.loopexit, %entry
+  %i.0.lcssa = phi i32 [ 0, %entry ], [ %N, %for.end8.loopexit ]
+  ret i32 %i.0.lcssa
+}
+
+
+; CHECK-LABEL: test4
+; Multiplication cannot overflow, so we can replace the original loop.
+define void @test4(i16 zeroext %N, i32* nocapture %C, i32* nocapture readonly %A, i32 %scale) {
+entry:
+  %conv = zext i16 %N to i32
+  %cmp30 = icmp eq i16 %N, 0
+  br i1 %cmp30, label %for.cond.cleanup, label %for.body.lr.ph.split.us
+; CHECK: entry:
+; CHECK: %[[LIMIT:.*]] = zext i16 %N to i32
+; CHECK: br i1 %{{.*}} label %for.cond.cleanup, label %for.body.lr.ph.split.us
+
+for.body.lr.ph.split.us:                          ; preds = %entry
+  br label %for.body.us
+; CHECK: for.body.lr.ph.split.us:
+; CHECK: %[[TRIPCOUNT:.*]] = mul i32 %[[LIMIT]], %[[LIMIT]]
+; CHECK: br label %for.body.us
+
+for.body.us:                                      ; preds = %for.cond2.for.cond.cleanup6_crit_edge.us, %for.body.lr.ph.split.us
+  %i.031.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc15.us, %for.cond2.for.cond.cleanup6_crit_edge.us ]
+  %mul.us = mul nuw nsw i32 %i.031.us, %conv
+  br label %for.body7.us
+; CHECK: for.body.us:
+; CHECK: %[[OUTER_IV:.*]] = phi i32
+; CHECK: br label %for.body7.us
+
+for.body7.us:                                     ; preds = %for.body.us, %for.body7.us
+; CHECK: for.body7.us:
+  %j.029.us = phi i32 [ 0, %for.body.us ], [ %inc.us, %for.body7.us ]
+  %add.us = add nuw nsw i32 %j.029.us, %mul.us
+  %arrayidx.us = getelementptr inbounds i32, i32* %A, i32 %add.us
+; CHECK: getelementptr inbounds i32, i32* %A, i32 %[[OUTER_IV]]
+  %0 = load i32, i32* %arrayidx.us, align 4
+  %mul9.us = mul nsw i32 %0, %scale
+; CHECK: getelementptr inbounds i32, i32* %C, i32 %[[OUTER_IV]]
+  %arrayidx13.us = getelementptr inbounds i32, i32* %C, i32 %add.us
+  store i32 %mul9.us, i32* %arrayidx13.us, align 4
+  %inc.us = add nuw nsw i32 %j.029.us, 1
+  %exitcond = icmp ne i32 %inc.us, %conv
+  br i1 %exitcond, label %for.body7.us, label %for.cond2.for.cond.cleanup6_crit_edge.us
+; CHECK: br label %for.cond2.for.cond.cleanup6_crit_edge.us
+
+for.cond2.for.cond.cleanup6_crit_edge.us:         ; preds = %for.body7.us
+  %inc15.us = add nuw nsw i32 %i.031.us, 1
+  %exitcond32 = icmp ne i32 %inc15.us, %conv
+  br i1 %exitcond32, label %for.body.us, label %for.cond.cleanup.loopexit
+; CHECK: for.cond2.for.cond.cleanup6_crit_edge.us:
+; CHECK: br i1 %exitcond32, label %for.body.us, label %for.cond.cleanup.loopexit
+
+for.cond.cleanup.loopexit:                        ; preds = %for.cond2.for.cond.cleanup6_crit_edge.us
+  br label %for.cond.cleanup
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK: br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+; CHECK: for.cond.cleanup:
+; CHECK: ret void
+}
+
+
+; CHECK-LABEL: test5
+define i32 @test5(i8 zeroext %I, i16 zeroext %J) {
+entry:
+  %0 = lshr i8 %I, 1
+  %div = zext i8 %0 to i32
+  %cmp30 = icmp eq i8 %0, 0
+  br i1 %cmp30, label %for.cond.cleanup, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  %1 = lshr i16 %J, 1
+  %div5 = zext i16 %1 to i32
+  %cmp627 = icmp eq i16 %1, 0
+  br i1 %cmp627, label %for.body.lr.ph.split, label %for.body.lr.ph.split.us
+
+for.body.lr.ph.split.us:                          ; preds = %for.body.lr.ph
+  br label %for.body.us
+; CHECK: for.body.lr.ph.split.us:
+; CHECK:   %flatten.tripcount = mul i32 %div5, %div
+; CHECK:   br label %for.body.us
+
+for.body.us:                                      ; preds = %for.cond3.for.cond.cleanup8_crit_edge.us, %for.body.lr.ph.split.us
+  %i.032.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc13.us, %for.cond3.for.cond.cleanup8_crit_edge.us ]
+  %x.031.us = phi i32 [ 1, %for.body.lr.ph.split.us ], [ %xor.us.lcssa, %for.cond3.for.cond.cleanup8_crit_edge.us ]
+  br label %for.body9.us
+; CHECK: for.body.us:
+; CHECK:   %i.032.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc13.us, %for.cond3.for.cond.cleanup8_crit_edge.us ]
+; CHECK:   %x.031.us = phi i32 [ 1, %for.body.lr.ph.split.us ], [ %xor.us.lcssa, %for.cond3.for.cond.cleanup8_crit_edge.us ]
+; CHECK:   br label %for.body9.us
+
+for.body9.us:                                     ; preds = %for.body.us, %for.body9.us
+  %j.029.us = phi i32 [ 0, %for.body.us ], [ %inc.us, %for.body9.us ]
+  %x.128.us = phi i32 [ %x.031.us, %for.body.us ], [ %xor.us, %for.body9.us ]
+  %call.us = tail call i32 @func(i32 1)
+  %sub.us = sub nsw i32 %call.us, %x.128.us
+  %xor.us = xor i32 %sub.us, %x.128.us
+  %inc.us = add nuw nsw i32 %j.029.us, 1
+  %cmp6.us = icmp ult i32 %inc.us, %div5
+  br i1 %cmp6.us, label %for.body9.us, label %for.cond3.for.cond.cleanup8_crit_edge.us
+; CHECK: for.body9.us:
+; CHECK:   %j.029.us = phi i32 [ 0, %for.body.us ]
+; CHECK:   %x.128.us = phi i32 [ %x.031.us, %for.body.us ]
+; CHECK:   %call.us = tail call i32 @func(i32 1)
+; CHECK:   %sub.us = sub nsw i32 %call.us, %x.128.us
+; CHECK:   %xor.us = xor i32 %sub.us, %x.128.us
+; CHECK:   %inc.us = add nuw nsw i32 %j.029.us, 1
+; CHECK:   %cmp6.us = icmp ult i32 %inc.us, %div5
+; CHECK:   br label %for.cond3.for.cond.cleanup8_crit_edge.us
+
+for.cond3.for.cond.cleanup8_crit_edge.us:         ; preds = %for.body9.us
+  %xor.us.lcssa = phi i32 [ %xor.us, %for.body9.us ]
+  %inc13.us = add nuw nsw i32 %i.032.us, 1
+  %cmp.us = icmp ult i32 %inc13.us, %div
+  br i1 %cmp.us, label %for.body.us, label %for.cond.cleanup.loopexit
+; CHECK: for.cond3.for.cond.cleanup8_crit_edge.us:
+; CHECK:   %xor.us.lcssa = phi i32 [ %xor.us, %for.body9.us ]
+; CHECK:   %inc13.us = add nuw nsw i32 %i.032.us, 1
+; CHECK:   %cmp.us = icmp ult i32 %inc13.us, %flatten.tripcount
+; CHECK:   br i1 %cmp.us, label %for.body.us, label %for.cond.cleanup.loopexit
+
+for.body.lr.ph.split:                             ; preds = %for.body.lr.ph
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.cond3.for.cond.cleanup8_crit_edge.us
+  %xor.us.lcssa.lcssa = phi i32 [ %xor.us.lcssa, %for.cond3.for.cond.cleanup8_crit_edge.us ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup.loopexit34:                      ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit34, %for.cond.cleanup.loopexit, %entry
+  %x.0.lcssa = phi i32 [ 1, %entry ], [ %xor.us.lcssa.lcssa, %for.cond.cleanup.loopexit ], [ 1, %for.cond.cleanup.loopexit34 ]
+  ret i32 %x.0.lcssa
+
+for.body:                                         ; preds = %for.body.lr.ph.split, %for.body
+  %i.032 = phi i32 [ 0, %for.body.lr.ph.split ], [ %inc13, %for.body ]
+  %inc13 = add nuw nsw i32 %i.032, 1
+  %cmp = icmp ult i32 %inc13, %div
+  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit34
+}
+
+
+; CHECK-LABEL: test6
+define i32 @test6(i8 zeroext %I, i16 zeroext %J) {
+entry:
+  %0 = lshr i8 %I, 1
+  %div = zext i8 %0 to i32
+  %cmp30 = icmp eq i8 %0, 0
+  br i1 %cmp30, label %for.cond.cleanup, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  %1 = lshr i16 %J, 1
+  %div5 = zext i16 %1 to i32
+  %cmp627 = icmp eq i16 %1, 0
+  br i1 %cmp627, label %for.body.lr.ph.split, label %for.body.lr.ph.split.us
+
+for.body.lr.ph.split.us:                          ; preds = %for.body.lr.ph
+  br label %for.body.us
+; CHECK: for.body.lr.ph.split.us:
+; CHECK:   %flatten.tripcount = mul i32 %div5, %div
+; CHECK:   br label %for.body.us
+
+for.body.us:                                      ; preds = %for.cond3.for.cond.cleanup8_crit_edge.us, %for.body.lr.ph.split.us
+  %i.032.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc13.us, %for.cond3.for.cond.cleanup8_crit_edge.us ]
+  %x.031.us = phi i32 [ 1, %for.body.lr.ph.split.us ], [ %xor.us.lcssa, %for.cond3.for.cond.cleanup8_crit_edge.us ]
+  %mul.us = mul nuw nsw i32 %i.032.us, %div5
+  br label %for.body9.us
+; CHECK: for.body.us:
+; CHECK:   %i.032.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc13.us, %for.cond3.for.cond.cleanup8_crit_edge.us ]
+; CHECK:   %x.031.us = phi i32 [ 1, %for.body.lr.ph.split.us ], [ %xor.us.lcssa, %for.cond3.for.cond.cleanup8_crit_edge.us ]
+; CHECK:   %mul.us = mul nuw nsw i32 %i.032.us, %div5
+; CHECK:   br label %for.body9.us
+
+for.body9.us:                                     ; preds = %for.body.us, %for.body9.us
+  %j.029.us = phi i32 [ 0, %for.body.us ], [ %inc.us, %for.body9.us ]
+  %x.128.us = phi i32 [ %x.031.us, %for.body.us ], [ %xor.us, %for.body9.us ]
+  %add.us = add nuw nsw i32 %j.029.us, %mul.us
+  %call.us = tail call i32 @func(i32 %add.us)
+  %sub.us = sub nsw i32 %call.us, %x.128.us
+  %xor.us = xor i32 %sub.us, %x.128.us
+  %inc.us = add nuw nsw i32 %j.029.us, 1
+  %cmp6.us = icmp ult i32 %inc.us, %div5
+  br i1 %cmp6.us, label %for.body9.us, label %for.cond3.for.cond.cleanup8_crit_edge.us
+; CHECK: for.body9.us:
+; CHECK:   %j.029.us = phi i32 [ 0, %for.body.us ]
+; CHECK:   %x.128.us = phi i32 [ %x.031.us, %for.body.us ]
+; CHECK:   %add.us = add nuw nsw i32 %j.029.us, %mul.us
+; CHECK:   %call.us = tail call i32 @func(i32 %i.032.us)
+; CHECK:   %sub.us = sub nsw i32 %call.us, %x.128.us
+; CHECK:   %xor.us = xor i32 %sub.us, %x.128.us
+; CHECK:   %inc.us = add nuw nsw i32 %j.029.us, 1
+; CHECK:   %cmp6.us = icmp ult i32 %inc.us, %div5
+; CHECK:   br label %for.cond3.for.cond.cleanup8_crit_edge.us
+
+for.cond3.for.cond.cleanup8_crit_edge.us:         ; preds = %for.body9.us
+  %xor.us.lcssa = phi i32 [ %xor.us, %for.body9.us ]
+  %inc13.us = add nuw nsw i32 %i.032.us, 1
+  %cmp.us = icmp ult i32 %inc13.us, %div
+  br i1 %cmp.us, label %for.body.us, label %for.cond.cleanup.loopexit
+; CHECK: for.cond3.for.cond.cleanup8_crit_edge.us:
+; CHECK:   %xor.us.lcssa = phi i32 [ %xor.us, %for.body9.us ]
+; CHECK:   %inc13.us = add nuw nsw i32 %i.032.us, 1
+; CHECK:   %cmp.us = icmp ult i32 %inc13.us, %flatten.tripcount
+; CHECK:   br i1 %cmp.us, label %for.body.us, label %for.cond.cleanup.loopexit
+
+for.body.lr.ph.split:                             ; preds = %for.body.lr.ph
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.cond3.for.cond.cleanup8_crit_edge.us
+  %xor.us.lcssa.lcssa = phi i32 [ %xor.us.lcssa, %for.cond3.for.cond.cleanup8_crit_edge.us ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup.loopexit34:                      ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit34, %for.cond.cleanup.loopexit, %entry
+  %x.0.lcssa = phi i32 [ 1, %entry ], [ %xor.us.lcssa.lcssa, %for.cond.cleanup.loopexit ], [ 1, %for.cond.cleanup.loopexit34 ]
+  ret i32 %x.0.lcssa
+
+for.body:                                         ; preds = %for.body.lr.ph.split, %for.body
+  %i.032 = phi i32 [ 0, %for.body.lr.ph.split ], [ %inc13, %for.body ]
+  %inc13 = add nuw nsw i32 %i.032, 1
+  %cmp = icmp ult i32 %inc13, %div
+  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit34
+}
+
+; CHECK-LABEL: test7
+; Various inner phis and conditions which we can still work with
+define signext i16 @test7(i32 %I, i32 %J, i32* nocapture readonly %C, i16 signext %limit) {
+entry:
+  %cmp43 = icmp eq i32 %J, 0
+  br i1 %cmp43, label %for.end17, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  %conv = sext i16 %limit to i32
+  br label %for.body.us
+; CHECK: for.body.lr.ph:
+; CHECK:   %conv = sext i16 %limit to i32
+; CHECK:   %flatten.tripcount = mul i32 %J, %J
+; CHECK:   br label %for.body.us
+
+for.body.us:                                      ; preds = %for.cond1.for.inc15_crit_edge.us, %for.body.lr.ph
+  %i.047.us = phi i32 [ 0, %for.body.lr.ph ], [ %inc16.us, %for.cond1.for.inc15_crit_edge.us ]
+  %ret.046.us = phi i16 [ 0, %for.body.lr.ph ], [ %ret.2.us.lcssa, %for.cond1.for.inc15_crit_edge.us ]
+  %prev.045.us = phi i32 [ 0, %for.body.lr.ph ], [ %.lcssa, %for.cond1.for.inc15_crit_edge.us ]
+  %tmp.044.us = phi i32 [ 0, %for.body.lr.ph ], [ %tmp.2.us.lcssa, %for.cond1.for.inc15_crit_edge.us ]
+  %mul.us = mul i32 %i.047.us, %J
+  br label %for.body3.us
+; CHECK: for.body.us:
+; CHECK:   %i.047.us = phi i32 [ 0, %for.body.lr.ph ], [ %inc16.us, %for.cond1.for.inc15_crit_edge.us ]
+; CHECK:   %ret.046.us = phi i16 [ 0, %for.body.lr.ph ], [ %ret.2.us.lcssa, %for.cond1.for.inc15_crit_edge.us ]
+; CHECK:   %prev.045.us = phi i32 [ 0, %for.body.lr.ph ], [ %.lcssa, %for.cond1.for.inc15_crit_edge.us ]
+; CHECK:   %tmp.044.us = phi i32 [ 0, %for.body.lr.ph ], [ %tmp.2.us.lcssa, %for.cond1.for.inc15_crit_edge.us ]
+; CHECK:   %mul.us = mul i32 %i.047.us, %J
+; CHECK:   br label %for.body3.us
+
+for.body3.us:                                     ; preds = %for.body.us, %if.end.us
+  %j.040.us = phi i32 [ 0, %for.body.us ], [ %inc.us, %if.end.us ]
+  %ret.139.us = phi i16 [ %ret.046.us, %for.body.us ], [ %ret.2.us, %if.end.us ]
+  %prev.138.us = phi i32 [ %prev.045.us, %for.body.us ], [ %0, %if.end.us ]
+  %tmp.137.us = phi i32 [ %tmp.044.us, %for.body.us ], [ %tmp.2.us, %if.end.us ]
+  %add.us = add i32 %j.040.us, %mul.us
+  %arrayidx.us = getelementptr inbounds i32, i32* %C, i32 %add.us
+  %0 = load i32, i32* %arrayidx.us, align 4
+  %add4.us = add nsw i32 %0, %tmp.137.us
+  %cmp5.us = icmp sgt i32 %add4.us, %conv
+  br i1 %cmp5.us, label %if.then.us, label %if.else.us
+; CHECK: for.body3.us:
+; CHECK:   %j.040.us = phi i32 [ 0, %for.body.us ]
+; CHECK:   %ret.139.us = phi i16 [ %ret.046.us, %for.body.us ]
+; CHECK:   %prev.138.us = phi i32 [ %prev.045.us, %for.body.us ]
+; CHECK:   %tmp.137.us = phi i32 [ %tmp.044.us, %for.body.us ]
+; CHECK:   %add.us = add i32 %j.040.us, %mul.us
+; CHECK:   %arrayidx.us = getelementptr inbounds i32, i32* %C, i32 %i.047.us
+; CHECK:   %0 = load i32, i32* %arrayidx.us, align 4
+; CHECK:   %add4.us = add nsw i32 %0, %tmp.137.us
+; CHECK:   %cmp5.us = icmp sgt i32 %add4.us, %conv
+; CHECK:   br i1 %cmp5.us, label %if.then.us, label %if.else.us
+
+if.else.us:                                       ; preds = %for.body3.us
+  %cmp10.us = icmp sgt i32 %0, %prev.138.us
+  %cond.us = zext i1 %cmp10.us to i32
+  %conv1235.us = zext i16 %ret.139.us to i32
+  %add13.us = add nuw nsw i32 %cond.us, %conv1235.us
+  br label %if.end.us
+; CHECK: if.else.us:
+; CHECK:   %cmp10.us = icmp sgt i32 %0, %prev.138.us
+; CHECK:   %cond.us = zext i1 %cmp10.us to i32
+; CHECK:   %conv1235.us = zext i16 %ret.139.us to i32
+; CHECK:   %add13.us = add nuw nsw i32 %cond.us, %conv1235.us
+; CHECK:   br label %if.end.us
+
+if.then.us:                                       ; preds = %for.body3.us
+  %conv7.us = sext i16 %ret.139.us to i32
+  %add8.us = add nsw i32 %conv7.us, 10
+  br label %if.end.us
+; CHECK: if.then.us:
+; CHECK:   %conv7.us = sext i16 %ret.139.us to i32
+; CHECK:   %add8.us = add nsw i32 %conv7.us, 10
+; CHECK:   br label %if.end.us
+
+if.end.us:                                        ; preds = %if.then.us, %if.else.us
+  %tmp.2.us = phi i32 [ 0, %if.then.us ], [ %add4.us, %if.else.us ]
+  %ret.2.in.us = phi i32 [ %add8.us, %if.then.us ], [ %add13.us, %if.else.us ]
+  %ret.2.us = trunc i32 %ret.2.in.us to i16
+  %inc.us = add nuw i32 %j.040.us, 1
+  %exitcond = icmp ne i32 %inc.us, %J
+  br i1 %exitcond, label %for.body3.us, label %for.cond1.for.inc15_crit_edge.us
+; CHECK: if.end.us:
+; CHECK:   %tmp.2.us = phi i32 [ 0, %if.then.us ], [ %add4.us, %if.else.us ]
+; CHECK:   %ret.2.in.us = phi i32 [ %add8.us, %if.then.us ], [ %add13.us, %if.else.us ]
+; CHECK:   %ret.2.us = trunc i32 %ret.2.in.us to i16
+; CHECK:   %inc.us = add nuw i32 %j.040.us, 1
+; CHECK:   %exitcond = icmp ne i32 %inc.us, %J
+; CHECK:   br label %for.cond1.for.inc15_crit_edge.us
+
+for.cond1.for.inc15_crit_edge.us:                 ; preds = %if.end.us
+  %tmp.2.us.lcssa = phi i32 [ %tmp.2.us, %if.end.us ]
+  %ret.2.us.lcssa = phi i16 [ %ret.2.us, %if.end.us ]
+  %.lcssa = phi i32 [ %0, %if.end.us ]
+  %inc16.us = add nuw i32 %i.047.us, 1
+  %exitcond49 = icmp ne i32 %inc16.us, %J
+  br i1 %exitcond49, label %for.body.us, label %for.end17.loopexit
+; CHECK: for.cond1.for.inc15_crit_edge.us:
+; CHECK:   %tmp.2.us.lcssa = phi i32 [ %tmp.2.us, %if.end.us ]
+; CHECK:   %ret.2.us.lcssa = phi i16 [ %ret.2.us, %if.end.us ]
+; CHECK:   %.lcssa = phi i32 [ %0, %if.end.us ]
+; CHECK:   %inc16.us = add nuw i32 %i.047.us, 1
+; CHECK:   %exitcond49 = icmp ne i32 %inc16.us, %flatten.tripcount
+; CHECK:   br i1 %exitcond49, label %for.body.us, label %for.end17.loopexit
+
+for.end17.loopexit:                               ; preds = %for.cond1.for.inc15_crit_edge.us
+  %ret.2.us.lcssa.lcssa = phi i16 [ %ret.2.us.lcssa, %for.cond1.for.inc15_crit_edge.us ]
+  br label %for.end17
+
+for.end17:                                        ; preds = %for.end17.loopexit, %entry
+  %ret.0.lcssa = phi i16 [ 0, %entry ], [ %ret.2.us.lcssa.lcssa, %for.end17.loopexit ]
+  ret i16 %ret.0.lcssa
+}
+
+; CHECK-LABEL: test8
+; Same as test1, but with different continue block order
+; (uses icmp eq and loops on false)
+define i32 @test8(i32 %val, i16* nocapture %A) {
+entry:
+  br label %for.body
+; CHECK: entry:
+; CHECK:   %flatten.tripcount = mul i32 20, 10
+; CHECK:   br label %for.body
+
+for.body:                                         ; preds = %entry, %for.inc6
+  %i.018 = phi i32 [ 0, %entry ], [ %inc7, %for.inc6 ]
+  %mul = mul nuw nsw i32 %i.018, 20
+  br label %for.body3
+; CHECK: for.body:
+; CHECK:   %i.018 = phi i32 [ 0, %entry ], [ %inc7, %for.inc6 ]
+; CHECK:   %mul = mul nuw nsw i32 %i.018, 20
+; CHECK:   br label %for.body3
+
+for.body3:                                        ; preds = %for.body, %for.body3
+  %j.017 = phi i32 [ 0, %for.body ], [ %inc, %for.body3 ]
+  %add = add nuw nsw i32 %j.017, %mul
+  %arrayidx = getelementptr inbounds i16, i16* %A, i32 %add
+  %0 = load i16, i16* %arrayidx, align 2
+  %conv16 = zext i16 %0 to i32
+  %add4 = add i32 %conv16, %val
+  %conv5 = trunc i32 %add4 to i16
+  store i16 %conv5, i16* %arrayidx, align 2
+  %inc = add nuw nsw i32 %j.017, 1
+  %exitcond = icmp eq i32 %inc, 20
+  br i1 %exitcond, label %for.inc6, label %for.body3
+; CHECK: for.body3:
+; CHECK:   %j.017 = phi i32 [ 0, %for.body ]
+; CHECK:   %add = add nuw nsw i32 %j.017, %mul
+; CHECK:   %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.018
+; CHECK:   %0 = load i16, i16* %arrayidx, align 2
+; CHECK:   %conv16 = zext i16 %0 to i32
+; CHECK:   %add4 = add i32 %conv16, %val
+; CHECK:   %conv5 = trunc i32 %add4 to i16
+; CHECK:   store i16 %conv5, i16* %arrayidx, align 2
+; CHECK:   %inc = add nuw nsw i32 %j.017, 1
+; CHECK:   %exitcond = icmp eq i32 %inc, 20
+; CHECK:   br label %for.inc6
+
+for.inc6:                                         ; preds = %for.body3
+  %inc7 = add nuw nsw i32 %i.018, 1
+  %exitcond19 = icmp eq i32 %inc7, 10
+  br i1 %exitcond19, label %for.end8, label %for.body
+; CHECK: for.inc6:
+; CHECK:   %inc7 = add nuw nsw i32 %i.018, 1
+; CHECK:   %exitcond19 = icmp eq i32 %inc7, %flatten.tripcount
+; CHECK:   br i1 %exitcond19, label %for.end8, label %for.body
+
+for.end8:                                         ; preds = %for.inc6
+  ret i32 10
+}
+
+
+declare i32 @func(i32)
+
diff --git a/llvm/test/Transforms/LoopFlatten/pr40581.ll b/llvm/test/Transforms/LoopFlatten/pr40581.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoopFlatten/pr40581.ll
@@ -0,0 +1,108 @@
+; RUN: opt < %s -S -loop-flatten -verify-loop-info -verify-dom-info -verify-scev -verify | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+; Test case and IR corresponding to this code:
+;
+; int k = 0;
+; for(int i = 0; i < n; i++)
+;   for(int j = 0; j < n; j++) {
+;     A[k] = B[k];
+;     k++;
+;   }
+;
+; TODO: this case doesn't trigger yet. 
+;
+define dso_local void @v0(i32 %n, i32* nocapture %A, i32* nocapture readonly %B) local_unnamed_addr #0 {
+;
+; CHECK-LABEL: @v0
+; CHECK-NOT:   %flatten.tripcount = mul i32 %n, %n
+;
+entry:
+  %cmp21 = icmp sgt i32 %n, 0
+  br i1 %cmp21, label %for.cond1.preheader.us.preheader, label %for.cond.cleanup
+
+for.cond1.preheader.us.preheader:
+  br label %for.cond1.preheader.us
+
+for.cond1.preheader.us:
+  %i.023.us = phi i32 [ %inc8.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
+  %k.022.us = phi i32 [ %inc.us.lcssa, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
+  %0 = add i32 %n, %k.022.us
+  br label %for.body4.us
+
+for.body4.us:
+  %k.119.us = phi i32 [ %k.022.us, %for.cond1.preheader.us ], [ %inc.us, %for.body4.us ]
+  %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %k.119.us
+  %1 = load i32, i32* %arrayidx.us, align 4
+  %arrayidx5.us = getelementptr inbounds i32, i32* %A, i32 %k.119.us
+  store i32 %1, i32* %arrayidx5.us, align 4
+  %inc.us = add i32 %k.119.us, 1
+  %exitcond = icmp ne i32 %inc.us, %0
+  br i1 %exitcond, label %for.body4.us, label %for.cond1.for.cond.cleanup3_crit_edge.us
+
+for.cond1.for.cond.cleanup3_crit_edge.us:
+  %inc.us.lcssa = phi i32 [ %inc.us, %for.body4.us ]
+  %inc8.us = add nuw nsw i32 %i.023.us, 1
+  %cmp.us = icmp slt i32 %inc8.us, %n
+  br i1 %cmp.us, label %for.cond1.preheader.us, label %for.cond.cleanup.loopexit
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
+; Test case and IR corresponding to this code:
+;
+; for(int i = 0; i < n; i++)
+;   for(int j = 0; j < n; j++) {
+;     int k = i*n+j;
+;     A[k] = B[k];
+;     k++;
+;   }
+;
+define dso_local void @v1(i32 %n, i32* nocapture %A, i32* nocapture readonly %B) local_unnamed_addr #0 {
+;
+; CHECK-LABEL: @v1
+; CHECK:       for.cond1.preheader.us.preheader:
+; CHECK:         %flatten.tripcount = mul i32 %n, %n
+; CHECK:       for.cond1.for.cond.cleanup3_crit_edge.us:
+; CHECK:         %inc8.us = add nuw nsw i32 %i.024.us, 1
+; CHECK:         %cmp.us = icmp slt i32 %inc8.us, %flatten.tripcount
+;
+entry:
+  %cmp23 = icmp sgt i32 %n, 0
+  br i1 %cmp23, label %for.cond1.preheader.us.preheader, label %for.cond.cleanup
+
+for.cond1.preheader.us.preheader:
+  br label %for.cond1.preheader.us
+
+for.cond1.preheader.us:
+  %i.024.us = phi i32 [ %inc8.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
+  %mul.us = mul nsw i32 %i.024.us, %n
+  br label %for.body4.us
+
+for.body4.us:
+  %j.022.us = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc6.us, %for.body4.us ]
+  %add.us = add nsw i32 %j.022.us, %mul.us
+  %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %add.us
+  %0 = load i32, i32* %arrayidx.us, align 4
+  %arrayidx5.us = getelementptr inbounds i32, i32* %A, i32 %add.us
+  store i32 %0, i32* %arrayidx5.us, align 4
+  %inc6.us = add nuw nsw i32 %j.022.us, 1
+  %exitcond = icmp ne i32 %inc6.us, %n
+  br i1 %exitcond, label %for.body4.us, label %for.cond1.for.cond.cleanup3_crit_edge.us
+
+for.cond1.for.cond.cleanup3_crit_edge.us:
+  %inc8.us = add nuw nsw i32 %i.024.us, 1
+  %cmp.us = icmp slt i32 %inc8.us, %n
+  br i1 %cmp.us, label %for.cond1.preheader.us, label %for.cond.cleanup.loopexit
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/nancvt.ll b/llvm/test/Transforms/PhaseOrdering/X86/nancvt.ll
--- a/llvm/test/Transforms/PhaseOrdering/X86/nancvt.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/nancvt.ll
@@ -18,6 +18,9 @@
 
 @var = external global i32
 
+; SNAN becomes QNAN on fptrunc:
+; 2147228864 = 0x7ffc1cc0 : QNAN
+
 define i32 @main() {
 ; CHECK-LABEL: @main(
 ; CHECK-NEXT:  entry:
@@ -30,15 +33,15 @@
 ; CHECK-NEXT:    store volatile i32 2147228864, i32* @var, align 4
 ; CHECK-NEXT:    store volatile i32 2147228864, i32* @var, align 4
 ; CHECK-NEXT:    store volatile i32 2147228864, i32* @var, align 4
-; CHECK-NEXT:    store volatile i32 2146502828, i32* @var, align 4
+; CHECK-NEXT:    store volatile i32 2147027116, i32* @var, align 4
 ; CHECK-NEXT:    store volatile i32 -1610612736, i32* @var, align 4
-; CHECK-NEXT:    store volatile i32 2146502828, i32* @var, align 4
+; CHECK-NEXT:    store volatile i32 2147027116, i32* @var, align 4
 ; CHECK-NEXT:    store volatile i32 -2147483648, i32* @var, align 4
-; CHECK-NEXT:    store volatile i32 2146502828, i32* @var, align 4
+; CHECK-NEXT:    store volatile i32 2147027116, i32* @var, align 4
 ; CHECK-NEXT:    store volatile i32 -1073741824, i32* @var, align 4
-; CHECK-NEXT:    store volatile i32 2143034560, i32* @var, align 4
-; CHECK-NEXT:    store volatile i32 2143034560, i32* @var, align 4
-; CHECK-NEXT:    store volatile i32 2143034560, i32* @var, align 4
+; CHECK-NEXT:    store volatile i32 2147228864, i32* @var, align 4
+; CHECK-NEXT:    store volatile i32 2147228864, i32* @var, align 4
+; CHECK-NEXT:    store volatile i32 2147228864, i32* @var, align 4
 ; CHECK-NEXT:    ret i32 undef
 ;
 entry:
diff --git a/llvm/test/Transforms/SampleProfile/flattened.ll b/llvm/test/Transforms/SampleProfile/flattened.ll
--- a/llvm/test/Transforms/SampleProfile/flattened.ll
+++ b/llvm/test/Transforms/SampleProfile/flattened.ll
@@ -1,13 +1,13 @@
 ; Check flattened profile will not be read in thinlto postlink.
-; RUN: opt < %s -O2 -flattened-profile-used -pgo-kind=pgo-sample-use-pipeline -profile-file=%S/Inputs/flattened.prof -enable-chr=false -perform-thinlto=true -S | FileCheck %s
+; RUN: opt < %s -O2 -flattened-profile-used -pgo-kind=pgo-sample-use-pipeline -profile-file=%S/Inputs/flattened.prof -enable-chr=false -perform-thinlto=true -enable-new-pm=0 -S | FileCheck %s
 ; RUN: opt < %s -passes='thinlto<O2>' -pgo-kind=pgo-sample-use-pipeline -profile-file=%S/Inputs/flattened.prof -flattened-profile-used -S | FileCheck %s
 ;
 ; Check flattened profile will be read in thinlto prelink.
-; RUN: opt < %s -O2 -flattened-profile-used -pgo-kind=pgo-sample-use-pipeline -profile-file=%S/Inputs/flattened.prof -enable-chr=false -prepare-for-thinlto=true -S | FileCheck %s --check-prefix=PRELINK
+; RUN: opt < %s -O2 -flattened-profile-used -pgo-kind=pgo-sample-use-pipeline -profile-file=%S/Inputs/flattened.prof -enable-chr=false -prepare-for-thinlto=true -enable-new-pm=0 -S | FileCheck %s --check-prefix=PRELINK
 ; RUN: opt < %s -passes='thinlto-pre-link<O2>' -pgo-kind=pgo-sample-use-pipeline -profile-file=%S/Inputs/flattened.prof -flattened-profile-used -S | FileCheck %s --check-prefix=PRELINK
 ;
 ; Check flattened profile will be read in non-thinlto mode.
-; RUN: opt < %s -O2 -flattened-profile-used -pgo-kind=pgo-sample-use-pipeline -profile-file=%S/Inputs/flattened.prof -enable-chr=false -S | FileCheck %s --check-prefix=NOTHINLTO
+; RUN: opt < %s -O2 -flattened-profile-used -pgo-kind=pgo-sample-use-pipeline -profile-file=%S/Inputs/flattened.prof -enable-chr=false -enable-new-pm=0 -S | FileCheck %s --check-prefix=NOTHINLTO
 ; RUN: opt < %s -passes='default<O2>' -pgo-kind=pgo-sample-use-pipeline -profile-file=%S/Inputs/flattened.prof -flattened-profile-used -S | FileCheck %s --check-prefix=NOTHINLTO
 ;
 ; CHECK-NOT: !{!"ProfileFormat", !"SampleProfile"}
diff --git a/llvm/test/Transforms/SampleProfile/inline-mergeprof.ll b/llvm/test/Transforms/SampleProfile/inline-mergeprof.ll
--- a/llvm/test/Transforms/SampleProfile/inline-mergeprof.ll
+++ b/llvm/test/Transforms/SampleProfile/inline-mergeprof.ll
@@ -1,6 +1,6 @@
 ; Test we lose details of not inlined profile without '-sample-profile-merge-inlinee'
-; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/inline-mergeprof.prof -sample-profile-merge-inlinee=false -S | FileCheck -check-prefix=SCALE %s
-; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/inline-mergeprof.prof -sample-profile-merge-inlinee=true -S | FileCheck -check-prefix=SCALE %s
+; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/inline-mergeprof.prof -sample-profile-merge-inlinee=false -enable-new-pm=0 -S | FileCheck -check-prefix=SCALE %s
+; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/inline-mergeprof.prof -sample-profile-merge-inlinee=true -enable-new-pm=0 -S | FileCheck -check-prefix=SCALE %s
 ; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/inline-mergeprof.prof -sample-profile-merge-inlinee=false -S | FileCheck -check-prefix=SCALE %s
 
 ; Test we properly merge not inlined profile with '-sample-profile-merge-inlinee'
diff --git a/llvm/test/Transforms/SampleProfile/profile-sample-accurate.ll b/llvm/test/Transforms/SampleProfile/profile-sample-accurate.ll
--- a/llvm/test/Transforms/SampleProfile/profile-sample-accurate.ll
+++ b/llvm/test/Transforms/SampleProfile/profile-sample-accurate.ll
@@ -1,17 +1,17 @@
-; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/profsampleacc.extbinary.afdo -profile-summary-cutoff-hot=600000 -profile-sample-accurate -S | FileCheck %s --check-prefix=CALL_SUM_IS_WARM
+; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/profsampleacc.extbinary.afdo -profile-summary-cutoff-hot=600000 -profile-sample-accurate -enable-new-pm=0 -S | FileCheck %s --check-prefix=CALL_SUM_IS_WARM
 ; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profsampleacc.extbinary.afdo -profile-summary-cutoff-hot=900000 -profile-sample-accurate -S | FileCheck %s --check-prefix=CALL_SUM_IS_HOT
-; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/profsampleacc.extbinary.afdo -profile-summary-cutoff-hot=600000 -profile-sample-accurate -S | FileCheck %s --check-prefix=CALL_SUM_IS_WARM
+; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/profsampleacc.extbinary.afdo -profile-summary-cutoff-hot=600000 -profile-sample-accurate -enable-new-pm=0 -S | FileCheck %s --check-prefix=CALL_SUM_IS_WARM
 ; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profsampleacc.extbinary.afdo -profile-summary-cutoff-hot=900000 -profile-sample-accurate -S | FileCheck %s --check-prefix=CALL_SUM_IS_HOT
 
 ; RUN: llvm-profdata merge -sample -extbinary -prof-sym-list=%S/Inputs/profile-symbol-list.text %S/Inputs/profsampleacc.extbinary.afdo -o %t.symlist.afdo
-; RUN: opt < %s -sample-profile -sample-profile-file=%t.symlist.afdo -profile-summary-cutoff-hot=600000 -profile-accurate-for-symsinlist -S | FileCheck %s --check-prefix=PROFSYMLIST
+; RUN: opt < %s -sample-profile -sample-profile-file=%t.symlist.afdo -profile-summary-cutoff-hot=600000 -profile-accurate-for-symsinlist -enable-new-pm=0 -S | FileCheck %s --check-prefix=PROFSYMLIST
 ; RUN: opt < %s -passes=sample-profile -sample-profile-file=%t.symlist.afdo -profile-summary-cutoff-hot=600000 -profile-accurate-for-symsinlist -S | FileCheck %s --check-prefix=PROFSYMLIST
 ;
 ; If -profile-accurate-for-symsinlist and -profile-sample-accurate both present,
 ; -profile-sample-accurate will override -profile-accurate-for-symsinlist.
-; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/profsampleacc.extbinary.afdo -profile-summary-cutoff-hot=600000 -profile-sample-accurate -profile-accurate-for-symsinlist -S | FileCheck %s --check-prefix=CALL_SUM_IS_WARM
+; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/profsampleacc.extbinary.afdo -profile-summary-cutoff-hot=600000 -profile-sample-accurate -profile-accurate-for-symsinlist -enable-new-pm=0 -S | FileCheck %s --check-prefix=CALL_SUM_IS_WARM
 ; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profsampleacc.extbinary.afdo -profile-summary-cutoff-hot=900000 -profile-sample-accurate -profile-accurate-for-symsinlist -S | FileCheck %s --check-prefix=CALL_SUM_IS_HOT
-; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/profsampleacc.extbinary.afdo -profile-summary-cutoff-hot=600000 -profile-sample-accurate -profile-accurate-for-symsinlist -S | FileCheck %s --check-prefix=CALL_SUM_IS_WARM
+; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/profsampleacc.extbinary.afdo -profile-summary-cutoff-hot=600000 -profile-sample-accurate -profile-accurate-for-symsinlist -enable-new-pm=0 -S | FileCheck %s --check-prefix=CALL_SUM_IS_WARM
 ; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profsampleacc.extbinary.afdo -profile-summary-cutoff-hot=900000 -profile-sample-accurate -profile-accurate-for-symsinlist -S | FileCheck %s --check-prefix=CALL_SUM_IS_HOT
 ;
 ; Original C++ test case
diff --git a/llvm/test/Transforms/WholeProgramDevirt/import.ll b/llvm/test/Transforms/WholeProgramDevirt/import.ll
--- a/llvm/test/Transforms/WholeProgramDevirt/import.ll
+++ b/llvm/test/Transforms/WholeProgramDevirt/import.ll
@@ -1,3 +1,4 @@
+; RUN: opt -S -passes=wholeprogramdevirt -wholeprogramdevirt-summary-action=import -wholeprogramdevirt-read-summary=%S/Inputs/import-single-impl.yaml < %s | FileCheck --check-prefixes=CHECK,SINGLE-IMPL %s
 ; RUN: opt -S -wholeprogramdevirt -wholeprogramdevirt-summary-action=import -wholeprogramdevirt-read-summary=%S/Inputs/import-single-impl.yaml < %s | FileCheck --check-prefixes=CHECK,SINGLE-IMPL %s
 ; RUN: opt -S -wholeprogramdevirt -wholeprogramdevirt-summary-action=import -wholeprogramdevirt-read-summary=%S/Inputs/import-uniform-ret-val.yaml < %s | FileCheck --check-prefixes=CHECK,INDIR,UNIFORM-RET-VAL %s
 ; RUN: opt -S -wholeprogramdevirt -wholeprogramdevirt-summary-action=import -wholeprogramdevirt-read-summary=%S/Inputs/import-unique-ret-val0.yaml < %s | FileCheck --check-prefixes=CHECK,INDIR,UNIQUE-RET-VAL0 %s
diff --git a/llvm/test/tools/llvm-exegesis/X86/lbr/lit.local.cfg b/llvm/test/tools/llvm-exegesis/X86/lbr/lit.local.cfg
--- a/llvm/test/tools/llvm-exegesis/X86/lbr/lit.local.cfg
+++ b/llvm/test/tools/llvm-exegesis/X86/lbr/lit.local.cfg
@@ -10,7 +10,7 @@
     config.unsupported = True
     
 else:    
-    # We need libpfm to be installed and the host to be at least skylake.
+    # We need libpfm to be installed and the host to be support LBR format with cycles.
     llvm_exegesis_exe = lit.util.which('llvm-exegesis', config.llvm_tools_dir)
     if not llvm_exegesis_exe:
         print('llvm-exegesis not found')
@@ -18,14 +18,10 @@
     else:
       try:
           with open(os.devnull, 'w') as quiet:
-              check_llvm_exegesis_uops_result = subprocess.call(
-                [llvm_exegesis_exe, '-mode', 'uops', '-snippets-file', '/dev/null'], stdout=quiet, stderr=quiet)
               check_llvm_exegesis_latency_result = subprocess.call(
-                [llvm_exegesis_exe, '-mode', 'latency', '-snippets-file', '/dev/null'], stdout=quiet, stderr=quiet)
+                [llvm_exegesis_exe, '-mode', 'latency', '-x86-lbr-sample-period', '123', '-repetition-mode', 'loop', '-snippets-file', '/dev/null'], stdout=quiet, stderr=quiet)
       except OSError:
           print('could not exec llvm-exegesis')
           config.unsupported = True
-      if not check_llvm_exegesis_uops_result == 0:
-        config.unsupported = True
       if not check_llvm_exegesis_latency_result == 0:
         config.unsupported = True
diff --git a/llvm/test/tools/llvm-mca/ARM/m4-int.s b/llvm/test/tools/llvm-mca/ARM/m4-int.s
--- a/llvm/test/tools/llvm-mca/ARM/m4-int.s
+++ b/llvm/test/tools/llvm-mca/ARM/m4-int.s
@@ -746,9 +746,9 @@
 # CHECK-NEXT:  1      1     1.00                        smulwt	r0, r1, r2
 # CHECK-NEXT:  1      2     1.00                        smusd	r0, r1, r2
 # CHECK-NEXT:  1      2     1.00                        smusdx	r0, r1, r2
-# CHECK-NEXT:  1      1     1.00                  U     ssat	r0, #1, r2
-# CHECK-NEXT:  1      1     1.00                  U     ssat	r0, #1, r2, lsl #1
-# CHECK-NEXT:  1      1     1.00                  U     ssat16	r0, #1, r1
+# CHECK-NEXT:  1      1     1.00                        ssat	r0, #1, r2
+# CHECK-NEXT:  1      1     1.00                        ssat	r0, #1, r2, lsl #1
+# CHECK-NEXT:  1      1     1.00                        ssat16	r0, #1, r1
 # CHECK-NEXT:  1      1     1.00    *      *      U     ssax	r0, r1, r2
 # CHECK-NEXT:  1      1     1.00    *      *      U     ssbb
 # CHECK-NEXT:  1      1     1.00    *      *      U     ssub16	r0, r1, r2
@@ -858,9 +858,9 @@
 # CHECK-NEXT:  1      1     1.00                        uqsub8	r0, r1, r2
 # CHECK-NEXT:  1      1     1.00                        usad8	r0, r1, r2
 # CHECK-NEXT:  1      1     1.00                        usada8	r0, r1, r2, r3
-# CHECK-NEXT:  1      1     1.00                  U     usat	r0, #1, r1
-# CHECK-NEXT:  1      1     1.00                  U     usat	r0, #1, r1, lsl #1
-# CHECK-NEXT:  1      1     1.00                  U     usat16	r0, #1, r1
+# CHECK-NEXT:  1      1     1.00                        usat	r0, #1, r1
+# CHECK-NEXT:  1      1     1.00                        usat	r0, #1, r1, lsl #1
+# CHECK-NEXT:  1      1     1.00                        usat16	r0, #1, r1
 # CHECK-NEXT:  1      1     1.00    *      *      U     usax	r0, r1, r2
 # CHECK-NEXT:  1      1     1.00    *      *      U     usub16	r0, r1, r2
 # CHECK-NEXT:  1      1     1.00    *      *      U     usub8	r0, r1, r2
diff --git a/llvm/tools/opt/NewPMDriver.cpp b/llvm/tools/opt/NewPMDriver.cpp
--- a/llvm/tools/opt/NewPMDriver.cpp
+++ b/llvm/tools/opt/NewPMDriver.cpp
@@ -375,6 +375,9 @@
   PB.registerLoopAnalyses(LAM);
   PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
 
+  if (TM)
+    TM->registerPassBuilderCallbacks(PB, DebugPM);
+
   ModulePassManager MPM(DebugPM);
   if (VK > VK_NoVerifier)
     MPM.addPass(VerifierPass());
diff --git a/llvm/unittests/ADT/APFloatTest.cpp b/llvm/unittests/ADT/APFloatTest.cpp
--- a/llvm/unittests/ADT/APFloatTest.cpp
+++ b/llvm/unittests/ADT/APFloatTest.cpp
@@ -1816,11 +1816,12 @@
   EXPECT_FALSE(losesInfo);
 
   test = APFloat::getSNaN(APFloat::IEEEsingle());
-  APFloat X87SNaN = APFloat::getSNaN(APFloat::x87DoubleExtended());
   APFloat::opStatus status = test.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven, &losesInfo);
-  EXPECT_TRUE(test.bitwiseIsEqual(X87SNaN));
+  // Conversion quiets the SNAN, so now 2 bits of the 64-bit significand should be set.
+  APInt topTwoBits(64, 0x6000000000000000);
+  EXPECT_TRUE(test.bitwiseIsEqual(APFloat::getQNaN(APFloat::x87DoubleExtended(), false, &topTwoBits)));
   EXPECT_FALSE(losesInfo);
-  EXPECT_EQ(status, APFloat::opOK);
+  EXPECT_EQ(status, APFloat::opInvalidOp);
 
   test = APFloat::getQNaN(APFloat::IEEEsingle());
   APFloat X87QNaN = APFloat::getQNaN(APFloat::x87DoubleExtended());
@@ -1832,6 +1833,7 @@
   test = APFloat::getSNaN(APFloat::x87DoubleExtended());
   test.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
                &losesInfo);
+  APFloat X87SNaN = APFloat::getSNaN(APFloat::x87DoubleExtended());
   EXPECT_TRUE(test.bitwiseIsEqual(X87SNaN));
   EXPECT_FALSE(losesInfo);
 
@@ -1841,13 +1843,13 @@
   EXPECT_TRUE(test.bitwiseIsEqual(X87QNaN));
   EXPECT_FALSE(losesInfo);
 
-  // The payload is lost in truncation, but we must retain NaN, so we set the bit after the quiet bit.
+  // The payload is lost in truncation, but we retain NaN by setting the quiet bit.
   APInt payload(52, 1);
   test = APFloat::getSNaN(APFloat::IEEEdouble(), false, &payload);
   status = test.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven, &losesInfo);
-  EXPECT_EQ(0x7fa00000, test.bitcastToAPInt());
+  EXPECT_EQ(0x7fc00000, test.bitcastToAPInt());
   EXPECT_TRUE(losesInfo);
-  EXPECT_EQ(status, APFloat::opOK);
+  EXPECT_EQ(status, APFloat::opInvalidOp);
 
   // The payload is lost in truncation. QNaN remains QNaN.
   test = APFloat::getQNaN(APFloat::IEEEdouble(), false, &payload);
@@ -4696,4 +4698,15 @@
   EXPECT_EQ(0x3fe8000000000000ull, Result.bitcastToAPInt().getRawData()[0]);
   EXPECT_EQ(0x3c98000000000000ull, Result.bitcastToAPInt().getRawData()[1]);
 }
+
+TEST(APFloatTest, x87Largest) {
+  APFloat MaxX87Val = APFloat::getLargest(APFloat::x87DoubleExtended());
+  EXPECT_TRUE(MaxX87Val.isLargest());
+}
+
+TEST(APFloatTest, x87Next) {
+  APFloat F(APFloat::x87DoubleExtended(), "-1.0");
+  F.next(false);
+  EXPECT_TRUE(ilogb(F) == -1);
+}
 }
diff --git a/llvm/unittests/Analysis/ScalarEvolutionTest.cpp b/llvm/unittests/Analysis/ScalarEvolutionTest.cpp
--- a/llvm/unittests/Analysis/ScalarEvolutionTest.cpp
+++ b/llvm/unittests/Analysis/ScalarEvolutionTest.cpp
@@ -1251,4 +1251,36 @@
   });
 }
 
+TEST_F(ScalarEvolutionsTest, ImpliedViaAddRecStart) {
+  LLVMContext C;
+  SMDiagnostic Err;
+  std::unique_ptr<Module> M = parseAssemblyString(
+      "define void @foo(i32* %p) { "
+      "entry: "
+      "  %x = load i32, i32* %p, !range !0 "
+      "  br label %loop "
+      "loop: "
+      "  %iv = phi i32 [ %x, %entry], [%iv.next, %backedge] "
+      "  %ne.check = icmp ne i32 %iv, 0 "
+      "  br i1 %ne.check, label %backedge, label %exit "
+      "backedge: "
+      "  %iv.next = add i32 %iv, -1 "
+      "  br label %loop "
+      "exit:"
+      "  ret void "
+      "} "
+      "!0 = !{i32 0, i32 2147483647}",
+      Err, C);
+
+  ASSERT_TRUE(M && "Could not parse module?");
+  ASSERT_TRUE(!verifyModule(*M) && "Must have been well formed!");
+
+  runWithSE(*M, "foo", [](Function &F, LoopInfo &LI, ScalarEvolution &SE) {
+    auto *X = SE.getSCEV(getInstructionByName(F, "x"));
+    auto *Context = getInstructionByName(F, "iv.next");
+    EXPECT_TRUE(SE.isKnownPredicateAt(ICmpInst::ICMP_NE, X,
+                                      SE.getZero(X->getType()), Context));
+  });
+}
+
 }  // end namespace llvm
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clangd/tool/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clangd/tool/BUILD.gn
--- a/llvm/utils/gn/secondary/clang-tools-extra/clangd/tool/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clangd/tool/BUILD.gn
@@ -25,5 +25,8 @@
   }
 
   include_dirs = [ ".." ]
-  sources = [ "ClangdMain.cpp" ]
+  sources = [
+    "Check.cpp",
+    "ClangdMain.cpp",
+  ]
 }
diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn
--- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn
@@ -38,6 +38,7 @@
     "LoopDataPrefetch.cpp",
     "LoopDeletion.cpp",
     "LoopDistribute.cpp",
+    "LoopFlatten.cpp",
     "LoopFuse.cpp",
     "LoopIdiomRecognize.cpp",
     "LoopInstSimplify.cpp",
diff --git a/llvm/utils/lit/lit/llvm/config.py b/llvm/utils/lit/lit/llvm/config.py
--- a/llvm/utils/lit/lit/llvm/config.py
+++ b/llvm/utils/lit/lit/llvm/config.py
@@ -133,7 +133,8 @@
         hives = [winreg.HKEY_LOCAL_MACHINE, winreg.HKEY_CURRENT_USER]
         for mask, hive in itertools.product(masks, hives):
             try:
-                with winreg.OpenKey(hive, r"SOFTWARE\GitForWindows", access=winreg.KEY_READ | mask) as key:
+                with winreg.OpenKey(hive, r"SOFTWARE\GitForWindows", 0,
+                                    winreg.KEY_READ | mask) as key:
                     install_root, _ = winreg.QueryValueEx(key, 'InstallPath')
 
                     if not install_root:
@@ -143,7 +144,7 @@
                         continue
 
                     # We found it, stop enumerating.
-                    return candidate_path
+                    return lit.util.to_string(candidate_path)
             except:
                 continue
 
@@ -168,7 +169,7 @@
                 paths = []
 
             # If we are passed a list [a b c], then iterating this list forwards
-            # and adding each to the beginning would result in b c a.  So we
+            # and adding each to the beginning would result in c b a.  So we
             # need to iterate in reverse to end up with the original ordering.
             for p in reversed(paths_to_add):
                 # Move it to the front if it already exists, otherwise insert it at the
diff --git a/mlir/docs/Tutorials/CreatingADialect.md b/mlir/docs/Tutorials/CreatingADialect.md
--- a/mlir/docs/Tutorials/CreatingADialect.md
+++ b/mlir/docs/Tutorials/CreatingADialect.md
@@ -26,7 +26,7 @@
 format](DeclarativeRewrites.md).
 
 Note that dialect names should not generally be suffixed with “Ops”,
-although some files pertaining to the operations of a dialect (e.g.
+although some files pertaining only to the operations of a dialect (e.g.
 FooOps.cpp) might be.
 
 ## CMake best practices
@@ -38,10 +38,8 @@
 is declared using add_mlir_dialect().
 
 ```cmake
-
 add_mlir_dialect(FooOps foo)
 add_mlir_doc(FooOps -gen-dialect-doc FooDialect Dialects/)
-
 ```
 
 This generates the correct rules to run mlir-tblgen, along with a
@@ -49,6 +47,7 @@
 
 Dialect transformations are typically declared in a file FooTransforms.td.
 Targets for TableGen are described in typical llvm fashion.
+
 ```cmake
 set(LLVM_TARGET_DEFINITIONS FooTransforms.td)
 mlir_tablegen(FooTransforms.h.inc -gen-rewriters)
@@ -67,20 +66,18 @@
 target_link_libraries() and the PUBLIC keyword.  For instance:
 
 ```cmake
-
-add_mlir_dialect_library(FooOps
-	DEPENDS
-	MLIRFooOpsIncGen
-	MLIRFooTransformsIncGen
-
-	LINK_COMPONENTS
-	Core
-
-	LINK_LIBS PUBLIC
-	BarOps
-	<some-other-library>
-   )
-
+add_mlir_dialect_library(MLIRFoo
+  DEPENDS
+  MLIRFooOpsIncGen
+  MLIRFooTransformsIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  MLIRBar
+  <some-other-library>
+  )
 ```
 
 add_mlir_dialect_library() is a thin wrapper around add_llvm_library()
@@ -90,9 +87,7 @@
 The list can be retrieved from the MLIR_DIALECT_LIBS global property:
 
 ```cmake
-
 get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
-
 ```
 
 Note that although the Bar dialect also uses TableGen to declare its
@@ -139,18 +134,16 @@
 using target_link_libraries() and the PUBLIC keyword.  For instance:
 
 ```cmake
-
 add_mlir_conversion_library(MLIRBarToFoo
-	BarToFoo.cpp
+  BarToFoo.cpp
 
-        ADDITIONAL_HEADER_DIRS
-        ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/BarToFoo
-
-   LINK_LIBS PUBLIC
-	BarOps
-	FooOps
-	)
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/BarToFoo
 
+  LINK_LIBS PUBLIC
+  MLIRBar
+  MLIRFoo
+  )
 ```
 
 add_mlir_conversion_library() is a thin wrapper around
@@ -161,9 +154,7 @@
 MLIR_CONVERSION_LIBS global property:
 
 ```cmake
-
 get_property(dialect_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS)
-
 ```
 
 Note that it is only necessary to specify a PUBLIC dependence against
diff --git a/mlir/include/mlir/Dialect/Async/IR/Async.h b/mlir/include/mlir/Dialect/Async/IR/Async.h
--- a/mlir/include/mlir/Dialect/Async/IR/Async.h
+++ b/mlir/include/mlir/Dialect/Async/IR/Async.h
@@ -22,12 +22,28 @@
 namespace mlir {
 namespace async {
 
+namespace detail {
+struct ValueTypeStorage;
+} // namespace detail
+
 /// The token type to represent asynchronous operation completion.
 class TokenType : public Type::TypeBase<TokenType, Type, TypeStorage> {
 public:
   using Base::Base;
 };
 
+/// The value type to represent values returned from asynchronous operations.
+class ValueType
+    : public Type::TypeBase<ValueType, Type, detail::ValueTypeStorage> {
+public:
+  using Base::Base;
+
+  /// Get or create an async ValueType with the provided value type.
+  static ValueType get(Type valueType);
+
+  Type getValueType();
+};
+
 } // namespace async
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/Async/IR/AsyncBase.td b/mlir/include/mlir/Dialect/Async/IR/AsyncBase.td
--- a/mlir/include/mlir/Dialect/Async/IR/AsyncBase.td
+++ b/mlir/include/mlir/Dialect/Async/IR/AsyncBase.td
@@ -39,4 +39,24 @@
   }];
 }
 
+class Async_ValueType<Type type>
+    : DialectType<AsyncDialect,
+        And<[
+          CPred<"$_self.isa<::mlir::async::ValueType>()">,
+          SubstLeaves<"$_self",
+                      "$_self.cast<::mlir::async::ValueType>().getValueType()",
+                      type.predicate>
+       ]>, "async value type with " # type.description # " underlying type"> {
+  let typeDescription = [{
+    `async.value` represents a value returned by asynchronous operations,
+    which may or may not be available currently, but will be available at some
+    point in the future.
+  }];
+
+  Type valueType = type;
+}
+
+def Async_AnyValueType : Type<CPred<"$_self.isa<::mlir::async::ValueType>()">,
+                                    "async value type">;
+
 #endif // ASYNC_BASE_TD
diff --git a/mlir/include/mlir/Dialect/Async/IR/AsyncOps.td b/mlir/include/mlir/Dialect/Async/IR/AsyncOps.td
--- a/mlir/include/mlir/Dialect/Async/IR/AsyncOps.td
+++ b/mlir/include/mlir/Dialect/Async/IR/AsyncOps.td
@@ -40,24 +40,24 @@
     state). All dependencies must be made explicit with async execute arguments
     (`async.token` or `async.value`).
 
-    Example:
-
     ```mlir
-    %0 = async.execute {
-      "compute0"(...)
-      async.yield
-    } : !async.token
+    %done, %values = async.execute {
+      %0 = "compute0"(...) : !some.type
+      async.yield %1 : f32
+    } : !async.token, !async.value<!some.type>
 
-    %1 = "compute1"(...)
+    %1 = "compute1"(...) : !some.type
     ```
   }];
 
   // TODO: Take async.tokens/async.values as arguments.
   let arguments = (ins );
-  let results = (outs Async_TokenType:$done);
+  let results = (outs Async_TokenType:$done,
+                      Variadic<Async_AnyValueType>:$values);
   let regions = (region SizedRegion<1>:$body);
 
-  let assemblyFormat = "$body attr-dict `:` type($done)";
+  let printer = [{ return ::mlir::async::print(p, *this); }];
+  let parser = [{ return ::mlir::async::parse$cppClass(parser, result); }];
 }
 
 def Async_YieldOp :
@@ -71,6 +71,8 @@
   let arguments = (ins Variadic<AnyType>:$operands);
 
   let assemblyFormat = "attr-dict ($operands^ `:` type($operands))?";
+
+  let verifier = [{ return ::mlir::async::verify(*this); }];
 }
 
 #endif // ASYNC_OPS
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOpsInterface.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOpsInterface.td
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOpsInterface.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOpsInterface.td
@@ -459,6 +459,24 @@
             }));
       }]
     >,
+    InterfaceMethod<
+      /*desc=*/[{
+        Return the position of buffer in inputs + outputs list
+      }],
+      /*retTy=*/"Optional<unsigned>",
+      /*methodName=*/"getIndexOfInputAndOutputBuffer",
+      /*args=*/(ins "Value":$value),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        Optional<unsigned> inputIndex = getIndexOfInput(value);
+        if (inputIndex.hasValue()) return inputIndex.getValue();
+        Optional<unsigned> outputIndex = getIndexOfOutputBuffer(value);
+        if (outputIndex.hasValue()) {
+          return $_op.getNumInputs() + outputIndex.getValue();
+        }
+        return llvm::None;
+      }]
+    >,
 
     //===------------------------------------------------------------------===//
     // Other interface methods.
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -18,6 +18,7 @@
 namespace mlir {
 namespace linalg {
 
+struct LinalgFusionOptions;
 struct LinalgTilingOptions;
 
 //===----------------------------------------------------------------------===//
@@ -30,6 +31,14 @@
   SmallVector<Operation *, 8> loops;
 };
 
+struct TiledAndFusedLinalgOps {
+  LinalgOp op;
+  SmallVector<LinalgOp, 1> fusedProducers;
+  SmallVector<LinalgOp, 1> originalProducers;
+  SmallVector<Operation *, 4> fusedLoops;
+  SmallVector<Operation *, 4> unfusedLoops;
+};
+
 /// Populates patterns for vectorization of all ConvN-D ops.
 void populateConvVectorizationPatterns(
     MLIRContext *context, SmallVectorImpl<OwningRewritePatternList> &patterns,
@@ -53,6 +62,71 @@
 Optional<TiledLinalgOp> tileLinalgOp(OpBuilder &b, LinalgOp op,
                                      const LinalgTilingOptions &options);
 
+/// Tile and fuse the `op` with its producers. The tile and fuse proceeds in
+/// three steps
+/// - Find tile loops that are fusable with its producer tile loops (a.k.a. tile
+///   + fuse loops).
+/// - Tile just these loops of the consumer (root operation) and fuse with
+///   the producer.
+/// - Tile again the tiled consumer operation produced above to do rest of
+///   the tiling specified by the `tilingOptions`.
+///
+/// For example, consider the sequence of matmul below
+///
+///   linalg.matmul ins(%arg0, %arg1 : memref<256x32xf32>, memref<32x32xf32>)
+///                 outs(%arg2 : memref<256x32xf32>)
+///   linalg.matmul ins(%arg2, %arg3 : memref<256x32xf32>, memref<32x32xf32>)
+///                 outs(%arg4 : memref<256x32xf32>)
+///
+/// It is legal to fuse the RAW dependence (through %arg2) by only fusing the
+/// matmuls row-wise. For example, the fused computation for the above is shown
+/// below. The outer `scf.parallel` loop is the "fused" loop obtained by tiling
+/// along the rows of the matrix. The entire rows of the first matmul operation
+/// need to be computed before they can be used for the second matmul. The
+/// second matmul is further tiled (similar to normal tiling).
+///
+/// #map0 = affine_map<(d0, d1)[s0] -> (d0 * 32 + s0 + d1)>
+/// #map1 = affine_map<(d0, d1) -> (d0 * 32 + d1)>
+/// scf.parallel (%arg5) = (%c0) to (%c256) step (%c16) {
+///   %0 = subview %arg2[%arg5, 0] [16, 32] [1, 1]
+///     : memref<256x32xf32> to memref<16x32xf32, #map0>
+///   %1 = subview %arg4[%arg5, 0] [16, 32] [1, 1]
+///     : memref<256x32xf32> to memref<16x32xf32, #map0>
+///   %2 = subview %arg0[%arg5, 0] [16, 32] [1, 1]
+///     : memref<256x32xf32> to memref<16x32xf32, #map0>
+///   %3 = subview %arg1[0, 0] [32, 32] [1, 1]
+///     : memref<32x32xf32> to memref<32x32xf32, #map1>
+///   linalg.matmul
+///     ins(%2, %3 : memref<16x32xf32, #map0>, memref<32x32xf32, #map1>)
+///     outs(%0 : memref<16x32xf32, #map0>)
+///   scf.parallel (%arg6) = (%c0) to (%c32) step (%c8) {
+///   scf.for %arg7 = %c0 to %c32 step %c4 {
+///     %4 = subview %0[0, %arg7] [16, 4] [1, 1]
+///       : memref<16x32xf32, #map0> to memref<16x4xf32, #map0>
+///     %5 = subview %arg3[%arg7, %arg6] [4, 8] [1, 1]
+///       : memref<32x32xf32> to memref<4x8xf32, #map0>
+///     %6 = subview %1[0, %arg6] [16, 8] [1, 1]
+///       : memref<16x32xf32, #map0> to memref<16x8xf32, #map0>
+///     linalg.matmul
+///       ins(%4, %5 : memref<16x4xf32, #map0>, memref<4x8xf32, #map0>)
+///       outs(%6 : memref<16x8xf32, #map0>)
+///     }
+///     scf.yield
+///   }
+///   scf.yield
+/// }
+///
+/// The following tiling options are handled differently in tile+fuse (compared
+/// to tile only)
+/// - Interchange of the tiling loops is not supported right now.
+/// - Distribution is only done for the tile+fuse loops. The tiled loops
+///   generated by the second tiling is not distributed.
+Optional<TiledAndFusedLinalgOps>
+tileAndFuseLinalgOps(PatternRewriter &rewriter, LinalgOp op,
+                     const LinalgDependenceGraph &dependenceGraph,
+                     const LinalgTilingOptions &tilingOptions,
+                     const LinalgFusionOptions &fusionOptions);
+
 /// Interchanges the `iterator_types` and `iterator_maps` dimensions of `op`.
 /// This is an in-place transformation controlled by `interchangeVector`.
 /// An empty vector is interpreted as the identity permutation and the
@@ -252,9 +326,9 @@
   /// Set the `tileSizeComputationFunction` to return the values `ts`. The
   /// values must not fold away when tiling. Otherwise, use a more robust
   /// `tileSizeComputationFunction`.
-  LinalgTilingOptions &setTileSizes(ValueRange ts) {
-    tileSizeComputationFunction = [&](OpBuilder &, Operation *) {
-      return SmallVector<Value, 4>(ts.begin(), ts.end());
+  LinalgTilingOptions &setTileSizes(SmallVector<Value, 4> ts) {
+    tileSizeComputationFunction = [=](OpBuilder &, Operation *) {
+      return ts;
     };
     return *this;
   }
@@ -323,6 +397,63 @@
   }
 };
 
+struct LinalgFusionOptions {
+  /// Optional list of operands indices to use for fusion. When unspecified,
+  /// only one fusion is done, i.e., the pattern returns after the first fusion.
+  Optional<DenseSet<unsigned>> indicesToFuse = None;
+  LinalgFusionOptions &setIndicesToFuse(ArrayRef<int64_t> operands) {
+    indicesToFuse = DenseSet<unsigned>();
+    indicesToFuse->insert(operands.begin(), operands.end());
+    return *this;
+  }
+};
+
+struct LinalgBaseTileAndFusePattern : public RewritePattern {
+  LinalgBaseTileAndFusePattern(StringRef opName, MLIRContext *context,
+                               const LinalgDependenceGraph &dependenceGraph,
+                               LinalgTilingOptions tilingOptions,
+                               LinalgFusionOptions fusionOptions,
+                               LinalgMarker marker = LinalgMarker(),
+                               LinalgMarker fusedOpMarker = LinalgMarker(),
+                               LinalgMarker originalOpMarker = LinalgMarker(),
+                               PatternBenefit benefit = 1);
+  LogicalResult matchAndRewrite(Operation *op,
+                                PatternRewriter &rewriter) const override;
+
+private:
+  /// Dependence graph needed for fusion.
+  const LinalgDependenceGraph &dependenceGraph;
+  /// Options to control tiling.
+  LinalgTilingOptions tilingOptions;
+  /// Options to control fusion.
+  LinalgFusionOptions fusionOptions;
+  /// Marker to control application of the pattern.
+  LinalgMarker marker;
+  /// Marker set on the fused op after tile and fuse.
+  LinalgMarker fusedOpMarker;
+  /// The dependenceGraph is not modifiable, i.e. if the Linalg operations used
+  /// to build the dependence graph changes then the dependenceGraph needs to be
+  /// recomputed right now. To not invalidate the dependenceGraph as
+  /// transformation happens, the original producer can be tagged with a marker
+  /// that can be later used to delete the original operations.
+  LinalgMarker originalOpMarker;
+};
+
+template <typename OpTy>
+struct LinalgTileAndFusePattern : public LinalgBaseTileAndFusePattern {
+  LinalgTileAndFusePattern(MLIRContext *context,
+                           const LinalgDependenceGraph &dependenceGraph,
+                           LinalgTilingOptions tilingOptions,
+                           LinalgFusionOptions fusionOptions,
+                           LinalgMarker marker = LinalgMarker(),
+                           LinalgMarker fusedOpMarker = LinalgMarker(),
+                           LinalgMarker originalOpMarker = LinalgMarker(),
+                           PatternBenefit benefit = 1)
+      : LinalgBaseTileAndFusePattern(
+            OpTy::getOperationName(), context, dependenceGraph, tilingOptions,
+            fusionOptions, marker, fusedOpMarker, originalOpMarker, benefit) {}
+};
+
 ///
 /// Linalg interchange patterns.
 ///
diff --git a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
--- a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
@@ -10,6 +10,7 @@
 #define MLIR_DIALECT_LINALG_UTILS_H_
 
 #include "mlir/Dialect/Affine/EDSC/Intrinsics.h"
+#include "mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h"
 #include "mlir/Dialect/Linalg/EDSC/Builders.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/SCF/SCF.h"
diff --git a/mlir/include/mlir/Dialect/Vector/VectorOps.td b/mlir/include/mlir/Dialect/Vector/VectorOps.td
--- a/mlir/include/mlir/Dialect/Vector/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/VectorOps.td
@@ -454,6 +454,71 @@
   }];
 }
 
+def Vector_ExtractMapOp :
+  Vector_Op<"extract_map", [NoSideEffect]>,
+    Arguments<(ins AnyVector:$vector, Index:$id, I64Attr:$multiplicity)>,
+    Results<(outs AnyVector)> {
+  let summary = "vector extract map operation";
+  let description = [{
+    Takes an 1-D vector and extract a sub-part of the vector starting at id with
+    a size of `vector size / multiplicity`. This maps a given multiplicity of
+    the vector to a Value such as a loop induction variable or an SPMD id.
+
+    Similarly to vector.tuple_get, this operation is used for progressive
+    lowering and should be folded away before converting to LLVM.
+
+
+    For instance, the following code:
+    ```mlir
+    %a = vector.transfer_read %A[%c0]: memref<32xf32>, vector<32xf32>
+    %b = vector.transfer_read %B[%c0]: memref<32xf32>, vector<32xf32>
+    %c = addf %a, %b: vector<32xf32>
+    vector.transfer_write %c, %C[%c0]: memref<32xf32>, vector<32xf32>
+    ```
+    can be rewritten to:
+    ```mlir
+    %a = vector.transfer_read %A[%c0]: memref<32xf32>, vector<32xf32>
+    %b = vector.transfer_read %B[%c0]: memref<32xf32>, vector<32xf32>
+    %ea = vector.extract_map %a[%id : 32] : vector<32xf32> to vector<1xf32>
+    %eb = vector.extract_map %b[%id : 32] : vector<32xf32> to vector<1xf32>
+    %ec = addf %ea, %eb : vector<1xf32>
+    %c = vector.insert_map %ec, %id, 32 : vector<1xf32> to vector<32xf32>
+    vector.transfer_write %c, %C[%c0]: memref<32xf32>, vector<32xf32>
+    ```
+
+    Where %id can be an induction variable or an SPMD id going from 0 to 31.
+
+    And then be rewritten to:
+    ```mlir
+    %a = vector.transfer_read %A[%id]: memref<32xf32>, vector<1xf32>
+    %b = vector.transfer_read %B[%id]: memref<32xf32>, vector<1xf32>
+    %c = addf %a, %b: vector<1xf32>
+    vector.transfer_write %c, %C[%id]: memref<32xf32>, vector<1xf32>
+    ```
+
+    Example:
+
+    ```mlir
+    %ev = vector.extract_map %v[%id:32] : vector<32xf32> to vector<1xf32>
+    ```
+  }];
+  let builders = [OpBuilder<
+    "OpBuilder &builder, OperationState &result, " #
+    "Value vector, Value id, int64_t multiplicity">];
+  let extraClassDeclaration = [{
+    VectorType getSourceVectorType() {
+      return vector().getType().cast<VectorType>();
+    }
+    VectorType getResultType() {
+      return getResult().getType().cast<VectorType>();
+    }
+  }];
+  let assemblyFormat = [{
+    $vector `[` $id `:` $multiplicity `]` attr-dict `:` type($vector) `to`
+    type(results)
+  }];
+}
+
 def Vector_FMAOp :
   Op<Vector_Dialect, "fma", [NoSideEffect,
                              AllTypesMatch<["lhs", "rhs", "acc", "result"]>]>,
@@ -626,6 +691,46 @@
   }];
 }
 
+def Vector_InsertMapOp :
+  Vector_Op<"insert_map", [NoSideEffect]>,
+    Arguments<(ins AnyVector:$vector, Index:$id, I64Attr:$multiplicity)>,
+    Results<(outs AnyVector)> {
+  let summary = "vector insert map operation";
+  let description = [{
+    insert an 1-D vector and within a larger vector starting at id. The new
+    vector created will have a size of `vector size * multiplicity`. This
+    represents how a sub-part of the vector is written for a given Value such as
+    a loop induction variable or an SPMD id.
+
+    Similarly to vector.tuple_get, this operation is used for progressive
+    lowering and should be folded away before converting to LLVM.
+
+    This operations is meant to be used in combination with vector.extract_map.
+    See example in extract.map description.
+
+    Example:
+
+    ```mlir
+    %v = vector.insert_map %ev, %id, 32 : vector<1xf32> to vector<32xf32>
+    ```
+  }];
+  let builders = [OpBuilder<
+    "OpBuilder &builder, OperationState &result, " #
+    "Value vector, Value id, int64_t multiplicity">];
+  let extraClassDeclaration = [{
+    VectorType getSourceVectorType() {
+      return vector().getType().cast<VectorType>();
+    }
+    VectorType getResultType() {
+      return getResult().getType().cast<VectorType>();
+    }
+  }];
+  let assemblyFormat = [{
+    $vector `,` $id `,` $multiplicity attr-dict `:` type($vector) `to`
+    type(results)
+  }];
+}
+
 def Vector_InsertStridedSliceOp :
   Vector_Op<"insert_strided_slice", [NoSideEffect,
     PredOpTrait<"operand #0 and result have same element type",
diff --git a/mlir/include/mlir/Dialect/Vector/VectorTransforms.h b/mlir/include/mlir/Dialect/Vector/VectorTransforms.h
--- a/mlir/include/mlir/Dialect/Vector/VectorTransforms.h
+++ b/mlir/include/mlir/Dialect/Vector/VectorTransforms.h
@@ -172,6 +172,47 @@
   FilterConstraintType filter;
 };
 
+struct DistributeOps {
+  ExtractMapOp extract;
+  InsertMapOp insert;
+};
+
+/// Distribute a 1D vector pointwise operation over a range of given IDs taking
+/// *all* values in [0 .. multiplicity - 1] (e.g. loop induction variable or
+/// SPMD id). This transformation only inserts
+/// vector.extract_map/vector.insert_map. It is meant to be used with
+/// canonicalizations pattern to propagate and fold the vector
+/// insert_map/extract_map operations.
+/// Transforms:
+//  %v = addf %a, %b : vector<32xf32>
+/// to:
+/// %v = addf %a, %b : vector<32xf32> %ev =
+/// vector.extract_map %v, %id, 32 : vector<32xf32> into vector<1xf32> %nv =
+/// vector.insert_map %ev, %id, 32 : vector<1xf32> into vector<32xf32>
+Optional<DistributeOps> distributPointwiseVectorOp(OpBuilder &builder,
+                                                   Operation *op, Value id,
+                                                   int64_t multiplicity);
+/// Canonicalize an extra element using the result of a pointwise operation.
+/// Transforms:
+/// %v = addf %a, %b : vector32xf32>
+/// %dv = vector.extract_map %v, %id, 32 : vector<32xf32> into vector<1xf32>
+/// to:
+/// %da = vector.extract_map %a, %id, 32 : vector<32xf32> into vector<1xf32>
+/// %db = vector.extract_map %a, %id, 32 : vector<32xf32> into vector<1xf32>
+/// %dv = addf %da, %db : vector<1xf32>
+struct PointwiseExtractPattern : public OpRewritePattern<ExtractMapOp> {
+  using FilterConstraintType = std::function<LogicalResult(ExtractMapOp op)>;
+  PointwiseExtractPattern(
+      MLIRContext *context, FilterConstraintType constraint =
+                                [](ExtractMapOp op) { return success(); })
+      : OpRewritePattern<ExtractMapOp>(context), filter(constraint) {}
+  LogicalResult matchAndRewrite(ExtractMapOp extract,
+                                PatternRewriter &rewriter) const override;
+
+private:
+  FilterConstraintType filter;
+};
+
 } // namespace vector
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/IR/Dialect.h b/mlir/include/mlir/IR/Dialect.h
--- a/mlir/include/mlir/IR/Dialect.h
+++ b/mlir/include/mlir/IR/Dialect.h
@@ -150,10 +150,11 @@
   /// This method is used by derived classes to add their operations to the set.
   ///
   template <typename... Args> void addOperations() {
-    (void)std::initializer_list<int>{
-        0, (addOperation(AbstractOperation::get<Args>(*this)), 0)...};
+    (void)std::initializer_list<int>{0, (addOperation<Args>(), 0)...};
+  }
+  template <typename Arg> void addOperation() {
+    addOperation(AbstractOperation::get<Arg>(*this));
   }
-
   void addOperation(AbstractOperation opInfo);
 
   /// Register a set of type classes with this dialect.
diff --git a/mlir/lib/Analysis/CMakeLists.txt b/mlir/lib/Analysis/CMakeLists.txt
--- a/mlir/lib/Analysis/CMakeLists.txt
+++ b/mlir/lib/Analysis/CMakeLists.txt
@@ -21,7 +21,7 @@
   mlir-headers
 
   LINK_LIBS PUBLIC
-  MLIRAffineOps
+  MLIRAffine
   MLIRCallInterfaces
   MLIRControlFlowInterfaces
   MLIRInferTypeOpInterface
@@ -43,7 +43,7 @@
   mlir-headers
 
   LINK_LIBS PUBLIC
-  MLIRAffineOps
+  MLIRAffine
   MLIRCallInterfaces
   MLIRControlFlowInterfaces
   MLIRInferTypeOpInterface
diff --git a/mlir/lib/CAPI/Standard/CMakeLists.txt b/mlir/lib/CAPI/Standard/CMakeLists.txt
--- a/mlir/lib/CAPI/Standard/CMakeLists.txt
+++ b/mlir/lib/CAPI/Standard/CMakeLists.txt
@@ -7,5 +7,5 @@
 
   LINK_LIBS PUBLIC
   MLIRCAPIIR
-  MLIRStandardOps
+  MLIRStandard
   )
diff --git a/mlir/lib/Conversion/AffineToStandard/CMakeLists.txt b/mlir/lib/Conversion/AffineToStandard/CMakeLists.txt
--- a/mlir/lib/Conversion/AffineToStandard/CMakeLists.txt
+++ b/mlir/lib/Conversion/AffineToStandard/CMakeLists.txt
@@ -11,10 +11,10 @@
   Core
 
   LINK_LIBS PUBLIC
-  MLIRAffineOps
+  MLIRAffine
   MLIRSCF
   MLIRPass
-  MLIRStandardOps
+  MLIRStandard
   MLIRTransforms
   MLIRIR
   )
diff --git a/mlir/lib/Conversion/GPUToSPIRV/CMakeLists.txt b/mlir/lib/Conversion/GPUToSPIRV/CMakeLists.txt
--- a/mlir/lib/Conversion/GPUToSPIRV/CMakeLists.txt
+++ b/mlir/lib/Conversion/GPUToSPIRV/CMakeLists.txt
@@ -16,7 +16,7 @@
   MLIRPass
   MLIRSCFToSPIRV
   MLIRSPIRV
-  MLIRStandardOps
+  MLIRStandard
   MLIRStandardToSPIRVTransforms
   MLIRSupport
   MLIRTransforms
diff --git a/mlir/lib/Conversion/GPUToVulkan/CMakeLists.txt b/mlir/lib/Conversion/GPUToVulkan/CMakeLists.txt
--- a/mlir/lib/Conversion/GPUToVulkan/CMakeLists.txt
+++ b/mlir/lib/Conversion/GPUToVulkan/CMakeLists.txt
@@ -12,7 +12,7 @@
   MLIRPass
   MLIRSPIRV
   MLIRSPIRVSerialization
-  MLIRStandardOps
+  MLIRStandard
   MLIRSupport
   MLIRTransforms
   MLIRTranslation
diff --git a/mlir/lib/Conversion/LinalgToLLVM/CMakeLists.txt b/mlir/lib/Conversion/LinalgToLLVM/CMakeLists.txt
--- a/mlir/lib/Conversion/LinalgToLLVM/CMakeLists.txt
+++ b/mlir/lib/Conversion/LinalgToLLVM/CMakeLists.txt
@@ -15,7 +15,7 @@
   MLIRAffineToStandard
   MLIREDSC
   MLIRIR
-  MLIRLinalgOps
+  MLIRLinalg
   MLIRLLVMIR
   MLIRSCFToStandard
   MLIRStandardToLLVM
diff --git a/mlir/lib/Conversion/LinalgToSPIRV/CMakeLists.txt b/mlir/lib/Conversion/LinalgToSPIRV/CMakeLists.txt
--- a/mlir/lib/Conversion/LinalgToSPIRV/CMakeLists.txt
+++ b/mlir/lib/Conversion/LinalgToSPIRV/CMakeLists.txt
@@ -11,7 +11,7 @@
 
   LINK_LIBS PUBLIC
   MLIRIR
-  MLIRLinalgOps
+  MLIRLinalg
   MLIRLinalgUtils
   MLIRPass
   MLIRSPIRV
diff --git a/mlir/lib/Conversion/LinalgToStandard/CMakeLists.txt b/mlir/lib/Conversion/LinalgToStandard/CMakeLists.txt
--- a/mlir/lib/Conversion/LinalgToStandard/CMakeLists.txt
+++ b/mlir/lib/Conversion/LinalgToStandard/CMakeLists.txt
@@ -13,7 +13,7 @@
   LINK_LIBS PUBLIC
   MLIREDSC
   MLIRIR
-  MLIRLinalgOps
+  MLIRLinalg
   MLIRPass
   MLIRSCF
   MLIRTransforms
diff --git a/mlir/lib/Conversion/SCFToGPU/CMakeLists.txt b/mlir/lib/Conversion/SCFToGPU/CMakeLists.txt
--- a/mlir/lib/Conversion/SCFToGPU/CMakeLists.txt
+++ b/mlir/lib/Conversion/SCFToGPU/CMakeLists.txt
@@ -9,13 +9,13 @@
   MLIRConversionPassIncGen
 
   LINK_LIBS PUBLIC
-  MLIRAffineOps
+  MLIRAffine
   MLIRAffineToStandard
   MLIRGPU
   MLIRIR
-  MLIRLinalgOps
+  MLIRLinalg
   MLIRPass
-  MLIRStandardOps
+  MLIRStandard
   MLIRSupport
   MLIRTransforms
   )
diff --git a/mlir/lib/Conversion/SCFToSPIRV/CMakeLists.txt b/mlir/lib/Conversion/SCFToSPIRV/CMakeLists.txt
--- a/mlir/lib/Conversion/SCFToSPIRV/CMakeLists.txt
+++ b/mlir/lib/Conversion/SCFToSPIRV/CMakeLists.txt
@@ -8,13 +8,13 @@
   MLIRConversionPassIncGen
 
   LINK_LIBS PUBLIC
-  MLIRAffineOps
+  MLIRAffine
   MLIRAffineToStandard
   MLIRSPIRV
   MLIRIR
-  MLIRLinalgOps
+  MLIRLinalg
   MLIRPass
-  MLIRStandardOps
+  MLIRStandard
   MLIRSupport
   MLIRTransforms
   )
diff --git a/mlir/lib/Conversion/StandardToSPIRV/CMakeLists.txt b/mlir/lib/Conversion/StandardToSPIRV/CMakeLists.txt
--- a/mlir/lib/Conversion/StandardToSPIRV/CMakeLists.txt
+++ b/mlir/lib/Conversion/StandardToSPIRV/CMakeLists.txt
@@ -17,5 +17,5 @@
   MLIRSupport
   MLIRTransformUtils
   MLIRSPIRV
-  MLIRStandardOps
+  MLIRStandard
   )
diff --git a/mlir/lib/Dialect/Affine/EDSC/CMakeLists.txt b/mlir/lib/Dialect/Affine/EDSC/CMakeLists.txt
--- a/mlir/lib/Dialect/Affine/EDSC/CMakeLists.txt
+++ b/mlir/lib/Dialect/Affine/EDSC/CMakeLists.txt
@@ -8,10 +8,10 @@
   MLIRAffineOpsIncGen
 
   LINK_LIBS PUBLIC
-  MLIRAffineOps
+  MLIRAffine
   MLIREDSC
   MLIRIR
   MLIRLoopLikeInterface
   MLIRSideEffectInterfaces
-  MLIRStandardOps
+  MLIRStandard
   )
diff --git a/mlir/lib/Dialect/Affine/IR/CMakeLists.txt b/mlir/lib/Dialect/Affine/IR/CMakeLists.txt
--- a/mlir/lib/Dialect/Affine/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/Affine/IR/CMakeLists.txt
@@ -1,4 +1,4 @@
-add_mlir_dialect_library(MLIRAffineOps
+add_mlir_dialect_library(MLIRAffine
   AffineMemoryOpInterfaces.cpp
   AffineOps.cpp
   AffineValueMap.cpp
@@ -15,5 +15,5 @@
   MLIRIR
   MLIRLoopLikeInterface
   MLIRSideEffectInterfaces
-  MLIRStandardOps
+  MLIRStandard
   )
diff --git a/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt
--- a/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt
@@ -18,13 +18,13 @@
   MLIRLoopLikeInterfaceIncGen
 
   LINK_LIBS PUBLIC
-  MLIRAffineOps
+  MLIRAffine
   MLIRAffineUtils
   MLIREDSC
   MLIRIR
   MLIRPass
   MLIRSideEffectInterfaces
-  MLIRStandardOps
+  MLIRStandard
   MLIRTransformUtils
   MLIRVector
   MLIRVectorToLLVM
diff --git a/mlir/lib/Dialect/Affine/Utils/CMakeLists.txt b/mlir/lib/Dialect/Affine/Utils/CMakeLists.txt
--- a/mlir/lib/Dialect/Affine/Utils/CMakeLists.txt
+++ b/mlir/lib/Dialect/Affine/Utils/CMakeLists.txt
@@ -5,6 +5,6 @@
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Affine
 
   LINK_LIBS PUBLIC
-  MLIRAffineOps
+  MLIRAffine
   MLIRTransformUtils
   )
diff --git a/mlir/lib/Dialect/Async/IR/Async.cpp b/mlir/lib/Dialect/Async/IR/Async.cpp
--- a/mlir/lib/Dialect/Async/IR/Async.cpp
+++ b/mlir/lib/Dialect/Async/IR/Async.cpp
@@ -19,8 +19,8 @@
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/raw_ostream.h"
 
-using namespace mlir;
-using namespace mlir::async;
+namespace mlir {
+namespace async {
 
 void AsyncDialect::initialize() {
   addOperations<
@@ -28,6 +28,7 @@
 #include "mlir/Dialect/Async/IR/AsyncOps.cpp.inc"
       >();
   addTypes<TokenType>();
+  addTypes<ValueType>();
 }
 
 /// Parse a type registered to this dialect.
@@ -39,6 +40,15 @@
   if (keyword == "token")
     return TokenType::get(getContext());
 
+  if (keyword == "value") {
+    Type ty;
+    if (parser.parseLess() || parser.parseType(ty) || parser.parseGreater()) {
+      parser.emitError(parser.getNameLoc(), "failed to parse async value type");
+      return Type();
+    }
+    return ValueType::get(ty);
+  }
+
   parser.emitError(parser.getNameLoc(), "unknown async type: ") << keyword;
   return Type();
 }
@@ -46,9 +56,113 @@
 /// Print a type registered to this dialect.
 void AsyncDialect::printType(Type type, DialectAsmPrinter &os) const {
   TypeSwitch<Type>(type)
-      .Case<TokenType>([&](Type) { os << "token"; })
+      .Case<TokenType>([&](TokenType) { os << "token"; })
+      .Case<ValueType>([&](ValueType valueTy) {
+        os << "value<";
+        os.printType(valueTy.getValueType());
+        os << '>';
+      })
       .Default([](Type) { llvm_unreachable("unexpected 'async' type kind"); });
 }
 
+//===----------------------------------------------------------------------===//
+/// ValueType
+//===----------------------------------------------------------------------===//
+
+namespace detail {
+
+// Storage for `async.value<T>` type, the only member is the wrapped type.
+struct ValueTypeStorage : public TypeStorage {
+  ValueTypeStorage(Type valueType) : valueType(valueType) {}
+
+  /// The hash key used for uniquing.
+  using KeyTy = Type;
+  bool operator==(const KeyTy &key) const { return key == valueType; }
+
+  /// Construction.
+  static ValueTypeStorage *construct(TypeStorageAllocator &allocator,
+                                     Type valueType) {
+    return new (allocator.allocate<ValueTypeStorage>())
+        ValueTypeStorage(valueType);
+  }
+
+  Type valueType;
+};
+
+} // namespace detail
+
+ValueType ValueType::get(Type valueType) {
+  return Base::get(valueType.getContext(), valueType);
+}
+
+Type ValueType::getValueType() { return getImpl()->valueType; }
+
+//===----------------------------------------------------------------------===//
+// YieldOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult verify(YieldOp op) {
+  // Get the underlying value types from async values returned from the
+  // parent `async.execute` operation.
+  auto executeOp = op.getParentOfType<ExecuteOp>();
+  auto types = llvm::map_range(executeOp.values(), [](const OpResult &result) {
+    return result.getType().cast<ValueType>().getValueType();
+  });
+
+  if (!std::equal(types.begin(), types.end(), op.getOperandTypes().begin()))
+    return op.emitOpError("Operand types do not match the types returned from "
+                          "the parent ExecuteOp");
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+/// ExecuteOp
+//===----------------------------------------------------------------------===//
+
+static void print(OpAsmPrinter &p, ExecuteOp op) {
+  p << "async.execute ";
+  p.printRegion(op.body());
+  p.printOptionalAttrDict(op.getAttrs());
+  p << " : ";
+  p.printType(op.done().getType());
+  if (!op.values().empty())
+    p << ", ";
+  llvm::interleaveComma(op.values(), p, [&](const OpResult &result) {
+    p.printType(result.getType());
+  });
+}
+
+static ParseResult parseExecuteOp(OpAsmParser &parser, OperationState &result) {
+  MLIRContext *ctx = result.getContext();
+
+  // Parse asynchronous region.
+  Region *body = result.addRegion();
+  if (parser.parseRegion(*body, /*arguments=*/{}, /*argTypes=*/{},
+                         /*enableNameShadowing=*/false))
+    return failure();
+
+  // Parse operation attributes.
+  NamedAttrList attrs;
+  if (parser.parseOptionalAttrDict(attrs))
+    return failure();
+  result.addAttributes(attrs);
+
+  // Parse result types.
+  SmallVector<Type, 4> resultTypes;
+  if (parser.parseColonTypeList(resultTypes))
+    return failure();
+
+  // First result type must be an async token type.
+  if (resultTypes.empty() || resultTypes.front() != TokenType::get(ctx))
+    return failure();
+  parser.addTypesToList(resultTypes, result.types);
+
+  return success();
+}
+
+} // namespace async
+} // namespace mlir
+
 #define GET_OP_CLASSES
 #include "mlir/Dialect/Async/IR/AsyncOps.cpp.inc"
diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -21,7 +21,7 @@
   MLIRSCF
   MLIRPass
   MLIRSideEffectInterfaces
-  MLIRStandardOps
+  MLIRStandard
   MLIRSupport
   MLIRTransformUtils
   )
diff --git a/mlir/lib/Dialect/Linalg/Analysis/CMakeLists.txt b/mlir/lib/Dialect/Linalg/Analysis/CMakeLists.txt
--- a/mlir/lib/Dialect/Linalg/Analysis/CMakeLists.txt
+++ b/mlir/lib/Dialect/Linalg/Analysis/CMakeLists.txt
@@ -1,11 +1,11 @@
 add_mlir_dialect_library(MLIRLinalgAnalysis
   DependenceAnalysis.cpp
-  
+
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Linalg
 
   LINK_LIBS PUBLIC
   MLIRIR
-  MLIRLinalgOps
-  MLIRStandardOps
+  MLIRLinalg
+  MLIRStandard
   )
diff --git a/mlir/lib/Dialect/Linalg/EDSC/CMakeLists.txt b/mlir/lib/Dialect/Linalg/EDSC/CMakeLists.txt
--- a/mlir/lib/Dialect/Linalg/EDSC/CMakeLists.txt
+++ b/mlir/lib/Dialect/Linalg/EDSC/CMakeLists.txt
@@ -7,9 +7,9 @@
   LINK_LIBS PUBLIC
   MLIREDSC
   MLIRIR
-  MLIRAffineOps
+  MLIRAffine
   MLIRAffineEDSC
-  MLIRLinalgOps
+  MLIRLinalg
   MLIRSCF
-  MLIRStandardOps
+  MLIRStandard
   )
diff --git a/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt b/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt
--- a/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt
@@ -1,4 +1,4 @@
-add_mlir_dialect_library(MLIRLinalgOps
+add_mlir_dialect_library(MLIRLinalg
   LinalgOps.cpp
   LinalgTypes.cpp
 
@@ -14,5 +14,5 @@
   MLIRIR
   MLIRSideEffectInterfaces
   MLIRViewLikeInterface
-  MLIRStandardOps
+  MLIRStandard
   )
diff --git a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
--- a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_mlir_dialect_library(MLIRLinalgTransforms
   DropUnitDims.cpp
   Fusion.cpp
+  FusionOnTensors.cpp
   Hoisting.cpp
   Interchange.cpp
   Loops.cpp
@@ -17,18 +18,18 @@
   MLIRLinalgPassIncGen
 
   LINK_LIBS PUBLIC
-  MLIRAffineOps
+  MLIRAffine
   MLIRAnalysis
   MLIREDSC
   MLIRIR
   MLIRLinalgAnalysis
   MLIRLinalgEDSC
-  MLIRLinalgOps
+  MLIRLinalg
   MLIRLinalgUtils
   MLIRSCF
   MLIRSCFTransforms
   MLIRPass
-  MLIRStandardOps
+  MLIRStandard
   MLIRStandardToLLVM
   MLIRTransformUtils
   MLIRVector
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp b/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp
@@ -17,6 +17,7 @@
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/Linalg/IR/LinalgTypes.h"
 #include "mlir/Dialect/Linalg/Passes.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/Linalg/Utils/Utils.h"
 #include "mlir/Dialect/StandardOps/EDSC/Intrinsics.h"
 #include "mlir/IR/AffineExpr.h"
@@ -154,9 +155,9 @@
   llvm_unreachable("Expect to be able to extract a view defining loop range");
 }
 
-static LinalgOp fuse(Value producedView, LinalgOp producer, LinalgOp consumer,
-                     unsigned consumerIdx, unsigned producerIdx,
-                     OperationFolder *folder) {
+static LinalgOp fuse(OpBuilder &b, LinalgOp producer, unsigned producerIdx,
+                     LinalgOp consumer, unsigned consumerIdx,
+                     OperationFolder *folder = nullptr) {
   assert(producer.hasBufferSemantics() &&
          "expected linalg op with buffer semantics");
   assert(consumer.hasBufferSemantics() &&
@@ -174,9 +175,7 @@
   //   we can always identify a data dimension with a (at least one) loop
   //   dimension.
   AffineMap producerMap =
-      producer.indexing_maps()[producer.getNumInputs() + producerIdx]
-          .cast<AffineMapAttr>()
-          .getValue();
+      producer.indexing_maps()[producerIdx].cast<AffineMapAttr>().getValue();
   LLVM_DEBUG(dbgs() << "Producer Idx: " << producerIdx
                     << ", producer map: " << producerMap << "\n");
 
@@ -185,10 +184,9 @@
   unsigned nWin = producer.getNumWindowLoops();
   SmallVector<SubViewOp::Range, 8> loopRanges(nPar + nRed + nWin);
 
-  OpBuilder b(consumer.getOperation());
-  auto loc = consumer.getLoc();
   // Iterate over dimensions identified by the producer map for `producerIdx`.
   // This defines a subset of the loop ranges that we need to complete later.
+  auto loc = consumer.getLoc();
   for (auto en : llvm::enumerate(producerMap.getResults())) {
     unsigned posInProducerLoop = en.value().cast<AffineDimExpr>().getPosition();
     loopRanges[posInProducerLoop] =
@@ -319,71 +317,380 @@
   return true;
 }
 
-static Optional<FusionInfo>
-fuseProducerOfDep(OpBuilder &b, LinalgOp consumer, unsigned consumerIdx,
-                  const LinalgDependenceGraph &graph, OperationFolder *folder,
-                  LinalgDependenceGraph::DependenceType depType) {
-  assert(consumer.hasBufferSemantics() &&
-         "expected linalg op with buffer semantics");
-  LLVM_DEBUG(dbgs() << "\nStart examining consumer: "
-                    << *consumer.getOperation());
-  for (auto dependence : graph.getDependencesInto(consumer, depType)) {
-    LLVM_DEBUG(dbgs() << "\n***Consider producer:\t"
-                      << *dependence.dependentOpView.op << "\n");
-    auto producer = cast<LinalgOp>(dependence.dependentOpView.op);
+static Optional<LinalgDependenceGraph::LinalgDependenceGraphElem>
+findFusableProducer(LinalgOp consumer, unsigned consumerIdx,
+                    const LinalgDependenceGraph &dependenceGraph) {
+  // Only consider RAW and WAW atm.
+  for (auto depType : {
+           LinalgDependenceGraph::DependenceType::RAW,
+           LinalgDependenceGraph::DependenceType::WAW,
+       }) {
+    for (auto dependence :
+         dependenceGraph.getDependencesInto(consumer, depType)) {
+      auto producer = cast<LinalgOp>(dependence.dependentOpView.op);
 
-    // Check that the dependence is indeed on the input `consumerIdx` view.
-    auto consumedView = dependence.indexingView;
-    if (!isSameSubView(consumer.getBuffer(consumerIdx), consumedView))
-      continue;
+      // Check that the dependence is indeed on the input `consumerIdx` view.
+      auto consumedView = dependence.indexingView;
+      if (!isSameSubView(consumer.getBuffer(consumerIdx), consumedView))
+        continue;
 
-    // Consumer consumes this view, `isStructurallyFusableProducer` also checks
-    // whether it is a strict subview of the producer view.
-    auto producedView = dependence.dependentOpView.view;
-    auto producerIdx = producer.getIndexOfOutputBuffer(producedView).getValue();
-    // `consumerIdx` and `producerIdx` exist by construction.
-    LLVM_DEBUG(dbgs() << "\n"
-                      << LinalgDependenceGraph::getDependenceTypeStr(depType)
-                      << "producer: " << *producer.getOperation() << " view: "
-                      << producedView << " output index: " << producerIdx);
+      // Consumer consumes this view, `isStructurallyFusableProducer` also
+      // checks whether it is a strict subview of the producer view.
+      auto producedView = dependence.dependentOpView.view;
+      auto producerIdx =
+          producer.getIndexOfOutputBuffer(producedView).getValue();
+      // `consumerIdx` and `producerIdx` exist by construction.
+      LLVM_DEBUG(dbgs() << "\n"
+                        << LinalgDependenceGraph::getDependenceTypeStr(depType)
+                        << "producer: " << *producer.getOperation() << " view: "
+                        << producedView << " output index: " << producerIdx);
+      (void)producerIdx;
+
+      // Simple fusability checks.
+      if (!isFusableInto(dependenceGraph, consumer, consumedView, producer))
+        continue;
 
-    // Must be a subview or a slice to guarantee there are loops we can fuse
-    // into.
-    auto subView = consumedView.getDefiningOp<SubViewOp>();
-    auto slice = consumedView.getDefiningOp<SliceOp>();
-    if (!subView && !slice) {
-      LLVM_DEBUG(dbgs() << "\nNot fusable (not a subview or slice)");
-      continue;
+      return dependence;
     }
+  }
+  return {};
+}
+
+Optional<FusionInfo> mlir::linalg::fuseProducerOf(
+    OpBuilder &b, LinalgOp consumer, unsigned consumerIdx,
+    const LinalgDependenceGraph &graph, OperationFolder *folder) {
+  Optional<LinalgDependenceGraph::LinalgDependenceGraphElem> fusableDependence =
+      findFusableProducer(consumer, consumerIdx, graph);
+  if (!fusableDependence)
+    return {};
+
+  LinalgOp producerOp = cast<LinalgOp>(fusableDependence->dependentOpView.op);
+  Value producerView = fusableDependence->dependentOpView.view;
+  Value consumerView = fusableDependence->indexingView;
+
+  // Must be a subview or a slice to guarantee there are loops we can fuse
+  // into.
+  auto subView = consumerView.getDefiningOp<SubViewOp>();
+  auto slice = consumerView.getDefiningOp<SliceOp>();
+  if (!subView && !slice) {
+    LLVM_DEBUG(dbgs() << "\nNot fusable (not a subview or slice)");
+    return {};
+  }
+
+  // Fuse `producer` just before `consumer`.
+  OpBuilder::InsertionGuard g(b);
+  b.setInsertionPoint(consumer.getOperation());
+  ScopedContext scope(b, consumer.getLoc());
+  LLVM_DEBUG(dbgs() << "Fuse into consumer: " << *consumer << "\n");
+  Optional<unsigned> producerIdxOpt =
+      producerOp.getIndexOfInputAndOutputBuffer(producerView);
+  assert(producerIdxOpt.hasValue() && "incorrect operand index");
+  unsigned producerIdx = producerIdxOpt.getValue();
+
+  auto fusedProducer =
+      fuse(b, producerOp, producerIdx, consumer, consumerIdx, folder);
+  return FusionInfo{producerOp, fusedProducer};
+}
 
-    // Simple fusability checks.
-    if (!isFusableInto(graph, consumer, consumedView, producer))
+/// Returns the positions of the loop in `op` that can be tiled based on the
+/// operations that are to be fused with it. For example, in a
+///
+///   linalg. matmul ins(%a, %b : ...) outs(%c : ...)
+///
+/// if the producer of %a needs to be fused with this op, only the `i` loop of
+/// the matmul can be tiled while fusing. If producer of %a, and %b are to be
+/// fused, then no loops can be tiled while fusing.
+static DenseSet<unsigned> collectTileAndFuseLoops(
+    LinalgOp op, ArrayRef<LinalgDependenceGraph::LinalgDependenceGraphElem>
+                     fusableDependences) {
+  // 1. Only parallel loops can be used for tile + fuse. Find the number of
+  // common outer parallel loops between the op and its producers being fused.
+  auto getNumOuterParallelLoops = [](LinalgOp linalgOp) {
+    return linalgOp.iterator_types()
+        .getValue()
+        .take_while([](Attribute attr) -> bool {
+          return attr.cast<StringAttr>().getValue() ==
+                 getParallelIteratorTypeName();
+        })
+        .size();
+  };
+
+  size_t numOuterParallelLoops = getNumOuterParallelLoops(op);
+  for (auto dependence : fusableDependences) {
+    numOuterParallelLoops =
+        std::min(numOuterParallelLoops, getNumOuterParallelLoops(cast<LinalgOp>(
+                                            dependence.dependentOpView.op)));
+  }
+
+  // Need to compute what tiled loops can be "fused". Given the precondition
+  // that all indexing map for the producer view is a projected permutation, we
+  // can assert that the producer iterates over the dimensions of the "fused
+  // view" only once. To be used a fused loop the producer should use this loop
+  // to access the fused view. For example, consider
+  //
+  // ```
+  //   linalg.add ins(%a, %b) outs(%c)
+  //   linalg.matmul ins(%d, %c) outs(%e)
+  // ```
+  //
+  // if `linalg.add` has the semantics of `c = a + b`, then the following
+  // tile+fuse code is correct.
+  //
+  // ```
+  // for j ... += TSj
+  //   %sa = subview %a[0, %j][...]
+  //   %sb = subview %b[0, %j][...]
+  //   %sc = subview %c[0, %j][...]
+  //   %sd = subview %d[0, 0][...]
+  //   %se = subview %e[0, %j][...]
+  //   linalg.add ins(%sa, %sb) outs(%sc)
+  //   linalg.matmul ins(%sd, %sc) outs(%se)
+  // ```
+  //
+  // On the other hand tiling along i would be incorrect
+  //
+  // ```
+  // for %i .. += TSi
+  //   %sa = subview %a[%i, 0][...]
+  //   %sb = subview %b[%i, 0][...]
+  //   %sc = subview %c[%i, 0][...]
+  //   %sc2 = subview %c[0, 0][...]
+  //   %sd = subview %d[%i, 0][...]
+  //   %se = subview %e[%i, 0][...]
+  //   linalg.add ins(%sa, %sb) outs(%sc)
+  //   linalg.matmul ins(%sd, %sc2) outs(%se)
+  // ```
+  //
+  // The write to the subview `%sc` in `linalg.add` is performed after the read
+  // from it using `%sc2` violating the RAW dependence of the original code. To
+  // find such loops indexing map of the fused view in the consumer op is
+  // used. For the above example, this indexing map is
+  //
+  //   affine_map<(d0, d1, d2) -> (d2, d1)>
+  //
+  // Since d0 is not in the result expressions of this map, it is not treated as
+  // tile + fuse loop, (but d1 is).
+  //
+  // TODO: The above is probably restrictive and there might be a generalization
+  // of these that might allow for more fusion opportunities. Explore based on
+  // needs.
+  SmallVector<DenseSet<unsigned>, 1> commonTilableLoops;
+  for (auto dependence : fusableDependences) {
+    unsigned consumerIdx =
+        op.getIndexOfInputAndOutputBuffer(dependence.indexingView).getValue();
+    AffineMap consumerAccess = op.getIndexingMap(consumerIdx);
+    // Previously asserted that the consumerAccess map is a projected
+    // permutation, so all results are known to be AffineDimExprs. To remove
+    // this restriction walk the expression to find which dimensions of the
+    // consumer loop appear in the `consumerAccess`.
+    DenseSet<unsigned> positions;
+    for (auto expr : consumerAccess.getResults())
+      positions.insert(expr.cast<AffineDimExpr>().getPosition());
+    commonTilableLoops.emplace_back(std::move(positions));
+  }
+
+  // 2. Of the outer parallel loops, only those loops can be tiled + fused as
+  // computed above for all the fused dependences can be used to tile and fuse.
+  DenseSet<unsigned> tilableParallelLoops;
+  for (auto index : llvm::seq<unsigned>(0, numOuterParallelLoops)) {
+    if (llvm::all_of(commonTilableLoops,
+                     [&](const DenseSet<unsigned> &tilableLoops) {
+                       return tilableLoops.count(index);
+                     }))
+      tilableParallelLoops.insert(index);
+  }
+  return tilableParallelLoops;
+}
+
+/// Find all dependences that are to be fusable.
+static Optional<
+    SmallVector<LinalgDependenceGraph::LinalgDependenceGraphElem, 1>>
+findAllFusableDependences(LinalgOp op,
+                          const LinalgDependenceGraph &dependenceGraph,
+                          const LinalgFusionOptions &fusionOptions) {
+  SmallVector<LinalgDependenceGraph::LinalgDependenceGraphElem, 1>
+      fusableDependences;
+  for (auto operand : llvm::enumerate(op.getInputsAndOutputBuffers())) {
+    if (fusionOptions.indicesToFuse &&
+        !fusionOptions.indicesToFuse->count(operand.index()))
       continue;
+    Optional<LinalgDependenceGraph::LinalgDependenceGraphElem>
+        fusableDependence =
+            findFusableProducer(op, operand.index(), dependenceGraph);
+    if (!fusableDependence)
+      continue;
+    // Make sure that the indexing map of the view used for fusion in the
+    // producer is a projected permutation.
+    LinalgOp producerOp = cast<LinalgOp>(fusableDependence->dependentOpView.op);
+    Value producerView = fusableDependence->dependentOpView.view;
+    unsigned producerIdx =
+        producerOp.getIndexOfInputAndOutputBuffer(producerView).getValue();
+    AffineMap producerMap = producerOp.getIndexingMap(producerIdx);
+    if (!producerMap.isProjectedPermutation()) {
+      op.emitError("unhandled non permutation indexing map for fused view in "
+                   "producer for operand at index ")
+          << operand.index();
+      return llvm::None;
+    }
+    Value consumerView = fusableDependence->indexingView;
+    unsigned consumerIdx =
+        op.getIndexOfInputAndOutputBuffer(consumerView).getValue();
+    if (!op.getIndexingMap(consumerIdx).isProjectedPermutation()) {
+      op.emitError(
+          "unhandled case where indexing map for fused view in the consumer is "
+          "not a projected permuration while fusing at index ")
+          << operand.index();
+      return llvm::None;
+    }
+    fusableDependences.push_back(*fusableDependence);
+    if (!fusionOptions.indicesToFuse)
+      break;
+  }
+  return fusableDependences;
+}
 
-    // Fuse `producer` just before `consumer`.
-    OpBuilder::InsertionGuard g(b);
-    b.setInsertionPoint(consumer.getOperation());
-    ScopedContext scope(b, consumer.getLoc());
-    LLVM_DEBUG(dbgs() << "Fuse into consumer: " << *consumer << "\n");
-    auto fusedProducer = fuse(producedView, producer, consumer, consumerIdx,
-                              producerIdx, folder);
+static bool isZero(Value v) {
+  if (auto cst = v.getDefiningOp<ConstantIndexOp>())
+    return cst.getValue() == 0;
+  return false;
+}
 
-    return FusionInfo{producer, fusedProducer};
+template <typename LoopType>
+static Optional<TiledAndFusedLinalgOps>
+tileAndFuseLinalgOpsImpl(PatternRewriter &rewriter, LinalgOp op,
+                         const LinalgDependenceGraph &dependenceGraph,
+                         const LinalgTilingOptions &tilingOptions,
+                         const LinalgFusionOptions &fusionOptions) {
+  assert(op.hasBufferSemantics() && "expected linalg op with buffer semantics");
+  // Some of the tiling options might not be supportable with tile and fuse.
+  // TODO: Support interchange with tile + fuse.
+  if (!tilingOptions.interchangeVector.empty()) {
+    op.emitError("unable to handle tile and fuse with interchange");
+    return llvm::None;
+  }
+
+  OpBuilder::InsertionGuard g(rewriter);
+  rewriter.setInsertionPoint(op);
+  ScopedContext scope(rewriter, op.getLoc());
+
+  // Find all the producers.
+  Optional<SmallVector<LinalgDependenceGraph::LinalgDependenceGraphElem, 1>>
+      fusableDependencesOpt =
+          findAllFusableDependences(op, dependenceGraph, fusionOptions);
+  if (!fusableDependencesOpt)
+    return llvm::None;
+  ArrayRef<LinalgDependenceGraph::LinalgDependenceGraphElem> fusableDependences(
+      *fusableDependencesOpt);
+
+  // Enforce the convention that "tiling by zero" skips tiling a particular
+  // dimension. This convention is significantly simpler to handle instead of
+  // adjusting affine maps to account for missing dimensions.
+  auto nLoops = op.getNumLoops();
+  SmallVector<Value, 4> tileSizeVector =
+      tilingOptions.tileSizeComputationFunction(rewriter, op);
+  if (tileSizeVector.size() < nLoops) {
+    auto zero = std_constant_index(0);
+    tileSizeVector.append(nLoops - tileSizeVector.size(), zero);
+  }
+
+  TiledAndFusedLinalgOps ret;
+
+  // Find the loops that can be tiled and fused.
+  DenseSet<unsigned> tileFuseLoops =
+      collectTileAndFuseLoops(op, fusableDependences);
+
+  // If there are no fusable dependences or there are no tile+fusable loops,
+  // just return.
+  if (fusableDependences.empty() || tileFuseLoops.empty()) {
+    return llvm::None;
+  }
+
+  // Get the tile sizes for the first and second tiling steps. For the first
+  // step the tile size are set to zero for the loops that arent
+  // fused. Similarly for the second step, the tile sizes are set to zero for
+  // the loops that are fused. For example, if for the following input
+  //
+  // ```
+  //   linalg.add ins(%a, %b) outs(%c)
+  //   linalg.matmul ins(%d, %c) outs(%e)
+  // ```
+  //
+  // if the tile sizes of the `{i, j, k}` loops where given as `{ti, tj, tk}`
+  // respectively, and since only `j` can be tiled and fused. The tile sizes
+  // would be `{0, t_j, 0}` for the first tiling that tiles just the fusable
+  // loops. The second tiling would be use tile sizes of `{t_i, 0, t_k}` to tile
+  // the tiled matmul generated by the first tiling step.
+  SmallVector<Value, 4> tileAndFuseSizes, tileSizes;
+  for (auto tileSize : enumerate(tileSizeVector)) {
+    auto zero = std_constant_index(0);
+    if (tileFuseLoops.count(tileSize.index())) {
+      tileAndFuseSizes.push_back(tileSize.value());
+      tileSizes.push_back(zero);
+    } else {
+      tileSizes.push_back(tileSize.value());
+      tileAndFuseSizes.push_back(zero);
+    }
   }
-  return llvm::None;
+
+  // Tile for the loops that can be fused.
+  LinalgTilingOptions firstTilingOptions = tilingOptions;
+  firstTilingOptions.setTileSizes(tileAndFuseSizes);
+  Optional<TiledLinalgOp> firstTiledOp =
+      tileLinalgOp(rewriter, op, firstTilingOptions);
+  if (!firstTiledOp)
+    return llvm::None;
+  ret.op = firstTiledOp->op;
+  ret.fusedLoops.assign(firstTiledOp->loops.begin(), firstTiledOp->loops.end());
+
+  rewriter.setInsertionPoint(ret.op);
+  // Fuse the operands.
+  for (auto producer : enumerate(fusableDependences)) {
+    LinalgOp producerOp = cast<LinalgOp>(producer.value().dependentOpView.op);
+    unsigned producerIdx = producerOp
+                               .getIndexOfInputAndOutputBuffer(
+                                   producer.value().dependentOpView.view)
+                               .getValue();
+    unsigned consumerIdx =
+        op.getIndexOfInputAndOutputBuffer(producer.value().indexingView)
+            .getValue();
+    LinalgOp fusedOp =
+        fuse(rewriter, producerOp, producerIdx, ret.op, consumerIdx);
+    ret.fusedProducers.push_back(fusedOp);
+    ret.originalProducers.push_back(producerOp);
+  }
+
+  if (!llvm::all_of(tileSizes, isZero)) {
+    // Tile the remaining loops of the root operation.
+    LinalgTilingOptions secondTilingOptions = tilingOptions;
+    // The distribution is done only for the tile+fused loops.
+    secondTilingOptions.distribution = llvm::None;
+    secondTilingOptions.setTileSizes(tileSizes);
+    Optional<TiledLinalgOp> secondTiledOp =
+        tileLinalgOp(rewriter, ret.op, secondTilingOptions);
+    if (!secondTiledOp)
+      return llvm::None;
+    ret.unfusedLoops.assign(secondTiledOp->loops.begin(),
+                            secondTiledOp->loops.end());
+    rewriter.eraseOp(ret.op);
+    ret.op = secondTiledOp->op;
+  }
+
+  return ret;
 }
 
-// Only consider RAW and WAW atm.
-Optional<FusionInfo> mlir::linalg::fuseProducerOf(
-    OpBuilder &b, LinalgOp consumer, unsigned consumerIdx,
-    const LinalgDependenceGraph &graph, OperationFolder *folder) {
-  for (auto dep : {
-           LinalgDependenceGraph::DependenceType::RAW,
-           LinalgDependenceGraph::DependenceType::WAW,
-       }) {
-    if (auto res =
-            fuseProducerOfDep(b, consumer, consumerIdx, graph, folder, dep))
-      return res;
+Optional<TiledAndFusedLinalgOps>
+mlir::linalg::tileAndFuseLinalgOps(PatternRewriter &rewriter, LinalgOp op,
+                                   const LinalgDependenceGraph &dependenceGraph,
+                                   const LinalgTilingOptions &tilingOptions,
+                                   const LinalgFusionOptions &fusionOptions) {
+  switch (tilingOptions.loopType) {
+  case LinalgTilingLoopType::Loops:
+    return tileAndFuseLinalgOpsImpl<scf::ForOp>(rewriter, op, dependenceGraph,
+                                                tilingOptions, fusionOptions);
+  case LinalgTilingLoopType::ParallelLoops:
+    return tileAndFuseLinalgOpsImpl<scf::ParallelOp>(
+        rewriter, op, dependenceGraph, tilingOptions, fusionOptions);
+  default:;
   }
   return llvm::None;
 }
@@ -429,687 +736,12 @@
   LLVM_DEBUG(f.print(dbgs() << "\nAfter linalg-fusion: \n"));
 }
 
-//====---------------------------------------------------------------------===//
-// Fusion on Tensor operation.
-//====---------------------------------------------------------------------===//
-
-namespace {
-
-/// Implementation of fusion of generic ops and indexed_generic ops.
-struct FuseGenericOpsOnTensors {
-  static bool isFusible(LinalgOp producer, LinalgOp consumer,
-                        unsigned consumerIdx) {
-    // Producer and consumer must have tensor semantics.
-    if (!producer.hasTensorSemantics() || !consumer.hasTensorSemantics())
-      return false;
-
-    // Verify that
-    // - the producer has all "parallel" iterator type.
-    if (producer.getNumParallelLoops() != producer.getNumLoops())
-      return false;
-
-    // Get the consumer index map. The number of results of the consumer index
-    // map must match the number of loops of the producer.
-    AffineMap consumerIndexMap = consumer.getIndexingMap(consumerIdx);
-    if (consumerIndexMap.getNumResults() != producer.getNumLoops())
-      return false;
-
-    // Finally the index_map for the result must be invertible. For now just
-    // verify it is a permutation.
-    AffineMap producerResultIndexMap = producer.getOutputIndexingMap(0);
-    return producerResultIndexMap.isPermutation();
-  }
-
-  static LinalgOp fuse(LinalgOp producer, LinalgOp consumer,
-                       unsigned consumerIdx, PatternRewriter &rewriter,
-                       OperationFolder *folder = nullptr) {
-    if (!isFusible(producer, consumer, consumerIdx))
-      return nullptr;
-
-    unsigned numFusedOperands = producer.getOperation()->getNumOperands() +
-                                consumer.getOperation()->getNumOperands() - 1;
-
-    // Compute the fused operands list,
-    SmallVector<Value, 2> fusedOperands;
-    fusedOperands.reserve(numFusedOperands);
-    auto consumerOperands = consumer.getOperation()->getOperands();
-    auto producerOperands = producer.getOperation()->getOperands();
-    fusedOperands.assign(consumerOperands.begin(),
-                         std::next(consumerOperands.begin(), consumerIdx));
-    fusedOperands.append(producerOperands.begin(), producerOperands.end());
-    fusedOperands.append(std::next(consumerOperands.begin(), consumerIdx + 1),
-                         consumerOperands.end());
-
-    // Compute indexing_maps for the fused operation. The indexing_maps for the
-    // operands of the consumers that arent fused are the same. The
-    // indexing_maps for the producers need to be computed based on the
-    // indexing_map of the operand at consumerIdx in the consumer.
-    SmallVector<Attribute, 4> fusedIndexMaps;
-    auto consumerIndexMaps = consumer.indexing_maps();
-    fusedIndexMaps.reserve(fusedOperands.size() +
-                           consumer.getOperation()->getNumResults());
-    fusedIndexMaps.assign(consumerIndexMaps.begin(),
-                          std::next(consumerIndexMaps.begin(), consumerIdx));
-    // Compute indexing maps for the producer args in the fused operation.
-    computeProducerOperandIndex(
-        producer, consumer.getInputIndexingMap(consumerIdx), fusedIndexMaps);
-
-    // Append the indexing maps for the remaining consumer operands.
-    fusedIndexMaps.append(std::next(consumerIndexMaps.begin(), consumerIdx + 1),
-                          consumerIndexMaps.end());
-
-    // Generate the fused op.
-    // Tensor-level fusion is only on ops without initTensors and outputBuffers.
-    LinalgOp fusedOp;
-    if (isa<GenericOp>(producer.getOperation()) &&
-        isa<GenericOp>(consumer.getOperation())) {
-      fusedOp =
-          rewriter
-              .create<GenericOp>(consumer.getLoc(),
-                                 consumer.getOperation()->getResultTypes(),
-                                 /*inputs=*/fusedOperands,
-                                 /*outputBuffers=*/ValueRange{},
-                                 /*initTensors=*/ValueRange{},
-                                 rewriter.getArrayAttr(fusedIndexMaps),
-                                 consumer.iterator_types(),
-                                 /*doc=*/nullptr,
-                                 /*library_call=*/nullptr,
-                                 /*symbol_source=*/nullptr)
-              .getOperation();
-    } else {
-      fusedOp =
-          rewriter
-              .create<IndexedGenericOp>(
-                  consumer.getLoc(), consumer.getOperation()->getResultTypes(),
-                  /*inputs=*/fusedOperands,
-                  /*outputBuffers=*/ValueRange{},
-                  /*initTensors=*/ValueRange{},
-                  rewriter.getArrayAttr(fusedIndexMaps),
-                  consumer.iterator_types(),
-                  /*doc=*/nullptr,
-                  /*library_call=*/nullptr,
-                  /*symbol_source=*/nullptr)
-              .getOperation();
-    }
-
-    // Construct an AffineMap from consumer loops to producer loops.
-    // consumer loop -> tensor index
-    AffineMap consumerResultIndexMap =
-        consumer.getInputIndexingMap(consumerIdx);
-    // producer loop -> tensor index
-    AffineMap producerResultIndexMap = producer.getOutputIndexingMap(0);
-    // tensor index -> producer loop
-    AffineMap invProducerResultIndexMap =
-        inversePermutation(producerResultIndexMap);
-    assert(invProducerResultIndexMap &&
-           "expected producer result indexig map to be invertible");
-    // consumer loop -> producer loop
-    AffineMap consumerToProducerLoopsMap =
-        invProducerResultIndexMap.compose(consumerResultIndexMap);
-
-    generateFusedRegion(rewriter, fusedOp, producer, consumer,
-                        consumerToProducerLoopsMap, consumerIdx,
-                        consumer.getNumLoops());
-    return fusedOp;
-  }
-
-private:
-  /// Append to `fusedOpIndexingMapAttrs` the indexing maps for the operands of
-  /// the `producer` to use in the fused operation given the indexing map of the
-  /// result of the producer in the consumer.
-  static void computeProducerOperandIndex(
-      LinalgOp producer, AffineMap fusedConsumerArgIndexMap,
-      SmallVectorImpl<Attribute> &fusedOpIndexingMapAttrs) {
-    // The indexing map in the consumer op (fusedConsumerArgIndexMap) is a map
-    // from consumer loop -> consumer arg tensor index/producer result tensor
-    // index. The fused loop is same as the consumer loop. For each producer arg
-    // the indexing map to be computed is a map from consumer loop -> producer
-    // arg tensor index.
-
-    AffineMap producerResultIndexMap = producer.getOutputIndexingMap(0);
-    // producerResultIndexMap is a map from producer loop -> tensor index.
-    // Compute the inverse to get map from tensor index -> producer loop.
-    // The inverse is a map from producer result tensor index -> producer loop.
-    AffineMap invProducerResultIndexMap =
-        inversePermutation(producerResultIndexMap);
-    assert(invProducerResultIndexMap &&
-           "expected producer result indexig map to be invertible");
-    for (unsigned argNum : llvm::seq<unsigned>(0, producer.getNumInputs())) {
-      // argMap is a map from producer loop -> producer arg tensor index.
-      AffineMap argMap = producer.getInputIndexingMap(argNum);
-
-      // Compose argMap with invProducerResultIndexMap to get a map from
-      // producer result tensor index -> producer arg tensor index.
-      AffineMap t1 = argMap.compose(invProducerResultIndexMap);
-
-      // Compose t1 with fusedConsumerArgIndexMap gives an indexing map from
-      // consumer loop/ fused loop -> producer arg tensor index.
-      AffineMap indexingMap = t1.compose(fusedConsumerArgIndexMap);
-      fusedOpIndexingMapAttrs.push_back(AffineMapAttr::get(indexingMap));
-    }
-  }
-
-  /// Generate the region of the fused operation. The region of the fused op
-  /// must be empty.
-  static void generateFusedRegion(PatternRewriter &rewriter, Operation *fusedOp,
-                                  LinalgOp producer, LinalgOp consumer,
-                                  AffineMap consumerToProducerLoopsMap,
-                                  unsigned consumerIdx, unsigned nloops) {
-    // Build the region of the fused op.
-    Block &producerBlock = producer.getOperation()->getRegion(0).front();
-    Block &consumerBlock = consumer.getOperation()->getRegion(0).front();
-    Block *fusedBlock = new Block();
-    fusedOp->getRegion(0).push_back(fusedBlock);
-    BlockAndValueMapping mapper;
-    OpBuilder::InsertionGuard guard(rewriter);
-    rewriter.setInsertionPointToStart(fusedBlock);
-
-    // The block arguments are
-    // [index_0, index_1, ... ,
-    //   consumer_operand_0, ... , consumer_operand_(`consumerIdx`-1),
-    //   producer_operand_0, ... , producer_operand_(n-1)],
-    //   consumer_operand_(`consumerIdx`), .. consumer_operand_(m-1)]
-    // , where n is the number of producer's operand and m is the number
-    // consumer's operand.
-    // If both `numProducerIndices` and `numConsumerIndices` are zero, this is a
-    // generic op. In this case, there are no indices in block arguments.
-    unsigned numProducerIndices =
-        isa<IndexedGenericOp>(producer.getOperation()) ? nloops : 0;
-    unsigned numConsumerIndices =
-        isa<IndexedGenericOp>(consumer.getOperation()) ? nloops : 0;
-    // Firstly, add all the indices to the block arguments.
-    for (unsigned i = 0, e = std::max(numProducerIndices, numConsumerIndices);
-         i < e; ++i)
-      fusedBlock->addArgument(rewriter.getIndexType());
-    // Map the arguments for the unmodified args from the consumer.
-    for (auto consumerArg : llvm::enumerate(consumerBlock.getArguments())) {
-      if (consumerArg.index() == consumerIdx + numConsumerIndices) {
-        // Map the arguments for the args from the producer.
-        for (auto producerArg : llvm::enumerate(producerBlock.getArguments())) {
-          // If producer is an indexed_generic op, map the indices from consumer
-          // loop to producer loop (because the fusedOp is built based on
-          // consumer's perspective).
-          if (producerArg.index() < numProducerIndices) {
-            auto newIndex = rewriter.create<mlir::AffineApplyOp>(
-                producer.getLoc(),
-                consumerToProducerLoopsMap.getSubMap(producerArg.index()),
-                fusedBlock->getArguments().take_front(nloops));
-            mapper.map(producerArg.value(), newIndex);
-          } else {
-            mapper.map(producerArg.value(),
-                       fusedBlock->addArgument(producerArg.value().getType()));
-          }
-        }
-        continue;
-      }
-
-      // If consumer is an indexed_generic op, map the indices to the block
-      // arguments directly. Otherwise, add the same type of arugment and map to
-      // it.
-      if (consumerArg.index() < numConsumerIndices) {
-        mapper.map(consumerArg.value(),
-                   fusedBlock->getArgument(consumerArg.index()));
-      } else {
-        mapper.map(consumerArg.value(),
-                   fusedBlock->addArgument(consumerArg.value().getType()));
-      }
-    }
-
-    // Add operations from producer (except the yield operation) to the fused
-    // op.
-    for (auto &op : producerBlock.getOperations()) {
-      if (auto yieldOp = dyn_cast<linalg::YieldOp>(op)) {
-        // Lookup the value the yield operation is mapped to.
-        Value yieldVal = yieldOp.getOperand(0);
-        if (Value clonedVal = mapper.lookupOrNull(yieldVal))
-          mapper.map(
-              consumerBlock.getArgument(consumerIdx + numConsumerIndices),
-              clonedVal);
-        continue;
-      }
-      rewriter.clone(op, mapper);
-    }
-    for (auto &op : consumerBlock.getOperations())
-      rewriter.clone(op, mapper);
-  }
-};
-} // namespace
-
-/// Linearize the expressions in `sourceMap` based on the `reassociationMaps`
-/// provided, given the shape of the source tensor that corresponds to the
-/// `sourceMap`. Note that this implicitly assumes that the tensors dimensions
-/// are "row-major" ordered logically.
-///
-/// For example:
-///
-/// %0 = op ... : tensor<?x?x4x5xf32>
-/// with output index_map `affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>`
-///
-/// and reshape:
-/// %1 = linalg.tensor_reshape %0 [affine_map<(i, j, k, l) -> (i)>,
-///                                affine_map<(i, j, k, l) -> (j, k, l)>] :
-///        tensor<?x?x4x5xf32> into tensor<?x?xf32>
-///
-/// would be rewritten into:
-/// %0 = op ... : tensor<?x?x4x5xf32>
-/// with output index_map
-///   `affine_map<(d0, d1, d2, d3) -> (d0, d1 * 20 + d2 * 5 + d3)>`
-static AffineMap linearizeCollapsedDims(AffineMap sourceMap,
-                                        ArrayRef<int64_t> sourceShape,
-                                        ArrayRef<AffineMap> reassociationMaps) {
-  SmallVector<AffineExpr, 4> resultExprs;
-  resultExprs.reserve(reassociationMaps.size());
-  ArrayRef<AffineExpr> sourceExprs = sourceMap.getResults();
-  MLIRContext *context = sourceMap.getContext();
-
-  // Compute the result exprs based on the reassociation maps.
-  for (AffineMap map : reassociationMaps) {
-    ArrayRef<AffineExpr> collapsedDims = map.getResults();
-    // Assume that they are in-order and contiguous (already checked in
-    // verifier).
-    assert(!collapsedDims.empty());
-    unsigned startDim =
-        collapsedDims.front().cast<AffineDimExpr>().getPosition();
-    AffineExpr linearizedExpr = makeCanonicalStridedLayoutExpr(
-        sourceShape.slice(startDim, collapsedDims.size()),
-        sourceExprs.slice(startDim, collapsedDims.size()), context);
-    resultExprs.push_back(linearizedExpr);
-  }
-  return AffineMap::get(sourceMap.getNumDims(), sourceMap.getNumSymbols(),
-                        resultExprs, context);
-}
-
-/// Checks if the `reshapeOp` can be fused with it consumer (if `asProducer` is
-/// true) or its producer (if `asProducer` is false) given the indexing map at
-/// its use.
-static bool isTensorReshapeOpFusible(TensorReshapeOp reshapeOp,
-                                     AffineMap useIndexMap, bool asProducer) {
-  RankedTensorType returnType = reshapeOp.getResultType();
-  RankedTensorType operandType = reshapeOp.getSrcType();
-  // Reshape is fusible with its consumer (i.e. reshape as a producer) when its
-  // operand is of lesser rank than the result. Fusing when operand has higher
-  // rank will require use of mods and divs in the indexing maps of the fused op
-  // which would make it non-invertible. Similarly reshape is fused with its
-  // producer (i.e. reshape as consumer) only if the return type has lesser
-  // rank.
-  if ((asProducer && returnType.getRank() < operandType.getRank()) ||
-      (!asProducer && operandType.getRank() < returnType.getRank()))
-    return false;
-  return useIndexMap.isIdentity();
-}
-
-/// Based on the type of `op` create a linalg op of the same type, i.e. if `op`
-/// is a linalg.generic operation, the create a `linalg.generic` operation with
-/// the given `args`. Expects `op` to be `linalg.generic` or
-/// `linalg.indexed_generic`.
-template <typename... Args>
-static LinalgOp createLinalgOpOfSameType(LinalgOp op, PatternRewriter &rewriter,
-                                         Args... args) {
-  if (isa<GenericOp>(op.getOperation()))
-    return cast<LinalgOp>(rewriter.create<GenericOp>(args...).getOperation());
-  if (isa<IndexedGenericOp>(op.getOperation()))
-    return cast<LinalgOp>(
-        rewriter.create<IndexedGenericOp>(args...).getOperation());
-  llvm_unreachable(
-      "expected only linalg.generic or linalg.indexed_generic ops");
-  return nullptr;
-}
-
 namespace {
-
-/// Implementation of fusion on tensor ops when producer is a TensorReshapeOp.
-struct FuseTensorReshapeOpAsProducer {
-  static bool isFusible(TensorReshapeOp producer, LinalgOp consumer,
-                        unsigned consumerIdx) {
-    return isa<GenericOp, IndexedGenericOp>(consumer.getOperation()) &&
-           consumer.hasTensorSemantics() &&
-           isTensorReshapeOpFusible(producer,
-                                    consumer.getInputIndexingMap(consumerIdx),
-                                    /*asProducer=*/true);
-  }
-
-  static LinalgOp fuse(TensorReshapeOp producer, LinalgOp consumer,
-                       unsigned consumerIdx, PatternRewriter &rewriter,
-                       OperationFolder *folder = nullptr) {
-    if (producer.src().getDefiningOp<ConstantOp>())
-      return nullptr;
-
-    if (!isFusible(producer, consumer, consumerIdx))
-      return nullptr;
-
-    // Compute the fused operands list,
-    Operation *consumerOp = consumer.getOperation();
-    SmallVector<Value, 2> fusedOperands(consumerOp->getOperands());
-    fusedOperands[consumerIdx] = producer.src();
-
-    // Compute indexing_maps for the fused operation. The indexing_maps for the
-    // operands of the consumers that arent fused are the same.
-    SmallVector<AffineMap, 4> fusedIndexMaps =
-        llvm::to_vector<4>(llvm::map_range(
-            consumer.indexing_maps(), [](Attribute attr) -> AffineMap {
-              return attr.cast<AffineMapAttr>().getValue();
-            }));
-
-    // Compute the indexing map to use for the operand of the producer.
-    AffineMap modifiedMap = linearizeCollapsedDims(
-        fusedIndexMaps[consumerIdx], producer.getResultType().getShape(),
-        producer.getReassociationMaps());
-    for (AffineExpr expr : modifiedMap.getResults()) {
-      if (!expr.isPureAffine())
-        return nullptr;
-    }
-    fusedIndexMaps[consumerIdx] = modifiedMap;
-
-    // Further check that the resulting index maps can be fused and
-    // inverted. Without this the resultant op is not legal.
-    if (!inversePermutation(concatAffineMaps(fusedIndexMaps)))
-      return nullptr;
-
-    SmallVector<Attribute, 4> indexMapAttrs = llvm::to_vector<4>(
-        llvm::map_range(fusedIndexMaps, [](AffineMap map) -> Attribute {
-          return AffineMapAttr::get(map);
-        }));
-    LinalgOp fusedOp = createLinalgOpOfSameType(
-        consumer, rewriter, rewriter.getUnknownLoc(),
-        consumerOp->getResultTypes(),
-        /*inputs=*/fusedOperands,
-        /*outputBuffers=*/ValueRange{},
-        /*initTensors=*/ValueRange{}, // no init tensors for now.
-        rewriter.getArrayAttr(indexMapAttrs), consumer.iterator_types(),
-        /*doc=*/nullptr,
-        /*library_call=*/nullptr,
-        /*symbol_source=*/nullptr);
-    auto &fusedRegion = fusedOp.getOperation()->getRegion(0);
-    rewriter.cloneRegionBefore(consumerOp->getRegion(0), fusedRegion,
-                               fusedRegion.begin());
-    return fusedOp;
-  }
-};
-
-/// Implementation of fusion on tensor ops when consumer is a TensorReshapeOp.
-struct FuseTensorReshapeOpAsConsumer {
-  static bool isCollapsingAndFusible(LinalgOp producer,
-                                     TensorReshapeOp consumer,
-                                     unsigned consumerIdx) {
-    return isa<GenericOp, IndexedGenericOp>(producer.getOperation()) &&
-           producer.hasTensorSemantics() &&
-           isTensorReshapeOpFusible(consumer, producer.getOutputIndexingMap(0),
-                                    /*asProducer=*/false);
-  }
-
-  static LinalgOp fuseCollapsingCase(LinalgOp producer,
-                                     TensorReshapeOp consumer,
-                                     unsigned consumerIdx,
-                                     PatternRewriter &rewriter) {
-    // The indexing_maps for the operands of the fused operation are same as
-    // those for the operands of the producer.
-    SmallVector<AffineMap, 4> fusedIndexMaps =
-        llvm::to_vector<4>(llvm::map_range(
-            producer.indexing_maps(), [](Attribute attr) -> AffineMap {
-              return attr.cast<AffineMapAttr>().getValue();
-            }));
-    // Compute the indexing map to use for the operand of the producer.
-    AffineMap modifiedMap = linearizeCollapsedDims(
-        producer.getOutputIndexingMap(0), consumer.getSrcType().getShape(),
-        consumer.getReassociationMaps());
-    for (AffineExpr expr : modifiedMap.getResults()) {
-      if (!expr.isPureAffine())
-        return nullptr;
-    }
-    fusedIndexMaps.back() = modifiedMap;
-
-    // Further check that the resulting index maps can be fused and
-    // inverted. Without this the resultant op is not legal.
-    if (!inversePermutation(concatAffineMaps(fusedIndexMaps)))
-      return nullptr;
-
-    SmallVector<Attribute, 4> indexMapAttrs = llvm::to_vector<4>(
-        llvm::map_range(fusedIndexMaps, [](AffineMap map) -> Attribute {
-          return AffineMapAttr::get(map);
-        }));
-
-    Operation *producerOp = producer.getOperation();
-    LinalgOp fusedOp = createLinalgOpOfSameType(
-        producer, rewriter, rewriter.getUnknownLoc(), consumer.getResultType(),
-        /*inputs=*/producerOp->getOperands(),
-        /*outputBuffers=*/ValueRange{},
-        /*initTensors=*/ValueRange{}, // no init tensors for now.
-        rewriter.getArrayAttr(indexMapAttrs), producer.iterator_types(),
-        /*doc=*/nullptr,
-        /*library_call=*/nullptr,
-        /*symbol_source=*/nullptr);
-    auto &fusedRegion = fusedOp.getOperation()->getRegion(0);
-    rewriter.cloneRegionBefore(producerOp->getRegion(0), fusedRegion,
-                               fusedRegion.begin());
-    return fusedOp;
-  }
-
-  static bool isExpandingAndFusible(LinalgOp producer, TensorReshapeOp consumer,
-                                    unsigned consumerIdx) {
-    // Is fusible only if:
-    //   1) The producer is a generic op.
-    //   2) The producer has tensor semantics.
-    //   3) The tensor reshape op is a expanding case.
-    //   4) All the shapes are the same for the generic op.
-    //   5) All the indexing maps in producer are identity.
-    //   6) All the loops in producer are parallel loops.
-    //   7) The producer has a single user.
-    auto types = producer.getInputOutputShapedTypes();
-    assert(!types.empty());
-    return isa<GenericOp>(producer.getOperation()) &&
-           producer.hasTensorSemantics() &&
-           consumer.getSrcType().getRank() <
-               consumer.getResultType().getRank() &&
-           std::equal(types.begin() + 1, types.end(), types.begin()) &&
-           llvm::all_of(producer.getIndexingMaps(),
-                        [](AffineMap map) { return map.isIdentity(); }) &&
-           llvm::all_of(producer.iterator_types(),
-                        [](Attribute attr) {
-                          return attr.cast<StringAttr>().getValue() ==
-                                 getParallelIteratorTypeName();
-                        }) &&
-           producer.getOperation()->hasOneUse();
-  }
-
-  static LinalgOp fuseExpandingCase(LinalgOp producer, TensorReshapeOp consumer,
-                                    unsigned consumerIdx,
-                                    PatternRewriter &rewriter) {
-    Location loc = producer.getLoc();
-    auto dstShape = consumer.getResultType().cast<ShapedType>().getShape();
-    SmallVector<Value, 4> args;
-    for (auto arg : producer.getOperation()->getOperands()) {
-      auto type = RankedTensorType::get(
-          dstShape, arg.getType().cast<ShapedType>().getElementType());
-      args.push_back(rewriter.createOrFold<linalg::TensorReshapeOp>(
-          loc, type, arg, consumer.reassociation()));
-    }
-
-    SmallVector<Type, 4> resultTypes;
-    for (auto t : producer.getOutputTensorTypes()) {
-      Type type = RankedTensorType::get(dstShape,
-                                        t.cast<ShapedType>().getElementType());
-      resultTypes.push_back(type);
-    }
-
-    int rank = dstShape.size();
-    auto genericOp = rewriter.create<linalg::GenericOp>(
-        loc, resultTypes, /*inputs=*/args,
-        /*outputBuffers=*/ValueRange{},
-        /*initTensors=*/ValueRange{},
-        SmallVector<AffineMap, 3>(args.size() + resultTypes.size(),
-                                  rewriter.getMultiDimIdentityMap(rank)),
-        SmallVector<StringRef, 3>(rank, getParallelIteratorTypeName()));
-    Region &region = genericOp.getRegion();
-    rewriter.cloneRegionBefore(producer.getOperation()->getRegion(0), region,
-                               region.begin());
-    return cast<LinalgOp>(genericOp.getOperation());
-  }
-
-  static LinalgOp fuse(LinalgOp producer, TensorReshapeOp consumer,
-                       unsigned consumerIdx, PatternRewriter &rewriter,
-                       OperationFolder *folder = nullptr) {
-    if (isCollapsingAndFusible(producer, consumer, consumerIdx))
-      return fuseCollapsingCase(producer, consumer, consumerIdx, rewriter);
-    if (isExpandingAndFusible(producer, consumer, consumerIdx))
-      return fuseExpandingCase(producer, consumer, consumerIdx, rewriter);
-    return nullptr;
-  }
-};
-
-/// Implementation of fusion on tensor ops when producer is a splat constant.
-struct FuseConstantOpAsProducer {
-  static bool isFusible(ConstantOp producer, LinalgOp consumer,
-                        unsigned consumerIdx) {
-    return isa<GenericOp, IndexedGenericOp>(consumer.getOperation()) &&
-           consumer.hasTensorSemantics() &&
-           producer.getResult().getType().isa<RankedTensorType>() &&
-           producer.value().cast<DenseElementsAttr>().isSplat();
-  }
-
-  static LinalgOp fuse(ConstantOp producer, LinalgOp consumer,
-                       unsigned consumerIdx, PatternRewriter &rewriter,
-                       OperationFolder *folder = nullptr) {
-    if (!isFusible(producer, consumer, consumerIdx))
-      return nullptr;
-
-    // The indexing_maps for the operands of the fused operation are same as
-    // those for the operands of the consumer without the indexing map at
-    // consumerIdx
-    SmallVector<AffineMap, 4> fusedIndexMaps =
-        llvm::to_vector<4>(llvm::map_range(
-            consumer.indexing_maps(), [](Attribute attr) -> AffineMap {
-              return attr.cast<AffineMapAttr>().getValue();
-            }));
-    fusedIndexMaps.erase(std::next(fusedIndexMaps.begin(), consumerIdx));
-
-    // The operands list is same as the consumer with the argument for constant
-    // index dropped.
-    Operation *consumerOp = consumer.getOperation();
-    SmallVector<Value, 4> fusedOperands(consumerOp->getOperands());
-    fusedOperands.erase(std::next(fusedOperands.begin(), consumerIdx));
-
-    // Create a constant scalar value from the splat constant.
-    Value scalarConstant = rewriter.create<ConstantOp>(
-        producer.getLoc(),
-        producer.value().cast<DenseElementsAttr>().getSplatValue());
-
-    LinalgOp fusedOp = createLinalgOpOfSameType(
-        consumer, rewriter, rewriter.getUnknownLoc(),
-        consumerOp->getResultTypes(),
-        /*inputs=*/fusedOperands,
-        /*outputBuffers=*/ValueRange{},
-        /*initTensors=*/ValueRange{}, // no init tensors for now.
-        rewriter.getAffineMapArrayAttr(fusedIndexMaps),
-        consumer.iterator_types(),
-        /*doc=*/nullptr,
-        /*library_call=*/nullptr,
-        /*symbol_source=*/nullptr);
-
-    // Map the block argument corresponding to the replaced argument with the
-    // scalar constant.
-    Region &consumerRegion = consumerOp->getRegion(0);
-    Block &entryBlock = *consumerRegion.begin();
-    unsigned argIndex = entryBlock.getNumArguments() -
-                        consumerOp->getNumOperands() + consumerIdx;
-    BlockAndValueMapping mapping;
-    mapping.map(entryBlock.getArgument(argIndex), scalarConstant);
-    Region &fusedRegion = fusedOp.getOperation()->getRegion(0);
-    rewriter.cloneRegionBefore(consumerRegion, fusedRegion, fusedRegion.begin(),
-                               mapping);
-    return fusedOp;
-  }
-};
-} // namespace
-
-Operation *mlir::linalg::fuseTensorOps(PatternRewriter &rewriter,
-                                       Operation *consumer,
-                                       unsigned consumerIdx,
-                                       OperationFolder *folder) {
-  if (consumerIdx >= consumer->getNumOperands())
-    return nullptr;
-  Operation *producer = consumer->getOperand(consumerIdx).getDefiningOp();
-  if (!producer || producer->getNumResults() != 1)
-    return nullptr;
-
-  // Fuse when consumer is GenericOp or IndexedGenericOp.
-  if (isa<GenericOp, IndexedGenericOp>(consumer)) {
-    if (isa<GenericOp, IndexedGenericOp>(producer))
-      return FuseGenericOpsOnTensors::fuse(cast<LinalgOp>(producer),
-                                           cast<LinalgOp>(consumer),
-                                           consumerIdx, rewriter, folder);
-    if (auto reshapeOpProducer = dyn_cast<TensorReshapeOp>(producer))
-      return FuseTensorReshapeOpAsProducer::fuse(reshapeOpProducer,
-                                                 cast<LinalgOp>(consumer),
-                                                 consumerIdx, rewriter, folder);
-    if (auto constantOpProducer = dyn_cast<ConstantOp>(producer))
-      return FuseConstantOpAsProducer::fuse(constantOpProducer,
-                                            cast<LinalgOp>(consumer),
-                                            consumerIdx, rewriter, folder);
-    return nullptr;
-  }
-
-  if (isa<GenericOp, IndexedGenericOp>(producer)) {
-    // Fuse when consumer is a TensorReshapeOp.
-    if (TensorReshapeOp reshapeOp = dyn_cast<TensorReshapeOp>(consumer)) {
-      return FuseTensorReshapeOpAsConsumer::fuse(
-          cast<LinalgOp>(producer), reshapeOp, consumerIdx, rewriter, folder);
-    }
-  }
-
-  return nullptr;
-}
-
-namespace {
-/// Patterns to fuse a generic op, with the producer of its operands.
-template <typename LinalgOpTy>
-struct FuseTensorOps : public OpRewritePattern<LinalgOpTy> {
-  using OpRewritePattern<LinalgOpTy>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(LinalgOpTy op,
-                                PatternRewriter &rewriter) const override {
-    // Find the first operand that is defined by another generic op on tensors.
-    for (auto operandNum :
-         llvm::seq<unsigned>(0, op.getOperation()->getNumOperands())) {
-      Operation *producer =
-          op.getOperation()->getOperand(operandNum).getDefiningOp();
-      if (Operation *fusedOp = fuseTensorOps(rewriter, op, operandNum)) {
-        rewriter.replaceOp(op, fusedOp->getResults());
-        if (producer && llvm::all_of(producer->getResults(),
-                                     [](Value val) { return val.use_empty(); }))
-          rewriter.eraseOp(producer);
-        return success();
-      }
-    }
-    return failure();
-  }
-};
-
-/// Pass that fuses generic ops on tensors. Used only for testing.
-struct FusionOfTensorOpsPass
-    : public LinalgFusionOfTensorOpsBase<FusionOfTensorOpsPass> {
-  void runOnOperation() override {
-    OwningRewritePatternList patterns;
-    Operation *op = getOperation();
-    populateLinalgTensorOpsFusionPatterns(op->getContext(), patterns);
-    applyPatternsAndFoldGreedily(op->getRegions(), patterns);
-  };
-};
-
 struct LinalgFusionPass : public LinalgFusionBase<LinalgFusionPass> {
   void runOnFunction() override { fuseLinalgOpsGreedily(getFunction()); }
 };
 } // namespace
 
-void mlir::populateLinalgTensorOpsFusionPatterns(
-    MLIRContext *context, OwningRewritePatternList &patterns) {
-  patterns.insert<FuseTensorOps<GenericOp>, FuseTensorOps<IndexedGenericOp>,
-                  FuseTensorOps<TensorReshapeOp>>(context);
-}
-
 std::unique_ptr<OperationPass<FuncOp>> mlir::createLinalgFusionPass() {
   return std::make_unique<LinalgFusionPass>();
 }
-
-std::unique_ptr<Pass> mlir::createLinalgFusionOfTensorOpsPass() {
-  return std::make_unique<FusionOfTensorOpsPass>();
-}
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp b/mlir/lib/Dialect/Linalg/Transforms/FusionOnTensors.cpp
copy from mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp
copy to mlir/lib/Dialect/Linalg/Transforms/FusionOnTensors.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/FusionOnTensors.cpp
@@ -6,433 +6,24 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements the linalg dialect Fusion pass.
+// This file implements the linalg dialect Fusion on tensors operations pass.
 //
 //===----------------------------------------------------------------------===//
-
 #include "PassDetail.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h"
-#include "mlir/Dialect/Linalg/EDSC/FoldedIntrinsics.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/Linalg/IR/LinalgTypes.h"
 #include "mlir/Dialect/Linalg/Passes.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/Linalg/Utils/Utils.h"
-#include "mlir/Dialect/StandardOps/EDSC/Intrinsics.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
-#include "mlir/IR/Dominance.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Support/LLVM.h"
-#include "mlir/Transforms/FoldUtils.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-
-#define DEBUG_TYPE "linalg-fusion"
 
 using namespace mlir;
-using namespace mlir::edsc;
-using namespace mlir::edsc::intrinsics;
 using namespace mlir::linalg;
 
-using folded_std_constant_index = FoldedValueBuilder<ConstantIndexOp>;
-
-using llvm::dbgs;
-
-/// Implements a simple high-level fusion pass of linalg library operations.
-///
-/// In each block, linalg ops are processed in reverse textual order.
-/// Given a linalg op `O`, fusion occurs by:
-///   1. inspecting the linalg ops that write into the views read by `O`. This
-///      uses the SSA value of the views and a simple subview/slice analysis to
-///      determine producer-consumer dependences;
-///   2. greedily fuse the linalg ops that produce subview
-///   3. inspect the fused ops and determine whether they have other remaining
-///      LinalgOp uses. If not, then erase the original producing linalg op.
-///
-/// More advanced use cases, analyses as well as profitability heuristics are
-/// left for future work.
-
-// Return a cloned version of `op` that operates on `loopRanges`, assumed to be
-// a subset of the original loop ranges of `op`.
-// This is achieved by applying the `loopToOperandRangesMaps` permutation maps
-// to the `loopRanges` in order to obtain view ranges.
-static LinalgOp cloneWithLoopRanges(OpBuilder &b, Location loc, LinalgOp op,
-                                    ArrayRef<SubViewOp::Range> loopRanges) {
-  assert(op.hasBufferSemantics() && "expected linalg op with buffer semantics");
-  auto maps = op.indexing_maps();
-  SmallVector<Value, 8> clonedViews;
-  clonedViews.reserve(op.getNumInputsAndOutputs());
-  // Iterate over the inputs and outputs in order.
-  // Extract the subranges from the linearized ranges.
-  SmallVector<Value, 8> ios(op.getInputsAndOutputBuffers());
-  for (auto en : llvm::enumerate(ios)) {
-    unsigned idx = en.index();
-    auto map = maps[idx].cast<AffineMapAttr>().getValue();
-    LLVM_DEBUG(dbgs() << "map: " << map << "\n");
-    Value view = en.value();
-    SmallVector<SubViewOp::Range, 4> viewRanges(map.getNumResults());
-    for (auto en2 : llvm::enumerate(map.getResults())) {
-      unsigned d = en2.index();
-      // loopToOperandRangesMaps are permutations-only.
-      unsigned loopPos = en2.value().cast<AffineDimExpr>().getPosition();
-      viewRanges[d] = loopRanges[loopPos];
-      LLVM_DEBUG(dbgs() << "\ni,j: " << en.index() << ", " << en2.index()
-                        << "\t"
-                        << "loopPos: " << loopPos << "\t" << viewRanges[d]);
-    }
-    // Construct a new subview for the tile.
-    unsigned rank = viewRanges.size();
-    SmallVector<Value, 4> offsets, sizes, strides;
-    offsets.reserve(rank);
-    sizes.reserve(rank);
-    strides.reserve(rank);
-    for (auto r : viewRanges) {
-      offsets.push_back(r.offset);
-      sizes.push_back(r.size);
-      strides.push_back(r.stride);
-    }
-    clonedViews.push_back(
-        b.create<SubViewOp>(loc, view, offsets, sizes, strides));
-  }
-  auto operands = getAssumedNonViewOperands(op);
-  clonedViews.append(operands.begin(), operands.end());
-
-  Operation *clonedOp = op.clone(b, loc, /*resultTypes*/ {}, clonedViews);
-  // When the producer is an IndexedGenercOp, we have to transform its block
-  // IV arguments according to the tiling of the consumer, i.e. offset them by
-  // the values computed in `loopRanges`.
-  if (auto indexedGenericOp = dyn_cast<IndexedGenericOp>(clonedOp)) {
-    auto &block = indexedGenericOp.region().front();
-
-    OpBuilder::InsertionGuard g(b);
-    b.setInsertionPointToStart(&block);
-    for (unsigned i = 0, e = indexedGenericOp.getNumLoops(); i < e; ++i) {
-      Value oldIndex = block.getArgument(i);
-      AddIOp newIndex = b.create<AddIOp>(indexedGenericOp.getLoc(), oldIndex,
-                                         loopRanges[i].offset);
-      oldIndex.replaceAllUsesExcept(newIndex,
-                                    SmallPtrSet<Operation *, 1>{newIndex});
-    }
-  }
-  return clonedOp;
-}
-
-struct ViewDimension {
-  Value view;
-  unsigned dimension;
-};
-
-// Given an `op`, returns the first (`view`, `dimension`) pair that identifies
-// the loop range at `loopDepth`. The semantics of the loopToOperandRangesMaps
-// guarantees at least one such dimension is found. If multiple candidates exist
-// they must agree by construction (i.e. have the same size) and we just return
-// the first one.
-static ViewDimension getViewDefiningLoopRange(LinalgOp op, unsigned loopDepth) {
-  assert(op.hasBufferSemantics() && "expected linalg op with buffer semantics");
-  auto maps = op.indexing_maps();
-  // Iterate over the inputs and outputs in order.
-  // Extract the subranges from the linearized ranges.
-  SmallVector<Value, 8> ios(op.getInputsAndOutputBuffers());
-  for (auto en : llvm::enumerate(ios)) {
-    unsigned idx = en.index();
-    auto map = maps[idx].cast<AffineMapAttr>().getValue();
-    LLVM_DEBUG(dbgs() << "getViewDefiningLoopRange I/O idx: " << idx << "\n");
-    LLVM_DEBUG(dbgs() << "getViewDefiningLoopRange map: " << map << "\n");
-    Value view = en.value();
-    SmallVector<Value, 8> viewRanges(map.getNumResults(), nullptr);
-    for (auto en2 : llvm::enumerate(map.getResults())) {
-      if (loopDepth == en2.value().cast<AffineDimExpr>().getPosition()) {
-        LLVM_DEBUG(dbgs() << "getViewDefiningLoopRange loopDepth: " << loopDepth
-                          << "\n");
-        LLVM_DEBUG(dbgs() << "getViewDefiningLoopRange view: " << view << "\n");
-        return ViewDimension{view, static_cast<unsigned>(en2.index())};
-      }
-    }
-  }
-  llvm_unreachable("Expect to be able to extract a view defining loop range");
-}
-
-static LinalgOp fuse(Value producedView, LinalgOp producer, LinalgOp consumer,
-                     unsigned consumerIdx, unsigned producerIdx,
-                     OperationFolder *folder) {
-  assert(producer.hasBufferSemantics() &&
-         "expected linalg op with buffer semantics");
-  assert(consumer.hasBufferSemantics() &&
-         "expected linalg op with buffer semantics");
-
-  auto subView = dyn_cast_or_null<SubViewOp>(
-      consumer.getBuffer(consumerIdx).getDefiningOp());
-  auto slice = dyn_cast_or_null<SliceOp>(
-      consumer.getBuffer(consumerIdx).getDefiningOp());
-  assert(subView || slice);
-  (void)subView;
-  (void)slice;
-
-  // loopToOperandRangesMaps are permutations-only by construction:
-  //   we can always identify a data dimension with a (at least one) loop
-  //   dimension.
-  AffineMap producerMap =
-      producer.indexing_maps()[producer.getNumInputs() + producerIdx]
-          .cast<AffineMapAttr>()
-          .getValue();
-  LLVM_DEBUG(dbgs() << "Producer Idx: " << producerIdx
-                    << ", producer map: " << producerMap << "\n");
-
-  unsigned nPar = producer.getNumParallelLoops();
-  unsigned nRed = producer.getNumReductionLoops();
-  unsigned nWin = producer.getNumWindowLoops();
-  SmallVector<SubViewOp::Range, 8> loopRanges(nPar + nRed + nWin);
-
-  OpBuilder b(consumer.getOperation());
-  auto loc = consumer.getLoc();
-  // Iterate over dimensions identified by the producer map for `producerIdx`.
-  // This defines a subset of the loop ranges that we need to complete later.
-  for (auto en : llvm::enumerate(producerMap.getResults())) {
-    unsigned posInProducerLoop = en.value().cast<AffineDimExpr>().getPosition();
-    loopRanges[posInProducerLoop] =
-        subView.getOrCreateRanges(b, loc)[en.index()];
-  }
-
-  // Iterate over all dimensions. For the dimensions not identified by the
-  // producer map for `producerIdx`, we need to explicitly compute the view that
-  // defines the loop ranges using the `producer`.
-  for (unsigned i = 0, nLoops = loopRanges.size(); i < nLoops; ++i) {
-    if (loopRanges[i].offset)
-      LLVM_DEBUG(llvm::dbgs()
-                 << "existing LoopRange: " << loopRanges[i] << "\n");
-    else {
-      auto viewDim = getViewDefiningLoopRange(producer, i);
-      loopRanges[i] = SubViewOp::Range{folded_std_constant_index(folder, 0),
-                                       std_dim(viewDim.view, viewDim.dimension),
-                                       folded_std_constant_index(folder, 1)};
-      LLVM_DEBUG(llvm::dbgs() << "new LoopRange: " << loopRanges[i] << "\n");
-    }
-  }
-
-  return cloneWithLoopRanges(b, loc, producer, loopRanges);
-}
-
-// Encode structural fusion safety preconditions.
-// Some of these will be lifted in the future with better analysis.
-static bool isStructurallyFusableProducer(LinalgOp producer, Value consumedView,
-                                          LinalgOp consumer) {
-  assert(producer.hasBufferSemantics() &&
-         "expected linalg op with buffer semantics");
-  assert(consumer.hasBufferSemantics() &&
-         "expected linalg op with buffer semantics");
-  if (producer.getNumOutputs() != 1) {
-    LLVM_DEBUG(dbgs() << "\nNot structurally fusable (multi-output)");
-    return false;
-  }
-  // Only fuse when the producer block dominates.
-  DominanceInfo dom(producer.getOperation());
-  if (!dom.dominates(producer.getOperation()->getBlock(),
-                     consumer.getOperation()->getBlock())) {
-    LLVM_DEBUG(
-        dbgs()
-        << "\nNot structurally fusable (producer block does not dominate)");
-    return false;
-  }
-  return true;
-}
-
-bool mlir::linalg::isProducerLastWriteOfView(const LinalgDependenceGraph &graph,
-                                             LinalgOp consumer,
-                                             Value consumedView,
-                                             LinalgOp producer) {
-  assert(producer.hasBufferSemantics() &&
-         "expected linalg op with buffer semantics");
-  assert(consumer.hasBufferSemantics() &&
-         "expected linalg op with buffer semantics");
-  // Make some simple structural checks that alleviate the need for more
-  // complex analyses.
-  if (!isStructurallyFusableProducer(producer, consumedView, consumer)) {
-    LLVM_DEBUG(dbgs() << "\n***Not static last write due to structure:\t"
-                      << *producer.getOperation());
-    return false;
-  }
-  // Check for any interleaved write to consumedView.
-  if (!graph.findCoveringWrites(producer, consumer, consumedView).empty()) {
-    LLVM_DEBUG(dbgs() << "\n***Not fusable due to interleaved write:\t"
-                      << *producer.getOperation());
-    return false;
-  }
-  return true;
-}
-
-bool mlir::linalg::isFusableInto(const LinalgDependenceGraph &graph,
-                                 LinalgOp consumer, Value consumedView,
-                                 LinalgOp producer) {
-  assert(producer.hasBufferSemantics() &&
-         "expected linalg op with buffer semantics");
-  assert(consumer.hasBufferSemantics() &&
-         "expected linalg op with buffer semantics");
-  if (!isProducerLastWriteOfView(graph, consumer, consumedView, producer))
-    return false;
-  // Check for any fusion-preventing dependence to any view read/written that
-  // would violate dependences.
-  if (!graph.findCoveringDependences(producer, consumer).empty()) {
-    LLVM_DEBUG(dbgs() << "\n***Not fusable due to an interleaved dependence:\t"
-                      << *producer.getOperation());
-    return false;
-  }
-  if (auto convOp = dyn_cast<linalg::ConvOp>(producer.getOperation())) {
-    // TODO: add a level of indirection to linalg.generic.
-    if (convOp.padding())
-      return false;
-  }
-  if (auto convOp = dyn_cast<linalg::ConvOp>(consumer.getOperation())) {
-    // TODO: add a level of indirection to linalg.generic.
-    if (convOp.padding())
-      return false;
-  }
-  return true;
-}
-
-static bool isSameSubView(Value a, Value b) {
-  if (a == b)
-    return true;
-  auto sva = a.getDefiningOp<SubViewOp>();
-  auto svb = b.getDefiningOp<SubViewOp>();
-  if (!sva || !svb)
-    return false;
-  if (!isSameSubView(sva.getViewSource(), svb.getViewSource()))
-    return false;
-  if (sva.getType() != svb.getType())
-    return false;
-  if (sva.getRank() != svb.getRank())
-    return false;
-  if (sva.getNumOperands() != svb.getNumOperands())
-    return false;
-  if (sva.static_offsets() != svb.static_offsets())
-    return false;
-  if (sva.static_sizes() != svb.static_sizes())
-    return false;
-  if (sva.static_strides() != svb.static_strides())
-    return false;
-  /// Skip the "viewSource" operand.
-  for (unsigned idx = 1, e = sva.getNumOperands(); idx != e; ++idx)
-    if (sva.getOperand(idx) != svb.getOperand(idx))
-      return false;
-  return true;
-}
-
-static Optional<FusionInfo>
-fuseProducerOfDep(OpBuilder &b, LinalgOp consumer, unsigned consumerIdx,
-                  const LinalgDependenceGraph &graph, OperationFolder *folder,
-                  LinalgDependenceGraph::DependenceType depType) {
-  assert(consumer.hasBufferSemantics() &&
-         "expected linalg op with buffer semantics");
-  LLVM_DEBUG(dbgs() << "\nStart examining consumer: "
-                    << *consumer.getOperation());
-  for (auto dependence : graph.getDependencesInto(consumer, depType)) {
-    LLVM_DEBUG(dbgs() << "\n***Consider producer:\t"
-                      << *dependence.dependentOpView.op << "\n");
-    auto producer = cast<LinalgOp>(dependence.dependentOpView.op);
-
-    // Check that the dependence is indeed on the input `consumerIdx` view.
-    auto consumedView = dependence.indexingView;
-    if (!isSameSubView(consumer.getBuffer(consumerIdx), consumedView))
-      continue;
-
-    // Consumer consumes this view, `isStructurallyFusableProducer` also checks
-    // whether it is a strict subview of the producer view.
-    auto producedView = dependence.dependentOpView.view;
-    auto producerIdx = producer.getIndexOfOutputBuffer(producedView).getValue();
-    // `consumerIdx` and `producerIdx` exist by construction.
-    LLVM_DEBUG(dbgs() << "\n"
-                      << LinalgDependenceGraph::getDependenceTypeStr(depType)
-                      << "producer: " << *producer.getOperation() << " view: "
-                      << producedView << " output index: " << producerIdx);
-
-    // Must be a subview or a slice to guarantee there are loops we can fuse
-    // into.
-    auto subView = consumedView.getDefiningOp<SubViewOp>();
-    auto slice = consumedView.getDefiningOp<SliceOp>();
-    if (!subView && !slice) {
-      LLVM_DEBUG(dbgs() << "\nNot fusable (not a subview or slice)");
-      continue;
-    }
-
-    // Simple fusability checks.
-    if (!isFusableInto(graph, consumer, consumedView, producer))
-      continue;
-
-    // Fuse `producer` just before `consumer`.
-    OpBuilder::InsertionGuard g(b);
-    b.setInsertionPoint(consumer.getOperation());
-    ScopedContext scope(b, consumer.getLoc());
-    LLVM_DEBUG(dbgs() << "Fuse into consumer: " << *consumer << "\n");
-    auto fusedProducer = fuse(producedView, producer, consumer, consumerIdx,
-                              producerIdx, folder);
-
-    return FusionInfo{producer, fusedProducer};
-  }
-  return llvm::None;
-}
-
-// Only consider RAW and WAW atm.
-Optional<FusionInfo> mlir::linalg::fuseProducerOf(
-    OpBuilder &b, LinalgOp consumer, unsigned consumerIdx,
-    const LinalgDependenceGraph &graph, OperationFolder *folder) {
-  for (auto dep : {
-           LinalgDependenceGraph::DependenceType::RAW,
-           LinalgDependenceGraph::DependenceType::WAW,
-       }) {
-    if (auto res =
-            fuseProducerOfDep(b, consumer, consumerIdx, graph, folder, dep))
-      return res;
-  }
-  return llvm::None;
-}
-
-static void fuseLinalgOpsGreedily(FuncOp f) {
-  LLVM_DEBUG(f.print(dbgs() << "\nBefore linalg-fusion: \n"));
-
-  OpBuilder b(f);
-  OperationFolder folder(f.getContext());
-  DenseSet<Operation *> eraseSet;
-
-  // Save original Linalg ops, we only want to make a pass over those.
-  SmallVector<Operation *, 8> linalgOps;
-  f.walk([&](LinalgOp op) {
-    if (op.hasBufferSemantics())
-      linalgOps.push_back(op);
-  });
-
-  // TODO: LinalgDependenceGraph should be able to update itself.
-  // The current naive and expensive reconstruction of the graph should be
-  // removed.
-  for (auto *op : llvm::reverse(linalgOps)) {
-    for (unsigned id = 0, e = LinalgOp(op).getNumInputsAndOutputBuffers();
-         id < e; ++id) {
-      linalg::Aliases aliases;
-      linalg::LinalgDependenceGraph graph(aliases, linalgOps);
-      if (auto info = fuseProducerOf(b, op, id, graph, &folder)) {
-        auto *originalOp = info->originalProducer.getOperation();
-        eraseSet.insert(originalOp);
-        auto *originalOpInLinalgOpsVector =
-            std::find(linalgOps.begin(), linalgOps.end(), originalOp);
-        *originalOpInLinalgOpsVector = info->fusedProducer.getOperation();
-      }
-    }
-  }
-  // The `fuseProducerOf` function performs structural checks and in particular
-  // that no covering read or write exist between the consumer and the producer.
-  // As a consequence, the only fusions that may occur preserve subsequent
-  // dependences and are guaranteed by construction to produce the whole view.
-  // We may thus erase the producer once it is fused.
-  for (auto *e : eraseSet)
-    e->erase();
-  LLVM_DEBUG(f.print(dbgs() << "\nAfter linalg-fusion: \n"));
-}
-
-//====---------------------------------------------------------------------===//
-// Fusion on Tensor operation.
-//====---------------------------------------------------------------------===//
-
 namespace {
 
 /// Implementation of fusion of generic ops and indexed_generic ops.
@@ -1094,10 +685,6 @@
     applyPatternsAndFoldGreedily(op->getRegions(), patterns);
   };
 };
-
-struct LinalgFusionPass : public LinalgFusionBase<LinalgFusionPass> {
-  void runOnFunction() override { fuseLinalgOpsGreedily(getFunction()); }
-};
 } // namespace
 
 void mlir::populateLinalgTensorOpsFusionPatterns(
@@ -1106,10 +693,6 @@
                   FuseTensorOps<TensorReshapeOp>>(context);
 }
 
-std::unique_ptr<OperationPass<FuncOp>> mlir::createLinalgFusionPass() {
-  return std::make_unique<LinalgFusionPass>();
-}
-
 std::unique_ptr<Pass> mlir::createLinalgFusionOfTensorOpsPass() {
   return std::make_unique<FusionOfTensorOpsPass>();
 }
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
@@ -318,25 +318,10 @@
 }
 
 template <typename LoopTy>
-Optional<TiledLinalgOp> static tileLinalgOpImpl(
-    OpBuilder &b, LinalgOp op, const LinalgTilingOptions &options) {
-  OpBuilder::InsertionGuard g(b);
-  b.setInsertionPoint(op);
-  ScopedContext scope(b, op.getLoc());
-
-  assert(op.hasBufferSemantics() && "expected linalg op with buffer semantics");
-  // 1. Enforce the convention that "tiling by zero" skips tiling a particular
-  // dimension. This convention is significantly simpler to handle instead of
-  // adjusting affine maps to account for missing dimensions.
+static Optional<TiledLinalgOp>
+tileLinalgOpImpl(OpBuilder &b, LinalgOp op, ArrayRef<Value> tileSizes,
+                 const LinalgTilingOptions &options) {
   auto nLoops = op.getNumLoops();
-  SmallVector<Value, 4> tileSizeVector =
-      options.tileSizeComputationFunction(b, op);
-  if (tileSizeVector.size() < nLoops) {
-    auto zero = std_constant_index(0);
-    tileSizeVector.append(nLoops - tileSizeVector.size(), zero);
-  }
-
-  ArrayRef<Value> tileSizes = tileSizeVector;
   // Initial tile sizes may be too big, only take the first nLoops.
   tileSizes = tileSizes.take_front(nLoops);
 
@@ -350,17 +335,7 @@
       return llvm::None;
   }
 
-  // If interchangeVector is empty, use the identity. Build the permutation map
-  // otherwise.
-  auto invPermutationMap =
-      AffineMap::getMultiDimIdentityMap(tileSizes.size(), b.getContext());
-  if (!options.interchangeVector.empty())
-    invPermutationMap = inversePermutation(AffineMap::getPermutationMap(
-        options.interchangeVector, b.getContext()));
-  if (!invPermutationMap)
-    return llvm::None;
-
-  // 2. Build the tiled loop ranges.
+  // 1. Build the tiled loop ranges.
   auto allViewSizes = getViewSizes(b, op);
   // The flattened loopToOperandRangesMaps is expected to be an invertible
   // permutation map (asserted in the inverse calculation).
@@ -374,17 +349,39 @@
   SmallVector<SubViewOp::Range, 4> loopRanges;
   LoopIndexToRangeIndexMap loopIndexToRangeIndex;
   std::tie(loopRanges, loopIndexToRangeIndex) = makeTiledLoopRanges(
-      b, scope.getLocation(), viewSizesToLoopsMap, allViewSizes, tileSizes);
-  if (!options.interchangeVector.empty())
-    applyPermutationToVector(loopRanges, options.interchangeVector);
+      b, op.getLoc(), viewSizesToLoopsMap, allViewSizes, tileSizes);
+  SmallVector<Attribute, 4> iteratorTypes;
+  for (auto attr :
+       enumerate(op.iterator_types().cast<ArrayAttr>().getValue())) {
+    if (loopIndexToRangeIndex.count(attr.index()))
+      iteratorTypes.push_back(attr.value());
+  }
+  // If interchangeVector is empty, use the identity. Build the permutation map
+  // otherwise.
+  auto invPermutationMap =
+      AffineMap::getMultiDimIdentityMap(tileSizes.size(), b.getContext());
+  if (!options.interchangeVector.empty()) {
+    // Based on the pruned iterations (due to zero tile size), recompute the
+    // interchange vector.
+    SmallVector<unsigned, 4> interchangeVector;
+    interchangeVector.reserve(options.interchangeVector.size());
+    for (auto pos : options.interchangeVector) {
+      auto it = loopIndexToRangeIndex.find(pos);
+      if (it == loopIndexToRangeIndex.end())
+        continue;
+      interchangeVector.push_back(it->second);
+    }
+    invPermutationMap = inversePermutation(
+        AffineMap::getPermutationMap(interchangeVector, b.getContext()));
+    if (!invPermutationMap)
+      return llvm::None;
+    applyPermutationToVector(loopRanges, interchangeVector);
+    applyPermutationToVector(iteratorTypes, interchangeVector);
+  }
 
-  // 3. Create the tiled loops.
+  // 2. Create the tiled loops.
   LinalgOp res = op;
   SmallVector<Value, 4> ivs;
-  SmallVector<Attribute, 4> iteratorTypes =
-      llvm::to_vector<4>(op.iterator_types().cast<ArrayAttr>().getValue());
-  if (!options.interchangeVector.empty())
-    applyPermutationToVector(iteratorTypes, options.interchangeVector);
   GenerateLoopNest<LoopTy>::doit(
       loopRanges, /*iterArgInitValues*/ {}, iteratorTypes,
       [&](ValueRange localIvs, ValueRange iterArgs) -> scf::ValueVector {
@@ -410,10 +407,10 @@
       },
       options.distribution);
 
-  // 4. Transforms index arguments of `linalg.generic` w.r.t. to the tiling.
+  // 3. Transforms index arguments of `linalg.generic` w.r.t. to the tiling.
   transformIndexedGenericOpIndices(b, res, ivs, loopIndexToRangeIndex);
 
-  // 5. Gather the newly created loops and return them with the new op.
+  // 4. Gather the newly created loops and return them with the new op.
   SmallVector<Operation *, 8> loops;
   loops.reserve(ivs.size());
   for (auto iv : ivs) {
@@ -429,14 +426,38 @@
   return TiledLinalgOp{res, loops};
 }
 
+template <typename LoopTy>
+Optional<TiledLinalgOp> static tileLinalgOpImpl(
+    OpBuilder &b, LinalgOp op, const LinalgTilingOptions &options) {
+  OpBuilder::InsertionGuard g(b);
+  b.setInsertionPoint(op);
+  ScopedContext scope(b, op.getLoc());
+
+  assert(op.hasBufferSemantics() && "expected linalg op with buffer semantics");
+  // Enforce the convention that "tiling by zero" skips tiling a particular
+  // dimension. This convention is significantly simpler to handle instead of
+  // adjusting affine maps to account for missing dimensions.
+  auto nLoops = op.getNumLoops();
+  SmallVector<Value, 4> tileSizeVector =
+      options.tileSizeComputationFunction(b, op);
+  if (tileSizeVector.size() < nLoops) {
+    auto zero = std_constant_index(0);
+    tileSizeVector.append(nLoops - tileSizeVector.size(), zero);
+  }
+
+  return tileLinalgOpImpl<LoopTy>(b, op, tileSizeVector, options);
+}
+
 Optional<TiledLinalgOp>
 mlir::linalg::tileLinalgOp(OpBuilder &b, LinalgOp op,
                            const LinalgTilingOptions &options) {
-  if (options.loopType == LinalgTilingLoopType::Loops)
+  switch (options.loopType) {
+  case LinalgTilingLoopType::Loops:
     return tileLinalgOpImpl<scf::ForOp>(b, op, options);
-  if (options.loopType == LinalgTilingLoopType::ParallelLoops)
+  case LinalgTilingLoopType::ParallelLoops:
     return tileLinalgOpImpl<scf::ParallelOp>(b, op, options);
-  // TODO: Impl tiling to affine loops when it makes sense.
+  default:;
+  }
   return llvm::None;
 }
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
@@ -129,6 +129,43 @@
   return success();
 }
 
+mlir::linalg::LinalgBaseTileAndFusePattern::LinalgBaseTileAndFusePattern(
+    StringRef opName, MLIRContext *context,
+    const LinalgDependenceGraph &dependenceGraph,
+    LinalgTilingOptions tilingOptions, LinalgFusionOptions fusionOptions,
+    LinalgMarker marker, LinalgMarker fusedOpMarker,
+    LinalgMarker originalOpMarker, PatternBenefit benefit)
+    : RewritePattern(opName, {}, benefit, context),
+      dependenceGraph(dependenceGraph), tilingOptions(tilingOptions),
+      fusionOptions(fusionOptions), marker(marker),
+      fusedOpMarker(fusedOpMarker), originalOpMarker(originalOpMarker) {}
+
+LogicalResult mlir::linalg::LinalgBaseTileAndFusePattern::matchAndRewrite(
+    Operation *op, PatternRewriter &rewriter) const {
+  LinalgOp linalgOp = dyn_cast<LinalgOp>(op);
+  if (!linalgOp)
+    return failure();
+  if (failed(marker.checkAndNotify(rewriter, linalgOp)))
+    return failure();
+  if (!linalgOp.hasBufferSemantics())
+    return failure();
+
+  Optional<TiledAndFusedLinalgOps> tiledAndFusedOps = tileAndFuseLinalgOps(
+      rewriter, op, dependenceGraph, tilingOptions, fusionOptions);
+  if (!tiledAndFusedOps)
+    return failure();
+  marker.replaceLinalgMarker(rewriter, tiledAndFusedOps->op.getOperation());
+  for (auto fusedOp : tiledAndFusedOps->fusedProducers) {
+    fusedOpMarker.replaceLinalgMarker(rewriter, fusedOp.getOperation());
+  }
+  for (auto origProducerOp : tiledAndFusedOps->originalProducers)
+    originalOpMarker.replaceLinalgMarker(rewriter,
+                                         origProducerOp.getOperation());
+  rewriter.updateRootInPlace(
+      op, [&]() { originalOpMarker.replaceLinalgMarker(rewriter, op); });
+  return success();
+}
+
 /// Linalg base interchange pattern.
 mlir::linalg::LinalgBaseInterchangePattern::LinalgBaseInterchangePattern(
     StringRef opName, MLIRContext *context,
diff --git a/mlir/lib/Dialect/Linalg/Utils/CMakeLists.txt b/mlir/lib/Dialect/Linalg/Utils/CMakeLists.txt
--- a/mlir/lib/Dialect/Linalg/Utils/CMakeLists.txt
+++ b/mlir/lib/Dialect/Linalg/Utils/CMakeLists.txt
@@ -5,13 +5,13 @@
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Linalg
 
   LINK_LIBS PUBLIC
-  MLIRAffineOps
+  MLIRAffine
   MLIREDSC
   MLIRIR
   MLIRLinalgEDSC
-  MLIRLinalgOps
+  MLIRLinalg
   MLIRSCF
   MLIRPass
-  MLIRStandardOps
+  MLIRStandard
   MLIRTransformUtils
   )
diff --git a/mlir/lib/Dialect/Quant/CMakeLists.txt b/mlir/lib/Dialect/Quant/CMakeLists.txt
--- a/mlir/lib/Dialect/Quant/CMakeLists.txt
+++ b/mlir/lib/Dialect/Quant/CMakeLists.txt
@@ -21,6 +21,6 @@
   MLIRPass
   MLIRSideEffectInterfaces
   MLIRSupport
-  MLIRStandardOps
+  MLIRStandard
   MLIRTransformUtils
   )
diff --git a/mlir/lib/Dialect/SCF/CMakeLists.txt b/mlir/lib/Dialect/SCF/CMakeLists.txt
--- a/mlir/lib/Dialect/SCF/CMakeLists.txt
+++ b/mlir/lib/Dialect/SCF/CMakeLists.txt
@@ -13,7 +13,7 @@
   MLIRIR
   MLIRLoopLikeInterface
   MLIRSideEffectInterfaces
-  MLIRStandardOps
+  MLIRStandard
   )
 
 add_subdirectory(Transforms)
diff --git a/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt b/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt
--- a/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt
@@ -11,11 +11,11 @@
   MLIRSCFPassIncGen
 
   LINK_LIBS PUBLIC
-  MLIRAffineOps
+  MLIRAffine
   MLIRIR
   MLIRPass
   MLIRSCF
-  MLIRStandardOps
+  MLIRStandard
   MLIRSupport
   MLIRTransformUtils
 )
diff --git a/mlir/lib/Dialect/Shape/IR/CMakeLists.txt b/mlir/lib/Dialect/Shape/IR/CMakeLists.txt
--- a/mlir/lib/Dialect/Shape/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/Shape/IR/CMakeLists.txt
@@ -17,5 +17,5 @@
   MLIRInferTypeOpInterface
   MLIRIR
   MLIRSideEffectInterfaces
-  MLIRStandardOps
+  MLIRStandard
   )
diff --git a/mlir/lib/Dialect/StandardOps/CMakeLists.txt b/mlir/lib/Dialect/StandardOps/CMakeLists.txt
--- a/mlir/lib/Dialect/StandardOps/CMakeLists.txt
+++ b/mlir/lib/Dialect/StandardOps/CMakeLists.txt
@@ -1,4 +1,4 @@
-add_mlir_dialect_library(MLIRStandardOps
+add_mlir_dialect_library(MLIRStandard
   IR/Ops.cpp
   EDSC/Builders.cpp
   EDSC/Intrinsics.cpp
diff --git a/mlir/lib/Dialect/StandardOps/Transforms/CMakeLists.txt b/mlir/lib/Dialect/StandardOps/Transforms/CMakeLists.txt
--- a/mlir/lib/Dialect/StandardOps/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/StandardOps/Transforms/CMakeLists.txt
@@ -12,6 +12,6 @@
   LINK_LIBS PUBLIC
   MLIRIR
   MLIRPass
-  MLIRStandardOps
+  MLIRStandard
   MLIRTransforms
   )
diff --git a/mlir/lib/Dialect/Vector/CMakeLists.txt b/mlir/lib/Dialect/Vector/CMakeLists.txt
--- a/mlir/lib/Dialect/Vector/CMakeLists.txt
+++ b/mlir/lib/Dialect/Vector/CMakeLists.txt
@@ -14,9 +14,9 @@
   MLIRAffineEDSC
   MLIREDSC
   MLIRIR
-  MLIRStandardOps
-  MLIRAffineOps
-  MLIRLinalgOps
+  MLIRStandard
+  MLIRAffine
+  MLIRLinalg
   MLIRSCF
   MLIRLoopAnalysis
   MLIRSideEffectInterfaces
diff --git a/mlir/lib/Dialect/Vector/VectorOps.cpp b/mlir/lib/Dialect/Vector/VectorOps.cpp
--- a/mlir/lib/Dialect/Vector/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/VectorOps.cpp
@@ -900,6 +900,29 @@
   populateFromInt64AttrArray(strides(), results);
 }
 
+//===----------------------------------------------------------------------===//
+// ExtractMapOp
+//===----------------------------------------------------------------------===//
+
+void ExtractMapOp::build(OpBuilder &builder, OperationState &result,
+                         Value vector, Value id, int64_t multiplicity) {
+  VectorType type = vector.getType().cast<VectorType>();
+  VectorType resultType = VectorType::get(type.getNumElements() / multiplicity,
+                                          type.getElementType());
+  ExtractMapOp::build(builder, result, resultType, vector, id, multiplicity);
+}
+
+static LogicalResult verify(ExtractMapOp op) {
+  if (op.getSourceVectorType().getShape().size() != 1 ||
+      op.getResultType().getShape().size() != 1)
+    return op.emitOpError("expects source and destination vectors of rank 1");
+  if (op.getResultType().getNumElements() * (int64_t)op.multiplicity() !=
+      op.getSourceVectorType().getNumElements())
+    return op.emitOpError("vector sizes mismatch. Source size must be equal "
+                          "to destination size * multiplicity");
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // BroadcastOp
 //===----------------------------------------------------------------------===//
@@ -1122,6 +1145,30 @@
   populateFromInt64AttrArray(strides(), results);
 }
 
+//===----------------------------------------------------------------------===//
+// InsertMapOp
+//===----------------------------------------------------------------------===//
+
+void InsertMapOp::build(OpBuilder &builder, OperationState &result,
+                        Value vector, Value id, int64_t multiplicity) {
+  VectorType type = vector.getType().cast<VectorType>();
+  VectorType resultType = VectorType::get(type.getNumElements() * multiplicity,
+                                          type.getElementType());
+  InsertMapOp::build(builder, result, resultType, vector, id, multiplicity);
+}
+
+static LogicalResult verify(InsertMapOp op) {
+  if (op.getSourceVectorType().getShape().size() != 1 ||
+      op.getResultType().getShape().size() != 1)
+    return op.emitOpError("expected source and destination vectors of rank 1");
+  if ((int64_t)op.multiplicity() * op.getSourceVectorType().getNumElements() !=
+      op.getResultType().getNumElements())
+    return op.emitOpError(
+        "vector sizes mismatch. Destination size must be equal "
+        "to source size * multiplicity");
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // InsertStridedSliceOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Vector/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/VectorTransforms.cpp
--- a/mlir/lib/Dialect/Vector/VectorTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/VectorTransforms.cpp
@@ -2418,6 +2418,40 @@
   return failure();
 }
 
+LogicalResult mlir::vector::PointwiseExtractPattern::matchAndRewrite(
+    ExtractMapOp extract, PatternRewriter &rewriter) const {
+  Operation *definedOp = extract.vector().getDefiningOp();
+  if (!definedOp || definedOp->getNumResults() != 1)
+    return failure();
+  // TODO: Create an interfaceOp for elementwise operations.
+  if (!isa<AddFOp>(definedOp))
+    return failure();
+  Location loc = extract.getLoc();
+  SmallVector<Value, 4> extractOperands;
+  for (OpOperand &operand : definedOp->getOpOperands())
+    extractOperands.push_back(rewriter.create<vector::ExtractMapOp>(
+        loc, operand.get(), extract.id(), extract.multiplicity()));
+  Operation *newOp = cloneOpWithOperandsAndTypes(
+      rewriter, loc, definedOp, extractOperands, extract.getResult().getType());
+  rewriter.replaceOp(extract, newOp->getResult(0));
+  return success();
+}
+
+Optional<mlir::vector::DistributeOps>
+mlir::vector::distributPointwiseVectorOp(OpBuilder &builder, Operation *op,
+                                         Value id, int64_t multiplicity) {
+  OpBuilder::InsertionGuard guard(builder);
+  builder.setInsertionPointAfter(op);
+  Location loc = op->getLoc();
+  Value result = op->getResult(0);
+  DistributeOps ops;
+  ops.extract =
+      builder.create<vector::ExtractMapOp>(loc, result, id, multiplicity);
+  ops.insert =
+      builder.create<vector::InsertMapOp>(loc, ops.extract, id, multiplicity);
+  return ops;
+}
+
 // TODO: Add pattern to rewrite ExtractSlices(ConstantMaskOp).
 // TODO: Add this as DRR pattern.
 void mlir::vector::populateVectorToVectorTransformationPatterns(
diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt
--- a/mlir/lib/ExecutionEngine/CMakeLists.txt
+++ b/mlir/lib/ExecutionEngine/CMakeLists.txt
@@ -60,7 +60,7 @@
   MLIRExecutionEngine
   MLIRIR
   MLIRParser
-  MLIRStandardOps
+  MLIRStandard
   MLIRTargetLLVMIR
   MLIRTransforms
   MLIRStandardToLLVM
diff --git a/mlir/lib/Transforms/CMakeLists.txt b/mlir/lib/Transforms/CMakeLists.txt
--- a/mlir/lib/Transforms/CMakeLists.txt
+++ b/mlir/lib/Transforms/CMakeLists.txt
@@ -30,10 +30,10 @@
   MLIRTransformsPassIncGen
 
   LINK_LIBS PUBLIC
-  MLIRAffineOps
+  MLIRAffine
   MLIRAnalysis
   MLIRCopyOpInterface
-  MLIRLinalgOps
+  MLIRLinalg
   MLIRLoopLikeInterface
   MLIRSCF
   MLIRPass
diff --git a/mlir/lib/Transforms/Utils/CMakeLists.txt b/mlir/lib/Transforms/Utils/CMakeLists.txt
--- a/mlir/lib/Transforms/Utils/CMakeLists.txt
+++ b/mlir/lib/Transforms/Utils/CMakeLists.txt
@@ -14,10 +14,10 @@
   MLIRStandardOpsIncGen
 
   LINK_LIBS PUBLIC
-  MLIRAffineOps
+  MLIRAffine
   MLIRAnalysis
   MLIRLoopAnalysis
   MLIRSCF
   MLIRPass
-  MLIRStandardOps
+  MLIRStandard
   )
diff --git a/mlir/test/Dialect/Async/ops.mlir b/mlir/test/Dialect/Async/ops.mlir
--- a/mlir/test/Dialect/Async/ops.mlir
+++ b/mlir/test/Dialect/Async/ops.mlir
@@ -1,16 +1,46 @@
 // RUN: mlir-opt  %s | FileCheck %s
 
-// CHECK-LABEL: @identity
-func @identity(%arg0 : !async.token) -> !async.token {
+// CHECK-LABEL: @identity_token
+func @identity_token(%arg0 : !async.token) -> !async.token {
   // CHECK: return %arg0 : !async.token
   return %arg0 : !async.token
 }
 
+// CHECK-LABEL: @identity_value
+func @identity_value(%arg0 : !async.value<f32>) -> !async.value<f32> {
+  // CHECK: return %arg0 : !async.value<f32>
+  return %arg0 : !async.value<f32>
+}
+
 // CHECK-LABEL: @empty_async_execute
 func @empty_async_execute() -> !async.token {
-  %0 = async.execute {
+  %done = async.execute {
     async.yield
   } : !async.token
 
-  return %0 : !async.token
+  // CHECK: return %done : !async.token
+  return %done : !async.token
+}
+
+// CHECK-LABEL: @return_async_value
+func @return_async_value() -> !async.value<f32> {
+  %done, %values = async.execute {
+    %cst = constant 1.000000e+00 : f32
+    async.yield %cst : f32
+  } : !async.token, !async.value<f32>
+
+  // CHECK: return %values : !async.value<f32>
+  return %values : !async.value<f32>
+}
+
+// CHECK-LABEL: @return_async_values
+func @return_async_values() -> (!async.value<f32>, !async.value<f32>) {
+  %done, %values:2 = async.execute {
+    %cst1 = constant 1.000000e+00 : f32
+    %cst2 = constant 2.000000e+00 : f32
+    async.yield %cst1, %cst2 : f32, f32
+  } : !async.token, !async.value<f32>, !async.value<f32>
+
+  // CHECK: return %values#0, %values#1 : !async.value<f32>, !async.value<f32>
+  return %values#0, %values#1 : !async.value<f32>, !async.value<f32>
 }
diff --git a/mlir/test/Dialect/Linalg/fusion-pattern.mlir b/mlir/test/Dialect/Linalg/fusion-pattern.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/fusion-pattern.mlir
@@ -0,0 +1,297 @@
+// RUN: mlir-opt %s -test-linalg-fusion-transform-patterns -canonicalize -cse -split-input-file | FileCheck %s
+
+module {
+  func @basic_fusion(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>,
+                     %arg2: memref<?x?xf32>) {
+    %cst = constant 0.000000e+00 : f32
+    linalg.fill(%arg2, %cst) : memref<?x?xf32>, f32
+    linalg.matmul {__internal_linalg_transform__ = "basic_fusion"}
+      ins(%arg0, %arg1 : memref<?x?xf32>, memref<?x?xf32>)
+      outs(%arg2 : memref<?x?xf32>)
+    return
+  }
+}
+
+//  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0] -> (32, -d0 + s0)>
+//  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>
+//  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0] -> (64, -d0 + s0)>
+//  CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0)[s0] -> (16, -d0 + s0)>
+//      CHECK: func @basic_fusion
+// CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: memref<?x?xf32>
+// CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: memref<?x?xf32>
+// CHECK-SAME:   %[[ARG2:[a-zA-Z0-9_]+]]: memref<?x?xf32>
+//  CHECK-DAG:   %[[C0:.+]] = constant 0 : index
+//  CHECK-DAG:   %[[C1:.+]] = constant 1 : index
+//  CHECK-DAG:   %[[C32:.+]] = constant 32 : index
+//  CHECK-DAG:   %[[C64:.+]] = constant 64 : index
+//  CHECK-DAG:   %[[C16:.+]] = constant 16 : index
+//  CHECK-DAG:   %[[CST:.+]] = constant 0.0{{.*}} : f32
+//  CHECK-DAG:   linalg.fill(%[[ARG2]], %[[CST]])
+// CHECK-SAME:   __internal_linalg_transform__ = "after_basic_fusion_original"
+//  CHECK-DAG:   %[[M:.+]] = dim %[[ARG0]], %[[C0]]
+//  CHECK-DAG:   %[[N:.+]] = dim %[[ARG1]], %[[C1]]
+//      CHECK:   scf.parallel (%[[IV0:.+]], %[[IV1:.+]]) =
+// CHECK-SAME:     to (%[[M]], %[[N]])
+// CHECK-SAME:     step (%[[C32]], %[[C64]]) {
+//      CHECK:     %[[TILE_M:.+]] = affine.min #[[MAP0]](%[[IV0]])[%[[M]]]
+//      CHECK:     %[[K:.+]] = dim %[[ARG0]], %[[C1]]
+//      CHECK:     %[[SV1:.+]] = subview %[[ARG0]][%[[IV0]], 0]
+// CHECK-SAME:       [%[[TILE_M]], %[[K]]]
+//      CHECK:     %[[K_2:.+]] = dim %[[ARG1]], %[[C0]]
+//      CHECK:     %[[TILE_N:.+]] = affine.min #[[MAP2]](%[[IV1]])[%[[N]]]
+//      CHECK:     %[[SV2:.+]] = subview %[[ARG1]][0, %[[IV1]]]
+// CHECK-SAME:       %[[K_2]], %[[TILE_N]]
+//      CHECK:     %[[M_2:.+]] = dim %[[ARG2]], %[[C0]]
+//      CHECK:     %[[TILE_M_2:.+]] = affine.min #[[MAP0]](%[[IV0]])[%[[M_2]]]
+//      CHECK:     %[[N_2:.+]] = dim %[[ARG2]], %[[C1]]
+//      CHECK:     %[[TILE_N_2:.+]] = affine.min #[[MAP2]](%[[IV1]])[%[[N_2]]]
+//      CHECK:     %[[SV3:.+]] = subview %[[ARG2]][%[[IV0]], %[[IV1]]]
+// CHECK-SAME:       [%[[TILE_M_2]], %[[TILE_N_2]]]
+//      CHECK:     linalg.fill(%[[SV3]], %[[CST]])
+// CHECK-SAME:       __internal_linalg_transform__ = "after_basic_fusion_producer"
+//      CHECK:     scf.for %[[IV2:.+]] = %[[C0]] to %[[K]] step %[[C16]] {
+//      CHECK:       %[[TILE_K:.+]] = affine.min #[[MAP3]](%[[IV2]])[%[[K]]]
+//      CHECK:       %[[SV4:.+]] = subview %[[SV1]][0, %[[IV2]]]
+// CHECK-SAME:         [%[[TILE_M]], %[[TILE_K]]]
+//      CHECK:       %[[TILE_K_2:.+]] = affine.min #[[MAP3]](%[[IV2]])[%[[K_2]]]
+//      CHECK:       %[[SV5:.+]] = subview %[[SV2]][%[[IV2]], 0]
+// CHECK-SAME:         [%[[TILE_K_2]], %[[TILE_N]]]
+//      CHECK:       linalg.matmul
+// CHECK-SAME:         __internal_linalg_transform__ = "after_basic_fusion"
+// CHECK-SAME:         ins(%[[SV4]], %[[SV5]]
+// CHECK-SAME:           : memref<?x?xf32, #[[MAP1]]>, memref<?x?xf32, #[[MAP1]]>)
+// CHECK-SAME:         outs(%[[SV3]] : memref<?x?xf32, #[[MAP1]]>)
+//      CHECK:     }
+//      CHECK:   }
+//      CHECK:   linalg.matmul
+// CHECK-SAME:     __internal_linalg_transform__ = "after_basic_fusion_original"
+
+// -----
+
+module {
+  func @rhs_fusion(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>,
+                              %arg2: memref<?x?xf32>, %arg3: memref<?x?xf32>) {
+    %cst = constant 0.000000e+00 : f32
+    linalg.copy(%arg1, %arg2) : memref<?x?xf32>, memref<?x?xf32>
+    linalg.fill(%arg3, %cst) : memref<?x?xf32>, f32
+    linalg.matmul {__internal_linalg_transform__ = "rhs_fusion"}
+      ins(%arg0, %arg2 : memref<?x?xf32>, memref<?x?xf32>)
+      outs(%arg3 : memref<?x?xf32>)
+    return
+  }
+}
+//  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0] -> (64, -d0 + s0)>
+//  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>
+//  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0] -> (32, -d0 + s0)>
+//  CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0)[s0] -> (16, -d0 + s0)>
+//      CHECK: func @rhs_fusion
+// CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: memref<?x?xf32>
+// CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: memref<?x?xf32>
+// CHECK-SAME:   %[[ARG2:[a-zA-Z0-9_]+]]: memref<?x?xf32>
+// CHECK-SAME:   %[[ARG3:[a-zA-Z0-9_]+]]: memref<?x?xf32>
+//  CHECK-DAG:   %[[C0:.+]] = constant 0 : index
+//  CHECK-DAG:   %[[C1:.+]] = constant 1 : index
+//  CHECK-DAG:   %[[C32:.+]] = constant 32 : index
+//  CHECK-DAG:   %[[C64:.+]] = constant 64 : index
+//  CHECK-DAG:   %[[C16:.+]] = constant 16 : index
+//  CHECK-DAG:   %[[CST:.+]] = constant 0.0{{.*}} : f32
+//  CHECK-DAG:   linalg.copy(%[[ARG1]], %[[ARG2]])
+// CHECK-SAME:   __internal_linalg_transform__ = "after_rhs_fusion_original"
+//  CHECK-DAG:   %[[N:.+]] = dim %[[ARG2]], %[[C1]]
+//      CHECK:   scf.parallel (%[[IV0:.+]]) =
+// CHECK-SAME:     (%[[C0]]) to (%[[N]]) step (%[[C64]]) {
+//      CHECK:     %[[K:.+]] = dim %[[ARG2]], %[[C0]]
+//      CHECK:     %[[TILE_N:.+]] = affine.min #[[MAP0]](%[[IV0]])[%[[N]]]
+//      CHECK:     %[[SV1:.+]] = subview %[[ARG2]][0, %[[IV0]]]
+// CHECK-SAME:       [%[[K]], %[[TILE_N]]]
+//      CHECK:     %[[M:.+]] = dim %[[ARG3]], %[[C0]]
+//      CHECK:     %[[N_2:.+]] = dim %[[ARG3]], %[[C1]]
+//      CHECK:     %[[TILE_N_2:.+]] = affine.min #[[MAP0]](%[[IV0]])[%[[N_2]]]
+//      CHECK:     %[[SV2:.+]] = subview %[[ARG3]][0, %[[IV0]]]
+// CHECK-SAME:       [%[[M]], %[[TILE_N_2]]]
+//      CHECK:     %[[SV3:.+]] = subview %[[ARG1]][0, %[[IV0]]]
+// CHECK-SAME:       [%[[K]], %[[TILE_N]]]
+//      CHECK:     linalg.copy(%[[SV3]], %[[SV1]])
+// CHECK-SAME:       __internal_linalg_transform__ = "after_rhs_fusion_producer"
+//  CHECK-NOT:     linalg.fill
+//  CHECK-DAG:     %[[M_2:.+]] = dim %[[ARG0]], %[[C0]]
+//  CHECK-DAG:     %[[K_2:.+]] = dim %[[ARG0]], %[[C1]]
+//      CHECK:     scf.parallel (%[[IV1:.+]]) =
+// CHECK-SAME:       (%[[C0]]) to (%[[M_2]]) step (%[[C32]]) {
+// CHECK-NEXT:       scf.for %[[IV2:.+]] = %[[C0]] to %[[K_2]] step %[[C16]] {
+//      CHECK:         %[[TILE_M:.+]] = affine.min #[[MAP2]](%[[IV1]])[%[[M_2]]]
+//      CHECK:         %[[TILE_K:.+]] = affine.min #[[MAP3]](%[[IV2]])[%[[K_2]]]
+//      CHECK:         %[[SV4:.+]] = subview %[[ARG0]][%[[IV1]], %[[IV2]]]
+// CHECK-SAME:           [%[[TILE_M]], %[[TILE_K]]]
+//      CHECK:         %[[TILE_K_2:.+]] = affine.min #[[MAP3]](%[[IV2]])[%[[K]]]
+//      CHECK:         %[[SV5:.+]] = subview %[[SV1]][%[[IV2]], 0]
+// CHECK-SAME:           [%[[TILE_K_2]], %[[TILE_N]]]
+//      CHECK:         %[[TILE_M_2:.+]] = affine.min #[[MAP2]](%[[IV1]])[%[[M]]]
+//      CHECK:         %[[SV6:.+]] = subview %[[SV2]][%[[IV1]], 0]
+// CHECK-SAME:           [%[[TILE_M_2]], %[[TILE_N_2]]]
+//      CHECK:         linalg.matmul
+// CHECK-SAME:           __internal_linalg_transform__ = "after_rhs_fusion"
+// CHECK-SAME:           ins(%[[SV4]], %[[SV5]]
+// CHECK-SAME:             : memref<?x?xf32, #[[MAP1]]>, memref<?x?xf32, #[[MAP1]]>)
+// CHECK-SAME:           outs(%[[SV6]] : memref<?x?xf32, #[[MAP1]]>)
+//      CHECK:       }
+//      CHECK:     }
+//      CHECK:   }
+//      CHECK:   linalg.matmul
+// CHECK-SAME:     __internal_linalg_transform__ = "after_rhs_fusion_original"
+
+
+// -----
+
+module {
+  func @two_operand_fusion(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>,
+                              %arg2: memref<?x?xf32>, %arg3: memref<?x?xf32>) {
+    %cst = constant 0.000000e+00 : f32
+    linalg.copy(%arg0, %arg1) : memref<?x?xf32>, memref<?x?xf32>
+    linalg.fill(%arg3, %cst) : memref<?x?xf32>, f32
+    linalg.matmul {__internal_linalg_transform__ = "two_operand_fusion"}
+      ins(%arg1, %arg2 : memref<?x?xf32>, memref<?x?xf32>)
+      outs(%arg3 : memref<?x?xf32>)
+    return
+  }
+}
+//  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0] -> (32, -d0 + s0)>
+//  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>
+//  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0] -> (16, -d0 + s0)>
+//  CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0)[s0] -> (64, -d0 + s0)>
+//      CHECK: func @two_operand_fusion
+// CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: memref<?x?xf32>
+// CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: memref<?x?xf32>
+// CHECK-SAME:   %[[ARG2:[a-zA-Z0-9_]+]]: memref<?x?xf32>
+// CHECK-SAME:   %[[ARG3:[a-zA-Z0-9_]+]]: memref<?x?xf32>
+//  CHECK-DAG:   %[[C0:.+]] = constant 0 : index
+//  CHECK-DAG:   %[[C1:.+]] = constant 1 : index
+//  CHECK-DAG:   %[[C32:.+]] = constant 32 : index
+//  CHECK-DAG:   %[[C64:.+]] = constant 64 : index
+//  CHECK-DAG:   %[[C16:.+]] = constant 16 : index
+//  CHECK-DAG:   %[[CST:.+]] = constant 0.0{{.*}} : f32
+//      CHECK:   linalg.copy(%[[ARG0]], %[[ARG1]])
+// CHECK-SAME:     __internal_linalg_transform__ = "after_two_operand_fusion_original"
+//      CHECK:   linalg.fill(%[[ARG3]], %[[CST]])
+// CHECK-SAME:     __internal_linalg_transform__ = "after_two_operand_fusion_original"
+//  CHECK-DAG:   %[[M:.+]] = dim %[[ARG1]], %[[C0]]
+//      CHECK:   scf.parallel (%[[IV0:.+]]) =
+// CHECK-SAME:     (%[[C0]]) to (%[[M]]) step (%[[C32]]) {
+//      CHECK:     %[[TILE_M:.+]] = affine.min #[[MAP0]](%[[IV0]])[%[[M]]]
+//      CHECK:     %[[K:.+]] = dim %[[ARG1]], %[[C1]]
+//      CHECK:     %[[SV1:.+]] = subview %[[ARG1]][%[[IV0]], 0]
+// CHECK-SAME:       [%[[TILE_M]], %[[K]]]
+//      CHECK:     %[[M_2:.+]] = dim %[[ARG3]], %[[C0]]
+//      CHECK:     %[[TILE_M_2:.+]] = affine.min #[[MAP0]](%[[IV0]])[%[[M_2]]]
+//      CHECK:     %[[N:.+]] = dim %[[ARG3]], %[[C1]]
+//      CHECK:     %[[SV2:.+]] = subview %[[ARG3]][%[[IV0]], 0]
+// CHECK-SAME:       [%[[TILE_M_2]], %[[N]]]
+//      CHECK:     %[[SV3:.+]] = subview %[[ARG0]][%[[IV0]], 0]
+// CHECK-SAME:       [%[[TILE_M]], %[[K]]]
+//      CHECK:     linalg.copy(%[[SV3]], %[[SV1]])
+// CHECK-SAME:       __internal_linalg_transform__ = "after_two_operand_fusion_producer"
+//      CHECK:     linalg.fill(%[[SV2]], %[[CST]])
+// CHECK-SAME:       __internal_linalg_transform__ = "after_two_operand_fusion_producer"
+//  CHECK-DAG:     %[[N_2:.+]] = dim %[[ARG2]], %[[C1]]
+//      CHECK:     scf.parallel (%[[IV1:.+]]) =
+// CHECK-SAME:       (%[[C0]]) to (%[[N_2]]) step (%[[C64]]) {
+// CHECK-NEXT:       scf.for %[[IV2:.+]] = %[[C0]] to %[[K]] step %[[C16]] {
+//      CHECK:         %[[TILE_K:.+]] = affine.min #[[MAP2]](%[[IV2]])[%[[K]]]
+//      CHECK:         %[[SV4:.+]] = subview %[[SV1]][0, %[[IV2]]]
+// CHECK-SAME:           [%[[TILE_M]], %[[TILE_K]]]
+//      CHECK:         %[[K_2:.+]] = dim %[[ARG2]], %[[C0]]
+//      CHECK:         %[[TILE_K_2:.+]] = affine.min #[[MAP2]](%[[IV2]])[%[[K_2]]]
+//      CHECK:         %[[TILE_N:.+]] = affine.min #[[MAP3]](%[[IV1]])[%[[N_2]]]
+//      CHECK:         %[[SV5:.+]] = subview %[[ARG2]][%[[IV2]], %[[IV1]]]
+// CHECK-SAME:           [%[[TILE_K_2]], %[[TILE_N]]]
+//      CHECK:         %[[TILE_N_2:.+]] = affine.min #[[MAP3]](%[[IV1]])[%[[N]]]
+//      CHECK:         %[[SV6:.+]] = subview %[[SV2]][0, %[[IV1]]]
+// CHECK-SAME:           [%[[TILE_M_2]], %[[TILE_N_2]]]
+//      CHECK:         linalg.matmul
+// CHECK-SAME:           __internal_linalg_transform__ = "after_two_operand_fusion"
+// CHECK-SAME:           ins(%[[SV4]], %[[SV5]]
+// CHECK-SAME:             : memref<?x?xf32, #[[MAP1]]>, memref<?x?xf32, #[[MAP1]]>)
+// CHECK-SAME:           outs(%[[SV6]] : memref<?x?xf32, #[[MAP1]]>)
+//      CHECK:       }
+//      CHECK:     }
+//      CHECK:   }
+//      CHECK:   linalg.matmul
+// CHECK-SAME:     __internal_linalg_transform__ = "after_two_operand_fusion_original"
+
+// -----
+
+module {
+  func @matmul_fusion(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>,
+                      %arg2: memref<?x?xf32>, %arg3: memref<?x?xf32>,
+                      %arg4: memref<?x?xf32>) {
+    linalg.matmul ins(%arg0, %arg1 : memref<?x?xf32>, memref<?x?xf32>)
+      outs(%arg2 : memref<?x?xf32>)
+    linalg.matmul {__internal_linalg_transform__ = "lhs_fusion"}
+      ins(%arg2, %arg3 : memref<?x?xf32>, memref<?x?xf32>)
+      outs(%arg4 : memref<?x?xf32>)
+    return
+  }
+}
+//  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0] -> (32, -d0 + s0)>
+//  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>
+//  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0] -> (16, -d0 + s0)>
+//  CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0)[s0] -> (64, -d0 + s0)>
+//      CHECK: func @matmul_fusion
+// CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: memref<?x?xf32>
+// CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: memref<?x?xf32>
+// CHECK-SAME:   %[[ARG2:[a-zA-Z0-9_]+]]: memref<?x?xf32>
+// CHECK-SAME:   %[[ARG3:[a-zA-Z0-9_]+]]: memref<?x?xf32>
+// CHECK-SAME:   %[[ARG4:[a-zA-Z0-9_]+]]: memref<?x?xf32>
+//  CHECK-DAG:   %[[C0:.+]] = constant 0 : index
+//  CHECK-DAG:   %[[C1:.+]] = constant 1 : index
+//  CHECK-DAG:   %[[C32:.+]] = constant 32 : index
+//  CHECK-DAG:   %[[C64:.+]] = constant 64 : index
+//  CHECK-DAG:   %[[C16:.+]] = constant 16 : index
+//      CHECK:   linalg.matmul
+// CHECK-SAME:     __internal_linalg_transform__ = "after_lhs_fusion_original"
+//  CHECK-DAG:   %[[M:.+]] = dim %[[ARG2]], %[[C0]]
+//      CHECK:   scf.parallel (%[[IV0:.+]]) =
+// CHECK-SAME:     (%[[C0]]) to (%[[M]]) step (%[[C32]]) {
+//      CHECK:     %[[TILE_M:.+]] = affine.min #[[MAP0]](%[[IV0]])[%[[M]]]
+//      CHECK:     %[[K2:.+]] = dim %[[ARG2]], %[[C1]]
+//      CHECK:     %[[SV1:.+]] = subview %[[ARG2]][%[[IV0]], 0]
+// CHECK-SAME:       [%[[TILE_M]], %[[K2]]]
+//      CHECK:     %[[M_2:.+]] = dim %[[ARG4]], %[[C0]]
+//      CHECK:     %[[TILE_M_2:.+]] = affine.min #[[MAP0]](%[[IV0]])[%[[M_2]]]
+//      CHECK:     %[[N:.+]] = dim %[[ARG4]], %[[C1]]
+//      CHECK:     %[[SV2:.+]] = subview %[[ARG4]][%[[IV0]], 0]
+// CHECK-SAME:       [%[[TILE_M_2]], %[[N]]]
+//      CHECK:     %[[K1:.+]] = dim %[[ARG0]], %[[C1]]
+//      CHECK:     %[[SV3:.+]] = subview %[[ARG0]][%[[IV0]], 0]
+// CHECK-SAME:       [%[[TILE_M]], %[[K1]]]
+//      CHECK:     %[[SV4:.+]] = subview %[[ARG1]][0, 0] [%[[K1]], %[[K2]]]
+//      CHECK:     linalg.matmul
+// CHECK-SAME:         __internal_linalg_transform__ = "after_lhs_fusion_producer"
+// CHECK-SAME:         ins(%[[SV3]], %[[SV4]]
+// CHECK-SAME:           : memref<?x?xf32, #[[MAP1]]>, memref<?x?xf32, #[[MAP1]]>)
+// CHECK-SAME:         outs(%[[SV1]] : memref<?x?xf32, #[[MAP1]]>)
+//  CHECK-DAG:     %[[N_2:.+]] = dim %[[ARG3]], %[[C1]]
+//      CHECK:     scf.parallel (%[[IV1:.+]]) =
+// CHECK-SAME:       (%[[C0]]) to (%[[N_2]]) step (%[[C64]]) {
+// CHECK-NEXT:       scf.for %[[IV2:.+]] = %[[C0]] to %[[K]] step %[[C16]] {
+//      CHECK:         %[[TILE_K:.+]] = affine.min #[[MAP2]](%[[IV2]])[%[[K]]]
+//      CHECK:         %[[SV6:.+]] = subview %[[SV1]][0, %[[IV2]]]
+// CHECK-SAME:           [%[[TILE_M]], %[[TILE_K]]]
+//      CHECK:         %[[K_2:.+]] = dim %[[ARG3]], %[[C0]]
+//      CHECK:         %[[TILE_K_2:.+]] = affine.min #[[MAP2]](%[[IV2]])[%[[K_2]]]
+//      CHECK:         %[[TILE_N:.+]] = affine.min #[[MAP3]](%[[IV1]])[%[[N_2]]]
+//      CHECK:         %[[SV7:.+]] = subview %[[ARG3]][%[[IV2]], %[[IV1]]]
+// CHECK-SAME:           [%[[TILE_K_2]], %[[TILE_N]]]
+//      CHECK:         %[[TILE_N_2:.+]] = affine.min #[[MAP3]](%[[IV1]])[%[[N]]]
+//      CHECK:         %[[SV8:.+]] = subview %[[SV2]][0, %[[IV1]]]
+// CHECK-SAME:           [%[[TILE_M_2]], %[[TILE_N_2]]]
+//      CHECK:         linalg.matmul
+// CHECK-SAME:           __internal_linalg_transform__ = "after_lhs_fusion"
+// CHECK-SAME:           ins(%[[SV6]], %[[SV7]]
+// CHECK-SAME:             : memref<?x?xf32, #[[MAP1]]>, memref<?x?xf32, #[[MAP1]]>)
+// CHECK-SAME:           outs(%[[SV8]] : memref<?x?xf32, #[[MAP1]]>)
+//      CHECK:       }
+//      CHECK:     }
+//      CHECK:   }
+//      CHECK:   linalg.matmul
+// CHECK-SAME:     __internal_linalg_transform__ = "after_lhs_fusion_original"
diff --git a/mlir/test/Dialect/Vector/invalid.mlir b/mlir/test/Dialect/Vector/invalid.mlir
--- a/mlir/test/Dialect/Vector/invalid.mlir
+++ b/mlir/test/Dialect/Vector/invalid.mlir
@@ -1328,3 +1328,31 @@
   // expected-error@+1 {{'vector.compressstore' op expected value dim to match mask dim}}
   vector.compressstore %base, %mask, %value : memref<?xf32>, vector<17xi1>, vector<16xf32>
 }
+
+// -----
+
+func @extract_map_rank(%v: vector<2x32xf32>, %id : index) {
+  // expected-error@+1 {{'vector.extract_map' op expects source and destination vectors of rank 1}}
+  %0 = vector.extract_map %v[%id : 32] : vector<2x32xf32> to vector<2x1xf32>
+}
+
+// -----
+
+func @extract_map_size(%v: vector<63xf32>, %id : index) {
+  // expected-error@+1 {{'vector.extract_map' op vector sizes mismatch. Source size must be equal to destination size * multiplicity}}
+  %0 = vector.extract_map %v[%id : 32] : vector<63xf32> to vector<2xf32>
+}
+
+// -----
+
+func @insert_map_rank(%v: vector<2x1xf32>, %id : index) {
+  // expected-error@+1 {{'vector.insert_map' op expected source and destination vectors of rank 1}}
+  %0 = vector.insert_map %v, %id, 32 : vector<2x1xf32> to vector<2x32xf32>
+}
+
+// -----
+
+func @insert_map_size(%v: vector<1xf32>, %id : index) {
+  // expected-error@+1 {{'vector.insert_map' op vector sizes mismatch. Destination size must be equal to source size * multiplicity}}
+  %0 = vector.insert_map %v, %id, 32 : vector<1xf32> to vector<64xf32>
+}
diff --git a/mlir/test/Dialect/Vector/ops.mlir b/mlir/test/Dialect/Vector/ops.mlir
--- a/mlir/test/Dialect/Vector/ops.mlir
+++ b/mlir/test/Dialect/Vector/ops.mlir
@@ -432,3 +432,14 @@
   vector.compressstore %base, %mask, %0 : memref<?xf32>, vector<16xi1>, vector<16xf32>
   return
 }
+
+// CHECK-LABEL: @extract_insert_map
+func @extract_insert_map(%v: vector<32xf32>, %id : index) -> vector<32xf32> {
+  // CHECK: %[[V:.*]] = vector.extract_map %{{.*}}[%{{.*}} : 16] : vector<32xf32> to vector<2xf32>
+  %vd = vector.extract_map %v[%id : 16] : vector<32xf32> to vector<2xf32>
+  // CHECK: %[[R:.*]] = vector.insert_map %[[V]], %{{.*}}, 16 : vector<2xf32> to vector<32xf32>
+  %r = vector.insert_map %vd, %id, 16 : vector<2xf32> to vector<32xf32>
+  // CHECK: return %[[R]] : vector<32xf32>
+  return %r : vector<32xf32>
+}
+
diff --git a/mlir/test/Dialect/Vector/vector-distribution.mlir b/mlir/test/Dialect/Vector/vector-distribution.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Dialect/Vector/vector-distribution.mlir
@@ -0,0 +1,13 @@
+// RUN: mlir-opt %s -test-vector-distribute-patterns | FileCheck %s
+
+// CHECK-LABEL: func @distribute_vector_add
+//  CHECK-SAME: (%[[ID:.*]]: index
+//  CHECK-NEXT:    %[[EXA:.*]] = vector.extract_map %{{.*}}[%[[ID]] : 32] : vector<32xf32> to vector<1xf32>
+//  CHECK-NEXT:    %[[EXB:.*]] = vector.extract_map %{{.*}}[%[[ID]] : 32] : vector<32xf32> to vector<1xf32>
+//  CHECK-NEXT:    %[[ADD:.*]] = addf %[[EXA]], %[[EXB]] : vector<1xf32>
+//  CHECK-NEXT:    %[[INS:.*]] = vector.insert_map %[[ADD]], %[[ID]], 32 : vector<1xf32> to vector<32xf32>
+//  CHECK-NEXT:    return %[[INS]] : vector<32xf32>
+func @distribute_vector_add(%id : index, %A: vector<32xf32>, %B: vector<32xf32>) -> vector<32xf32> {
+  %0 = addf %A, %B : vector<32xf32>
+  return %0: vector<32xf32>
+}
diff --git a/mlir/test/EDSC/CMakeLists.txt b/mlir/test/EDSC/CMakeLists.txt
--- a/mlir/test/EDSC/CMakeLists.txt
+++ b/mlir/test/EDSC/CMakeLists.txt
@@ -10,14 +10,14 @@
 
 target_link_libraries(mlir-edsc-builder-api-test
   PRIVATE
-  MLIRAffineOps
+  MLIRAffine
   MLIRAffineEDSC
   MLIREDSC
   MLIRIR
+  MLIRLinalg
   MLIRLinalgEDSC
-  MLIRLinalgOps
   MLIRSCF
-  MLIRStandardOps
+  MLIRStandard
   MLIRTransforms
   MLIRVector
   )
diff --git a/mlir/test/lib/Dialect/Test/CMakeLists.txt b/mlir/test/lib/Dialect/Test/CMakeLists.txt
--- a/mlir/test/lib/Dialect/Test/CMakeLists.txt
+++ b/mlir/test/lib/Dialect/Test/CMakeLists.txt
@@ -38,7 +38,7 @@
   MLIRInferTypeOpInterface
   MLIRLinalgTransforms
   MLIRPass
-  MLIRStandardOps
+  MLIRStandard
   MLIRStandardOpsTransforms
   MLIRTransformUtils
   MLIRTransforms
diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt
--- a/mlir/test/lib/Transforms/CMakeLists.txt
+++ b/mlir/test/lib/Transforms/CMakeLists.txt
@@ -16,6 +16,7 @@
   TestGpuMemoryPromotion.cpp
   TestGpuParallelLoopMapping.cpp
   TestInlining.cpp
+  TestLinalgFusionTransforms.cpp
   TestLinalgHoisting.cpp
   TestLinalgTransforms.cpp
   TestLiveness.cpp
@@ -38,12 +39,12 @@
   MLIRStandardOpsIncGen
 
   LINK_LIBS PUBLIC
-  MLIRAffineOps
+  MLIRAffine
   MLIRAnalysis
   MLIREDSC
   MLIRGPU
   MLIRGPUToGPURuntimeTransforms
-  MLIRLinalgOps
+  MLIRLinalg
   MLIRLinalgTransforms
   MLIRNVVMIR
   MLIRSCF
diff --git a/mlir/test/lib/Transforms/TestLinalgFusionTransforms.cpp b/mlir/test/lib/Transforms/TestLinalgFusionTransforms.cpp
new file mode 100644
--- /dev/null
+++ b/mlir/test/lib/Transforms/TestLinalgFusionTransforms.cpp
@@ -0,0 +1,112 @@
+//===- TestLinalgFusionTransforms.cpp - Test Linalg fusion patterns -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements logic for testing Linalg fusion patterns.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+using namespace mlir::linalg;
+
+namespace {
+struct TestLinalgFusionTransforms
+    : public PassWrapper<TestLinalgFusionTransforms, FunctionPass> {
+  TestLinalgFusionTransforms() = default;
+  TestLinalgFusionTransforms(const TestLinalgFusionTransforms &pass) {}
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<AffineDialect, linalg::LinalgDialect, scf::SCFDialect,
+                    StandardOpsDialect>();
+  }
+
+  void runOnFunction() override;
+};
+} // namespace
+
+static void fillFusionPatterns(MLIRContext *context,
+                               const LinalgDependenceGraph &dependenceGraph,
+                               OwningRewritePatternList &patterns) {
+  patterns.insert<LinalgTileAndFusePattern<MatmulOp>>(
+      context, dependenceGraph,
+      LinalgTilingOptions()
+          .setTileSizes({32, 64, 16})
+          .setLoopType(LinalgTilingLoopType::ParallelLoops),
+      LinalgFusionOptions(),
+      LinalgMarker(Identifier::get("basic_fusion", context),
+                   Identifier::get("after_basic_fusion", context)),
+      LinalgMarker(ArrayRef<Identifier>(),
+                   Identifier::get("after_basic_fusion_producer", context)),
+      LinalgMarker(ArrayRef<Identifier>(),
+                   Identifier::get("after_basic_fusion_original", context)));
+
+  patterns.insert<LinalgTileAndFusePattern<MatmulOp>>(
+      context, dependenceGraph,
+      LinalgTilingOptions()
+          .setTileSizes({32, 64, 16})
+          .setLoopType(LinalgTilingLoopType::ParallelLoops),
+      LinalgFusionOptions().setIndicesToFuse({0}),
+      LinalgMarker(Identifier::get("lhs_fusion", context),
+                   Identifier::get("after_lhs_fusion", context)),
+      LinalgMarker(ArrayRef<Identifier>(),
+                   Identifier::get("after_lhs_fusion_producer", context)),
+      LinalgMarker(ArrayRef<Identifier>(),
+                   Identifier::get("after_lhs_fusion_original", context)));
+
+  patterns.insert<LinalgTileAndFusePattern<MatmulOp>>(
+      context, dependenceGraph,
+      LinalgTilingOptions()
+          .setTileSizes({32, 64, 16})
+          .setLoopType(LinalgTilingLoopType::ParallelLoops),
+      LinalgFusionOptions().setIndicesToFuse({1}),
+      LinalgMarker(Identifier::get("rhs_fusion", context),
+                   Identifier::get("after_rhs_fusion", context)),
+      LinalgMarker(ArrayRef<Identifier>(),
+                   Identifier::get("after_rhs_fusion_producer", context)),
+      LinalgMarker(ArrayRef<Identifier>(),
+                   Identifier::get("after_rhs_fusion_original", context)));
+
+  patterns.insert<LinalgTileAndFusePattern<MatmulOp>>(
+      context, dependenceGraph,
+      LinalgTilingOptions()
+          .setTileSizes({32, 64, 16})
+          .setLoopType(LinalgTilingLoopType::ParallelLoops),
+      LinalgFusionOptions().setIndicesToFuse({0, 2}),
+      LinalgMarker(Identifier::get("two_operand_fusion", context),
+                   Identifier::get("after_two_operand_fusion", context)),
+      LinalgMarker(
+          ArrayRef<Identifier>(),
+          Identifier::get("after_two_operand_fusion_producer", context)),
+      LinalgMarker(
+          ArrayRef<Identifier>(),
+          Identifier::get("after_two_operand_fusion_original", context)));
+}
+
+static void applyFusionPatterns(MLIRContext *context, FuncOp funcOp) {
+  OwningRewritePatternList fusionPatterns;
+  Aliases alias;
+  LinalgDependenceGraph dependenceGraph =
+      LinalgDependenceGraph::buildDependenceGraph(alias, funcOp);
+  fillFusionPatterns(context, dependenceGraph, fusionPatterns);
+  applyPatternsAndFoldGreedily(funcOp, fusionPatterns);
+}
+
+void TestLinalgFusionTransforms::runOnFunction() {
+  applyFusionPatterns(&getContext(), getFunction());
+}
+
+namespace mlir {
+void registerTestLinalgFusionTransforms() {
+  PassRegistration<TestLinalgFusionTransforms> testFusionTransformsPass(
+      "test-linalg-fusion-transform-patterns",
+      "Test Linalg fusion transformation patterns by applying them greedily.");
+}
+} // namespace mlir
diff --git a/mlir/test/lib/Transforms/TestVectorTransforms.cpp b/mlir/test/lib/Transforms/TestVectorTransforms.cpp
--- a/mlir/test/lib/Transforms/TestVectorTransforms.cpp
+++ b/mlir/test/lib/Transforms/TestVectorTransforms.cpp
@@ -125,6 +125,28 @@
   }
 };
 
+struct TestVectorDistributePatterns
+    : public PassWrapper<TestVectorDistributePatterns, FunctionPass> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<VectorDialect>();
+  }
+  void runOnFunction() override {
+    MLIRContext *ctx = &getContext();
+    OwningRewritePatternList patterns;
+    FuncOp func = getFunction();
+    func.walk([&](AddFOp op) {
+      OpBuilder builder(op);
+      Optional<mlir::vector::DistributeOps> ops = distributPointwiseVectorOp(
+          builder, op.getOperation(), func.getArgument(0), 32);
+      assert(ops.hasValue());
+      SmallPtrSet<Operation *, 1> extractOp({ops->extract});
+      op.getResult().replaceAllUsesExcept(ops->insert.getResult(), extractOp);
+    });
+    patterns.insert<PointwiseExtractPattern>(ctx);
+    applyPatternsAndFoldGreedily(getFunction(), patterns);
+  }
+};
+
 struct TestVectorTransferFullPartialSplitPatterns
     : public PassWrapper<TestVectorTransferFullPartialSplitPatterns,
                          FunctionPass> {
@@ -178,5 +200,9 @@
       vectorTransformFullPartialPass("test-vector-transfer-full-partial-split",
                                      "Test conversion patterns to split "
                                      "transfer ops via scf.if + linalg ops");
+  PassRegistration<TestVectorDistributePatterns> distributePass(
+      "test-vector-distribute-patterns",
+      "Test conversion patterns to distribute vector ops in the vector "
+      "dialect");
 }
 } // namespace mlir
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -58,6 +58,7 @@
 void registerTestGpuMemoryPromotionPass();
 void registerTestGpuParallelLoopMappingPass();
 void registerTestInterfaces();
+void registerTestLinalgFusionTransforms();
 void registerTestLinalgHoisting();
 void registerTestLinalgTransforms();
 void registerTestLivenessPass();
@@ -114,6 +115,7 @@
   registerTestExpandTanhPass();
   registerTestGpuMemoryPromotionPass();
   registerTestInterfaces();
+  registerTestLinalgFusionTransforms();
   registerTestLinalgHoisting();
   registerTestLinalgTransforms();
   registerTestLivenessPass();
diff --git a/mlir/utils/gdb-scripts/prettyprinters.py b/mlir/utils/gdb-scripts/prettyprinters.py
new file mode 100644
--- /dev/null
+++ b/mlir/utils/gdb-scripts/prettyprinters.py
@@ -0,0 +1,235 @@
+"""GDB pretty printers for MLIR types."""
+
+import gdb.printing
+
+
+class IdentifierPrinter:
+  """Prints an mlir::Identifier instance."""
+
+  def __init__(self, val):
+    self.entry = val['entry']
+
+  def to_string(self):
+    ptr = (self.entry + 1).cast(gdb.lookup_type('char').pointer())
+    return ptr.string(length=self.entry['keyLength'])
+
+  def display_hint(self):
+    return 'string'
+
+
+class StoragePrinter:
+  """Prints bases of a struct and its fields."""
+
+  def __init__(self, val):
+    self.val = val
+
+  def children(self):
+    for field in self.val.type.fields():
+      if field.is_base_class:
+        yield ('<%s>' % field.name, self.val.cast(field.type))
+      else:
+        yield (field.name, self.val[field.name])
+
+
+class TupleTypeStoragePrinter(StoragePrinter):
+
+  def children(self):
+    for child in StoragePrinter.children(self):
+      yield child
+    pointer_type = gdb.lookup_type('mlir::Type').pointer()
+    elements = (self.val.address + 1).cast(pointer_type)
+    for i in range(self.val['numElements']):
+      yield 'elements[%u]' % i, elements[i]
+
+
+class RankedTypeStoragePrinter(StoragePrinter):
+
+  def children(self):
+    for child in StoragePrinter.children(self):
+      yield child
+    for i in range(self.val['shapeSize']):
+      yield 'shapeElements[%u]' % i, self.val['shapeElements'][i]
+
+
+class MemRefTypeStoragePrinter(RankedTypeStoragePrinter):
+
+  def children(self):
+    for child in RankedTypeStoragePrinter.children(self):
+      yield child
+    for i in range(self.val['numAffineMaps']):
+      yield 'affineMapsList[%u]' % i, self.val['affineMapsList'][i]
+
+
+class FusedLocationStoragePrinter(StoragePrinter):
+
+  def children(self):
+    for child in StoragePrinter.children(self):
+      yield child
+    pointer_type = gdb.lookup_type('mlir::Location').pointer()
+    elements = (self.val.address + 1).cast(pointer_type)
+    for i in range(self.val['numLocs']):
+      yield 'locs[%u]' % i, elements[i]
+
+
+class StorageUserBasePrinter:
+  """Printer for an mlir::detail::StorageUserBase instance."""
+
+  def __init__(self, val):
+    self.val = val
+
+  def children(self):
+    storage_type = self.val.type.template_argument(2)
+    yield 'impl', self.val['impl'].dereference().cast(storage_type)
+
+
+class StorageTypeMap:
+  """Maps a TypeID to the corresponding type derived from StorageUserBase.
+
+  Types need to be registered by name before the first lookup.
+  """
+
+  def __init__(self):
+    self.map = None
+    self.type_names = []
+
+  def register_type(self, type_name):
+    assert not self.map, 'register_type called after __getitem__'
+    self.type_names += [type_name]
+
+  def _init_map(self):
+    """Lazy initialization  of self.map."""
+    if self.map:
+      return
+    self.map = {}
+    for type_name in self.type_names:
+      concrete_type = gdb.lookup_type(type_name)
+      storage = gdb.parse_and_eval(
+          "&'mlir::TypeID::get<%s>()::instance'" % type_name)
+      if concrete_type and storage:
+        self.map[int(storage)] = concrete_type
+
+  def __getitem__(self, type_id):
+    self._init_map()
+    return self.map.get(int(type_id['storage']))
+
+
+storage_type_map = StorageTypeMap()
+
+
+def get_type_id_printer(val):
+  """Returns a printer of the name of a mlir::TypeID."""
+
+  class StringPrinter:
+
+    def __init__(self, string):
+      self.string = string
+
+    def to_string(self):
+      return self.string
+
+  concrete_type = storage_type_map[val]
+  if not concrete_type:
+    return None
+  return StringPrinter('"%s"' % concrete_type.name)
+
+
+def get_attr_or_type_printer(val, get_type_id):
+  """Returns a printer for mlir::Attribute or mlir::Type."""
+
+  class UpcastPrinter:
+
+    def __init__(self, val, type):
+      self.val = val.cast(type)
+
+    def children(self):
+      yield 'cast<%s>' % self.val.type.name, self.val
+
+  if not val['impl']:
+    return None
+  type_id = get_type_id(val['impl'].dereference())
+  concrete_type = storage_type_map[type_id]
+  if not concrete_type:
+    return None
+  return UpcastPrinter(val, concrete_type)
+
+
+pp = gdb.printing.RegexpCollectionPrettyPrinter('MLIRSupport')
+
+pp.add_printer('mlir::Identifier', '^mlir::Identifier$', IdentifierPrinter)
+
+# Printers for types deriving from AttributeStorage or TypeStorage.
+pp.add_printer('mlir::detail::FusedLocationStorage',
+               '^mlir::detail::FusedLocationStorage',
+               FusedLocationStoragePrinter)
+pp.add_printer('mlir::detail::VectorTypeStorage',
+               '^mlir::detail::VectorTypeStorage', RankedTypeStoragePrinter)
+pp.add_printer('mlir::detail::RankedTensorTypeStorage',
+               '^mlir::detail::RankedTensorTypeStorage',
+               RankedTypeStoragePrinter)
+pp.add_printer('mlir::detail::MemRefTypeStorage',
+               '^mlir::detail::MemRefTypeStorage$', MemRefTypeStoragePrinter)
+pp.add_printer('mlir::detail::TupleTypeStorage',
+               '^mlir::detail::TupleTypeStorage$', TupleTypeStoragePrinter)
+
+# Printers for Attribute::AttrBase or Type::TypeBase typedefs.
+pp.add_printer('mlir::detail::StorageUserBase',
+               '^mlir::detail::StorageUserBase<.*>$', StorageUserBasePrinter)
+
+# Printers of types deriving from Attribute::AttrBase or Type::TypeBase.
+for name in [
+    # mlir/IR/Attributes.h
+    'ArrayAttr',
+    'DictionaryAttr',
+    'FloatAttr',
+    'IntegerAttr',
+    'IntegerSetAttr',
+    'OpaqueAttr',
+    'StringAttr',
+    'SymbolRefAttr',
+    'TypeAttr',
+    'UnitAttr',
+    'DenseStringElementsAttr',
+    'DenseIntOrFPElementsAttr',
+    'OpaqueElementsAttr',
+    'SparseElementsAttr',
+    # mlir/IR/StandardTypes.h
+    'ComplexType',
+    'IndexType',
+    'IntegerType',
+    'Float16Type',
+    'Float32Type',
+    'Float64Type',
+    'NoneType',
+    'VectorType',
+    'RankedTensorType',
+    'UnrankedTensorType',
+    'MemRefType',
+    'UnrankedMemRefType',
+    'TupleType',
+    # mlir/IR/Location.h
+    'CallSiteLoc',
+    'FileLineColLoc',
+    'FusedLoc',
+    'NameLoc',
+    'OpaqueLoc',
+    'UnknownLoc'
+]:
+  storage_type_map.register_type('mlir::%s' % name)  # Register for upcasting.
+
+pp.add_printer('mlir::TypeID', '^mlir::TypeID$', get_type_id_printer)
+
+
+def add_attr_or_type_printers(name):
+  """Adds printers for mlir::Attribute or mlir::Type and their Storage type."""
+  get_type_id = lambda val: val['abstract%s' % name]['typeID']
+  pp.add_printer('mlir::%s' % name, '^mlir::%s$' % name,
+                 lambda val: get_attr_or_type_printer(val, get_type_id))
+  pp.add_printer('mlir::%sStorage' % name, '^mlir::%sStorage$' % name,
+                 lambda val: get_type_id_printer(get_type_id(val)))
+
+
+# Upcasting printers of mlir::Attribute and mlir::Type.
+for name in ['Attribute', 'Type']:
+  add_attr_or_type_printers(name)
+
+gdb.printing.register_pretty_printer(gdb.current_objfile(), pp)
diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h
--- a/openmp/libomptarget/include/omptarget.h
+++ b/openmp/libomptarget/include/omptarget.h
@@ -21,7 +21,6 @@
 #define OFFLOAD_FAIL (~0)
 
 #define OFFLOAD_DEVICE_DEFAULT     -1
-#define HOST_DEVICE                -10
 
 /// Data attributes for each data reference used in an OpenMP target region.
 enum tgt_map_type {
diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp
--- a/openmp/libomptarget/src/api.cpp
+++ b/openmp/libomptarget/src/api.cpp
@@ -29,8 +29,9 @@
 }
 
 EXTERN int omp_get_initial_device(void) {
-  DP("Call to omp_get_initial_device returning %d\n", HOST_DEVICE);
-  return HOST_DEVICE;
+  int hostDevice = omp_get_num_devices();
+  DP("Call to omp_get_initial_device returning %d\n", hostDevice);
+  return hostDevice;
 }
 
 EXTERN void *omp_target_alloc(size_t size, int device_num) {
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -3876,7 +3876,6 @@
 
 // Constants used in libomptarget
 #define KMP_DEVICE_DEFAULT -1 // This is libomptarget's default device.
-#define KMP_HOST_DEVICE -10 // This is what it is in libomptarget, go figure.
 #define KMP_DEVICE_ALL -11 // This is libomptarget's "all devices".
 
 // OMP Pause Resource
diff --git a/openmp/runtime/src/kmp_ftn_entry.h b/openmp/runtime/src/kmp_ftn_entry.h
--- a/openmp/runtime/src/kmp_ftn_entry.h
+++ b/openmp/runtime/src/kmp_ftn_entry.h
@@ -966,13 +966,15 @@
 int FTN_STDCALL FTN_GET_INITIAL_DEVICE(void) KMP_WEAK_ATTRIBUTE_EXTERNAL;
 int FTN_STDCALL FTN_GET_INITIAL_DEVICE(void) {
 #if KMP_MIC || KMP_OS_DARWIN || KMP_OS_WINDOWS || defined(KMP_STUB)
-  return KMP_HOST_DEVICE;
+  // same as omp_get_num_devices()
+  return 0;
 #else
   int (*fptr)();
   if ((*(void **)(&fptr) = dlsym(RTLD_NEXT, "omp_get_initial_device"))) {
     return (*fptr)();
   } else { // liboffload & libomptarget don't exist
-    return KMP_HOST_DEVICE;
+    // same as omp_get_num_devices()
+    return 0;
   }
 #endif
 }
@@ -1319,14 +1321,14 @@
 // loaded, we assume we are on the host and return KMP_HOST_DEVICE.
 // Compiler/libomptarget will handle this if called inside target.
 int FTN_STDCALL FTN_GET_DEVICE_NUM(void) KMP_WEAK_ATTRIBUTE_EXTERNAL;
-int FTN_STDCALL FTN_GET_DEVICE_NUM(void) { return KMP_HOST_DEVICE; }
+int FTN_STDCALL FTN_GET_DEVICE_NUM(void) { return FTN_GET_INITIAL_DEVICE(); }
 
 // Compiler will ensure that this is only called from host in sequential region
 int FTN_STDCALL FTN_PAUSE_RESOURCE(kmp_pause_status_t kind, int device_num) {
 #ifdef KMP_STUB
   return 1; // just fail
 #else
-  if (device_num == KMP_HOST_DEVICE)
+  if (device_num == FTN_GET_INITIAL_DEVICE())
     return __kmpc_pause_resource(kind);
   else {
 #if !KMP_OS_WINDOWS
diff --git a/openmp/runtime/src/kmp_gsupport.cpp b/openmp/runtime/src/kmp_gsupport.cpp
--- a/openmp/runtime/src/kmp_gsupport.cpp
+++ b/openmp/runtime/src/kmp_gsupport.cpp
@@ -1891,6 +1891,9 @@
   kmp_depend_info_t dep_list[ndeps];
   for (kmp_int32 i = 0; i < ndeps; i++)
     dep_list[i] = gomp_depends.get_kmp_depend(i);
+#if OMPT_SUPPORT
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
   __kmpc_omp_wait_deps(&loc, gtid, ndeps, dep_list, 0, NULL);
   KA_TRACE(20, ("GOMP_taskwait_depend exit: T#%d\n", gtid));
 }
diff --git a/openmp/runtime/src/kmp_taskdeps.cpp b/openmp/runtime/src/kmp_taskdeps.cpp
--- a/openmp/runtime/src/kmp_taskdeps.cpp
+++ b/openmp/runtime/src/kmp_taskdeps.cpp
@@ -520,7 +520,6 @@
 
 #if OMPT_SUPPORT
   if (ompt_enabled.enabled) {
-    OMPT_STORE_RETURN_ADDRESS(gtid);
     if (!current_task->ompt_task_info.frame.enter_frame.ptr)
       current_task->ompt_task_info.frame.enter_frame.ptr =
           OMPT_GET_FRAME_ADDRESS(0);
@@ -531,7 +530,7 @@
           current_task ? &(current_task->ompt_task_info.frame) : NULL,
           &(new_taskdata->ompt_task_info.task_data),
           ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 1,
-          OMPT_LOAD_RETURN_ADDRESS(gtid));
+          OMPT_LOAD_OR_GET_RETURN_ADDRESS(gtid));
     }
 
     new_taskdata->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
@@ -700,7 +699,7 @@
           current_task ? &(current_task->ompt_task_info.frame) : NULL,
           taskwait_task_data,
           ompt_task_explicit | ompt_task_undeferred | ompt_task_mergeable, 1,
-          OMPT_GET_RETURN_ADDRESS(0));
+          OMPT_LOAD_OR_GET_RETURN_ADDRESS(gtid));
     }
   }
 
diff --git a/openmp/runtime/src/ompt-specific.h b/openmp/runtime/src/ompt-specific.h
--- a/openmp/runtime/src/ompt-specific.h
+++ b/openmp/runtime/src/ompt-specific.h
@@ -81,6 +81,11 @@
   __kmp_threads[gtid]->th.ompt_thread_info.return_address =                    \
       __builtin_return_address(0)
 #define OMPT_LOAD_RETURN_ADDRESS(gtid) __ompt_load_return_address(gtid)
+#define OMPT_LOAD_OR_GET_RETURN_ADDRESS(gtid)                                  \
+  ((ompt_enabled.enabled && gtid >= 0 && __kmp_threads[gtid] &&                \
+      __kmp_threads[gtid]->th.ompt_thread_info.return_address)?                \
+      __ompt_load_return_address(gtid):                                        \
+      __builtin_return_address(0))
 
 //******************************************************************************
 // inline functions
diff --git a/openmp/runtime/test/ompt/tasks/dependences_mutexinoutset.c b/openmp/runtime/test/ompt/tasks/dependences_mutexinoutset.c
--- a/openmp/runtime/test/ompt/tasks/dependences_mutexinoutset.c
+++ b/openmp/runtime/test/ompt/tasks/dependences_mutexinoutset.c
@@ -1,10 +1,10 @@
 // RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
 // REQUIRES: ompt
 
-// GCC does not pass in mutexinoutset
-// clang 9 introduced codegen for mutexinoutset
+// GCC 9 introduced codegen for mutexinoutset
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7, gcc-8
 
-// UNSUPPORTED: gcc
+// clang 9 introduced codegen for mutexinoutset
 // UNSUPPORTED: clang-4, clang-5, clang-6, clang-7, clang-8
 
 #include "callback.h"
diff --git a/openmp/runtime/test/ompt/tasks/taskwait-depend.c b/openmp/runtime/test/ompt/tasks/task_if0-depend.c
copy from openmp/runtime/test/ompt/tasks/taskwait-depend.c
copy to openmp/runtime/test/ompt/tasks/task_if0-depend.c
--- a/openmp/runtime/test/ompt/tasks/taskwait-depend.c
+++ b/openmp/runtime/test/ompt/tasks/task_if0-depend.c
@@ -1,10 +1,6 @@
 // RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
 // REQUIRES: ompt
 
-// The GOMP wrapper does not handle `task if(0) depend()` and drops the
-// dependency. Once this is fixed, reevaluate the GCC status:
-// XFAIL: gcc-4, gcc-5, gcc-6, gcc-7, gcc-8, gcc-9, gcc-10
-
 #include "callback.h"
 #include <omp.h>
 
@@ -20,7 +16,6 @@
 #pragma omp task depend(out : x)
       { x++; }
       print_fuzzy_address(1);
-      //#pragma omp taskwait depend(in: x) <-- currently not supported in clang
 #pragma omp task if (0) depend(in : x)
       {}
       print_fuzzy_address(2);
diff --git a/openmp/runtime/test/ompt/tasks/taskwait-depend.c b/openmp/runtime/test/ompt/tasks/taskwait-depend.c
--- a/openmp/runtime/test/ompt/tasks/taskwait-depend.c
+++ b/openmp/runtime/test/ompt/tasks/taskwait-depend.c
@@ -1,9 +1,13 @@
 // RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
 // REQUIRES: ompt
 
-// The GOMP wrapper does not handle `task if(0) depend()` and drops the
-// dependency. Once this is fixed, reevaluate the GCC status:
-// XFAIL: gcc-4, gcc-5, gcc-6, gcc-7, gcc-8, gcc-9, gcc-10
+// taskwait with depend clause was introduced with gcc-9
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7, gcc-8
+
+// clang does not yet support taskwait with depend clause
+// clang-12 introduced parsing, but no codegen
+// update expected result when codegen in clang was added
+// XFAIL: clang
 
 #include "callback.h"
 #include <omp.h>
@@ -20,9 +24,7 @@
 #pragma omp task depend(out : x)
       { x++; }
       print_fuzzy_address(1);
-      //#pragma omp taskwait depend(in: x) <-- currently not supported in clang
-#pragma omp task if (0) depend(in : x)
-      {}
+      #pragma omp taskwait depend(in: x)
       print_fuzzy_address(2);
     }
   }
diff --git a/openmp/tools/archer/ompt-tsan.cpp b/openmp/tools/archer/ompt-tsan.cpp
--- a/openmp/tools/archer/ompt-tsan.cpp
+++ b/openmp/tools/archer/ompt-tsan.cpp
@@ -15,18 +15,18 @@
 #define __STDC_FORMAT_MACROS
 #endif
 
+#include <algorithm>
 #include <atomic>
 #include <cassert>
 #include <cstdlib>
 #include <cstring>
 #include <inttypes.h>
 #include <iostream>
+#include <list>
 #include <mutex>
 #include <sstream>
 #include <stack>
-#include <list>
 #include <string>
-#include <iostream>
 #include <unordered_map>
 #include <vector>
 
@@ -89,17 +89,26 @@
   TsanFlags(const char *env) : ignore_noninstrumented_modules(0) {
     if (env) {
       std::vector<std::string> tokens;
-      std::string token;
       std::string str(env);
-      std::istringstream iss(str);
-      while (std::getline(iss, token, ' '))
-        tokens.push_back(token);
+      auto end = str.end();
+      auto it = str.begin();
+      auto is_sep = [](char c) {
+        return c == ' ' || c == ',' || c == ':' || c == '\n' || c == '\t' ||
+               c == '\r';
+      };
+      while (it != end) {
+        auto next_it = std::find_if(it, end, is_sep);
+        tokens.emplace_back(it, next_it);
+        it = next_it;
+        if (it != end) {
+          ++it;
+        }
+      }
 
-      for (std::vector<std::string>::iterator it = tokens.begin();
-           it != tokens.end(); ++it) {
+      for (const auto &token : tokens) {
         // we are interested in ignore_noninstrumented_modules to print a
         // warning
-        if (sscanf(it->c_str(), "ignore_noninstrumented_modules=%d",
+        if (sscanf(token.c_str(), "ignore_noninstrumented_modules=%d",
                    &ignore_noninstrumented_modules))
           continue;
       }
diff --git a/openmp/tools/archer/tests/lit.cfg b/openmp/tools/archer/tests/lit.cfg
--- a/openmp/tools/archer/tests/lit.cfg
+++ b/openmp/tools/archer/tests/lit.cfg
@@ -93,6 +93,8 @@
 # Race Tests
 config.substitutions.append(("%libarcher-compile-and-run-race", \
     "%libarcher-compile && %libarcher-run-race"))
+config.substitutions.append(("%libarcher-compile-and-run-nosuppression", \
+                             "%libarcher-compile && %libarcher-run-nosuppression"))
 config.substitutions.append(("%libarcher-compile-and-run", \
                              "%libarcher-compile && %libarcher-run"))
 config.substitutions.append(("%libarcher-cxx-compile-and-run", \
@@ -102,13 +104,15 @@
 config.substitutions.append(("%libarcher-compile", \
                              "%clang-archer %openmp_flags %archer_flags %flags %s -o %t" + libs))
 config.substitutions.append(("%libarcher-run-race", "%suppression %deflake %t 2>&1 | tee %t.log"))
+config.substitutions.append(("%libarcher-run-nosuppression", "%nosuppression %t 2>&1 | tee %t.log"))
 config.substitutions.append(("%libarcher-run", "%suppression %t 2>&1 | tee %t.log"))
 config.substitutions.append(("%clang-archerXX", config.test_cxx_compiler))
 config.substitutions.append(("%clang-archer", config.test_c_compiler))
 config.substitutions.append(("%openmp_flags", config.test_openmp_flags))
 config.substitutions.append(("%archer_flags", config.archer_flags))
 config.substitutions.append(("%flags", config.test_flags))
-config.substitutions.append(("%suppression", "env TSAN_OPTIONS='ignore_noninstrumented_modules=1'"))
+config.substitutions.append(("%nosuppression", "env TSAN_OPTIONS='ignore_noninstrumented_modules=0'"))
+config.substitutions.append(("%suppression", "env TSAN_OPTIONS='ignore_noninstrumented_modules=0:ignore_noninstrumented_modules=1'"))
 config.substitutions.append(("%deflake", os.path.join(os.path.dirname(__file__), "deflake.bash")))
 
 config.substitutions.append(("FileCheck", config.test_filecheck))
diff --git a/openmp/tools/archer/tests/parallel/parallel-simple.c b/openmp/tools/archer/tests/parallel/parallel-nosuppression.c
copy from openmp/tools/archer/tests/parallel/parallel-simple.c
copy to openmp/tools/archer/tests/parallel/parallel-nosuppression.c
--- a/openmp/tools/archer/tests/parallel/parallel-simple.c
+++ b/openmp/tools/archer/tests/parallel/parallel-nosuppression.c
@@ -1,5 +1,5 @@
 /*
- * parallel-simple.c -- Archer testcase
+ * parallel-nosuppression.c -- Archer testcase
  */
 
 //===----------------------------------------------------------------------===//
@@ -12,7 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 
-// RUN: %libarcher-compile-and-run | FileCheck %s
+// RUN: %libarcher-compile-and-run-nosuppression | FileCheck %s
 // REQUIRES: tsan
 #include <omp.h>
 #include <stdio.h>
@@ -36,4 +36,5 @@
 
 // CHECK-NOT: ThreadSanitizer: data race
 // CHECK-NOT: ThreadSanitizer: reported
+// CHECK: Warning: please export TSAN_OPTIONS
 // CHECK: DONE
diff --git a/openmp/tools/archer/tests/parallel/parallel-simple.c b/openmp/tools/archer/tests/parallel/parallel-simple.c
--- a/openmp/tools/archer/tests/parallel/parallel-simple.c
+++ b/openmp/tools/archer/tests/parallel/parallel-simple.c
@@ -36,4 +36,5 @@
 
 // CHECK-NOT: ThreadSanitizer: data race
 // CHECK-NOT: ThreadSanitizer: reported
+// CHECK-NOT: Warning: please export TSAN_OPTIONS
 // CHECK: DONE