diff --git a/clang-tools-extra/clangd/ClangdServer.h b/clang-tools-extra/clangd/ClangdServer.h
--- a/clang-tools-extra/clangd/ClangdServer.h
+++ b/clang-tools-extra/clangd/ClangdServer.h
@@ -130,8 +130,11 @@
     llvm::Optional<std::string> ResourceDir = llvm::None;
 
     /// Time to wait after a new file version before computing diagnostics.
-    DebouncePolicy UpdateDebounce =
-        DebouncePolicy::fixed(std::chrono::milliseconds(500));
+    DebouncePolicy UpdateDebounce = DebouncePolicy{
+        /*Min=*/std::chrono::milliseconds(50),
+        /*Max=*/std::chrono::milliseconds(500),
+        /*RebuildRatio=*/1,
+    };
 
     bool SuggestMissingIncludes = false;
 
diff --git a/clang-tools-extra/clangd/ClangdServer.cpp b/clang-tools-extra/clangd/ClangdServer.cpp
--- a/clang-tools-extra/clangd/ClangdServer.cpp
+++ b/clang-tools-extra/clangd/ClangdServer.cpp
@@ -395,7 +395,9 @@
   WorkScheduler.runWithAST("Rename", File, std::move(Action));
 }
 
-static llvm::Expected<Tweak::Selection>
+// May generate several candidate selections, due to SelectionTree ambiguity.
+// vector of pointers because GCC doesn't like non-copyable Selection.
+static llvm::Expected<std::vector<std::unique_ptr<Tweak::Selection>>>
 tweakSelection(const Range &Sel, const InputsAndAST &AST) {
   auto Begin = positionToOffset(AST.Inputs.Contents, Sel.start);
   if (!Begin)
@@ -403,7 +405,16 @@
   auto End = positionToOffset(AST.Inputs.Contents, Sel.end);
   if (!End)
     return End.takeError();
-  return Tweak::Selection(AST.Inputs.Index, AST.AST, *Begin, *End);
+  std::vector<std::unique_ptr<Tweak::Selection>> Result;
+  SelectionTree::createEach(
+      AST.AST.getASTContext(), AST.AST.getTokens(), *Begin, *End,
+      [&](SelectionTree T) {
+        Result.push_back(std::make_unique<Tweak::Selection>(
+            AST.Inputs.Index, AST.AST, *Begin, *End, std::move(T)));
+        return false;
+      });
+  assert(!Result.empty() && "Expected at least one SelectionTree");
+  return std::move(Result);
 }
 
 void ClangdServer::enumerateTweaks(PathRef File, Range Sel,
@@ -412,12 +423,21 @@
                  this](Expected<InputsAndAST> InpAST) mutable {
     if (!InpAST)
       return CB(InpAST.takeError());
-    auto Selection = tweakSelection(Sel, *InpAST);
-    if (!Selection)
-      return CB(Selection.takeError());
+    auto Selections = tweakSelection(Sel, *InpAST);
+    if (!Selections)
+      return CB(Selections.takeError());
     std::vector<TweakRef> Res;
-    for (auto &T : prepareTweaks(*Selection, TweakFilter))
-      Res.push_back({T->id(), T->title(), T->intent()});
+    // Don't allow a tweak to fire more than once across ambiguous selections.
+    llvm::DenseSet<llvm::StringRef> PreparedTweaks;
+    auto Filter = [&](const Tweak &T) {
+      return TweakFilter(T) && !PreparedTweaks.count(T.id());
+    };
+    for (const auto &Sel : *Selections) {
+      for (auto &T : prepareTweaks(*Sel, Filter)) {
+        Res.push_back({T->id(), T->title(), T->intent()});
+        PreparedTweaks.insert(T->id());
+      }
+    }
 
     CB(std::move(Res));
   };
@@ -432,21 +452,30 @@
        FS = FSProvider.getFileSystem()](Expected<InputsAndAST> InpAST) mutable {
         if (!InpAST)
           return CB(InpAST.takeError());
-        auto Selection = tweakSelection(Sel, *InpAST);
-        if (!Selection)
-          return CB(Selection.takeError());
-        auto A = prepareTweak(TweakID, *Selection);
-        if (!A)
-          return CB(A.takeError());
-        auto Effect = (*A)->apply(*Selection);
-        if (!Effect)
-          return CB(Effect.takeError());
-        for (auto &It : Effect->ApplyEdits) {
-          Edit &E = It.second;
-          format::FormatStyle Style =
-              getFormatStyleForFile(File, E.InitialCode, FS.get());
-          if (llvm::Error Err = reformatEdit(E, Style))
-            elog("Failed to format {0}: {1}", It.first(), std::move(Err));
+        auto Selections = tweakSelection(Sel, *InpAST);
+        if (!Selections)
+          return CB(Selections.takeError());
+        llvm::Optional<llvm::Expected<Tweak::Effect>> Effect;
+        // Try each selection, take the first one that prepare()s.
+        // If they all fail, Effect will hold get the last error.
+        for (const auto &Selection : *Selections) {
+          auto T = prepareTweak(TweakID, *Selection);
+          if (T) {
+            Effect = (*T)->apply(*Selection);
+            break;
+          }
+          Effect = T.takeError();
+        }
+        assert(Effect.hasValue() && "Expected at least one selection");
+        if (*Effect) {
+          // Tweaks don't apply clang-format, do that centrally here.
+          for (auto &It : (*Effect)->ApplyEdits) {
+            Edit &E = It.second;
+            format::FormatStyle Style =
+                getFormatStyleForFile(File, E.InitialCode, FS.get());
+            if (llvm::Error Err = reformatEdit(E, Style))
+              elog("Failed to format {0}: {1}", It.first(), std::move(Err));
+          }
         }
         return CB(std::move(*Effect));
       };
diff --git a/clang-tools-extra/clangd/Hover.cpp b/clang-tools-extra/clangd/Hover.cpp
--- a/clang-tools-extra/clangd/Hover.cpp
+++ b/clang-tools-extra/clangd/Hover.cpp
@@ -537,9 +537,12 @@
       llvm::consumeError(Offset.takeError());
       return llvm::None;
     }
-    SelectionTree Selection(AST.getASTContext(), AST.getTokens(), *Offset);
+    // Editors send the position on the left of the hovered character.
+    // So our selection tree should be biased right. (Tested with VSCode).
+    SelectionTree ST = SelectionTree::createRight(
+        AST.getASTContext(), AST.getTokens(), *Offset, *Offset);
     std::vector<const Decl *> Result;
-    if (const SelectionTree::Node *N = Selection.commonAncestor()) {
+    if (const SelectionTree::Node *N = ST.commonAncestor()) {
       auto Decls = explicitReferenceTargets(N->ASTNode, DeclRelation::Alias);
       if (!Decls.empty()) {
         HI = getHoverContents(Decls.front(), Index);
diff --git a/clang-tools-extra/clangd/Selection.h b/clang-tools-extra/clangd/Selection.h
--- a/clang-tools-extra/clangd/Selection.h
+++ b/clang-tools-extra/clangd/Selection.h
@@ -29,6 +29,14 @@
 //   - we determine which low-level nodes are partly or completely covered
 //     by the selection.
 //   - we expose a tree of the selected nodes and their lexical parents.
+//
+// Sadly LSP specifies locations as being between characters, and this causes
+// some ambiguities we cannot cleanly resolve:
+//   lhs+rhs  // targeting '+' or 'lhs'?
+//      ^     // in GUI editors, double-clicking 'lhs' yields this position!
+//
+// The best we can do in these cases is try both, which leads to the awkward
+// SelectionTree::createEach() API.
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_SELECTION_H
@@ -64,16 +72,32 @@
 // point back into the AST it was constructed with.
 class SelectionTree {
 public:
-  // Creates a selection tree at the given byte offset in the main file.
-  // This is approximately equivalent to a range of one character.
-  // (Usually, the character to the right of Offset, sometimes to the left).
-  SelectionTree(ASTContext &AST, const syntax::TokenBuffer &Tokens,
-                unsigned Offset);
-  // Creates a selection tree for the given range in the main file.
-  // The range includes bytes [Start, End).
-  // If Start == End, uses the same heuristics as SelectionTree(AST, Start).
-  SelectionTree(ASTContext &AST, const syntax::TokenBuffer &Tokens,
-                unsigned Start, unsigned End);
+  // Create selection trees for the given range, and pass them to Func.
+  //
+  // There may be multiple possible selection trees:
+  // - if the range is empty and borders two tokens, a tree for the right token
+  //   and a tree for the left token will be yielded.
+  // - Func should return true on success (stop) and false on failure (continue)
+  //
+  // Always yields at least one tree. If no tokens are touched, it is empty.
+  static bool createEach(ASTContext &AST, const syntax::TokenBuffer &Tokens,
+                         unsigned Begin, unsigned End,
+                         llvm::function_ref<bool(SelectionTree)> Func);
+
+  // Create a selection tree for the given range.
+  //
+  // Where ambiguous (range is empty and borders two tokens), prefer the token
+  // on the right.
+  static SelectionTree createRight(ASTContext &AST,
+                                   const syntax::TokenBuffer &Tokens,
+                                   unsigned Begin, unsigned End);
+
+  // Copies are no good - contain pointers to other nodes.
+  SelectionTree(const SelectionTree &) = delete;
+  SelectionTree &operator=(const SelectionTree &) = delete;
+  // Moves are OK though - internal storage is pointer-stable when moved.
+  SelectionTree(SelectionTree &&) = default;
+  SelectionTree &operator=(SelectionTree &&) = default;
 
   // Describes to what extent an AST node is covered by the selection.
   enum Selection : unsigned char {
@@ -121,6 +145,11 @@
   const Node &root() const { return *Root; }
 
 private:
+  // Creates a selection tree for the given range in the main file.
+  // The range includes bytes [Start, End).
+  SelectionTree(ASTContext &AST, const syntax::TokenBuffer &Tokens,
+                unsigned Start, unsigned End);
+
   std::deque<Node> Nodes; // Stable-pointer storage.
   const Node *Root;
   clang::PrintingPolicy PrintPolicy;
diff --git a/clang-tools-extra/clangd/Selection.cpp b/clang-tools-extra/clangd/Selection.cpp
--- a/clang-tools-extra/clangd/Selection.cpp
+++ b/clang-tools-extra/clangd/Selection.cpp
@@ -142,6 +142,11 @@
     Result = SelectionTree::Partial;
 }
 
+// As well as comments, don't count semicolons as real tokens.
+// They're not properly claimed as expr-statement is missing from the AST.
+bool shouldIgnore(const syntax::Token &Tok) {
+  return Tok.kind() == tok::comment || Tok.kind() == tok::semi;
+}
 
 // SelectionTester can determine whether a range of tokens from the PP-expanded
 // stream (corresponding to an AST node) is considered selected.
@@ -172,9 +177,7 @@
         });
     // Precompute selectedness and offset for selected spelled tokens.
     for (const syntax::Token *T = SelFirst; T < SelLimit; ++T) {
-      // As well as comments, don't count semicolons as real tokens.
-      // They're not properly claimed as expr-statement is missing from the AST.
-      if (T->kind() == tok::comment || T->kind() == tok::semi)
+      if (shouldIgnore(*T))
         continue;
       SpelledTokens.emplace_back();
       Tok &S = SpelledTokens.back();
@@ -671,24 +674,49 @@
   return std::move(OS.str());
 }
 
-// Decide which selection emulates a "point" query in between characters.
-static std::pair<unsigned, unsigned> pointBounds(unsigned Offset, FileID FID,
-                                                 ASTContext &AST) {
-  StringRef Buf = AST.getSourceManager().getBufferData(FID);
-  // Edge-cases where the choice is forced.
-  if (Buf.size() == 0)
-    return {0, 0};
-  if (Offset == 0)
-    return {0, 1};
-  if (Offset == Buf.size())
-    return {Offset - 1, Offset};
-  // We could choose either this byte or the previous. Usually we prefer the
-  // character on the right of the cursor (or under a block cursor).
-  // But if that's whitespace/semicolon, we likely want the token on the left.
-  auto IsIgnoredChar = [](char C) { return isWhitespace(C) || C == ';'; };
-  if (IsIgnoredChar(Buf[Offset]) && !IsIgnoredChar(Buf[Offset - 1]))
-    return {Offset - 1, Offset};
-  return {Offset, Offset + 1};
+// Decide which selections emulate a "point" query in between characters.
+// If it's ambiguous (the neighboring characters are selectable tokens), returns
+// both possibilities in preference order.
+// Always returns at least one range - if no tokens touched, and empty range.
+static llvm::SmallVector<std::pair<unsigned, unsigned>, 2>
+pointBounds(unsigned Offset, const syntax::TokenBuffer &Tokens) {
+  const auto &SM = Tokens.sourceManager();
+  SourceLocation Loc = SM.getComposedLoc(SM.getMainFileID(), Offset);
+  llvm::SmallVector<std::pair<unsigned, unsigned>, 2> Result;
+  // Prefer right token over left.
+  for (const syntax::Token &Tok :
+       llvm::reverse(spelledTokensTouching(Loc, Tokens))) {
+    if (shouldIgnore(Tok))
+      continue;
+    unsigned Offset = Tokens.sourceManager().getFileOffset(Tok.location());
+    Result.emplace_back(Offset, Offset + Tok.length());
+  }
+  if (Result.empty())
+    Result.emplace_back(Offset, Offset);
+  return Result;
+}
+
+bool SelectionTree::createEach(ASTContext &AST,
+                               const syntax::TokenBuffer &Tokens,
+                               unsigned Begin, unsigned End,
+                               llvm::function_ref<bool(SelectionTree)> Func) {
+  if (Begin != End)
+    return Func(SelectionTree(AST, Tokens, Begin, End));
+  for (std::pair<unsigned, unsigned> Bounds : pointBounds(Begin, Tokens))
+    if (Func(SelectionTree(AST, Tokens, Bounds.first, Bounds.second)))
+      return true;
+  return false;
+}
+
+SelectionTree SelectionTree::createRight(ASTContext &AST,
+                                         const syntax::TokenBuffer &Tokens,
+                                         unsigned int Begin, unsigned int End) {
+  llvm::Optional<SelectionTree> Result;
+  createEach(AST, Tokens, Begin, End, [&](SelectionTree T) {
+    Result = std::move(T);
+    return true;
+  });
+  return std::move(*Result);
 }
 
 SelectionTree::SelectionTree(ASTContext &AST, const syntax::TokenBuffer &Tokens,
@@ -698,8 +726,6 @@
   // but that's all clangd has needed so far.
   const SourceManager &SM = AST.getSourceManager();
   FileID FID = SM.getMainFileID();
-  if (Begin == End)
-    std::tie(Begin, End) = pointBounds(Begin, FID, AST);
   PrintPolicy.TerseOutput = true;
   PrintPolicy.IncludeNewlines = false;
 
@@ -711,10 +737,6 @@
   dlog("Built selection tree\n{0}", *this);
 }
 
-SelectionTree::SelectionTree(ASTContext &AST, const syntax::TokenBuffer &Tokens,
-                             unsigned Offset)
-    : SelectionTree(AST, Tokens, Offset, Offset) {}
-
 const Node *SelectionTree::commonAncestor() const {
   const Node *Ancestor = Root;
   while (Ancestor->Children.size() == 1 && !Ancestor->Selected)
diff --git a/clang-tools-extra/clangd/SemanticSelection.cpp b/clang-tools-extra/clangd/SemanticSelection.cpp
--- a/clang-tools-extra/clangd/SemanticSelection.cpp
+++ b/clang-tools-extra/clangd/SemanticSelection.cpp
@@ -39,7 +39,8 @@
   }
 
   // Get node under the cursor.
-  SelectionTree ST(AST.getASTContext(), AST.getTokens(), *Offset);
+  SelectionTree ST = SelectionTree::createRight(
+      AST.getASTContext(), AST.getTokens(), *Offset, *Offset);
   for (const auto *Node = ST.commonAncestor(); Node != nullptr;
        Node = Node->Parent) {
     if (const Decl *D = Node->ASTNode.get<Decl>()) {
diff --git a/clang-tools-extra/clangd/XRefs.cpp b/clang-tools-extra/clangd/XRefs.cpp
--- a/clang-tools-extra/clangd/XRefs.cpp
+++ b/clang-tools-extra/clangd/XRefs.cpp
@@ -134,15 +134,16 @@
 std::vector<const NamedDecl *> getDeclAtPosition(ParsedAST &AST,
                                                  SourceLocation Pos,
                                                  DeclRelationSet Relations) {
-  FileID FID;
-  unsigned Offset;
-  std::tie(FID, Offset) = AST.getSourceManager().getDecomposedSpellingLoc(Pos);
-  SelectionTree Selection(AST.getASTContext(), AST.getTokens(), Offset);
+  unsigned Offset = AST.getSourceManager().getDecomposedSpellingLoc(Pos).second;
   std::vector<const NamedDecl *> Result;
-  if (const SelectionTree::Node *N = Selection.commonAncestor()) {
-    auto Decls = targetDecl(N->ASTNode, Relations);
-    Result.assign(Decls.begin(), Decls.end());
-  }
+  SelectionTree::createEach(AST.getASTContext(), AST.getTokens(), Offset,
+                            Offset, [&](SelectionTree ST) {
+                              if (const SelectionTree::Node *N =
+                                      ST.commonAncestor())
+                                llvm::copy(targetDecl(N->ASTNode, Relations),
+                                           std::back_inserter(Result));
+                              return !Result.empty();
+                            });
   return Result;
 }
 
@@ -712,41 +713,50 @@
 }
 
 const CXXRecordDecl *findRecordTypeAt(ParsedAST &AST, Position Pos) {
-  const SourceManager &SM = AST.getSourceManager();
-  SourceLocation SourceLocationBeg = SM.getMacroArgExpandedLocation(
-      getBeginningOfIdentifier(Pos, SM, AST.getLangOpts()));
-  unsigned Offset =
-      AST.getSourceManager().getDecomposedSpellingLoc(SourceLocationBeg).second;
-  SelectionTree Selection(AST.getASTContext(), AST.getTokens(), Offset);
-  const SelectionTree::Node *N = Selection.commonAncestor();
-  if (!N)
-    return nullptr;
-
-  // Note: explicitReferenceTargets() will search for both template
-  // instantiations and template patterns, and prefer the former if available
-  // (generally, one will be available for non-dependent specializations of a
-  // class template).
-  auto Decls = explicitReferenceTargets(N->ASTNode, DeclRelation::Underlying);
-  if (Decls.empty())
-    return nullptr;
+  auto RecordFromNode =
+      [](const SelectionTree::Node *N) -> const CXXRecordDecl * {
+    if (!N)
+      return nullptr;
+
+    // Note: explicitReferenceTargets() will search for both template
+    // instantiations and template patterns, and prefer the former if available
+    // (generally, one will be available for non-dependent specializations of a
+    // class template).
+    auto Decls = explicitReferenceTargets(N->ASTNode, DeclRelation::Underlying);
+    if (Decls.empty())
+      return nullptr;
+
+    const NamedDecl *D = Decls[0];
+
+    if (const VarDecl *VD = dyn_cast<VarDecl>(D)) {
+      // If this is a variable, use the type of the variable.
+      return VD->getType().getTypePtr()->getAsCXXRecordDecl();
+    }
 
-  const NamedDecl *D = Decls[0];
+    if (const CXXMethodDecl *Method = dyn_cast<CXXMethodDecl>(D)) {
+      // If this is a method, use the type of the class.
+      return Method->getParent();
+    }
 
-  if (const VarDecl *VD = dyn_cast<VarDecl>(D)) {
-    // If this is a variable, use the type of the variable.
-    return VD->getType().getTypePtr()->getAsCXXRecordDecl();
-  }
+    // We don't handle FieldDecl because it's not clear what behaviour
+    // the user would expect: the enclosing class type (as with a
+    // method), or the field's type (as with a variable).
 
-  if (const CXXMethodDecl *Method = dyn_cast<CXXMethodDecl>(D)) {
-    // If this is a method, use the type of the class.
-    return Method->getParent();
-  }
+    return dyn_cast<CXXRecordDecl>(D);
+  };
 
-  // We don't handle FieldDecl because it's not clear what behaviour
-  // the user would expect: the enclosing class type (as with a
-  // method), or the field's type (as with a variable).
+  const SourceManager &SM = AST.getSourceManager();
+  SourceLocation SourceLocationBeg = SM.getMacroArgExpandedLocation(
+      getBeginningOfIdentifier(Pos, SM, AST.getLangOpts()));
+  unsigned Offset = SM.getDecomposedSpellingLoc(SourceLocationBeg).second;
+  const CXXRecordDecl *Result = nullptr;
+  SelectionTree::createEach(AST.getASTContext(), AST.getTokens(), Offset,
+                            Offset, [&](SelectionTree ST) {
+                              Result = RecordFromNode(ST.commonAncestor());
+                              return Result != nullptr;
+                            });
+  return Result;
 
-  return dyn_cast<CXXRecordDecl>(D);
 }
 
 std::vector<const CXXRecordDecl *> typeParents(const CXXRecordDecl *CXXRD) {
diff --git a/clang-tools-extra/clangd/refactor/Rename.cpp b/clang-tools-extra/clangd/refactor/Rename.cpp
--- a/clang-tools-extra/clangd/refactor/Rename.cpp
+++ b/clang-tools-extra/clangd/refactor/Rename.cpp
@@ -81,7 +81,8 @@
   unsigned Offset =
       AST.getSourceManager().getDecomposedSpellingLoc(TokenStartLoc).second;
 
-  SelectionTree Selection(AST.getASTContext(), AST.getTokens(), Offset);
+  SelectionTree Selection = SelectionTree::createRight(
+      AST.getASTContext(), AST.getTokens(), Offset, Offset);
   const SelectionTree::Node *SelectedNode = Selection.commonAncestor();
   if (!SelectedNode)
     return {};
diff --git a/clang-tools-extra/clangd/refactor/Tweak.h b/clang-tools-extra/clangd/refactor/Tweak.h
--- a/clang-tools-extra/clangd/refactor/Tweak.h
+++ b/clang-tools-extra/clangd/refactor/Tweak.h
@@ -48,7 +48,7 @@
   /// Input to prepare and apply tweaks.
   struct Selection {
     Selection(const SymbolIndex *Index, ParsedAST &AST, unsigned RangeBegin,
-              unsigned RangeEnd);
+              unsigned RangeEnd, SelectionTree ASTSelection);
     /// The text of the active document.
     llvm::StringRef Code;
     /// The Index for handling codebase related queries.
diff --git a/clang-tools-extra/clangd/refactor/Tweak.cpp b/clang-tools-extra/clangd/refactor/Tweak.cpp
--- a/clang-tools-extra/clangd/refactor/Tweak.cpp
+++ b/clang-tools-extra/clangd/refactor/Tweak.cpp
@@ -46,10 +46,10 @@
 } // namespace
 
 Tweak::Selection::Selection(const SymbolIndex *Index, ParsedAST &AST,
-                            unsigned RangeBegin, unsigned RangeEnd)
+                            unsigned RangeBegin, unsigned RangeEnd,
+                            SelectionTree ASTSelection)
     : Index(Index), AST(&AST), SelectionBegin(RangeBegin),
-      SelectionEnd(RangeEnd),
-      ASTSelection(AST.getASTContext(), AST.getTokens(), RangeBegin, RangeEnd) {
+      SelectionEnd(RangeEnd), ASTSelection(std::move(ASTSelection)) {
   auto &SM = AST.getSourceManager();
   Code = SM.getBufferData(SM.getMainFileID());
   Cursor = SM.getComposedLoc(SM.getMainFileID(), RangeBegin);
diff --git a/clang-tools-extra/clangd/unittests/FindTargetTests.cpp b/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
--- a/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
+++ b/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
@@ -76,8 +76,8 @@
     TU.ExtraArgs = Flags;
     auto AST = TU.build();
     llvm::Annotations::Range R = A.range();
-    SelectionTree Selection(AST.getASTContext(), AST.getTokens(), R.Begin,
-                            R.End);
+    auto Selection = SelectionTree::createRight(
+        AST.getASTContext(), AST.getTokens(), R.Begin, R.End);
     const SelectionTree::Node *N = Selection.commonAncestor();
     if (!N) {
       ADD_FAILURE() << "No node selected!\n" << Code;
diff --git a/clang-tools-extra/clangd/unittests/HoverTests.cpp b/clang-tools-extra/clangd/unittests/HoverTests.cpp
--- a/clang-tools-extra/clangd/unittests/HoverTests.cpp
+++ b/clang-tools-extra/clangd/unittests/HoverTests.cpp
@@ -654,7 +654,7 @@
             }
           )cpp",
       R"cpp(// Template auto parameter. Nothing (Not useful).
-            template<^auto T>
+            template<a^uto T>
             void func() {
             }
             void foo() {
diff --git a/clang-tools-extra/clangd/unittests/SelectionTests.cpp b/clang-tools-extra/clangd/unittests/SelectionTests.cpp
--- a/clang-tools-extra/clangd/unittests/SelectionTests.cpp
+++ b/clang-tools-extra/clangd/unittests/SelectionTests.cpp
@@ -19,20 +19,26 @@
 namespace {
 using ::testing::UnorderedElementsAreArray;
 
+// Create a selection tree corresponding to a point or pair of points.
+// This uses the precisely-defined createRight semantics. The fuzzier
+// createEach is tested separately.
 SelectionTree makeSelectionTree(const StringRef MarkedCode, ParsedAST &AST) {
   Annotations Test(MarkedCode);
   switch (Test.points().size()) {
-  case 1: // Point selection.
-    return SelectionTree(AST.getASTContext(), AST.getTokens(),
-                         cantFail(positionToOffset(Test.code(), Test.point())));
+  case 1: { // Point selection.
+    unsigned Offset = cantFail(positionToOffset(Test.code(), Test.point()));
+    return SelectionTree::createRight(AST.getASTContext(), AST.getTokens(),
+                                      Offset, Offset);
+  }
   case 2: // Range selection.
-    return SelectionTree(
+    return SelectionTree::createRight(
         AST.getASTContext(), AST.getTokens(),
         cantFail(positionToOffset(Test.code(), Test.points()[0])),
         cantFail(positionToOffset(Test.code(), Test.points()[1])));
   default:
     ADD_FAILURE() << "Expected 1-2 points for selection.\n" << MarkedCode;
-    return SelectionTree(AST.getASTContext(), AST.getTokens(), 0u, 0u);
+    return SelectionTree::createRight(AST.getASTContext(), AST.getTokens(), 0u,
+                                      0u);
   }
 }
 
@@ -554,6 +560,61 @@
   EXPECT_EQ("CXXConstructExpr", nodeKind(&Str->outerImplicit()));
 }
 
+TEST(SelectionTest, CreateAll) {
+  llvm::Annotations Test("int$unique^ a=1$ambiguous^+1; $empty^");
+  auto AST = TestTU::withCode(Test.code()).build();
+  unsigned Seen = 0;
+  SelectionTree::createEach(
+      AST.getASTContext(), AST.getTokens(), Test.point("ambiguous"),
+      Test.point("ambiguous"), [&](SelectionTree T) {
+        // Expect to see the right-biased tree first.
+        if (Seen == 0)
+          EXPECT_EQ("BinaryOperator", nodeKind(T.commonAncestor()));
+        else if (Seen == 1)
+          EXPECT_EQ("IntegerLiteral", nodeKind(T.commonAncestor()));
+        ++Seen;
+        return false;
+      });
+  EXPECT_EQ(2u, Seen);
+
+  Seen = 0;
+  SelectionTree::createEach(AST.getASTContext(), AST.getTokens(),
+                            Test.point("ambiguous"), Test.point("ambiguous"),
+                            [&](SelectionTree T) {
+                              ++Seen;
+                              return true;
+                            });
+  EXPECT_EQ(1u, Seen) << "Return true --> stop iterating";
+
+  Seen = 0;
+  SelectionTree::createEach(AST.getASTContext(), AST.getTokens(),
+                            Test.point("unique"), Test.point("unique"),
+                            [&](SelectionTree T) {
+                              ++Seen;
+                              return false;
+                            });
+  EXPECT_EQ(1u, Seen) << "no ambiguity --> only one tree";
+
+  Seen = 0;
+  SelectionTree::createEach(AST.getASTContext(), AST.getTokens(),
+                            Test.point("empty"), Test.point("empty"),
+                            [&](SelectionTree T) {
+                              EXPECT_FALSE(T.commonAncestor());
+                              ++Seen;
+                              return false;
+                            });
+  EXPECT_EQ(1u, Seen) << "empty tree still created";
+
+  Seen = 0;
+  SelectionTree::createEach(AST.getASTContext(), AST.getTokens(),
+                            Test.point("unique"), Test.point("ambiguous"),
+                            [&](SelectionTree T) {
+                              ++Seen;
+                              return false;
+                            });
+  EXPECT_EQ(1u, Seen) << "one tree for nontrivial selection";
+}
+
 } // namespace
 } // namespace clangd
 } // namespace clang
diff --git a/clang-tools-extra/clangd/unittests/TweakTesting.cpp b/clang-tools-extra/clangd/unittests/TweakTesting.cpp
--- a/clang-tools-extra/clangd/unittests/TweakTesting.cpp
+++ b/clang-tools-extra/clangd/unittests/TweakTesting.cpp
@@ -63,12 +63,33 @@
           cantFail(positionToOffset(A.code(), SelectionRng.end))};
 }
 
+// Prepare and apply the specified tweak based on the selection in Input.
+// Returns None if and only if prepare() failed.
+llvm::Optional<llvm::Expected<Tweak::Effect>>
+applyTweak(ParsedAST &AST, const Annotations &Input, StringRef TweakID,
+           const SymbolIndex *Index) {
+  auto Range = rangeOrPoint(Input);
+  llvm::Optional<llvm::Expected<Tweak::Effect>> Result;
+  SelectionTree::createEach(AST.getASTContext(), AST.getTokens(), Range.first,
+                            Range.second, [&](SelectionTree ST) {
+                              Tweak::Selection S(Index, AST, Range.first,
+                                                 Range.second, std::move(ST));
+                              if (auto T = prepareTweak(TweakID, S)) {
+                                Result = (*T)->apply(S);
+                                return true;
+                              } else {
+                                llvm::consumeError(T.takeError());
+                                return false;
+                              }
+                            });
+  return Result;
+}
+
 MATCHER_P7(TweakIsAvailable, TweakID, Ctx, Header, ExtraArgs, ExtraFiles, Index,
            FileName,
            (TweakID + (negation ? " is unavailable" : " is available")).str()) {
   std::string WrappedCode = wrap(Ctx, arg);
   Annotations Input(WrappedCode);
-  auto Selection = rangeOrPoint(Input);
   TestTU TU;
   TU.Filename = std::string(FileName);
   TU.HeaderCode = Header;
@@ -76,12 +97,11 @@
   TU.ExtraArgs = ExtraArgs;
   TU.AdditionalFiles = std::move(ExtraFiles);
   ParsedAST AST = TU.build();
-  Tweak::Selection S(Index, AST, Selection.first, Selection.second);
-  auto PrepareResult = prepareTweak(TweakID, S);
-  if (PrepareResult)
-    return true;
-  llvm::consumeError(PrepareResult.takeError());
-  return false;
+  auto Result = applyTweak(AST, Input, TweakID, Index);
+  // We only care if prepare() succeeded, but must handle Errors.
+  if (Result && !*Result)
+    consumeError(Result->takeError());
+  return Result.hasValue();
 }
 
 } // namespace
@@ -90,8 +110,6 @@
                              llvm::StringMap<std::string> *EditedFiles) const {
   std::string WrappedCode = wrap(Context, MarkedCode);
   Annotations Input(WrappedCode);
-  auto Selection = rangeOrPoint(Input);
-
   TestTU TU;
   TU.Filename = std::string(FileName);
   TU.HeaderCode = Header;
@@ -99,23 +117,20 @@
   TU.Code = std::string(Input.code());
   TU.ExtraArgs = ExtraArgs;
   ParsedAST AST = TU.build();
-  Tweak::Selection S(Index.get(), AST, Selection.first, Selection.second);
 
-  auto T = prepareTweak(TweakID, S);
-  if (!T) {
-    llvm::consumeError(T.takeError());
-    return "unavailable";
-  }
-  llvm::Expected<Tweak::Effect> Result = (*T)->apply(S);
+  auto Result = applyTweak(AST, Input, TweakID, Index.get());
   if (!Result)
-    return "fail: " + llvm::toString(Result.takeError());
-  if (Result->ShowMessage)
-    return "message:\n" + *Result->ShowMessage;
-  if (Result->ApplyEdits.empty())
+    return "unavailable";
+  if (!*Result)
+    return "fail: " + llvm::toString(Result->takeError());
+  const auto &Effect = **Result;
+  if ((*Result)->ShowMessage)
+    return "message:\n" + *Effect.ShowMessage;
+  if (Effect.ApplyEdits.empty())
     return "no effect";
 
   std::string EditedMainFile;
-  for (auto &It : Result->ApplyEdits) {
+  for (auto &It : Effect.ApplyEdits) {
     auto NewText = It.second.apply();
     if (!NewText)
       return "bad edits: " + llvm::toString(NewText.takeError());
diff --git a/clang-tools-extra/clangd/unittests/TweakTests.cpp b/clang-tools-extra/clangd/unittests/TweakTests.cpp
--- a/clang-tools-extra/clangd/unittests/TweakTests.cpp
+++ b/clang-tools-extra/clangd/unittests/TweakTests.cpp
@@ -1966,7 +1966,7 @@
   // Basic check for function body and signature.
   EXPECT_AVAILABLE(R"cpp(
     class Bar {
-      [[void [[f^o^o]]() [[{ return; }]]]]
+      [[void [[f^o^o^]]() [[{ return; }]]]]
     };
 
     void foo();
diff --git a/clang/include/clang/Driver/CC1Options.td b/clang/include/clang/Driver/CC1Options.td
--- a/clang/include/clang/Driver/CC1Options.td
+++ b/clang/include/clang/Driver/CC1Options.td
@@ -284,8 +284,6 @@
   HelpText<"Enable enhanced struct-path aware Type Based Alias Analysis">;
 def masm_verbose : Flag<["-"], "masm-verbose">,
   HelpText<"Generate verbose assembly output">;
-def mcode_model : Separate<["-"], "mcode-model">,
-  HelpText<"The code model to use">, Values<"tiny,small,kernel,medium,large">;
 def mdebug_pass : Separate<["-"], "mdebug-pass">,
   HelpText<"Enable additional debug output">;
 def mframe_pointer_EQ : Joined<["-"], "mframe-pointer=">,
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -2166,10 +2166,13 @@
 def malign_functions_EQ : Joined<["-"], "malign-functions=">, Group<clang_ignored_m_Group>;
 def malign_loops_EQ : Joined<["-"], "malign-loops=">, Group<clang_ignored_m_Group>;
 def malign_jumps_EQ : Joined<["-"], "malign-jumps=">, Group<clang_ignored_m_Group>;
-def malign_branch_EQ : CommaJoined<["-"], "malign-branch=">, Group<m_Group>;
-def malign_branch_boundary_EQ : Joined<["-"], "malign-branch-boundary=">, Group<m_Group>;
+def malign_branch_EQ : CommaJoined<["-"], "malign-branch=">, Group<m_Group>, Flags<[DriverOption]>,
+  HelpText<"Specify types of branches to align">;
+def malign_branch_boundary_EQ : Joined<["-"], "malign-branch-boundary=">, Group<m_Group>, Flags<[DriverOption]>,
+  HelpText<"Specify the boundary's size to align branches">;
 def malign_branch_prefix_size_EQ : Joined<["-"], "malign-branch-prefix-size=">, Group<m_Group>;
-def mbranches_within_32B_boundaries : Flag<["-"], "mbranches-within-32B-boundaries">, Flags<[DriverOption]>, Group<m_Group>;
+def mbranches_within_32B_boundaries : Flag<["-"], "mbranches-within-32B-boundaries">, Flags<[DriverOption]>, Group<m_Group>,
+  HelpText<"Align selected branches (fused, jcc, jmp) within 32-byte boundary">;
 def mfancy_math_387 : Flag<["-"], "mfancy-math-387">, Group<clang_ignored_m_Group>;
 def mlong_calls : Flag<["-"], "mlong-calls">, Group<m_Group>,
   HelpText<"Generate branches with extended addressability, usually via indirect jumps.">;
@@ -2201,7 +2204,7 @@
 def mwatchsimulator_version_min_EQ : Joined<["-"], "mwatchsimulator-version-min=">, Alias<mwatchos_simulator_version_min_EQ>;
 def march_EQ : Joined<["-"], "march=">, Group<m_Group>, Flags<[CoreOption]>;
 def masm_EQ : Joined<["-"], "masm=">, Group<m_Group>, Flags<[DriverOption]>;
-def mcmodel_EQ : Joined<["-"], "mcmodel=">, Group<m_Group>;
+def mcmodel_EQ : Joined<["-"], "mcmodel=">, Group<m_Group>, Flags<[CC1Option]>;
 def mtls_size_EQ : Joined<["-"], "mtls-size=">, Group<m_Group>, Flags<[DriverOption, CC1Option]>,
   HelpText<"Specify bit size of immediate TLS offsets (AArch64 ELF only): "
            "12 (for 4KB) | 24 (for 16MB, default) | 32 (for 4GB) | 48 (for 256TB, needs -mcmodel=large)">;
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -1113,7 +1113,8 @@
   /// it (unless StopBeforeMatch is specified).  Because we cannot guarantee
   /// that the token will ever occur, this skips to the next token, or to some
   /// likely good stopping point.  If Flags has StopAtSemi flag, skipping will
-  /// stop at a ';' character.
+  /// stop at a ';' character. Balances (), [], and {} delimiter tokens while
+  /// skipping.
   ///
   /// If SkipUntil finds the specified token, it returns true, otherwise it
   /// returns false.
diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -194,6 +194,13 @@
     Builder.defineMacro("__LP64__");
   }
 
+  std::string CodeModel = getTargetOpts().CodeModel;
+  if (CodeModel == "default")
+    CodeModel = "small";
+  for (char &c : CodeModel)
+    c = toupper(c);
+  Builder.defineMacro("__AARCH64_CMODEL_" + CodeModel + "__");
+
   // ACLE predefines. Many can only have one possible value on v8 AArch64.
   Builder.defineMacro("__ARM_ACLE", "200");
   Builder.defineMacro("__ARM_ARCH", "8");
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -911,7 +911,7 @@
   std::string CodeModel = getTargetOpts().CodeModel;
   if (CodeModel == "default")
     CodeModel = "small";
-  Builder.defineMacro("__code_model_" + CodeModel + "_");
+  Builder.defineMacro("__code_model_" + CodeModel + "__");
 
   // Target identification.
   if (getTriple().getArch() == llvm::Triple::x86_64) {
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -2007,21 +2007,19 @@
                                    options::OPT_mno_backchain, false);
   bool HasPackedStack = Args.hasFlag(options::OPT_mpacked_stack,
                                      options::OPT_mno_packed_stack, false);
-  if (HasBackchain && HasPackedStack) {
+  systemz::FloatABI FloatABI =
+      systemz::getSystemZFloatABI(getToolChain().getDriver(), Args);
+  bool HasSoftFloat = (FloatABI == systemz::FloatABI::Soft);
+  if (HasBackchain && HasPackedStack && !HasSoftFloat) {
     const Driver &D = getToolChain().getDriver();
     D.Diag(diag::err_drv_unsupported_opt)
-      << Args.getLastArg(options::OPT_mpacked_stack)->getAsString(Args) +
-      " " + Args.getLastArg(options::OPT_mbackchain)->getAsString(Args);
+      << "-mpacked-stack -mbackchain -mhard-float";
   }
   if (HasBackchain)
     CmdArgs.push_back("-mbackchain");
   if (HasPackedStack)
     CmdArgs.push_back("-mpacked-stack");
-
-  systemz::FloatABI FloatABI =
-      systemz::getSystemZFloatABI(getToolChain().getDriver(), Args);
-
-  if (FloatABI == systemz::FloatABI::Soft) {
+  if (HasSoftFloat) {
     // Floating point operations and argument passing are soft.
     CmdArgs.push_back("-msoft-float");
     CmdArgs.push_back("-mfloat-abi");
@@ -4672,8 +4670,13 @@
   (void)Args.hasArg(options::OPT_mtune_EQ);
 
   if (Arg *A = Args.getLastArg(options::OPT_mcmodel_EQ)) {
-    CmdArgs.push_back("-mcode-model");
-    CmdArgs.push_back(A->getValue());
+    StringRef CM = A->getValue();
+    if (CM == "small" || CM == "kernel" || CM == "medium" || CM == "large" ||
+        CM == "tiny")
+      A->render(Args, CmdArgs);
+    else
+      D.Diag(diag::err_drv_invalid_argument_to_option)
+          << CM << A->getOption().getName();
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_mtls_size_EQ)) {
@@ -5882,7 +5885,7 @@
       Arg->render(Args, OriginalArgs);
 
     SmallString<256> Flags;
-    Flags += Exec;
+    EscapeSpacesAndBackslashes(Exec, Flags);
     for (const char *OriginalArg : OriginalArgs) {
       SmallString<128> EscapedArg;
       EscapeSpacesAndBackslashes(OriginalArg, EscapedArg);
@@ -6790,7 +6793,7 @@
 
     SmallString<256> Flags;
     const char *Exec = getToolChain().getDriver().getClangProgramPath();
-    Flags += Exec;
+    EscapeSpacesAndBackslashes(Exec, Flags);
     for (const char *OriginalArg : OriginalArgs) {
       SmallString<128> EscapedArg;
       EscapeSpacesAndBackslashes(OriginalArg, EscapedArg);
diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp
--- a/clang/lib/Driver/ToolChains/Darwin.cpp
+++ b/clang/lib/Driver/ToolChains/Darwin.cpp
@@ -1144,8 +1144,10 @@
   if (hasExportSymbolDirective(Args)) {
     if (ForGCOV) {
       addExportedSymbol(CmdArgs, "___gcov_flush");
+      addExportedSymbol(CmdArgs, "___gcov_fork");
       addExportedSymbol(CmdArgs, "_flush_fn_list");
       addExportedSymbol(CmdArgs, "_writeout_fn_list");
+      addExportedSymbol(CmdArgs, "_reset_fn_list");
     } else {
       addExportedSymbol(CmdArgs, "___llvm_profile_filename");
       addExportedSymbol(CmdArgs, "___llvm_profile_raw_version");
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -527,17 +527,6 @@
   Opts.ParseAllComments = Args.hasArg(OPT_fparse_all_comments);
 }
 
-static StringRef getCodeModel(ArgList &Args, DiagnosticsEngine &Diags) {
-  if (Arg *A = Args.getLastArg(OPT_mcode_model)) {
-    StringRef Value = A->getValue();
-    if (Value == "small" || Value == "kernel" || Value == "medium" ||
-        Value == "large" || Value == "tiny")
-      return Value;
-    Diags.Report(diag::err_drv_invalid_value) << A->getAsString(Args) << Value;
-  }
-  return "default";
-}
-
 static llvm::Reloc::Model getRelocModel(ArgList &Args,
                                         DiagnosticsEngine &Diags) {
   if (Arg *A = Args.getLastArg(OPT_mrelocation_model)) {
@@ -3496,7 +3485,7 @@
 
 static void ParseTargetArgs(TargetOptions &Opts, ArgList &Args,
                             DiagnosticsEngine &Diags) {
-  Opts.CodeModel = std::string(getCodeModel(Args, Diags));
+  Opts.CodeModel = std::string(Args.getLastArgValue(OPT_mcmodel_EQ, "default"));
   Opts.ABI = std::string(Args.getLastArgValue(OPT_target_abi));
   if (Arg *A = Args.getLastArg(OPT_meabi)) {
     StringRef Value = A->getValue();
diff --git a/clang/lib/Sema/SemaCast.cpp b/clang/lib/Sema/SemaCast.cpp
--- a/clang/lib/Sema/SemaCast.cpp
+++ b/clang/lib/Sema/SemaCast.cpp
@@ -2764,7 +2764,8 @@
     }
 
     if ((Self.Context.getTypeSize(SrcType) >
-         Self.Context.getTypeSize(DestType))) {
+         Self.Context.getTypeSize(DestType)) &&
+        !DestType->isBooleanType()) {
       // C 6.3.2.3p6: Any pointer type may be converted to an integer type.
       // Except as previously specified, the result is implementation-defined.
       // If the result cannot be represented in the integer type, the behavior
diff --git a/clang/test/CodeGen/codemodels.c b/clang/test/CodeGen/codemodels.c
--- a/clang/test/CodeGen/codemodels.c
+++ b/clang/test/CodeGen/codemodels.c
@@ -1,9 +1,9 @@
 // RUN: %clang_cc1 -emit-llvm  %s -o - | FileCheck %s -check-prefix=CHECK-NOMODEL
-// RUN: %clang_cc1 -triple aarch64-unknown-none-eabi -emit-llvm -mcode-model tiny %s -o - | FileCheck %s -check-prefix=CHECK-TINY
-// RUN: %clang_cc1 -emit-llvm -mcode-model small %s -o - | FileCheck %s -check-prefix=CHECK-SMALL
-// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -mcode-model kernel %s -o - | FileCheck %s -check-prefix=CHECK-KERNEL
-// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -mcode-model medium %s -o - | FileCheck %s -check-prefix=CHECK-MEDIUM
-// RUN: %clang_cc1 -emit-llvm -mcode-model large %s -o - | FileCheck %s -check-prefix=CHECK-LARGE
+// RUN: %clang_cc1 -triple aarch64-unknown-none-eabi -emit-llvm -mcmodel=tiny %s -o - | FileCheck %s -check-prefix=CHECK-TINY
+// RUN: %clang_cc1 -emit-llvm -mcmodel=small %s -o - | FileCheck %s -check-prefix=CHECK-SMALL
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -mcmodel=kernel %s -o - | FileCheck %s -check-prefix=CHECK-KERNEL
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -mcmodel=medium %s -o - | FileCheck %s -check-prefix=CHECK-MEDIUM
+// RUN: %clang_cc1 -emit-llvm -mcmodel=large %s -o - | FileCheck %s -check-prefix=CHECK-LARGE
 
 // CHECK-TINY: !llvm.module.flags = !{{{.*}}}
 // CHECK-TINY: !{{[0-9]+}} = !{i32 1, !"Code Model", i32 0}
diff --git a/clang/test/Driver/clang_f_opts.c b/clang/test/Driver/clang_f_opts.c
--- a/clang/test/Driver/clang_f_opts.c
+++ b/clang/test/Driver/clang_f_opts.c
@@ -572,6 +572,11 @@
 // CHECK-RECORD-GCC-SWITCHES: "-record-command-line"
 // CHECK-NO-RECORD-GCC-SWITCHES-NOT: "-record-command-line"
 // CHECK-RECORD-GCC-SWITCHES-ERROR: error: unsupported option '-frecord-command-line' for target
+// Test when clang is in a path containing a space.
+// RUN: mkdir -p "%t.r/with spaces"
+// RUN: cp %clang "%t.r/with spaces/clang"
+// RUN: "%t.r/with spaces/clang" -### -S -target x86_64-unknown-linux -frecord-gcc-switches %s 2>&1 | FileCheck -check-prefix=CHECK-RECORD-GCC-SWITCHES-ESCAPED %s
+// CHECK-RECORD-GCC-SWITCHES-ESCAPED: "-record-command-line" "{{.+}}with\\ spaces{{.+}}"
 
 // RUN: %clang -### -S -ftrivial-auto-var-init=uninitialized %s 2>&1 | FileCheck -check-prefix=CHECK-TRIVIAL-UNINIT %s
 // RUN: %clang -### -S -ftrivial-auto-var-init=pattern %s 2>&1 | FileCheck -check-prefix=CHECK-TRIVIAL-PATTERN %s
diff --git a/clang/test/Driver/code-model.c b/clang/test/Driver/code-model.c
deleted file mode 100644
--- a/clang/test/Driver/code-model.c
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: %clang -### -c -mcmodel=tiny %s 2>&1 | FileCheck -check-prefix CHECK-TINY %s
-// RUN: %clang -### -c -mcmodel=small %s 2>&1 | FileCheck -check-prefix CHECK-SMALL %s
-// RUN: %clang -### -S -mcmodel=kernel %s 2>&1 | FileCheck -check-prefix CHECK-KERNEL %s
-// RUN: %clang -### -c -mcmodel=medium %s 2>&1 | FileCheck -check-prefix CHECK-MEDIUM %s
-// RUN: %clang -### -S -mcmodel=large %s 2>&1 | FileCheck -check-prefix CHECK-LARGE %s
-// RUN: not %clang -c -mcmodel=lager %s 2>&1 | FileCheck -check-prefix CHECK-INVALID %s
-
-// CHECK-TINY: "-mcode-model" "tiny"
-// CHECK-SMALL: "-mcode-model" "small"
-// CHECK-KERNEL: "-mcode-model" "kernel"
-// CHECK-MEDIUM: "-mcode-model" "medium"
-// CHECK-LARGE: "-mcode-model" "large"
-
-// CHECK-INVALID: error: invalid value 'lager' in '-mcode-model lager'
-
diff --git a/clang/test/Driver/mbackchain.c b/clang/test/Driver/mbackchain.c
--- a/clang/test/Driver/mbackchain.c
+++ b/clang/test/Driver/mbackchain.c
@@ -1,3 +1,7 @@
 // RUN: %clang -target s390x -c -### %s -mpacked-stack -mbackchain 2>&1 | FileCheck %s
+// RUN: %clang -target s390x -c -### %s -mpacked-stack -mbackchain -msoft-float \
+// RUN:   2>&1 | FileCheck %s --check-prefix=KERNEL-BUILD
+// REQUIRES: systemz-registered-target
 
-// CHECK: error: unsupported option '-mpacked-stack -mbackchain'
+// CHECK: error: unsupported option '-mpacked-stack -mbackchain -mhard-float'
+// KERNEL-BUILD-NOT: error: unsupported option
diff --git a/clang/test/Driver/mcmodel.c b/clang/test/Driver/mcmodel.c
new file mode 100644
--- /dev/null
+++ b/clang/test/Driver/mcmodel.c
@@ -0,0 +1,14 @@
+// RUN: %clang -target x86_64 -### -c -mcmodel=tiny %s 2>&1 | FileCheck --check-prefix=TINY %s
+// RUN: %clang -target x86_64 -### -c -mcmodel=small %s 2>&1 | FileCheck --check-prefix=SMALL %s
+// RUN: %clang -target x86_64 -### -S -mcmodel=kernel %s 2>&1 | FileCheck --check-prefix=KERNEL %s
+// RUN: %clang -target x86_64 -### -c -mcmodel=medium %s 2>&1 | FileCheck --check-prefix=MEDIUM %s
+// RUN: %clang -target x86_64 -### -S -mcmodel=large %s 2>&1 | FileCheck --check-prefix=LARGE %s
+// RUN: not %clang -c -mcmodel=lager %s 2>&1 | FileCheck --check-prefix=INVALID %s
+
+// TINY: "-mcmodel=tiny"
+// SMALL: "-mcmodel=small"
+// KERNEL: "-mcmodel=kernel"
+// MEDIUM: "-mcmodel=medium"
+// LARGE: "-mcmodel=large"
+
+// INVALID: error: invalid argument 'lager' to -mcmodel=
diff --git a/clang/test/Preprocessor/aarch64-target-features.c b/clang/test/Preprocessor/aarch64-target-features.c
--- a/clang/test/Preprocessor/aarch64-target-features.c
+++ b/clang/test/Preprocessor/aarch64-target-features.c
@@ -40,9 +40,6 @@
 // CHECK-NOT: __ARM_FEATURE_SVE
 // CHECK-NOT: __ARM_FEATURE_DOTPROD
 
-// RUN: %clang -target aarch64_be-eabi -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-BIGENDIAN
-// CHECK-BIGENDIAN: __ARM_BIG_ENDIAN 1
-
 // RUN: %clang -target aarch64-none-linux-gnu -march=armv8-a+crypto -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-CRYPTO %s
 // RUN: %clang -target arm64-none-linux-gnu -march=armv8-a+crypto -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-CRYPTO %s
 // CHECK-CRYPTO: __ARM_FEATURE_CRYPTO 1
diff --git a/clang/test/Preprocessor/init-aarch64.c b/clang/test/Preprocessor/init-aarch64.c
new file mode 100644
--- /dev/null
+++ b/clang/test/Preprocessor/init-aarch64.c
@@ -0,0 +1,701 @@
+// RUN: %clang_cc1 -E -dM -triple=aarch64 -xc /dev/null > %t.aarch64
+// RUN: FileCheck --check-prefixes=AARCH64,AARCH64_LE,AARCH64_C %s --match-full-lines < %t.aarch64
+// RUN: %clang_cc1 -E -dM -triple=arm64 -xc /dev/null > %t.arm64
+// RUN: cmp %t.aarch64 %t.arm64
+// RUN: %clang_cc1 -E -dM -triple=aarch64_be -xc /dev/null | FileCheck --check-prefixes=AARCH64,AARCH64_BE,AARCH64_C --match-full-lines %s
+// RUN: %clang_cc1 -E -dM -triple=arm64 -xc++ /dev/null | FileCheck --check-prefixes=AARCH64,AARCH64_LE,AARCH64_CXX --match-full-lines %s
+
+// AARCH64: #define _LP64 1
+// AARCH64_BE-NEXT: #define __AARCH64EB__ 1
+// AARCH64_BE-NEXT: #define __AARCH64_CMODEL_SMALL__ 1
+// AARCH64_BE-NEXT: #define __AARCH_BIG_ENDIAN 1
+// AARCH64_LE-NEXT: #define __AARCH64EL__ 1
+// AARCH64_LE-NEXT: #define __AARCH64_CMODEL_SMALL__ 1
+// AARCH64-NEXT: #define __ARM_64BIT_STATE 1
+// AARCH64-NEXT: #define __ARM_ACLE 200
+// AARCH64-NEXT: #define __ARM_ALIGN_MAX_STACK_PWR 4
+// AARCH64-NEXT: #define __ARM_ARCH 8
+// AARCH64-NEXT: #define __ARM_ARCH_ISA_A64 1
+// AARCH64-NEXT: #define __ARM_ARCH_PROFILE 'A'
+// AARCH64_BE-NEXT: #define __ARM_BIG_ENDIAN 1
+// AARCH64-NEXT: #define __ARM_FEATURE_CLZ 1
+// AARCH64-NEXT: #define __ARM_FEATURE_DIRECTED_ROUNDING 1
+// AARCH64-NEXT: #define __ARM_FEATURE_DIV 1
+// AARCH64-NEXT: #define __ARM_FEATURE_FMA 1
+// AARCH64-NEXT: #define __ARM_FEATURE_IDIV 1
+// AARCH64-NEXT: #define __ARM_FEATURE_LDREX 0xF
+// AARCH64-NEXT: #define __ARM_FEATURE_NUMERIC_MAXMIN 1
+// AARCH64-NEXT: #define __ARM_FEATURE_UNALIGNED 1
+// AARCH64-NEXT: #define __ARM_FP 0xE
+// AARCH64-NEXT: #define __ARM_FP16_ARGS 1
+// AARCH64-NEXT: #define __ARM_FP16_FORMAT_IEEE 1
+// AARCH64-NEXT: #define __ARM_PCS_AAPCS64 1
+// AARCH64-NEXT: #define __ARM_SIZEOF_MINIMAL_ENUM 4
+// AARCH64-NEXT: #define __ARM_SIZEOF_WCHAR_T 4
+// AARCH64-NEXT: #define __ATOMIC_ACQUIRE 2
+// AARCH64-NEXT: #define __ATOMIC_ACQ_REL 4
+// AARCH64-NEXT: #define __ATOMIC_CONSUME 1
+// AARCH64-NEXT: #define __ATOMIC_RELAXED 0
+// AARCH64-NEXT: #define __ATOMIC_RELEASE 3
+// AARCH64-NEXT: #define __ATOMIC_SEQ_CST 5
+// AARCH64:      #define __BIGGEST_ALIGNMENT__ 16
+// AARCH64_BE-NEXT: #define __BIG_ENDIAN__ 1
+// AARCH64_BE-NEXT: #define __BYTE_ORDER__ __ORDER_BIG_ENDIAN__
+// AARCH64_LE-NEXT: #define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
+// AARCH64-NEXT: #define __CHAR16_TYPE__ unsigned short
+// AARCH64-NEXT: #define __CHAR32_TYPE__ unsigned int
+// AARCH64-NEXT: #define __CHAR_BIT__ 8
+// AARCH64-NEXT: #define __CLANG_ATOMIC_BOOL_LOCK_FREE 2
+// AARCH64-NEXT: #define __CLANG_ATOMIC_CHAR16_T_LOCK_FREE 2
+// AARCH64-NEXT: #define __CLANG_ATOMIC_CHAR32_T_LOCK_FREE 2
+// AARCH64-NEXT: #define __CLANG_ATOMIC_CHAR_LOCK_FREE 2
+// AARCH64-NEXT: #define __CLANG_ATOMIC_INT_LOCK_FREE 2
+// AARCH64-NEXT: #define __CLANG_ATOMIC_LLONG_LOCK_FREE 2
+// AARCH64-NEXT: #define __CLANG_ATOMIC_LONG_LOCK_FREE 2
+// AARCH64-NEXT: #define __CLANG_ATOMIC_POINTER_LOCK_FREE 2
+// AARCH64-NEXT: #define __CLANG_ATOMIC_SHORT_LOCK_FREE 2
+// AARCH64-NEXT: #define __CLANG_ATOMIC_WCHAR_T_LOCK_FREE 2
+// AARCH64-NEXT: #define __CONSTANT_CFSTRINGS__ 1
+// AARCH64-NEXT: #define __DBL_DECIMAL_DIG__ 17
+// AARCH64-NEXT: #define __DBL_DENORM_MIN__ 4.9406564584124654e-324
+// AARCH64-NEXT: #define __DBL_DIG__ 15
+// AARCH64-NEXT: #define __DBL_EPSILON__ 2.2204460492503131e-16
+// AARCH64-NEXT: #define __DBL_HAS_DENORM__ 1
+// AARCH64-NEXT: #define __DBL_HAS_INFINITY__ 1
+// AARCH64-NEXT: #define __DBL_HAS_QUIET_NAN__ 1
+// AARCH64-NEXT: #define __DBL_MANT_DIG__ 53
+// AARCH64-NEXT: #define __DBL_MAX_10_EXP__ 308
+// AARCH64-NEXT: #define __DBL_MAX_EXP__ 1024
+// AARCH64-NEXT: #define __DBL_MAX__ 1.7976931348623157e+308
+// AARCH64-NEXT: #define __DBL_MIN_10_EXP__ (-307)
+// AARCH64-NEXT: #define __DBL_MIN_EXP__ (-1021)
+// AARCH64-NEXT: #define __DBL_MIN__ 2.2250738585072014e-308
+// AARCH64-NEXT: #define __DECIMAL_DIG__ __LDBL_DECIMAL_DIG__
+// AARCH64-NEXT: #define __ELF__ 1
+// AARCH64-NEXT: #define __FINITE_MATH_ONLY__ 0
+// AARCH64-NEXT: #define __FLT16_DECIMAL_DIG__ 5
+// AARCH64-NEXT: #define __FLT16_DENORM_MIN__ 5.9604644775390625e-8F16
+// AARCH64-NEXT: #define __FLT16_DIG__ 3
+// AARCH64-NEXT: #define __FLT16_EPSILON__ 9.765625e-4F16
+// AARCH64-NEXT: #define __FLT16_HAS_DENORM__ 1
+// AARCH64-NEXT: #define __FLT16_HAS_INFINITY__ 1
+// AARCH64-NEXT: #define __FLT16_HAS_QUIET_NAN__ 1
+// AARCH64-NEXT: #define __FLT16_MANT_DIG__ 11
+// AARCH64-NEXT: #define __FLT16_MAX_10_EXP__ 4
+// AARCH64-NEXT: #define __FLT16_MAX_EXP__ 16
+// AARCH64-NEXT: #define __FLT16_MAX__ 6.5504e+4F16
+// AARCH64-NEXT: #define __FLT16_MIN_10_EXP__ (-4)
+// AARCH64-NEXT: #define __FLT16_MIN_EXP__ (-13)
+// AARCH64-NEXT: #define __FLT16_MIN__ 6.103515625e-5F16
+// AARCH64-NEXT: #define __FLT_DECIMAL_DIG__ 9
+// AARCH64-NEXT: #define __FLT_DENORM_MIN__ 1.40129846e-45F
+// AARCH64-NEXT: #define __FLT_DIG__ 6
+// AARCH64-NEXT: #define __FLT_EPSILON__ 1.19209290e-7F
+// AARCH64-NEXT: #define __FLT_EVAL_METHOD__ 0
+// AARCH64-NEXT: #define __FLT_HAS_DENORM__ 1
+// AARCH64-NEXT: #define __FLT_HAS_INFINITY__ 1
+// AARCH64-NEXT: #define __FLT_HAS_QUIET_NAN__ 1
+// AARCH64-NEXT: #define __FLT_MANT_DIG__ 24
+// AARCH64-NEXT: #define __FLT_MAX_10_EXP__ 38
+// AARCH64-NEXT: #define __FLT_MAX_EXP__ 128
+// AARCH64-NEXT: #define __FLT_MAX__ 3.40282347e+38F
+// AARCH64-NEXT: #define __FLT_MIN_10_EXP__ (-37)
+// AARCH64-NEXT: #define __FLT_MIN_EXP__ (-125)
+// AARCH64-NEXT: #define __FLT_MIN__ 1.17549435e-38F
+// AARCH64-NEXT: #define __FLT_RADIX__ 2
+// AARCH64-NEXT: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1 1
+// AARCH64-NEXT: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2 1
+// AARCH64-NEXT: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 1
+// AARCH64-NEXT: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 1
+// AARCH64_CXX-NEXT: #define __GLIBCXX_BITSIZE_INT_N_0 128
+// AARCH64_CXX-NEXT: #define __GLIBCXX_TYPE_INT_N_0 __int128
+// AARCH64-NEXT: #define __INT16_C_SUFFIX__ 
+// AARCH64-NEXT: #define __INT16_FMTd__ "hd"
+// AARCH64-NEXT: #define __INT16_FMTi__ "hi"
+// AARCH64-NEXT: #define __INT16_MAX__ 32767
+// AARCH64-NEXT: #define __INT16_TYPE__ short
+// AARCH64-NEXT: #define __INT32_C_SUFFIX__ 
+// AARCH64-NEXT: #define __INT32_FMTd__ "d"
+// AARCH64-NEXT: #define __INT32_FMTi__ "i"
+// AARCH64-NEXT: #define __INT32_MAX__ 2147483647
+// AARCH64-NEXT: #define __INT32_TYPE__ int
+// AARCH64-NEXT: #define __INT64_C_SUFFIX__ L
+// AARCH64-NEXT: #define __INT64_FMTd__ "ld"
+// AARCH64-NEXT: #define __INT64_FMTi__ "li"
+// AARCH64-NEXT: #define __INT64_MAX__ 9223372036854775807L
+// AARCH64-NEXT: #define __INT64_TYPE__ long int
+// AARCH64-NEXT: #define __INT8_C_SUFFIX__ 
+// AARCH64-NEXT: #define __INT8_FMTd__ "hhd"
+// AARCH64-NEXT: #define __INT8_FMTi__ "hhi"
+// AARCH64-NEXT: #define __INT8_MAX__ 127
+// AARCH64-NEXT: #define __INT8_TYPE__ signed char
+// AARCH64-NEXT: #define __INTMAX_C_SUFFIX__ L
+// AARCH64-NEXT: #define __INTMAX_FMTd__ "ld"
+// AARCH64-NEXT: #define __INTMAX_FMTi__ "li"
+// AARCH64-NEXT: #define __INTMAX_MAX__ 9223372036854775807L
+// AARCH64-NEXT: #define __INTMAX_TYPE__ long int
+// AARCH64-NEXT: #define __INTMAX_WIDTH__ 64
+// AARCH64-NEXT: #define __INTPTR_FMTd__ "ld"
+// AARCH64-NEXT: #define __INTPTR_FMTi__ "li"
+// AARCH64-NEXT: #define __INTPTR_MAX__ 9223372036854775807L
+// AARCH64-NEXT: #define __INTPTR_TYPE__ long int
+// AARCH64-NEXT: #define __INTPTR_WIDTH__ 64
+// AARCH64-NEXT: #define __INT_FAST16_FMTd__ "hd"
+// AARCH64-NEXT: #define __INT_FAST16_FMTi__ "hi"
+// AARCH64-NEXT: #define __INT_FAST16_MAX__ 32767
+// AARCH64-NEXT: #define __INT_FAST16_TYPE__ short
+// AARCH64-NEXT: #define __INT_FAST32_FMTd__ "d"
+// AARCH64-NEXT: #define __INT_FAST32_FMTi__ "i"
+// AARCH64-NEXT: #define __INT_FAST32_MAX__ 2147483647
+// AARCH64-NEXT: #define __INT_FAST32_TYPE__ int
+// AARCH64-NEXT: #define __INT_FAST64_FMTd__ "ld"
+// AARCH64-NEXT: #define __INT_FAST64_FMTi__ "li"
+// AARCH64-NEXT: #define __INT_FAST64_MAX__ 9223372036854775807L
+// AARCH64-NEXT: #define __INT_FAST64_TYPE__ long int
+// AARCH64-NEXT: #define __INT_FAST8_FMTd__ "hhd"
+// AARCH64-NEXT: #define __INT_FAST8_FMTi__ "hhi"
+// AARCH64-NEXT: #define __INT_FAST8_MAX__ 127
+// AARCH64-NEXT: #define __INT_FAST8_TYPE__ signed char
+// AARCH64-NEXT: #define __INT_LEAST16_FMTd__ "hd"
+// AARCH64-NEXT: #define __INT_LEAST16_FMTi__ "hi"
+// AARCH64-NEXT: #define __INT_LEAST16_MAX__ 32767
+// AARCH64-NEXT: #define __INT_LEAST16_TYPE__ short
+// AARCH64-NEXT: #define __INT_LEAST32_FMTd__ "d"
+// AARCH64-NEXT: #define __INT_LEAST32_FMTi__ "i"
+// AARCH64-NEXT: #define __INT_LEAST32_MAX__ 2147483647
+// AARCH64-NEXT: #define __INT_LEAST32_TYPE__ int
+// AARCH64-NEXT: #define __INT_LEAST64_FMTd__ "ld"
+// AARCH64-NEXT: #define __INT_LEAST64_FMTi__ "li"
+// AARCH64-NEXT: #define __INT_LEAST64_MAX__ 9223372036854775807L
+// AARCH64-NEXT: #define __INT_LEAST64_TYPE__ long int
+// AARCH64-NEXT: #define __INT_LEAST8_FMTd__ "hhd"
+// AARCH64-NEXT: #define __INT_LEAST8_FMTi__ "hhi"
+// AARCH64-NEXT: #define __INT_LEAST8_MAX__ 127
+// AARCH64-NEXT: #define __INT_LEAST8_TYPE__ signed char
+// AARCH64-NEXT: #define __INT_MAX__ 2147483647
+// AARCH64-NEXT: #define __LDBL_DECIMAL_DIG__ 36
+// AARCH64-NEXT: #define __LDBL_DENORM_MIN__ 6.47517511943802511092443895822764655e-4966L
+// AARCH64-NEXT: #define __LDBL_DIG__ 33
+// AARCH64-NEXT: #define __LDBL_EPSILON__ 1.92592994438723585305597794258492732e-34L
+// AARCH64-NEXT: #define __LDBL_HAS_DENORM__ 1
+// AARCH64-NEXT: #define __LDBL_HAS_INFINITY__ 1
+// AARCH64-NEXT: #define __LDBL_HAS_QUIET_NAN__ 1
+// AARCH64-NEXT: #define __LDBL_MANT_DIG__ 113
+// AARCH64-NEXT: #define __LDBL_MAX_10_EXP__ 4932
+// AARCH64-NEXT: #define __LDBL_MAX_EXP__ 16384
+// AARCH64-NEXT: #define __LDBL_MAX__ 1.18973149535723176508575932662800702e+4932L
+// AARCH64-NEXT: #define __LDBL_MIN_10_EXP__ (-4931)
+// AARCH64-NEXT: #define __LDBL_MIN_EXP__ (-16381)
+// AARCH64-NEXT: #define __LDBL_MIN__ 3.36210314311209350626267781732175260e-4932L
+// AARCH64_LE-NEXT: #define __LITTLE_ENDIAN__ 1
+// AARCH64-NEXT: #define __LONG_LONG_MAX__ 9223372036854775807LL
+// AARCH64-NEXT: #define __LONG_MAX__ 9223372036854775807L
+// AARCH64-NEXT: #define __LP64__ 1
+// AARCH64-NEXT: #define __NO_INLINE__ 1
+// AARCH64-NEXT: #define __OBJC_BOOL_IS_BOOL 0
+// AARCH64-NEXT: #define __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES 3
+// AARCH64-NEXT: #define __OPENCL_MEMORY_SCOPE_DEVICE 2
+// AARCH64-NEXT: #define __OPENCL_MEMORY_SCOPE_SUB_GROUP 4
+// AARCH64-NEXT: #define __OPENCL_MEMORY_SCOPE_WORK_GROUP 1
+// AARCH64-NEXT: #define __OPENCL_MEMORY_SCOPE_WORK_ITEM 0
+// AARCH64-NEXT: #define __ORDER_BIG_ENDIAN__ 4321
+// AARCH64-NEXT: #define __ORDER_LITTLE_ENDIAN__ 1234
+// AARCH64-NEXT: #define __ORDER_PDP_ENDIAN__ 3412
+// AARCH64-NEXT: #define __POINTER_WIDTH__ 64
+// AARCH64-NEXT: #define __PRAGMA_REDEFINE_EXTNAME 1
+// AARCH64-NEXT: #define __PTRDIFF_FMTd__ "ld"
+// AARCH64-NEXT: #define __PTRDIFF_FMTi__ "li"
+// AARCH64-NEXT: #define __PTRDIFF_MAX__ 9223372036854775807L
+// AARCH64-NEXT: #define __PTRDIFF_TYPE__ long int
+// AARCH64-NEXT: #define __PTRDIFF_WIDTH__ 64
+// AARCH64-NEXT: #define __SCHAR_MAX__ 127
+// AARCH64-NEXT: #define __SHRT_MAX__ 32767
+// AARCH64-NEXT: #define __SIG_ATOMIC_MAX__ 2147483647
+// AARCH64-NEXT: #define __SIG_ATOMIC_WIDTH__ 32
+// AARCH64-NEXT: #define __SIZEOF_DOUBLE__ 8
+// AARCH64-NEXT: #define __SIZEOF_FLOAT__ 4
+// AARCH64-NEXT: #define __SIZEOF_INT128__ 16
+// AARCH64-NEXT: #define __SIZEOF_INT__ 4
+// AARCH64-NEXT: #define __SIZEOF_LONG_DOUBLE__ 16
+// AARCH64-NEXT: #define __SIZEOF_LONG_LONG__ 8
+// AARCH64-NEXT: #define __SIZEOF_LONG__ 8
+// AARCH64-NEXT: #define __SIZEOF_POINTER__ 8
+// AARCH64-NEXT: #define __SIZEOF_PTRDIFF_T__ 8
+// AARCH64-NEXT: #define __SIZEOF_SHORT__ 2
+// AARCH64-NEXT: #define __SIZEOF_SIZE_T__ 8
+// AARCH64-NEXT: #define __SIZEOF_WCHAR_T__ 4
+// AARCH64-NEXT: #define __SIZEOF_WINT_T__ 4
+// AARCH64-NEXT: #define __SIZE_FMTX__ "lX"
+// AARCH64-NEXT: #define __SIZE_FMTo__ "lo"
+// AARCH64-NEXT: #define __SIZE_FMTu__ "lu"
+// AARCH64-NEXT: #define __SIZE_FMTx__ "lx"
+// AARCH64-NEXT: #define __SIZE_MAX__ 18446744073709551615UL
+// AARCH64-NEXT: #define __SIZE_TYPE__ long unsigned int
+// AARCH64-NEXT: #define __SIZE_WIDTH__ 64
+// AARCH64_CXX: #define __STDCPP_DEFAULT_NEW_ALIGNMENT__ 16UL
+// AARCH64-NEXT: #define __STDC_HOSTED__ 1
+// AARCH64-NEXT: #define __STDC_UTF_16__ 1
+// AARCH64-NEXT: #define __STDC_UTF_32__ 1
+// AARCH64_C: #define __STDC_VERSION__ 201112L
+// AARCH64-NEXT: #define __STDC__ 1
+// AARCH64-NEXT: #define __UINT16_C_SUFFIX__ 
+// AARCH64-NEXT: #define __UINT16_FMTX__ "hX"
+// AARCH64-NEXT: #define __UINT16_FMTo__ "ho"
+// AARCH64-NEXT: #define __UINT16_FMTu__ "hu"
+// AARCH64-NEXT: #define __UINT16_FMTx__ "hx"
+// AARCH64-NEXT: #define __UINT16_MAX__ 65535
+// AARCH64-NEXT: #define __UINT16_TYPE__ unsigned short
+// AARCH64-NEXT: #define __UINT32_C_SUFFIX__ U
+// AARCH64-NEXT: #define __UINT32_FMTX__ "X"
+// AARCH64-NEXT: #define __UINT32_FMTo__ "o"
+// AARCH64-NEXT: #define __UINT32_FMTu__ "u"
+// AARCH64-NEXT: #define __UINT32_FMTx__ "x"
+// AARCH64-NEXT: #define __UINT32_MAX__ 4294967295U
+// AARCH64-NEXT: #define __UINT32_TYPE__ unsigned int
+// AARCH64-NEXT: #define __UINT64_C_SUFFIX__ UL
+// AARCH64-NEXT: #define __UINT64_FMTX__ "lX"
+// AARCH64-NEXT: #define __UINT64_FMTo__ "lo"
+// AARCH64-NEXT: #define __UINT64_FMTu__ "lu"
+// AARCH64-NEXT: #define __UINT64_FMTx__ "lx"
+// AARCH64-NEXT: #define __UINT64_MAX__ 18446744073709551615UL
+// AARCH64-NEXT: #define __UINT64_TYPE__ long unsigned int
+// AARCH64-NEXT: #define __UINT8_C_SUFFIX__ 
+// AARCH64-NEXT: #define __UINT8_FMTX__ "hhX"
+// AARCH64-NEXT: #define __UINT8_FMTo__ "hho"
+// AARCH64-NEXT: #define __UINT8_FMTu__ "hhu"
+// AARCH64-NEXT: #define __UINT8_FMTx__ "hhx"
+// AARCH64-NEXT: #define __UINT8_MAX__ 255
+// AARCH64-NEXT: #define __UINT8_TYPE__ unsigned char
+// AARCH64-NEXT: #define __UINTMAX_C_SUFFIX__ UL
+// AARCH64-NEXT: #define __UINTMAX_FMTX__ "lX"
+// AARCH64-NEXT: #define __UINTMAX_FMTo__ "lo"
+// AARCH64-NEXT: #define __UINTMAX_FMTu__ "lu"
+// AARCH64-NEXT: #define __UINTMAX_FMTx__ "lx"
+// AARCH64-NEXT: #define __UINTMAX_MAX__ 18446744073709551615UL
+// AARCH64-NEXT: #define __UINTMAX_TYPE__ long unsigned int
+// AARCH64-NEXT: #define __UINTMAX_WIDTH__ 64
+// AARCH64-NEXT: #define __UINTPTR_FMTX__ "lX"
+// AARCH64-NEXT: #define __UINTPTR_FMTo__ "lo"
+// AARCH64-NEXT: #define __UINTPTR_FMTu__ "lu"
+// AARCH64-NEXT: #define __UINTPTR_FMTx__ "lx"
+// AARCH64-NEXT: #define __UINTPTR_MAX__ 18446744073709551615UL
+// AARCH64-NEXT: #define __UINTPTR_TYPE__ long unsigned int
+// AARCH64-NEXT: #define __UINTPTR_WIDTH__ 64
+// AARCH64-NEXT: #define __UINT_FAST16_FMTX__ "hX"
+// AARCH64-NEXT: #define __UINT_FAST16_FMTo__ "ho"
+// AARCH64-NEXT: #define __UINT_FAST16_FMTu__ "hu"
+// AARCH64-NEXT: #define __UINT_FAST16_FMTx__ "hx"
+// AARCH64-NEXT: #define __UINT_FAST16_MAX__ 65535
+// AARCH64-NEXT: #define __UINT_FAST16_TYPE__ unsigned short
+// AARCH64-NEXT: #define __UINT_FAST32_FMTX__ "X"
+// AARCH64-NEXT: #define __UINT_FAST32_FMTo__ "o"
+// AARCH64-NEXT: #define __UINT_FAST32_FMTu__ "u"
+// AARCH64-NEXT: #define __UINT_FAST32_FMTx__ "x"
+// AARCH64-NEXT: #define __UINT_FAST32_MAX__ 4294967295U
+// AARCH64-NEXT: #define __UINT_FAST32_TYPE__ unsigned int
+// AARCH64-NEXT: #define __UINT_FAST64_FMTX__ "lX"
+// AARCH64-NEXT: #define __UINT_FAST64_FMTo__ "lo"
+// AARCH64-NEXT: #define __UINT_FAST64_FMTu__ "lu"
+// AARCH64-NEXT: #define __UINT_FAST64_FMTx__ "lx"
+// AARCH64-NEXT: #define __UINT_FAST64_MAX__ 18446744073709551615UL
+// AARCH64-NEXT: #define __UINT_FAST64_TYPE__ long unsigned int
+// AARCH64-NEXT: #define __UINT_FAST8_FMTX__ "hhX"
+// AARCH64-NEXT: #define __UINT_FAST8_FMTo__ "hho"
+// AARCH64-NEXT: #define __UINT_FAST8_FMTu__ "hhu"
+// AARCH64-NEXT: #define __UINT_FAST8_FMTx__ "hhx"
+// AARCH64-NEXT: #define __UINT_FAST8_MAX__ 255
+// AARCH64-NEXT: #define __UINT_FAST8_TYPE__ unsigned char
+// AARCH64-NEXT: #define __UINT_LEAST16_FMTX__ "hX"
+// AARCH64-NEXT: #define __UINT_LEAST16_FMTo__ "ho"
+// AARCH64-NEXT: #define __UINT_LEAST16_FMTu__ "hu"
+// AARCH64-NEXT: #define __UINT_LEAST16_FMTx__ "hx"
+// AARCH64-NEXT: #define __UINT_LEAST16_MAX__ 65535
+// AARCH64-NEXT: #define __UINT_LEAST16_TYPE__ unsigned short
+// AARCH64-NEXT: #define __UINT_LEAST32_FMTX__ "X"
+// AARCH64-NEXT: #define __UINT_LEAST32_FMTo__ "o"
+// AARCH64-NEXT: #define __UINT_LEAST32_FMTu__ "u"
+// AARCH64-NEXT: #define __UINT_LEAST32_FMTx__ "x"
+// AARCH64-NEXT: #define __UINT_LEAST32_MAX__ 4294967295U
+// AARCH64-NEXT: #define __UINT_LEAST32_TYPE__ unsigned int
+// AARCH64-NEXT: #define __UINT_LEAST64_FMTX__ "lX"
+// AARCH64-NEXT: #define __UINT_LEAST64_FMTo__ "lo"
+// AARCH64-NEXT: #define __UINT_LEAST64_FMTu__ "lu"
+// AARCH64-NEXT: #define __UINT_LEAST64_FMTx__ "lx"
+// AARCH64-NEXT: #define __UINT_LEAST64_MAX__ 18446744073709551615UL
+// AARCH64-NEXT: #define __UINT_LEAST64_TYPE__ long unsigned int
+// AARCH64-NEXT: #define __UINT_LEAST8_FMTX__ "hhX"
+// AARCH64-NEXT: #define __UINT_LEAST8_FMTo__ "hho"
+// AARCH64-NEXT: #define __UINT_LEAST8_FMTu__ "hhu"
+// AARCH64-NEXT: #define __UINT_LEAST8_FMTx__ "hhx"
+// AARCH64-NEXT: #define __UINT_LEAST8_MAX__ 255
+// AARCH64-NEXT: #define __UINT_LEAST8_TYPE__ unsigned char
+// AARCH64-NEXT: #define __USER_LABEL_PREFIX__ 
+// AARCH64-NEXT: #define __VERSION__ "{{.*}}"
+// AARCH64-NEXT: #define __WCHAR_MAX__ 4294967295U
+// AARCH64-NEXT: #define __WCHAR_TYPE__ unsigned int
+// AARCH64-NEXT: #define __WCHAR_UNSIGNED__ 1
+// AARCH64-NEXT: #define __WCHAR_WIDTH__ 32
+// AARCH64-NEXT: #define __WINT_MAX__ 2147483647
+// AARCH64-NEXT: #define __WINT_TYPE__ int
+// AARCH64-NEXT: #define __WINT_WIDTH__ 32
+// AARCH64-NEXT: #define __aarch64__ 1
+
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=aarch64-apple-ios7.0 < /dev/null | FileCheck -match-full-lines -check-prefix AARCH64-DARWIN %s
+
+// AARCH64-DARWIN: #define _LP64 1
+// AARCH64-DARWIN-NOT: #define __AARCH64EB__ 1
+// AARCH64-DARWIN: #define __AARCH64EL__ 1
+// AARCH64-DARWIN-NOT: #define __AARCH_BIG_ENDIAN 1
+// AARCH64-DARWIN: #define __ARM_64BIT_STATE 1
+// AARCH64-DARWIN: #define __ARM_ARCH 8
+// AARCH64-DARWIN: #define __ARM_ARCH_ISA_A64 1
+// AARCH64-DARWIN-NOT: #define __ARM_BIG_ENDIAN 1
+// AARCH64-DARWIN: #define __BIGGEST_ALIGNMENT__ 8
+// AARCH64-DARWIN: #define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
+// AARCH64-DARWIN: #define __CHAR16_TYPE__ unsigned short
+// AARCH64-DARWIN: #define __CHAR32_TYPE__ unsigned int
+// AARCH64-DARWIN: #define __CHAR_BIT__ 8
+// AARCH64-DARWIN: #define __DBL_DENORM_MIN__ 4.9406564584124654e-324
+// AARCH64-DARWIN: #define __DBL_DIG__ 15
+// AARCH64-DARWIN: #define __DBL_EPSILON__ 2.2204460492503131e-16
+// AARCH64-DARWIN: #define __DBL_HAS_DENORM__ 1
+// AARCH64-DARWIN: #define __DBL_HAS_INFINITY__ 1
+// AARCH64-DARWIN: #define __DBL_HAS_QUIET_NAN__ 1
+// AARCH64-DARWIN: #define __DBL_MANT_DIG__ 53
+// AARCH64-DARWIN: #define __DBL_MAX_10_EXP__ 308
+// AARCH64-DARWIN: #define __DBL_MAX_EXP__ 1024
+// AARCH64-DARWIN: #define __DBL_MAX__ 1.7976931348623157e+308
+// AARCH64-DARWIN: #define __DBL_MIN_10_EXP__ (-307)
+// AARCH64-DARWIN: #define __DBL_MIN_EXP__ (-1021)
+// AARCH64-DARWIN: #define __DBL_MIN__ 2.2250738585072014e-308
+// AARCH64-DARWIN: #define __DECIMAL_DIG__ __LDBL_DECIMAL_DIG__
+// AARCH64-DARWIN: #define __FLT_DENORM_MIN__ 1.40129846e-45F
+// AARCH64-DARWIN: #define __FLT_DIG__ 6
+// AARCH64-DARWIN: #define __FLT_EPSILON__ 1.19209290e-7F
+// AARCH64-DARWIN: #define __FLT_EVAL_METHOD__ 0
+// AARCH64-DARWIN: #define __FLT_HAS_DENORM__ 1
+// AARCH64-DARWIN: #define __FLT_HAS_INFINITY__ 1
+// AARCH64-DARWIN: #define __FLT_HAS_QUIET_NAN__ 1
+// AARCH64-DARWIN: #define __FLT_MANT_DIG__ 24
+// AARCH64-DARWIN: #define __FLT_MAX_10_EXP__ 38
+// AARCH64-DARWIN: #define __FLT_MAX_EXP__ 128
+// AARCH64-DARWIN: #define __FLT_MAX__ 3.40282347e+38F
+// AARCH64-DARWIN: #define __FLT_MIN_10_EXP__ (-37)
+// AARCH64-DARWIN: #define __FLT_MIN_EXP__ (-125)
+// AARCH64-DARWIN: #define __FLT_MIN__ 1.17549435e-38F
+// AARCH64-DARWIN: #define __FLT_RADIX__ 2
+// AARCH64-DARWIN: #define __INT16_C_SUFFIX__
+// AARCH64-DARWIN: #define __INT16_FMTd__ "hd"
+// AARCH64-DARWIN: #define __INT16_FMTi__ "hi"
+// AARCH64-DARWIN: #define __INT16_MAX__ 32767
+// AARCH64-DARWIN: #define __INT16_TYPE__ short
+// AARCH64-DARWIN: #define __INT32_C_SUFFIX__
+// AARCH64-DARWIN: #define __INT32_FMTd__ "d"
+// AARCH64-DARWIN: #define __INT32_FMTi__ "i"
+// AARCH64-DARWIN: #define __INT32_MAX__ 2147483647
+// AARCH64-DARWIN: #define __INT32_TYPE__ int
+// AARCH64-DARWIN: #define __INT64_C_SUFFIX__ LL
+// AARCH64-DARWIN: #define __INT64_FMTd__ "lld"
+// AARCH64-DARWIN: #define __INT64_FMTi__ "lli"
+// AARCH64-DARWIN: #define __INT64_MAX__ 9223372036854775807LL
+// AARCH64-DARWIN: #define __INT64_TYPE__ long long int
+// AARCH64-DARWIN: #define __INT8_C_SUFFIX__
+// AARCH64-DARWIN: #define __INT8_FMTd__ "hhd"
+// AARCH64-DARWIN: #define __INT8_FMTi__ "hhi"
+// AARCH64-DARWIN: #define __INT8_MAX__ 127
+// AARCH64-DARWIN: #define __INT8_TYPE__ signed char
+// AARCH64-DARWIN: #define __INTMAX_C_SUFFIX__ L
+// AARCH64-DARWIN: #define __INTMAX_FMTd__ "ld"
+// AARCH64-DARWIN: #define __INTMAX_FMTi__ "li"
+// AARCH64-DARWIN: #define __INTMAX_MAX__ 9223372036854775807L
+// AARCH64-DARWIN: #define __INTMAX_TYPE__ long int
+// AARCH64-DARWIN: #define __INTMAX_WIDTH__ 64
+// AARCH64-DARWIN: #define __INTPTR_FMTd__ "ld"
+// AARCH64-DARWIN: #define __INTPTR_FMTi__ "li"
+// AARCH64-DARWIN: #define __INTPTR_MAX__ 9223372036854775807L
+// AARCH64-DARWIN: #define __INTPTR_TYPE__ long int
+// AARCH64-DARWIN: #define __INTPTR_WIDTH__ 64
+// AARCH64-DARWIN: #define __INT_FAST16_FMTd__ "hd"
+// AARCH64-DARWIN: #define __INT_FAST16_FMTi__ "hi"
+// AARCH64-DARWIN: #define __INT_FAST16_MAX__ 32767
+// AARCH64-DARWIN: #define __INT_FAST16_TYPE__ short
+// AARCH64-DARWIN: #define __INT_FAST32_FMTd__ "d"
+// AARCH64-DARWIN: #define __INT_FAST32_FMTi__ "i"
+// AARCH64-DARWIN: #define __INT_FAST32_MAX__ 2147483647
+// AARCH64-DARWIN: #define __INT_FAST32_TYPE__ int
+// AARCH64-DARWIN: #define __INT_FAST64_FMTd__ "lld"
+// AARCH64-DARWIN: #define __INT_FAST64_FMTi__ "lli"
+// AARCH64-DARWIN: #define __INT_FAST64_MAX__ 9223372036854775807LL
+// AARCH64-DARWIN: #define __INT_FAST64_TYPE__ long long int
+// AARCH64-DARWIN: #define __INT_FAST8_FMTd__ "hhd"
+// AARCH64-DARWIN: #define __INT_FAST8_FMTi__ "hhi"
+// AARCH64-DARWIN: #define __INT_FAST8_MAX__ 127
+// AARCH64-DARWIN: #define __INT_FAST8_TYPE__ signed char
+// AARCH64-DARWIN: #define __INT_LEAST16_FMTd__ "hd"
+// AARCH64-DARWIN: #define __INT_LEAST16_FMTi__ "hi"
+// AARCH64-DARWIN: #define __INT_LEAST16_MAX__ 32767
+// AARCH64-DARWIN: #define __INT_LEAST16_TYPE__ short
+// AARCH64-DARWIN: #define __INT_LEAST32_FMTd__ "d"
+// AARCH64-DARWIN: #define __INT_LEAST32_FMTi__ "i"
+// AARCH64-DARWIN: #define __INT_LEAST32_MAX__ 2147483647
+// AARCH64-DARWIN: #define __INT_LEAST32_TYPE__ int
+// AARCH64-DARWIN: #define __INT_LEAST64_FMTd__ "lld"
+// AARCH64-DARWIN: #define __INT_LEAST64_FMTi__ "lli"
+// AARCH64-DARWIN: #define __INT_LEAST64_MAX__ 9223372036854775807LL
+// AARCH64-DARWIN: #define __INT_LEAST64_TYPE__ long long int
+// AARCH64-DARWIN: #define __INT_LEAST8_FMTd__ "hhd"
+// AARCH64-DARWIN: #define __INT_LEAST8_FMTi__ "hhi"
+// AARCH64-DARWIN: #define __INT_LEAST8_MAX__ 127
+// AARCH64-DARWIN: #define __INT_LEAST8_TYPE__ signed char
+// AARCH64-DARWIN: #define __INT_MAX__ 2147483647
+// AARCH64-DARWIN: #define __LDBL_DENORM_MIN__ 4.9406564584124654e-324L
+// AARCH64-DARWIN: #define __LDBL_DIG__ 15
+// AARCH64-DARWIN: #define __LDBL_EPSILON__ 2.2204460492503131e-16L
+// AARCH64-DARWIN: #define __LDBL_HAS_DENORM__ 1
+// AARCH64-DARWIN: #define __LDBL_HAS_INFINITY__ 1
+// AARCH64-DARWIN: #define __LDBL_HAS_QUIET_NAN__ 1
+// AARCH64-DARWIN: #define __LDBL_MANT_DIG__ 53
+// AARCH64-DARWIN: #define __LDBL_MAX_10_EXP__ 308
+// AARCH64-DARWIN: #define __LDBL_MAX_EXP__ 1024
+// AARCH64-DARWIN: #define __LDBL_MAX__ 1.7976931348623157e+308L
+// AARCH64-DARWIN: #define __LDBL_MIN_10_EXP__ (-307)
+// AARCH64-DARWIN: #define __LDBL_MIN_EXP__ (-1021)
+// AARCH64-DARWIN: #define __LDBL_MIN__ 2.2250738585072014e-308L
+// AARCH64-DARWIN: #define __LONG_LONG_MAX__ 9223372036854775807LL
+// AARCH64-DARWIN: #define __LONG_MAX__ 9223372036854775807L
+// AARCH64-DARWIN: #define __LP64__ 1
+// AARCH64-DARWIN: #define __POINTER_WIDTH__ 64
+// AARCH64-DARWIN: #define __PTRDIFF_TYPE__ long int
+// AARCH64-DARWIN: #define __PTRDIFF_WIDTH__ 64
+// AARCH64-DARWIN: #define __SCHAR_MAX__ 127
+// AARCH64-DARWIN: #define __SHRT_MAX__ 32767
+// AARCH64-DARWIN: #define __SIG_ATOMIC_MAX__ 2147483647
+// AARCH64-DARWIN: #define __SIG_ATOMIC_WIDTH__ 32
+// AARCH64-DARWIN: #define __SIZEOF_DOUBLE__ 8
+// AARCH64-DARWIN: #define __SIZEOF_FLOAT__ 4
+// AARCH64-DARWIN: #define __SIZEOF_INT128__ 16
+// AARCH64-DARWIN: #define __SIZEOF_INT__ 4
+// AARCH64-DARWIN: #define __SIZEOF_LONG_DOUBLE__ 8
+// AARCH64-DARWIN: #define __SIZEOF_LONG_LONG__ 8
+// AARCH64-DARWIN: #define __SIZEOF_LONG__ 8
+// AARCH64-DARWIN: #define __SIZEOF_POINTER__ 8
+// AARCH64-DARWIN: #define __SIZEOF_PTRDIFF_T__ 8
+// AARCH64-DARWIN: #define __SIZEOF_SHORT__ 2
+// AARCH64-DARWIN: #define __SIZEOF_SIZE_T__ 8
+// AARCH64-DARWIN: #define __SIZEOF_WCHAR_T__ 4
+// AARCH64-DARWIN: #define __SIZEOF_WINT_T__ 4
+// AARCH64-DARWIN: #define __SIZE_MAX__ 18446744073709551615UL
+// AARCH64-DARWIN: #define __SIZE_TYPE__ long unsigned int
+// AARCH64-DARWIN: #define __SIZE_WIDTH__ 64
+// AARCH64-DARWIN: #define __UINT16_C_SUFFIX__
+// AARCH64-DARWIN: #define __UINT16_MAX__ 65535
+// AARCH64-DARWIN: #define __UINT16_TYPE__ unsigned short
+// AARCH64-DARWIN: #define __UINT32_C_SUFFIX__ U
+// AARCH64-DARWIN: #define __UINT32_MAX__ 4294967295U
+// AARCH64-DARWIN: #define __UINT32_TYPE__ unsigned int
+// AARCH64-DARWIN: #define __UINT64_C_SUFFIX__ ULL
+// AARCH64-DARWIN: #define __UINT64_MAX__ 18446744073709551615ULL
+// AARCH64-DARWIN: #define __UINT64_TYPE__ long long unsigned int
+// AARCH64-DARWIN: #define __UINT8_C_SUFFIX__
+// AARCH64-DARWIN: #define __UINT8_MAX__ 255
+// AARCH64-DARWIN: #define __UINT8_TYPE__ unsigned char
+// AARCH64-DARWIN: #define __UINTMAX_C_SUFFIX__ UL
+// AARCH64-DARWIN: #define __UINTMAX_MAX__ 18446744073709551615UL
+// AARCH64-DARWIN: #define __UINTMAX_TYPE__ long unsigned int
+// AARCH64-DARWIN: #define __UINTMAX_WIDTH__ 64
+// AARCH64-DARWIN: #define __UINTPTR_MAX__ 18446744073709551615UL
+// AARCH64-DARWIN: #define __UINTPTR_TYPE__ long unsigned int
+// AARCH64-DARWIN: #define __UINTPTR_WIDTH__ 64
+// AARCH64-DARWIN: #define __UINT_FAST16_MAX__ 65535
+// AARCH64-DARWIN: #define __UINT_FAST16_TYPE__ unsigned short
+// AARCH64-DARWIN: #define __UINT_FAST32_MAX__ 4294967295U
+// AARCH64-DARWIN: #define __UINT_FAST32_TYPE__ unsigned int
+// AARCH64-DARWIN: #define __UINT_FAST64_MAX__ 18446744073709551615ULL
+// AARCH64-DARWIN: #define __UINT_FAST64_TYPE__ long long unsigned int
+// AARCH64-DARWIN: #define __UINT_FAST8_MAX__ 255
+// AARCH64-DARWIN: #define __UINT_FAST8_TYPE__ unsigned char
+// AARCH64-DARWIN: #define __UINT_LEAST16_MAX__ 65535
+// AARCH64-DARWIN: #define __UINT_LEAST16_TYPE__ unsigned short
+// AARCH64-DARWIN: #define __UINT_LEAST32_MAX__ 4294967295U
+// AARCH64-DARWIN: #define __UINT_LEAST32_TYPE__ unsigned int
+// AARCH64-DARWIN: #define __UINT_LEAST64_MAX__ 18446744073709551615ULL
+// AARCH64-DARWIN: #define __UINT_LEAST64_TYPE__ long long unsigned int
+// AARCH64-DARWIN: #define __UINT_LEAST8_MAX__ 255
+// AARCH64-DARWIN: #define __UINT_LEAST8_TYPE__ unsigned char
+// AARCH64-DARWIN: #define __USER_LABEL_PREFIX__ _
+// AARCH64-DARWIN: #define __WCHAR_MAX__ 2147483647
+// AARCH64-DARWIN: #define __WCHAR_TYPE__ int
+// AARCH64-DARWIN-NOT: #define __WCHAR_UNSIGNED__
+// AARCH64-DARWIN: #define __WCHAR_WIDTH__ 32
+// AARCH64-DARWIN: #define __WINT_TYPE__ int
+// AARCH64-DARWIN: #define __WINT_WIDTH__ 32
+// AARCH64-DARWIN: #define __aarch64__ 1
+
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=aarch64-windows-msvc < /dev/null | FileCheck -match-full-lines -check-prefix AARCH64-MSVC %s
+
+// AARCH64-MSVC: #define _INTEGRAL_MAX_BITS 64
+// AARCH64-MSVC-NOT: #define _LP64 1
+// AARCH64-MSVC: #define _M_ARM64 1
+// AARCH64-MSVC: #define _WIN32 1
+// AARCH64-MSVC: #define _WIN64 1
+// AARCH64-MSVC: #define __AARCH64EL__ 1
+// AARCH64-MSVC: #define __ARM_64BIT_STATE 1
+// AARCH64-MSVC: #define __ARM_ACLE 200
+// AARCH64-MSVC: #define __ARM_ALIGN_MAX_STACK_PWR 4
+// AARCH64-MSVC: #define __ARM_ARCH 8
+// AARCH64-MSVC: #define __ARM_ARCH_ISA_A64 1
+// AARCH64-MSVC: #define __ARM_ARCH_PROFILE 'A'
+// AARCH64-MSVC: #define __ARM_FEATURE_CLZ 1
+// AARCH64-MSVC: #define __ARM_FEATURE_DIRECTED_ROUNDING 1
+// AARCH64-MSVC: #define __ARM_FEATURE_DIV 1
+// AARCH64-MSVC: #define __ARM_FEATURE_FMA 1
+// AARCH64-MSVC: #define __ARM_FEATURE_IDIV 1
+// AARCH64-MSVC: #define __ARM_FEATURE_LDREX 0xF
+// AARCH64-MSVC: #define __ARM_FEATURE_NUMERIC_MAXMIN 1
+// AARCH64-MSVC: #define __ARM_FEATURE_UNALIGNED 1
+// AARCH64-MSVC: #define __ARM_FP 0xE
+// AARCH64-MSVC: #define __ARM_FP16_ARGS 1
+// AARCH64-MSVC: #define __ARM_FP16_FORMAT_IEEE 1
+// AARCH64-MSVC: #define __ARM_PCS_AAPCS64 1
+// AARCH64-MSVC: #define __ARM_SIZEOF_MINIMAL_ENUM 4
+// AARCH64-MSVC: #define __ARM_SIZEOF_WCHAR_T 4
+// AARCH64-MSVC: #define __BIGGEST_ALIGNMENT__ 16
+// AARCH64-MSVC: #define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
+// AARCH64-MSVC: #define __CHAR16_TYPE__ unsigned short
+// AARCH64-MSVC: #define __CHAR32_TYPE__ unsigned int
+// AARCH64-MSVC: #define __CHAR_BIT__ 8
+// AARCH64-MSVC: #define __CONSTANT_CFSTRINGS__ 1
+// AARCH64-MSVC: #define __DBL_DECIMAL_DIG__ 17
+// AARCH64-MSVC: #define __DBL_DENORM_MIN__ 4.9406564584124654e-324
+// AARCH64-MSVC: #define __DBL_DIG__ 15
+// AARCH64-MSVC: #define __DBL_EPSILON__ 2.2204460492503131e-16
+// AARCH64-MSVC: #define __DBL_HAS_DENORM__ 1
+// AARCH64-MSVC: #define __DBL_HAS_INFINITY__ 1
+// AARCH64-MSVC: #define __DBL_HAS_QUIET_NAN__ 1
+// AARCH64-MSVC: #define __DBL_MANT_DIG__ 53
+// AARCH64-MSVC: #define __DBL_MAX_10_EXP__ 308
+// AARCH64-MSVC: #define __DBL_MAX_EXP__ 1024
+// AARCH64-MSVC: #define __DBL_MAX__ 1.7976931348623157e+308
+// AARCH64-MSVC: #define __DBL_MIN_10_EXP__ (-307)
+// AARCH64-MSVC: #define __DBL_MIN_EXP__ (-1021)
+// AARCH64-MSVC: #define __DBL_MIN__ 2.2250738585072014e-308
+// AARCH64-MSVC: #define __DECIMAL_DIG__ __LDBL_DECIMAL_DIG__
+// AARCH64-MSVC: #define __FINITE_MATH_ONLY__ 0
+// AARCH64-MSVC: #define __FLT_DECIMAL_DIG__ 9
+// AARCH64-MSVC: #define __FLT_DENORM_MIN__ 1.40129846e-45F
+// AARCH64-MSVC: #define __FLT_DIG__ 6
+// AARCH64-MSVC: #define __FLT_EPSILON__ 1.19209290e-7F
+// AARCH64-MSVC: #define __FLT_EVAL_METHOD__ 0
+// AARCH64-MSVC: #define __FLT_HAS_DENORM__ 1
+// AARCH64-MSVC: #define __FLT_HAS_INFINITY__ 1
+// AARCH64-MSVC: #define __FLT_HAS_QUIET_NAN__ 1
+// AARCH64-MSVC: #define __FLT_MANT_DIG__ 24
+// AARCH64-MSVC: #define __FLT_MAX_10_EXP__ 38
+// AARCH64-MSVC: #define __FLT_MAX_EXP__ 128
+// AARCH64-MSVC: #define __FLT_MAX__ 3.40282347e+38F
+// AARCH64-MSVC: #define __FLT_MIN_10_EXP__ (-37)
+// AARCH64-MSVC: #define __FLT_MIN_EXP__ (-125)
+// AARCH64-MSVC: #define __FLT_MIN__ 1.17549435e-38F
+// AARCH64-MSVC: #define __FLT_RADIX__ 2
+// AARCH64-MSVC: #define __INT_MAX__ 2147483647
+// AARCH64-MSVC: #define __LDBL_DECIMAL_DIG__ 17
+// AARCH64-MSVC: #define __LDBL_DENORM_MIN__ 4.9406564584124654e-324L
+// AARCH64-MSVC: #define __LDBL_DIG__ 15
+// AARCH64-MSVC: #define __LDBL_EPSILON__ 2.2204460492503131e-16L
+// AARCH64-MSVC: #define __LDBL_HAS_DENORM__ 1
+// AARCH64-MSVC: #define __LDBL_HAS_INFINITY__ 1
+// AARCH64-MSVC: #define __LDBL_HAS_QUIET_NAN__ 1
+// AARCH64-MSVC: #define __LDBL_MANT_DIG__ 53
+// AARCH64-MSVC: #define __LDBL_MAX_10_EXP__ 308
+// AARCH64-MSVC: #define __LDBL_MAX_EXP__ 1024
+// AARCH64-MSVC: #define __LDBL_MAX__ 1.7976931348623157e+308L
+// AARCH64-MSVC: #define __LDBL_MIN_10_EXP__ (-307)
+// AARCH64-MSVC: #define __LDBL_MIN_EXP__ (-1021)
+// AARCH64-MSVC: #define __LDBL_MIN__ 2.2250738585072014e-308L
+// AARCH64-MSVC: #define __LITTLE_ENDIAN__ 1
+// AARCH64-MSVC: #define __LONG_LONG_MAX__ 9223372036854775807LL
+// AARCH64-MSVC: #define __LONG_MAX__ 2147483647L
+// AARCH64-MSVC-NOT: #define __LP64__ 1
+// AARCH64-MSVC: #define __NO_INLINE__ 1
+// AARCH64-MSVC: #define __OBJC_BOOL_IS_BOOL 0
+// AARCH64-MSVC: #define __ORDER_BIG_ENDIAN__ 4321
+// AARCH64-MSVC: #define __ORDER_LITTLE_ENDIAN__ 1234
+// AARCH64-MSVC: #define __ORDER_PDP_ENDIAN__ 3412
+// AARCH64-MSVC: #define __POINTER_WIDTH__ 64
+// AARCH64-MSVC: #define __PRAGMA_REDEFINE_EXTNAME 1
+// AARCH64-MSVC: #define __SCHAR_MAX__ 127
+// AARCH64-MSVC: #define __SHRT_MAX__ 32767
+// AARCH64-MSVC: #define __SIG_ATOMIC_MAX__ 2147483647
+// AARCH64-MSVC: #define __SIG_ATOMIC_WIDTH__ 32
+// AARCH64-MSVC: #define __SIZEOF_DOUBLE__ 8
+// AARCH64-MSVC: #define __SIZEOF_FLOAT__ 4
+// AARCH64-MSVC: #define __SIZEOF_INT128__ 16
+// AARCH64-MSVC: #define __SIZEOF_INT__ 4
+// AARCH64-MSVC: #define __SIZEOF_LONG_DOUBLE__ 8
+// AARCH64-MSVC: #define __SIZEOF_LONG_LONG__ 8
+// AARCH64-MSVC: #define __SIZEOF_LONG__ 4
+// AARCH64-MSVC: #define __SIZEOF_POINTER__ 8
+// AARCH64-MSVC: #define __SIZEOF_PTRDIFF_T__ 8
+// AARCH64-MSVC: #define __SIZEOF_SHORT__ 2
+// AARCH64-MSVC: #define __SIZEOF_SIZE_T__ 8
+// AARCH64-MSVC: #define __SIZEOF_WCHAR_T__ 2
+// AARCH64-MSVC: #define __SIZEOF_WINT_T__ 2
+// AARCH64-MSVC: #define __SIZE_MAX__ 18446744073709551615ULL
+// AARCH64-MSVC: #define __SIZE_TYPE__ long long unsigned int
+// AARCH64-MSVC: #define __SIZE_WIDTH__ 64
+// AARCH64-MSVC: #define __STDC_HOSTED__ 0
+// AARCH64-MSVC: #define __STDC_UTF_16__ 1
+// AARCH64-MSVC: #define __STDC_UTF_32__ 1
+// AARCH64-MSVC: #define __STDC_VERSION__ 201112L
+// AARCH64-MSVC: #define __STDC__ 1
+// AARCH64-MSVC: #define __UINT16_C_SUFFIX__
+// AARCH64-MSVC: #define __UINT16_MAX__ 65535
+// AARCH64-MSVC: #define __UINT16_TYPE__ unsigned short
+// AARCH64-MSVC: #define __UINT32_C_SUFFIX__ U
+// AARCH64-MSVC: #define __UINT32_MAX__ 4294967295U
+// AARCH64-MSVC: #define __UINT32_TYPE__ unsigned int
+// AARCH64-MSVC: #define __UINT64_C_SUFFIX__ ULL
+// AARCH64-MSVC: #define __UINT64_MAX__ 18446744073709551615ULL
+// AARCH64-MSVC: #define __UINT64_TYPE__ long long unsigned int
+// AARCH64-MSVC: #define __UINT8_C_SUFFIX__
+// AARCH64-MSVC: #define __UINT8_MAX__ 255
+// AARCH64-MSVC: #define __UINT8_TYPE__ unsigned char
+// AARCH64-MSVC: #define __UINTMAX_C_SUFFIX__ ULL
+// AARCH64-MSVC: #define __UINTMAX_MAX__ 18446744073709551615ULL
+// AARCH64-MSVC: #define __UINTMAX_TYPE__ long long unsigned int
+// AARCH64-MSVC: #define __UINTMAX_WIDTH__ 64
+// AARCH64-MSVC: #define __UINTPTR_MAX__ 18446744073709551615ULL
+// AARCH64-MSVC: #define __UINTPTR_TYPE__ long long unsigned int
+// AARCH64-MSVC: #define __UINTPTR_WIDTH__ 64
+// AARCH64-MSVC: #define __UINT_FAST16_MAX__ 65535
+// AARCH64-MSVC: #define __UINT_FAST16_TYPE__ unsigned short
+// AARCH64-MSVC: #define __UINT_FAST32_MAX__ 4294967295U
+// AARCH64-MSVC: #define __UINT_FAST32_TYPE__ unsigned int
+// AARCH64-MSVC: #define __UINT_FAST64_MAX__ 18446744073709551615ULL
+// AARCH64-MSVC: #define __UINT_FAST64_TYPE__ long long unsigned int
+// AARCH64-MSVC: #define __UINT_FAST8_MAX__ 255
+// AARCH64-MSVC: #define __UINT_FAST8_TYPE__ unsigned char
+// AARCH64-MSVC: #define __UINT_LEAST16_MAX__ 65535
+// AARCH64-MSVC: #define __UINT_LEAST16_TYPE__ unsigned short
+// AARCH64-MSVC: #define __UINT_LEAST32_MAX__ 4294967295U
+// AARCH64-MSVC: #define __UINT_LEAST32_TYPE__ unsigned int
+// AARCH64-MSVC: #define __UINT_LEAST64_MAX__ 18446744073709551615ULL
+// AARCH64-MSVC: #define __UINT_LEAST64_TYPE__ long long unsigned int
+// AARCH64-MSVC: #define __UINT_LEAST8_MAX__ 255
+// AARCH64-MSVC: #define __UINT_LEAST8_TYPE__ unsigned char
+// AARCH64-MSVC: #define __USER_LABEL_PREFIX__
+// AARCH64-MSVC: #define __WCHAR_MAX__ 65535
+// AARCH64-MSVC: #define __WCHAR_TYPE__ unsigned short
+// AARCH64-MSVC: #define __WCHAR_UNSIGNED__ 1
+// AARCH64-MSVC: #define __WCHAR_WIDTH__ 16
+// AARCH64-MSVC: #define __WINT_TYPE__ unsigned short
+// AARCH64-MSVC: #define __WINT_WIDTH__ 16
+// AARCH64-MSVC: #define __aarch64__ 1
+
+// RUN: %clang_cc1 -triple=aarch64 -E -dM -mcmodel=small -xc /dev/null | FileCheck --check-prefix=CMODEL_SMALL %s
+// RUN: %clang_cc1 -triple=aarch64 -E -dM -mcmodel=tiny -xc /dev/null | FileCheck --check-prefix=CMODEL_TINY %s
+// RUN: %clang_cc1 -triple=aarch64 -E -dM -mcmodel=large -xc /dev/null | FileCheck --check-prefix=CMODEL_LARGE %s
+
+// CMODEL_TINY: #define __AARCH64_CMODEL_TINY__ 1
+// CMODEL_SMALL: #define __AARCH64_CMODEL_SMALL__ 1
+// CMODEL_LARGE: #define __AARCH64_CMODEL_LARGE__ 1
diff --git a/clang/test/Preprocessor/init.c b/clang/test/Preprocessor/init.c
--- a/clang/test/Preprocessor/init.c
+++ b/clang/test/Preprocessor/init.c
@@ -277,1342 +277,12 @@
 // SHORTWCHAR2: #define __WCHAR_WIDTH__ 32
 // Other definitions vary from platform to platform
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=aarch64-none-none < /dev/null | FileCheck -match-full-lines -check-prefix AARCH64 %s
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=arm64-none-none < /dev/null | FileCheck -match-full-lines -check-prefix AARCH64 %s
-// RUN: %clang_cc1 -x c++ -E -dM -ffreestanding -triple=arm64-none-none < /dev/null | FileCheck -match-full-lines -check-prefix AARCH64 -check-prefix AARCH64-CXX %s
-//
-// AARCH64:#define _LP64 1
-// AARCH64-NOT:#define __AARCH64EB__ 1
-// AARCH64:#define __AARCH64EL__ 1
-// AARCH64-NOT:#define __AARCH_BIG_ENDIAN 1
-// AARCH64:#define __ARM_64BIT_STATE 1
-// AARCH64:#define __ARM_ARCH 8
-// AARCH64:#define __ARM_ARCH_ISA_A64 1
-// AARCH64-NOT:#define __ARM_BIG_ENDIAN 1
-// AARCH64:#define __BIGGEST_ALIGNMENT__ 16
-// AARCH64:#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
-// AARCH64:#define __CHAR16_TYPE__ unsigned short
-// AARCH64:#define __CHAR32_TYPE__ unsigned int
-// AARCH64:#define __CHAR_BIT__ 8
-// AARCH64:#define __DBL_DENORM_MIN__ 4.9406564584124654e-324
-// AARCH64:#define __DBL_DIG__ 15
-// AARCH64:#define __DBL_EPSILON__ 2.2204460492503131e-16
-// AARCH64:#define __DBL_HAS_DENORM__ 1
-// AARCH64:#define __DBL_HAS_INFINITY__ 1
-// AARCH64:#define __DBL_HAS_QUIET_NAN__ 1
-// AARCH64:#define __DBL_MANT_DIG__ 53
-// AARCH64:#define __DBL_MAX_10_EXP__ 308
-// AARCH64:#define __DBL_MAX_EXP__ 1024
-// AARCH64:#define __DBL_MAX__ 1.7976931348623157e+308
-// AARCH64:#define __DBL_MIN_10_EXP__ (-307)
-// AARCH64:#define __DBL_MIN_EXP__ (-1021)
-// AARCH64:#define __DBL_MIN__ 2.2250738585072014e-308
-// AARCH64:#define __DECIMAL_DIG__ __LDBL_DECIMAL_DIG__
-// AARCH64:#define __FLT16_DECIMAL_DIG__ 5
-// AARCH64:#define __FLT16_DENORM_MIN__ 5.9604644775390625e-8F16
-// AARCH64:#define __FLT16_DIG__ 3
-// AARCH64:#define __FLT16_EPSILON__ 9.765625e-4F16
-// AARCH64:#define __FLT16_HAS_DENORM__ 1
-// AARCH64:#define __FLT16_HAS_INFINITY__ 1
-// AARCH64:#define __FLT16_HAS_QUIET_NAN__ 1
-// AARCH64:#define __FLT16_MANT_DIG__ 11
-// AARCH64:#define __FLT16_MAX_10_EXP__ 4
-// AARCH64:#define __FLT16_MAX_EXP__ 16
-// AARCH64:#define __FLT16_MAX__ 6.5504e+4F16
-// AARCH64:#define __FLT16_MIN_10_EXP__ (-4)
-// AARCH64:#define __FLT16_MIN_EXP__ (-13)
-// AARCH64:#define __FLT16_MIN__ 6.103515625e-5F16
-// AARCH64:#define __FLT_DENORM_MIN__ 1.40129846e-45F
-// AARCH64:#define __FLT_DIG__ 6
-// AARCH64:#define __FLT_EPSILON__ 1.19209290e-7F
-// AARCH64:#define __FLT_EVAL_METHOD__ 0
-// AARCH64:#define __FLT_HAS_DENORM__ 1
-// AARCH64:#define __FLT_HAS_INFINITY__ 1
-// AARCH64:#define __FLT_HAS_QUIET_NAN__ 1
-// AARCH64:#define __FLT_MANT_DIG__ 24
-// AARCH64:#define __FLT_MAX_10_EXP__ 38
-// AARCH64:#define __FLT_MAX_EXP__ 128
-// AARCH64:#define __FLT_MAX__ 3.40282347e+38F
-// AARCH64:#define __FLT_MIN_10_EXP__ (-37)
-// AARCH64:#define __FLT_MIN_EXP__ (-125)
-// AARCH64:#define __FLT_MIN__ 1.17549435e-38F
-// AARCH64:#define __FLT_RADIX__ 2
-// AARCH64:#define __INT16_C_SUFFIX__
-// AARCH64:#define __INT16_FMTd__ "hd"
-// AARCH64:#define __INT16_FMTi__ "hi"
-// AARCH64:#define __INT16_MAX__ 32767
-// AARCH64:#define __INT16_TYPE__ short
-// AARCH64:#define __INT32_C_SUFFIX__
-// AARCH64:#define __INT32_FMTd__ "d"
-// AARCH64:#define __INT32_FMTi__ "i"
-// AARCH64:#define __INT32_MAX__ 2147483647
-// AARCH64:#define __INT32_TYPE__ int
-// AARCH64:#define __INT64_C_SUFFIX__ L
-// AARCH64:#define __INT64_FMTd__ "ld"
-// AARCH64:#define __INT64_FMTi__ "li"
-// AARCH64:#define __INT64_MAX__ 9223372036854775807L
-// AARCH64:#define __INT64_TYPE__ long int
-// AARCH64:#define __INT8_C_SUFFIX__
-// AARCH64:#define __INT8_FMTd__ "hhd"
-// AARCH64:#define __INT8_FMTi__ "hhi"
-// AARCH64:#define __INT8_MAX__ 127
-// AARCH64:#define __INT8_TYPE__ signed char
-// AARCH64:#define __INTMAX_C_SUFFIX__ L
-// AARCH64:#define __INTMAX_FMTd__ "ld"
-// AARCH64:#define __INTMAX_FMTi__ "li"
-// AARCH64:#define __INTMAX_MAX__ 9223372036854775807L
-// AARCH64:#define __INTMAX_TYPE__ long int
-// AARCH64:#define __INTMAX_WIDTH__ 64
-// AARCH64:#define __INTPTR_FMTd__ "ld"
-// AARCH64:#define __INTPTR_FMTi__ "li"
-// AARCH64:#define __INTPTR_MAX__ 9223372036854775807L
-// AARCH64:#define __INTPTR_TYPE__ long int
-// AARCH64:#define __INTPTR_WIDTH__ 64
-// AARCH64:#define __INT_FAST16_FMTd__ "hd"
-// AARCH64:#define __INT_FAST16_FMTi__ "hi"
-// AARCH64:#define __INT_FAST16_MAX__ 32767
-// AARCH64:#define __INT_FAST16_TYPE__ short
-// AARCH64:#define __INT_FAST32_FMTd__ "d"
-// AARCH64:#define __INT_FAST32_FMTi__ "i"
-// AARCH64:#define __INT_FAST32_MAX__ 2147483647
-// AARCH64:#define __INT_FAST32_TYPE__ int
-// AARCH64:#define __INT_FAST64_FMTd__ "ld"
-// AARCH64:#define __INT_FAST64_FMTi__ "li"
-// AARCH64:#define __INT_FAST64_MAX__ 9223372036854775807L
-// AARCH64:#define __INT_FAST64_TYPE__ long int
-// AARCH64:#define __INT_FAST8_FMTd__ "hhd"
-// AARCH64:#define __INT_FAST8_FMTi__ "hhi"
-// AARCH64:#define __INT_FAST8_MAX__ 127
-// AARCH64:#define __INT_FAST8_TYPE__ signed char
-// AARCH64:#define __INT_LEAST16_FMTd__ "hd"
-// AARCH64:#define __INT_LEAST16_FMTi__ "hi"
-// AARCH64:#define __INT_LEAST16_MAX__ 32767
-// AARCH64:#define __INT_LEAST16_TYPE__ short
-// AARCH64:#define __INT_LEAST32_FMTd__ "d"
-// AARCH64:#define __INT_LEAST32_FMTi__ "i"
-// AARCH64:#define __INT_LEAST32_MAX__ 2147483647
-// AARCH64:#define __INT_LEAST32_TYPE__ int
-// AARCH64:#define __INT_LEAST64_FMTd__ "ld"
-// AARCH64:#define __INT_LEAST64_FMTi__ "li"
-// AARCH64:#define __INT_LEAST64_MAX__ 9223372036854775807L
-// AARCH64:#define __INT_LEAST64_TYPE__ long int
-// AARCH64:#define __INT_LEAST8_FMTd__ "hhd"
-// AARCH64:#define __INT_LEAST8_FMTi__ "hhi"
-// AARCH64:#define __INT_LEAST8_MAX__ 127
-// AARCH64:#define __INT_LEAST8_TYPE__ signed char
-// AARCH64:#define __INT_MAX__ 2147483647
-// AARCH64:#define __LDBL_DENORM_MIN__ 6.47517511943802511092443895822764655e-4966L
-// AARCH64:#define __LDBL_DIG__ 33
-// AARCH64:#define __LDBL_EPSILON__ 1.92592994438723585305597794258492732e-34L
-// AARCH64:#define __LDBL_HAS_DENORM__ 1
-// AARCH64:#define __LDBL_HAS_INFINITY__ 1
-// AARCH64:#define __LDBL_HAS_QUIET_NAN__ 1
-// AARCH64:#define __LDBL_MANT_DIG__ 113
-// AARCH64:#define __LDBL_MAX_10_EXP__ 4932
-// AARCH64:#define __LDBL_MAX_EXP__ 16384
-// AARCH64:#define __LDBL_MAX__ 1.18973149535723176508575932662800702e+4932L
-// AARCH64:#define __LDBL_MIN_10_EXP__ (-4931)
-// AARCH64:#define __LDBL_MIN_EXP__ (-16381)
-// AARCH64:#define __LDBL_MIN__ 3.36210314311209350626267781732175260e-4932L
-// AARCH64:#define __LONG_LONG_MAX__ 9223372036854775807LL
-// AARCH64:#define __LONG_MAX__ 9223372036854775807L
-// AARCH64:#define __LP64__ 1
-// AARCH64:#define __POINTER_WIDTH__ 64
-// AARCH64:#define __PTRDIFF_TYPE__ long int
-// AARCH64:#define __PTRDIFF_WIDTH__ 64
-// AARCH64:#define __SCHAR_MAX__ 127
-// AARCH64:#define __SHRT_MAX__ 32767
-// AARCH64:#define __SIG_ATOMIC_MAX__ 2147483647
-// AARCH64:#define __SIG_ATOMIC_WIDTH__ 32
-// AARCH64:#define __SIZEOF_DOUBLE__ 8
-// AARCH64:#define __SIZEOF_FLOAT__ 4
-// AARCH64:#define __SIZEOF_INT128__ 16
-// AARCH64:#define __SIZEOF_INT__ 4
-// AARCH64:#define __SIZEOF_LONG_DOUBLE__ 16
-// AARCH64:#define __SIZEOF_LONG_LONG__ 8
-// AARCH64:#define __SIZEOF_LONG__ 8
-// AARCH64:#define __SIZEOF_POINTER__ 8
-// AARCH64:#define __SIZEOF_PTRDIFF_T__ 8
-// AARCH64:#define __SIZEOF_SHORT__ 2
-// AARCH64:#define __SIZEOF_SIZE_T__ 8
-// AARCH64:#define __SIZEOF_WCHAR_T__ 4
-// AARCH64:#define __SIZEOF_WINT_T__ 4
-// AARCH64:#define __SIZE_MAX__ 18446744073709551615UL
-// AARCH64:#define __SIZE_TYPE__ long unsigned int
-// AARCH64:#define __SIZE_WIDTH__ 64
-// AARCH64-CXX:#define __STDCPP_DEFAULT_NEW_ALIGNMENT__ 16UL
-// AARCH64:#define __UINT16_C_SUFFIX__
-// AARCH64:#define __UINT16_MAX__ 65535
-// AARCH64:#define __UINT16_TYPE__ unsigned short
-// AARCH64:#define __UINT32_C_SUFFIX__ U
-// AARCH64:#define __UINT32_MAX__ 4294967295U
-// AARCH64:#define __UINT32_TYPE__ unsigned int
-// AARCH64:#define __UINT64_C_SUFFIX__ UL
-// AARCH64:#define __UINT64_MAX__ 18446744073709551615UL
-// AARCH64:#define __UINT64_TYPE__ long unsigned int
-// AARCH64:#define __UINT8_C_SUFFIX__
-// AARCH64:#define __UINT8_MAX__ 255
-// AARCH64:#define __UINT8_TYPE__ unsigned char
-// AARCH64:#define __UINTMAX_C_SUFFIX__ UL
-// AARCH64:#define __UINTMAX_MAX__ 18446744073709551615UL
-// AARCH64:#define __UINTMAX_TYPE__ long unsigned int
-// AARCH64:#define __UINTMAX_WIDTH__ 64
-// AARCH64:#define __UINTPTR_MAX__ 18446744073709551615UL
-// AARCH64:#define __UINTPTR_TYPE__ long unsigned int
-// AARCH64:#define __UINTPTR_WIDTH__ 64
-// AARCH64:#define __UINT_FAST16_MAX__ 65535
-// AARCH64:#define __UINT_FAST16_TYPE__ unsigned short
-// AARCH64:#define __UINT_FAST32_MAX__ 4294967295U
-// AARCH64:#define __UINT_FAST32_TYPE__ unsigned int
-// AARCH64:#define __UINT_FAST64_MAX__ 18446744073709551615UL
-// AARCH64:#define __UINT_FAST64_TYPE__ long unsigned int
-// AARCH64:#define __UINT_FAST8_MAX__ 255
-// AARCH64:#define __UINT_FAST8_TYPE__ unsigned char
-// AARCH64:#define __UINT_LEAST16_MAX__ 65535
-// AARCH64:#define __UINT_LEAST16_TYPE__ unsigned short
-// AARCH64:#define __UINT_LEAST32_MAX__ 4294967295U
-// AARCH64:#define __UINT_LEAST32_TYPE__ unsigned int
-// AARCH64:#define __UINT_LEAST64_MAX__ 18446744073709551615UL
-// AARCH64:#define __UINT_LEAST64_TYPE__ long unsigned int
-// AARCH64:#define __UINT_LEAST8_MAX__ 255
-// AARCH64:#define __UINT_LEAST8_TYPE__ unsigned char
-// AARCH64:#define __USER_LABEL_PREFIX__
-// AARCH64:#define __WCHAR_MAX__ 4294967295U
-// AARCH64:#define __WCHAR_TYPE__ unsigned int
-// AARCH64:#define __WCHAR_UNSIGNED__ 1
-// AARCH64:#define __WCHAR_WIDTH__ 32
-// AARCH64:#define __WINT_TYPE__ int
-// AARCH64:#define __WINT_WIDTH__ 32
-// AARCH64:#define __aarch64__ 1
-//
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=aarch64_be-none-none < /dev/null | FileCheck -match-full-lines -check-prefix AARCH64-BE %s
-//
-// AARCH64-BE:#define _LP64 1
-// AARCH64-BE:#define __AARCH64EB__ 1
-// AARCH64-BE-NOT:#define __AARCH64EL__ 1
-// AARCH64-BE:#define __AARCH_BIG_ENDIAN 1
-// AARCH64-BE:#define __ARM_64BIT_STATE 1
-// AARCH64-BE:#define __ARM_ARCH 8
-// AARCH64-BE:#define __ARM_ARCH_ISA_A64 1
-// AARCH64-BE:#define __ARM_BIG_ENDIAN 1
-// AARCH64-BE:#define __BIGGEST_ALIGNMENT__ 16
-// AARCH64-BE:#define __BYTE_ORDER__ __ORDER_BIG_ENDIAN__
-// AARCH64-BE:#define __CHAR16_TYPE__ unsigned short
-// AARCH64-BE:#define __CHAR32_TYPE__ unsigned int
-// AARCH64-BE:#define __CHAR_BIT__ 8
-// AARCH64-BE:#define __DBL_DENORM_MIN__ 4.9406564584124654e-324
-// AARCH64-BE:#define __DBL_DIG__ 15
-// AARCH64-BE:#define __DBL_EPSILON__ 2.2204460492503131e-16
-// AARCH64-BE:#define __DBL_HAS_DENORM__ 1
-// AARCH64-BE:#define __DBL_HAS_INFINITY__ 1
-// AARCH64-BE:#define __DBL_HAS_QUIET_NAN__ 1
-// AARCH64-BE:#define __DBL_MANT_DIG__ 53
-// AARCH64-BE:#define __DBL_MAX_10_EXP__ 308
-// AARCH64-BE:#define __DBL_MAX_EXP__ 1024
-// AARCH64-BE:#define __DBL_MAX__ 1.7976931348623157e+308
-// AARCH64-BE:#define __DBL_MIN_10_EXP__ (-307)
-// AARCH64-BE:#define __DBL_MIN_EXP__ (-1021)
-// AARCH64-BE:#define __DBL_MIN__ 2.2250738585072014e-308
-// AARCH64-BE:#define __DECIMAL_DIG__ __LDBL_DECIMAL_DIG__
-// AARCH64-BE:#define __FLT_DENORM_MIN__ 1.40129846e-45F
-// AARCH64-BE:#define __FLT_DIG__ 6
-// AARCH64-BE:#define __FLT_EPSILON__ 1.19209290e-7F
-// AARCH64-BE:#define __FLT_EVAL_METHOD__ 0
-// AARCH64-BE:#define __FLT_HAS_DENORM__ 1
-// AARCH64-BE:#define __FLT_HAS_INFINITY__ 1
-// AARCH64-BE:#define __FLT_HAS_QUIET_NAN__ 1
-// AARCH64-BE:#define __FLT_MANT_DIG__ 24
-// AARCH64-BE:#define __FLT_MAX_10_EXP__ 38
-// AARCH64-BE:#define __FLT_MAX_EXP__ 128
-// AARCH64-BE:#define __FLT_MAX__ 3.40282347e+38F
-// AARCH64-BE:#define __FLT_MIN_10_EXP__ (-37)
-// AARCH64-BE:#define __FLT_MIN_EXP__ (-125)
-// AARCH64-BE:#define __FLT_MIN__ 1.17549435e-38F
-// AARCH64-BE:#define __FLT_RADIX__ 2
-// AARCH64-BE:#define __INT16_C_SUFFIX__
-// AARCH64-BE:#define __INT16_FMTd__ "hd"
-// AARCH64-BE:#define __INT16_FMTi__ "hi"
-// AARCH64-BE:#define __INT16_MAX__ 32767
-// AARCH64-BE:#define __INT16_TYPE__ short
-// AARCH64-BE:#define __INT32_C_SUFFIX__
-// AARCH64-BE:#define __INT32_FMTd__ "d"
-// AARCH64-BE:#define __INT32_FMTi__ "i"
-// AARCH64-BE:#define __INT32_MAX__ 2147483647
-// AARCH64-BE:#define __INT32_TYPE__ int
-// AARCH64-BE:#define __INT64_C_SUFFIX__ L
-// AARCH64-BE:#define __INT64_FMTd__ "ld"
-// AARCH64-BE:#define __INT64_FMTi__ "li"
-// AARCH64-BE:#define __INT64_MAX__ 9223372036854775807L
-// AARCH64-BE:#define __INT64_TYPE__ long int
-// AARCH64-BE:#define __INT8_C_SUFFIX__
-// AARCH64-BE:#define __INT8_FMTd__ "hhd"
-// AARCH64-BE:#define __INT8_FMTi__ "hhi"
-// AARCH64-BE:#define __INT8_MAX__ 127
-// AARCH64-BE:#define __INT8_TYPE__ signed char
-// AARCH64-BE:#define __INTMAX_C_SUFFIX__ L
-// AARCH64-BE:#define __INTMAX_FMTd__ "ld"
-// AARCH64-BE:#define __INTMAX_FMTi__ "li"
-// AARCH64-BE:#define __INTMAX_MAX__ 9223372036854775807L
-// AARCH64-BE:#define __INTMAX_TYPE__ long int
-// AARCH64-BE:#define __INTMAX_WIDTH__ 64
-// AARCH64-BE:#define __INTPTR_FMTd__ "ld"
-// AARCH64-BE:#define __INTPTR_FMTi__ "li"
-// AARCH64-BE:#define __INTPTR_MAX__ 9223372036854775807L
-// AARCH64-BE:#define __INTPTR_TYPE__ long int
-// AARCH64-BE:#define __INTPTR_WIDTH__ 64
-// AARCH64-BE:#define __INT_FAST16_FMTd__ "hd"
-// AARCH64-BE:#define __INT_FAST16_FMTi__ "hi"
-// AARCH64-BE:#define __INT_FAST16_MAX__ 32767
-// AARCH64-BE:#define __INT_FAST16_TYPE__ short
-// AARCH64-BE:#define __INT_FAST32_FMTd__ "d"
-// AARCH64-BE:#define __INT_FAST32_FMTi__ "i"
-// AARCH64-BE:#define __INT_FAST32_MAX__ 2147483647
-// AARCH64-BE:#define __INT_FAST32_TYPE__ int
-// AARCH64-BE:#define __INT_FAST64_FMTd__ "ld"
-// AARCH64-BE:#define __INT_FAST64_FMTi__ "li"
-// AARCH64-BE:#define __INT_FAST64_MAX__ 9223372036854775807L
-// AARCH64-BE:#define __INT_FAST64_TYPE__ long int
-// AARCH64-BE:#define __INT_FAST8_FMTd__ "hhd"
-// AARCH64-BE:#define __INT_FAST8_FMTi__ "hhi"
-// AARCH64-BE:#define __INT_FAST8_MAX__ 127
-// AARCH64-BE:#define __INT_FAST8_TYPE__ signed char
-// AARCH64-BE:#define __INT_LEAST16_FMTd__ "hd"
-// AARCH64-BE:#define __INT_LEAST16_FMTi__ "hi"
-// AARCH64-BE:#define __INT_LEAST16_MAX__ 32767
-// AARCH64-BE:#define __INT_LEAST16_TYPE__ short
-// AARCH64-BE:#define __INT_LEAST32_FMTd__ "d"
-// AARCH64-BE:#define __INT_LEAST32_FMTi__ "i"
-// AARCH64-BE:#define __INT_LEAST32_MAX__ 2147483647
-// AARCH64-BE:#define __INT_LEAST32_TYPE__ int
-// AARCH64-BE:#define __INT_LEAST64_FMTd__ "ld"
-// AARCH64-BE:#define __INT_LEAST64_FMTi__ "li"
-// AARCH64-BE:#define __INT_LEAST64_MAX__ 9223372036854775807L
-// AARCH64-BE:#define __INT_LEAST64_TYPE__ long int
-// AARCH64-BE:#define __INT_LEAST8_FMTd__ "hhd"
-// AARCH64-BE:#define __INT_LEAST8_FMTi__ "hhi"
-// AARCH64-BE:#define __INT_LEAST8_MAX__ 127
-// AARCH64-BE:#define __INT_LEAST8_TYPE__ signed char
-// AARCH64-BE:#define __INT_MAX__ 2147483647
-// AARCH64-BE:#define __LDBL_DENORM_MIN__ 6.47517511943802511092443895822764655e-4966L
-// AARCH64-BE:#define __LDBL_DIG__ 33
-// AARCH64-BE:#define __LDBL_EPSILON__ 1.92592994438723585305597794258492732e-34L
-// AARCH64-BE:#define __LDBL_HAS_DENORM__ 1
-// AARCH64-BE:#define __LDBL_HAS_INFINITY__ 1
-// AARCH64-BE:#define __LDBL_HAS_QUIET_NAN__ 1
-// AARCH64-BE:#define __LDBL_MANT_DIG__ 113
-// AARCH64-BE:#define __LDBL_MAX_10_EXP__ 4932
-// AARCH64-BE:#define __LDBL_MAX_EXP__ 16384
-// AARCH64-BE:#define __LDBL_MAX__ 1.18973149535723176508575932662800702e+4932L
-// AARCH64-BE:#define __LDBL_MIN_10_EXP__ (-4931)
-// AARCH64-BE:#define __LDBL_MIN_EXP__ (-16381)
-// AARCH64-BE:#define __LDBL_MIN__ 3.36210314311209350626267781732175260e-4932L
-// AARCH64-BE:#define __LONG_LONG_MAX__ 9223372036854775807LL
-// AARCH64-BE:#define __LONG_MAX__ 9223372036854775807L
-// AARCH64-BE:#define __LP64__ 1
-// AARCH64-BE:#define __POINTER_WIDTH__ 64
-// AARCH64-BE:#define __PTRDIFF_TYPE__ long int
-// AARCH64-BE:#define __PTRDIFF_WIDTH__ 64
-// AARCH64-BE:#define __SCHAR_MAX__ 127
-// AARCH64-BE:#define __SHRT_MAX__ 32767
-// AARCH64-BE:#define __SIG_ATOMIC_MAX__ 2147483647
-// AARCH64-BE:#define __SIG_ATOMIC_WIDTH__ 32
-// AARCH64-BE:#define __SIZEOF_DOUBLE__ 8
-// AARCH64-BE:#define __SIZEOF_FLOAT__ 4
-// AARCH64-BE:#define __SIZEOF_INT128__ 16
-// AARCH64-BE:#define __SIZEOF_INT__ 4
-// AARCH64-BE:#define __SIZEOF_LONG_DOUBLE__ 16
-// AARCH64-BE:#define __SIZEOF_LONG_LONG__ 8
-// AARCH64-BE:#define __SIZEOF_LONG__ 8
-// AARCH64-BE:#define __SIZEOF_POINTER__ 8
-// AARCH64-BE:#define __SIZEOF_PTRDIFF_T__ 8
-// AARCH64-BE:#define __SIZEOF_SHORT__ 2
-// AARCH64-BE:#define __SIZEOF_SIZE_T__ 8
-// AARCH64-BE:#define __SIZEOF_WCHAR_T__ 4
-// AARCH64-BE:#define __SIZEOF_WINT_T__ 4
-// AARCH64-BE:#define __SIZE_MAX__ 18446744073709551615UL
-// AARCH64-BE:#define __SIZE_TYPE__ long unsigned int
-// AARCH64-BE:#define __SIZE_WIDTH__ 64
-// AARCH64-BE:#define __UINT16_C_SUFFIX__
-// AARCH64-BE:#define __UINT16_MAX__ 65535
-// AARCH64-BE:#define __UINT16_TYPE__ unsigned short
-// AARCH64-BE:#define __UINT32_C_SUFFIX__ U
-// AARCH64-BE:#define __UINT32_MAX__ 4294967295U
-// AARCH64-BE:#define __UINT32_TYPE__ unsigned int
-// AARCH64-BE:#define __UINT64_C_SUFFIX__ UL
-// AARCH64-BE:#define __UINT64_MAX__ 18446744073709551615UL
-// AARCH64-BE:#define __UINT64_TYPE__ long unsigned int
-// AARCH64-BE:#define __UINT8_C_SUFFIX__
-// AARCH64-BE:#define __UINT8_MAX__ 255
-// AARCH64-BE:#define __UINT8_TYPE__ unsigned char
-// AARCH64-BE:#define __UINTMAX_C_SUFFIX__ UL
-// AARCH64-BE:#define __UINTMAX_MAX__ 18446744073709551615UL
-// AARCH64-BE:#define __UINTMAX_TYPE__ long unsigned int
-// AARCH64-BE:#define __UINTMAX_WIDTH__ 64
-// AARCH64-BE:#define __UINTPTR_MAX__ 18446744073709551615UL
-// AARCH64-BE:#define __UINTPTR_TYPE__ long unsigned int
-// AARCH64-BE:#define __UINTPTR_WIDTH__ 64
-// AARCH64-BE:#define __UINT_FAST16_MAX__ 65535
-// AARCH64-BE:#define __UINT_FAST16_TYPE__ unsigned short
-// AARCH64-BE:#define __UINT_FAST32_MAX__ 4294967295U
-// AARCH64-BE:#define __UINT_FAST32_TYPE__ unsigned int
-// AARCH64-BE:#define __UINT_FAST64_MAX__ 18446744073709551615UL
-// AARCH64-BE:#define __UINT_FAST64_TYPE__ long unsigned int
-// AARCH64-BE:#define __UINT_FAST8_MAX__ 255
-// AARCH64-BE:#define __UINT_FAST8_TYPE__ unsigned char
-// AARCH64-BE:#define __UINT_LEAST16_MAX__ 65535
-// AARCH64-BE:#define __UINT_LEAST16_TYPE__ unsigned short
-// AARCH64-BE:#define __UINT_LEAST32_MAX__ 4294967295U
-// AARCH64-BE:#define __UINT_LEAST32_TYPE__ unsigned int
-// AARCH64-BE:#define __UINT_LEAST64_MAX__ 18446744073709551615UL
-// AARCH64-BE:#define __UINT_LEAST64_TYPE__ long unsigned int
-// AARCH64-BE:#define __UINT_LEAST8_MAX__ 255
-// AARCH64-BE:#define __UINT_LEAST8_TYPE__ unsigned char
-// AARCH64-BE:#define __USER_LABEL_PREFIX__
-// AARCH64-BE:#define __WCHAR_MAX__ 4294967295U
-// AARCH64-BE:#define __WCHAR_TYPE__ unsigned int
-// AARCH64-BE:#define __WCHAR_UNSIGNED__ 1
-// AARCH64-BE:#define __WCHAR_WIDTH__ 32
-// AARCH64-BE:#define __WINT_TYPE__ int
-// AARCH64-BE:#define __WINT_WIDTH__ 32
-// AARCH64-BE:#define __aarch64__ 1
-//
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=aarch64-netbsd < /dev/null | FileCheck -match-full-lines -check-prefix AARCH64-NETBSD %s
-//
-// AARCH64-NETBSD:#define _LP64 1
-// AARCH64-NETBSD-NOT:#define __AARCH64EB__ 1
-// AARCH64-NETBSD:#define __AARCH64EL__ 1
-// AARCH64-NETBSD-NOT:#define __AARCH_BIG_ENDIAN 1
-// AARCH64-NETBSD:#define __ARM_64BIT_STATE 1
-// AARCH64-NETBSD:#define __ARM_ARCH 8
-// AARCH64-NETBSD:#define __ARM_ARCH_ISA_A64 1
-// AARCH64-NETBSD-NOT:#define __ARM_BIG_ENDIAN 1
-// AARCH64-NETBSD:#define __BIGGEST_ALIGNMENT__ 16
-// AARCH64-NETBSD:#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
-// AARCH64-NETBSD:#define __CHAR16_TYPE__ unsigned short
-// AARCH64-NETBSD:#define __CHAR32_TYPE__ unsigned int
-// AARCH64-NETBSD:#define __CHAR_BIT__ 8
-// AARCH64-NETBSD:#define __DBL_DENORM_MIN__ 4.9406564584124654e-324
-// AARCH64-NETBSD:#define __DBL_DIG__ 15
-// AARCH64-NETBSD:#define __DBL_EPSILON__ 2.2204460492503131e-16
-// AARCH64-NETBSD:#define __DBL_HAS_DENORM__ 1
-// AARCH64-NETBSD:#define __DBL_HAS_INFINITY__ 1
-// AARCH64-NETBSD:#define __DBL_HAS_QUIET_NAN__ 1
-// AARCH64-NETBSD:#define __DBL_MANT_DIG__ 53
-// AARCH64-NETBSD:#define __DBL_MAX_10_EXP__ 308
-// AARCH64-NETBSD:#define __DBL_MAX_EXP__ 1024
-// AARCH64-NETBSD:#define __DBL_MAX__ 1.7976931348623157e+308
-// AARCH64-NETBSD:#define __DBL_MIN_10_EXP__ (-307)
-// AARCH64-NETBSD:#define __DBL_MIN_EXP__ (-1021)
-// AARCH64-NETBSD:#define __DBL_MIN__ 2.2250738585072014e-308
-// AARCH64-NETBSD:#define __DECIMAL_DIG__ __LDBL_DECIMAL_DIG__
-// AARCH64-NETBSD:#define __ELF__ 1
-// AARCH64-NETBSD:#define __FLT_DENORM_MIN__ 1.40129846e-45F
-// AARCH64-NETBSD:#define __FLT_DIG__ 6
-// AARCH64-NETBSD:#define __FLT_EPSILON__ 1.19209290e-7F
-// AARCH64-NETBSD:#define __FLT_EVAL_METHOD__ 0
-// AARCH64-NETBSD:#define __FLT_HAS_DENORM__ 1
-// AARCH64-NETBSD:#define __FLT_HAS_INFINITY__ 1
-// AARCH64-NETBSD:#define __FLT_HAS_QUIET_NAN__ 1
-// AARCH64-NETBSD:#define __FLT_MANT_DIG__ 24
-// AARCH64-NETBSD:#define __FLT_MAX_10_EXP__ 38
-// AARCH64-NETBSD:#define __FLT_MAX_EXP__ 128
-// AARCH64-NETBSD:#define __FLT_MAX__ 3.40282347e+38F
-// AARCH64-NETBSD:#define __FLT_MIN_10_EXP__ (-37)
-// AARCH64-NETBSD:#define __FLT_MIN_EXP__ (-125)
-// AARCH64-NETBSD:#define __FLT_MIN__ 1.17549435e-38F
-// AARCH64-NETBSD:#define __FLT_RADIX__ 2
-// AARCH64-NETBSD:#define __INT16_C_SUFFIX__
-// AARCH64-NETBSD:#define __INT16_FMTd__ "hd"
-// AARCH64-NETBSD:#define __INT16_FMTi__ "hi"
-// AARCH64-NETBSD:#define __INT16_MAX__ 32767
-// AARCH64-NETBSD:#define __INT16_TYPE__ short
-// AARCH64-NETBSD:#define __INT32_C_SUFFIX__
-// AARCH64-NETBSD:#define __INT32_FMTd__ "d"
-// AARCH64-NETBSD:#define __INT32_FMTi__ "i"
-// AARCH64-NETBSD:#define __INT32_MAX__ 2147483647
-// AARCH64-NETBSD:#define __INT32_TYPE__ int
-// AARCH64-NETBSD:#define __INT64_C_SUFFIX__ L
-// AARCH64-NETBSD:#define __INT64_FMTd__ "ld"
-// AARCH64-NETBSD:#define __INT64_FMTi__ "li"
-// AARCH64-NETBSD:#define __INT64_MAX__ 9223372036854775807L
-// AARCH64-NETBSD:#define __INT64_TYPE__ long int
-// AARCH64-NETBSD:#define __INT8_C_SUFFIX__
-// AARCH64-NETBSD:#define __INT8_FMTd__ "hhd"
-// AARCH64-NETBSD:#define __INT8_FMTi__ "hhi"
-// AARCH64-NETBSD:#define __INT8_MAX__ 127
-// AARCH64-NETBSD:#define __INT8_TYPE__ signed char
-// AARCH64-NETBSD:#define __INTMAX_C_SUFFIX__ L
-// AARCH64-NETBSD:#define __INTMAX_FMTd__ "ld"
-// AARCH64-NETBSD:#define __INTMAX_FMTi__ "li"
-// AARCH64-NETBSD:#define __INTMAX_MAX__ 9223372036854775807L
-// AARCH64-NETBSD:#define __INTMAX_TYPE__ long int
-// AARCH64-NETBSD:#define __INTMAX_WIDTH__ 64
-// AARCH64-NETBSD:#define __INTPTR_FMTd__ "ld"
-// AARCH64-NETBSD:#define __INTPTR_FMTi__ "li"
-// AARCH64-NETBSD:#define __INTPTR_MAX__ 9223372036854775807L
-// AARCH64-NETBSD:#define __INTPTR_TYPE__ long int
-// AARCH64-NETBSD:#define __INTPTR_WIDTH__ 64
-// AARCH64-NETBSD:#define __INT_FAST16_FMTd__ "hd"
-// AARCH64-NETBSD:#define __INT_FAST16_FMTi__ "hi"
-// AARCH64-NETBSD:#define __INT_FAST16_MAX__ 32767
-// AARCH64-NETBSD:#define __INT_FAST16_TYPE__ short
-// AARCH64-NETBSD:#define __INT_FAST32_FMTd__ "d"
-// AARCH64-NETBSD:#define __INT_FAST32_FMTi__ "i"
-// AARCH64-NETBSD:#define __INT_FAST32_MAX__ 2147483647
-// AARCH64-NETBSD:#define __INT_FAST32_TYPE__ int
-// AARCH64-NETBSD:#define __INT_FAST64_FMTd__ "ld"
-// AARCH64-NETBSD:#define __INT_FAST64_FMTi__ "li"
-// AARCH64-NETBSD:#define __INT_FAST64_MAX__ 9223372036854775807L
-// AARCH64-NETBSD:#define __INT_FAST64_TYPE__ long int
-// AARCH64-NETBSD:#define __INT_FAST8_FMTd__ "hhd"
-// AARCH64-NETBSD:#define __INT_FAST8_FMTi__ "hhi"
-// AARCH64-NETBSD:#define __INT_FAST8_MAX__ 127
-// AARCH64-NETBSD:#define __INT_FAST8_TYPE__ signed char
-// AARCH64-NETBSD:#define __INT_LEAST16_FMTd__ "hd"
-// AARCH64-NETBSD:#define __INT_LEAST16_FMTi__ "hi"
-// AARCH64-NETBSD:#define __INT_LEAST16_MAX__ 32767
-// AARCH64-NETBSD:#define __INT_LEAST16_TYPE__ short
-// AARCH64-NETBSD:#define __INT_LEAST32_FMTd__ "d"
-// AARCH64-NETBSD:#define __INT_LEAST32_FMTi__ "i"
-// AARCH64-NETBSD:#define __INT_LEAST32_MAX__ 2147483647
-// AARCH64-NETBSD:#define __INT_LEAST32_TYPE__ int
-// AARCH64-NETBSD:#define __INT_LEAST64_FMTd__ "ld"
-// AARCH64-NETBSD:#define __INT_LEAST64_FMTi__ "li"
-// AARCH64-NETBSD:#define __INT_LEAST64_MAX__ 9223372036854775807L
-// AARCH64-NETBSD:#define __INT_LEAST64_TYPE__ long int
-// AARCH64-NETBSD:#define __INT_LEAST8_FMTd__ "hhd"
-// AARCH64-NETBSD:#define __INT_LEAST8_FMTi__ "hhi"
-// AARCH64-NETBSD:#define __INT_LEAST8_MAX__ 127
-// AARCH64-NETBSD:#define __INT_LEAST8_TYPE__ signed char
-// AARCH64-NETBSD:#define __INT_MAX__ 2147483647
-// AARCH64-NETBSD:#define __LDBL_DENORM_MIN__ 6.47517511943802511092443895822764655e-4966L
-// AARCH64-NETBSD:#define __LDBL_DIG__ 33
-// AARCH64-NETBSD:#define __LDBL_EPSILON__ 1.92592994438723585305597794258492732e-34L
-// AARCH64-NETBSD:#define __LDBL_HAS_DENORM__ 1
-// AARCH64-NETBSD:#define __LDBL_HAS_INFINITY__ 1
-// AARCH64-NETBSD:#define __LDBL_HAS_QUIET_NAN__ 1
-// AARCH64-NETBSD:#define __LDBL_MANT_DIG__ 113
-// AARCH64-NETBSD:#define __LDBL_MAX_10_EXP__ 4932
-// AARCH64-NETBSD:#define __LDBL_MAX_EXP__ 16384
-// AARCH64-NETBSD:#define __LDBL_MAX__ 1.18973149535723176508575932662800702e+4932L
-// AARCH64-NETBSD:#define __LDBL_MIN_10_EXP__ (-4931)
-// AARCH64-NETBSD:#define __LDBL_MIN_EXP__ (-16381)
-// AARCH64-NETBSD:#define __LDBL_MIN__ 3.36210314311209350626267781732175260e-4932L
-// AARCH64-NETBSD:#define __LITTLE_ENDIAN__ 1
-// AARCH64-NETBSD:#define __LONG_LONG_MAX__ 9223372036854775807LL
-// AARCH64-NETBSD:#define __LONG_MAX__ 9223372036854775807L
-// AARCH64-NETBSD:#define __LP64__ 1
-// AARCH64-NETBSD:#define __NetBSD__ 1
-// AARCH64-NETBSD:#define __POINTER_WIDTH__ 64
-// AARCH64-NETBSD:#define __PTRDIFF_TYPE__ long int
-// AARCH64-NETBSD:#define __PTRDIFF_WIDTH__ 64
-// AARCH64-NETBSD:#define __SCHAR_MAX__ 127
-// AARCH64-NETBSD:#define __SHRT_MAX__ 32767
-// AARCH64-NETBSD:#define __SIG_ATOMIC_MAX__ 2147483647
-// AARCH64-NETBSD:#define __SIG_ATOMIC_WIDTH__ 32
-// AARCH64-NETBSD:#define __SIZEOF_DOUBLE__ 8
-// AARCH64-NETBSD:#define __SIZEOF_FLOAT__ 4
-// AARCH64-NETBSD:#define __SIZEOF_INT__ 4
-// AARCH64-NETBSD:#define __SIZEOF_LONG_DOUBLE__ 16
-// AARCH64-NETBSD:#define __SIZEOF_LONG_LONG__ 8
-// AARCH64-NETBSD:#define __SIZEOF_LONG__ 8
-// AARCH64-NETBSD:#define __SIZEOF_POINTER__ 8
-// AARCH64-NETBSD:#define __SIZEOF_PTRDIFF_T__ 8
-// AARCH64-NETBSD:#define __SIZEOF_SHORT__ 2
-// AARCH64-NETBSD:#define __SIZEOF_SIZE_T__ 8
-// AARCH64-NETBSD:#define __SIZEOF_WCHAR_T__ 4
-// AARCH64-NETBSD:#define __SIZEOF_WINT_T__ 4
-// AARCH64-NETBSD:#define __SIZE_MAX__ 18446744073709551615UL
-// AARCH64-NETBSD:#define __SIZE_TYPE__ long unsigned int
-// AARCH64-NETBSD:#define __SIZE_WIDTH__ 64
-// AARCH64-NETBSD:#define __UINT16_C_SUFFIX__
-// AARCH64-NETBSD:#define __UINT16_MAX__ 65535
-// AARCH64-NETBSD:#define __UINT16_TYPE__ unsigned short
-// AARCH64-NETBSD:#define __UINT32_C_SUFFIX__ U
-// AARCH64-NETBSD:#define __UINT32_MAX__ 4294967295U
-// AARCH64-NETBSD:#define __UINT32_TYPE__ unsigned int
-// AARCH64-NETBSD:#define __UINT64_C_SUFFIX__ UL
-// AARCH64-NETBSD:#define __UINT64_MAX__ 18446744073709551615UL
-// AARCH64-NETBSD:#define __UINT64_TYPE__ long unsigned int
-// AARCH64-NETBSD:#define __UINT8_C_SUFFIX__
-// AARCH64-NETBSD:#define __UINT8_MAX__ 255
-// AARCH64-NETBSD:#define __UINT8_TYPE__ unsigned char
-// AARCH64-NETBSD:#define __UINTMAX_C_SUFFIX__ UL
-// AARCH64-NETBSD:#define __UINTMAX_MAX__ 18446744073709551615UL
-// AARCH64-NETBSD:#define __UINTMAX_TYPE__ long unsigned int
-// AARCH64-NETBSD:#define __UINTMAX_WIDTH__ 64
-// AARCH64-NETBSD:#define __UINTPTR_MAX__ 18446744073709551615UL
-// AARCH64-NETBSD:#define __UINTPTR_TYPE__ long unsigned int
-// AARCH64-NETBSD:#define __UINTPTR_WIDTH__ 64
-// AARCH64-NETBSD:#define __UINT_FAST16_MAX__ 65535
-// AARCH64-NETBSD:#define __UINT_FAST16_TYPE__ unsigned short
-// AARCH64-NETBSD:#define __UINT_FAST32_MAX__ 4294967295U
-// AARCH64-NETBSD:#define __UINT_FAST32_TYPE__ unsigned int
-// AARCH64-NETBSD:#define __UINT_FAST64_MAX__ 18446744073709551615UL
-// AARCH64-NETBSD:#define __UINT_FAST64_TYPE__ long unsigned int
-// AARCH64-NETBSD:#define __UINT_FAST8_MAX__ 255
-// AARCH64-NETBSD:#define __UINT_FAST8_TYPE__ unsigned char
-// AARCH64-NETBSD:#define __UINT_LEAST16_MAX__ 65535
-// AARCH64-NETBSD:#define __UINT_LEAST16_TYPE__ unsigned short
-// AARCH64-NETBSD:#define __UINT_LEAST32_MAX__ 4294967295U
-// AARCH64-NETBSD:#define __UINT_LEAST32_TYPE__ unsigned int
-// AARCH64-NETBSD:#define __UINT_LEAST64_MAX__ 18446744073709551615UL
-// AARCH64-NETBSD:#define __UINT_LEAST64_TYPE__ long unsigned int
-// AARCH64-NETBSD:#define __UINT_LEAST8_MAX__ 255
-// AARCH64-NETBSD:#define __UINT_LEAST8_TYPE__ unsigned char
-// AARCH64-NETBSD:#define __USER_LABEL_PREFIX__
-// AARCH64-NETBSD:#define __WCHAR_MAX__ 2147483647
-// AARCH64-NETBSD:#define __WCHAR_TYPE__ int
-// AARCH64-NETBSD:#define __WCHAR_WIDTH__ 32
-// AARCH64-NETBSD:#define __WINT_TYPE__ int
-// AARCH64-NETBSD:#define __WINT_WIDTH__ 32
-// AARCH64-NETBSD:#define __aarch64__ 1
-//
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=aarch64-openbsd < /dev/null | FileCheck -match-full-lines -check-prefix AARCH64-OPENBSD %s
-//
-// AARCH64-OPENBSD:#define _LP64 1
-// AARCH64-OPENBSD-NOT:#define __AARCH64EB__ 1
-// AARCH64-OPENBSD:#define __AARCH64EL__ 1
-// AARCH64-OPENBSD-NOT:#define __AARCH_BIG_ENDIAN 1
-// AARCH64-OPENBSD:#define __ARM_64BIT_STATE 1
-// AARCH64-OPENBSD:#define __ARM_ARCH 8
-// AARCH64-OPENBSD:#define __ARM_ARCH_ISA_A64 1
-// AARCH64-OPENBSD-NOT:#define __ARM_BIG_ENDIAN 1
-// AARCH64-OPENBSD:#define __BIGGEST_ALIGNMENT__ 16
-// AARCH64-OPENBSD:#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
-// AARCH64-OPENBSD:#define __CHAR16_TYPE__ unsigned short
-// AARCH64-OPENBSD:#define __CHAR32_TYPE__ unsigned int
-// AARCH64-OPENBSD:#define __CHAR_BIT__ 8
-// AARCH64-OPENBSD:#define __DBL_DENORM_MIN__ 4.9406564584124654e-324
-// AARCH64-OPENBSD:#define __DBL_DIG__ 15
-// AARCH64-OPENBSD:#define __DBL_EPSILON__ 2.2204460492503131e-16
-// AARCH64-OPENBSD:#define __DBL_HAS_DENORM__ 1
-// AARCH64-OPENBSD:#define __DBL_HAS_INFINITY__ 1
-// AARCH64-OPENBSD:#define __DBL_HAS_QUIET_NAN__ 1
-// AARCH64-OPENBSD:#define __DBL_MANT_DIG__ 53
-// AARCH64-OPENBSD:#define __DBL_MAX_10_EXP__ 308
-// AARCH64-OPENBSD:#define __DBL_MAX_EXP__ 1024
-// AARCH64-OPENBSD:#define __DBL_MAX__ 1.7976931348623157e+308
-// AARCH64-OPENBSD:#define __DBL_MIN_10_EXP__ (-307)
-// AARCH64-OPENBSD:#define __DBL_MIN_EXP__ (-1021)
-// AARCH64-OPENBSD:#define __DBL_MIN__ 2.2250738585072014e-308
-// AARCH64-OPENBSD:#define __DECIMAL_DIG__ __LDBL_DECIMAL_DIG__
-// AARCH64-OPENBSD:#define __ELF__ 1
-// AARCH64-OPENBSD:#define __FLT_DENORM_MIN__ 1.40129846e-45F
-// AARCH64-OPENBSD:#define __FLT_DIG__ 6
-// AARCH64-OPENBSD:#define __FLT_EPSILON__ 1.19209290e-7F
-// AARCH64-OPENBSD:#define __FLT_EVAL_METHOD__ 0
-// AARCH64-OPENBSD:#define __FLT_HAS_DENORM__ 1
-// AARCH64-OPENBSD:#define __FLT_HAS_INFINITY__ 1
-// AARCH64-OPENBSD:#define __FLT_HAS_QUIET_NAN__ 1
-// AARCH64-OPENBSD:#define __FLT_MANT_DIG__ 24
-// AARCH64-OPENBSD:#define __FLT_MAX_10_EXP__ 38
-// AARCH64-OPENBSD:#define __FLT_MAX_EXP__ 128
-// AARCH64-OPENBSD:#define __FLT_MAX__ 3.40282347e+38F
-// AARCH64-OPENBSD:#define __FLT_MIN_10_EXP__ (-37)
-// AARCH64-OPENBSD:#define __FLT_MIN_EXP__ (-125)
-// AARCH64-OPENBSD:#define __FLT_MIN__ 1.17549435e-38F
-// AARCH64-OPENBSD:#define __FLT_RADIX__ 2
-// AARCH64-OPENBSD:#define __INT16_C_SUFFIX__
-// AARCH64-OPENBSD:#define __INT16_FMTd__ "hd"
-// AARCH64-OPENBSD:#define __INT16_FMTi__ "hi"
-// AARCH64-OPENBSD:#define __INT16_MAX__ 32767
-// AARCH64-OPENBSD:#define __INT16_TYPE__ short
-// AARCH64-OPENBSD:#define __INT32_C_SUFFIX__
-// AARCH64-OPENBSD:#define __INT32_FMTd__ "d"
-// AARCH64-OPENBSD:#define __INT32_FMTi__ "i"
-// AARCH64-OPENBSD:#define __INT32_MAX__ 2147483647
-// AARCH64-OPENBSD:#define __INT32_TYPE__ int
-// AARCH64-OPENBSD:#define __INT64_C_SUFFIX__ LL
-// AARCH64-OPENBSD:#define __INT64_FMTd__ "lld"
-// AARCH64-OPENBSD:#define __INT64_FMTi__ "lli"
-// AARCH64-OPENBSD:#define __INT64_MAX__ 9223372036854775807LL
-// AARCH64-OPENBSD:#define __INT64_TYPE__ long long int
-// AARCH64-OPENBSD:#define __INT8_C_SUFFIX__
-// AARCH64-OPENBSD:#define __INT8_FMTd__ "hhd"
-// AARCH64-OPENBSD:#define __INT8_FMTi__ "hhi"
-// AARCH64-OPENBSD:#define __INT8_MAX__ 127
-// AARCH64-OPENBSD:#define __INT8_TYPE__ signed char
-// AARCH64-OPENBSD:#define __INTMAX_C_SUFFIX__ LL
-// AARCH64-OPENBSD:#define __INTMAX_FMTd__ "lld"
-// AARCH64-OPENBSD:#define __INTMAX_FMTi__ "lli"
-// AARCH64-OPENBSD:#define __INTMAX_MAX__ 9223372036854775807LL
-// AARCH64-OPENBSD:#define __INTMAX_TYPE__ long long int
-// AARCH64-OPENBSD:#define __INTMAX_WIDTH__ 64
-// AARCH64-OPENBSD:#define __INTPTR_FMTd__ "ld"
-// AARCH64-OPENBSD:#define __INTPTR_FMTi__ "li"
-// AARCH64-OPENBSD:#define __INTPTR_MAX__ 9223372036854775807L
-// AARCH64-OPENBSD:#define __INTPTR_TYPE__ long int
-// AARCH64-OPENBSD:#define __INTPTR_WIDTH__ 64
-// AARCH64-OPENBSD:#define __INT_FAST16_FMTd__ "hd"
-// AARCH64-OPENBSD:#define __INT_FAST16_FMTi__ "hi"
-// AARCH64-OPENBSD:#define __INT_FAST16_MAX__ 32767
-// AARCH64-OPENBSD:#define __INT_FAST16_TYPE__ short
-// AARCH64-OPENBSD:#define __INT_FAST32_FMTd__ "d"
-// AARCH64-OPENBSD:#define __INT_FAST32_FMTi__ "i"
-// AARCH64-OPENBSD:#define __INT_FAST32_MAX__ 2147483647
-// AARCH64-OPENBSD:#define __INT_FAST32_TYPE__ int
-// AARCH64-OPENBSD:#define __INT_FAST64_FMTd__ "ld"
-// AARCH64-OPENBSD:#define __INT_FAST64_FMTi__ "li"
-// AARCH64-OPENBSD:#define __INT_FAST64_MAX__ 9223372036854775807L
-// AARCH64-OPENBSD:#define __INT_FAST64_TYPE__ long int
-// AARCH64-OPENBSD:#define __INT_FAST8_FMTd__ "hhd"
-// AARCH64-OPENBSD:#define __INT_FAST8_FMTi__ "hhi"
-// AARCH64-OPENBSD:#define __INT_FAST8_MAX__ 127
-// AARCH64-OPENBSD:#define __INT_FAST8_TYPE__ signed char
-// AARCH64-OPENBSD:#define __INT_LEAST16_FMTd__ "hd"
-// AARCH64-OPENBSD:#define __INT_LEAST16_FMTi__ "hi"
-// AARCH64-OPENBSD:#define __INT_LEAST16_MAX__ 32767
-// AARCH64-OPENBSD:#define __INT_LEAST16_TYPE__ short
-// AARCH64-OPENBSD:#define __INT_LEAST32_FMTd__ "d"
-// AARCH64-OPENBSD:#define __INT_LEAST32_FMTi__ "i"
-// AARCH64-OPENBSD:#define __INT_LEAST32_MAX__ 2147483647
-// AARCH64-OPENBSD:#define __INT_LEAST32_TYPE__ int
-// AARCH64-OPENBSD:#define __INT_LEAST64_FMTd__ "ld"
-// AARCH64-OPENBSD:#define __INT_LEAST64_FMTi__ "li"
-// AARCH64-OPENBSD:#define __INT_LEAST64_MAX__ 9223372036854775807L
-// AARCH64-OPENBSD:#define __INT_LEAST64_TYPE__ long int
-// AARCH64-OPENBSD:#define __INT_LEAST8_FMTd__ "hhd"
-// AARCH64-OPENBSD:#define __INT_LEAST8_FMTi__ "hhi"
-// AARCH64-OPENBSD:#define __INT_LEAST8_MAX__ 127
-// AARCH64-OPENBSD:#define __INT_LEAST8_TYPE__ signed char
-// AARCH64-OPENBSD:#define __INT_MAX__ 2147483647
-// AARCH64-OPENBSD:#define __LDBL_DENORM_MIN__ 6.47517511943802511092443895822764655e-4966L
-// AARCH64-OPENBSD:#define __LDBL_DIG__ 33
-// AARCH64-OPENBSD:#define __LDBL_EPSILON__ 1.92592994438723585305597794258492732e-34L
-// AARCH64-OPENBSD:#define __LDBL_HAS_DENORM__ 1
-// AARCH64-OPENBSD:#define __LDBL_HAS_INFINITY__ 1
-// AARCH64-OPENBSD:#define __LDBL_HAS_QUIET_NAN__ 1
-// AARCH64-OPENBSD:#define __LDBL_MANT_DIG__ 113
-// AARCH64-OPENBSD:#define __LDBL_MAX_10_EXP__ 4932
-// AARCH64-OPENBSD:#define __LDBL_MAX_EXP__ 16384
-// AARCH64-OPENBSD:#define __LDBL_MAX__ 1.18973149535723176508575932662800702e+4932L
-// AARCH64-OPENBSD:#define __LDBL_MIN_10_EXP__ (-4931)
-// AARCH64-OPENBSD:#define __LDBL_MIN_EXP__ (-16381)
-// AARCH64-OPENBSD:#define __LDBL_MIN__ 3.36210314311209350626267781732175260e-4932L
-// AARCH64-OPENBSD:#define __LITTLE_ENDIAN__ 1
-// AARCH64-OPENBSD:#define __LONG_LONG_MAX__ 9223372036854775807LL
-// AARCH64-OPENBSD:#define __LONG_MAX__ 9223372036854775807L
-// AARCH64-OPENBSD:#define __LP64__ 1
-// AARCH64-OPENBSD:#define __OpenBSD__ 1
-// AARCH64-OPENBSD:#define __POINTER_WIDTH__ 64
-// AARCH64-OPENBSD:#define __PTRDIFF_TYPE__ long int
-// AARCH64-OPENBSD:#define __PTRDIFF_WIDTH__ 64
-// AARCH64-OPENBSD:#define __SCHAR_MAX__ 127
-// AARCH64-OPENBSD:#define __SHRT_MAX__ 32767
-// AARCH64-OPENBSD:#define __SIG_ATOMIC_MAX__ 2147483647
-// AARCH64-OPENBSD:#define __SIG_ATOMIC_WIDTH__ 32
-// AARCH64-OPENBSD:#define __SIZEOF_DOUBLE__ 8
-// AARCH64-OPENBSD:#define __SIZEOF_FLOAT__ 4
-// AARCH64-OPENBSD:#define __SIZEOF_INT__ 4
-// AARCH64-OPENBSD:#define __SIZEOF_LONG_DOUBLE__ 16
-// AARCH64-OPENBSD:#define __SIZEOF_LONG_LONG__ 8
-// AARCH64-OPENBSD:#define __SIZEOF_LONG__ 8
-// AARCH64-OPENBSD:#define __SIZEOF_POINTER__ 8
-// AARCH64-OPENBSD:#define __SIZEOF_PTRDIFF_T__ 8
-// AARCH64-OPENBSD:#define __SIZEOF_SHORT__ 2
-// AARCH64-OPENBSD:#define __SIZEOF_SIZE_T__ 8
-// AARCH64-OPENBSD:#define __SIZEOF_WCHAR_T__ 4
-// AARCH64-OPENBSD:#define __SIZEOF_WINT_T__ 4
-// AARCH64-OPENBSD:#define __SIZE_MAX__ 18446744073709551615UL
-// AARCH64-OPENBSD:#define __SIZE_TYPE__ long unsigned int
-// AARCH64-OPENBSD:#define __SIZE_WIDTH__ 64
-// AARCH64-OPENBSD:#define __UINT16_C_SUFFIX__
-// AARCH64-OPENBSD:#define __UINT16_MAX__ 65535
-// AARCH64-OPENBSD:#define __UINT16_TYPE__ unsigned short
-// AARCH64-OPENBSD:#define __UINT32_C_SUFFIX__ U
-// AARCH64-OPENBSD:#define __UINT32_MAX__ 4294967295U
-// AARCH64-OPENBSD:#define __UINT32_TYPE__ unsigned int
-// AARCH64-OPENBSD:#define __UINT64_C_SUFFIX__ ULL
-// AARCH64-OPENBSD:#define __UINT64_MAX__ 18446744073709551615ULL
-// AARCH64-OPENBSD:#define __UINT64_TYPE__ long long unsigned int
-// AARCH64-OPENBSD:#define __UINT8_C_SUFFIX__
-// AARCH64-OPENBSD:#define __UINT8_MAX__ 255
-// AARCH64-OPENBSD:#define __UINT8_TYPE__ unsigned char
-// AARCH64-OPENBSD:#define __UINTMAX_C_SUFFIX__ ULL
-// AARCH64-OPENBSD:#define __UINTMAX_MAX__ 18446744073709551615ULL
-// AARCH64-OPENBSD:#define __UINTMAX_TYPE__ long long unsigned int
-// AARCH64-OPENBSD:#define __UINTMAX_WIDTH__ 64
-// AARCH64-OPENBSD:#define __UINTPTR_MAX__ 18446744073709551615UL
-// AARCH64-OPENBSD:#define __UINTPTR_TYPE__ long unsigned int
-// AARCH64-OPENBSD:#define __UINTPTR_WIDTH__ 64
-// AARCH64-OPENBSD:#define __UINT_FAST16_MAX__ 65535
-// AARCH64-OPENBSD:#define __UINT_FAST16_TYPE__ unsigned short
-// AARCH64-OPENBSD:#define __UINT_FAST32_MAX__ 4294967295U
-// AARCH64-OPENBSD:#define __UINT_FAST32_TYPE__ unsigned int
-// AARCH64-OPENBSD:#define __UINT_FAST64_MAX__ 18446744073709551615UL
-// AARCH64-OPENBSD:#define __UINT_FAST64_TYPE__ long unsigned int
-// AARCH64-OPENBSD:#define __UINT_FAST8_MAX__ 255
-// AARCH64-OPENBSD:#define __UINT_FAST8_TYPE__ unsigned char
-// AARCH64-OPENBSD:#define __UINT_LEAST16_MAX__ 65535
-// AARCH64-OPENBSD:#define __UINT_LEAST16_TYPE__ unsigned short
-// AARCH64-OPENBSD:#define __UINT_LEAST32_MAX__ 4294967295U
-// AARCH64-OPENBSD:#define __UINT_LEAST32_TYPE__ unsigned int
-// AARCH64-OPENBSD:#define __UINT_LEAST64_MAX__ 18446744073709551615UL
-// AARCH64-OPENBSD:#define __UINT_LEAST64_TYPE__ long unsigned int
-// AARCH64-OPENBSD:#define __UINT_LEAST8_MAX__ 255
-// AARCH64-OPENBSD:#define __UINT_LEAST8_TYPE__ unsigned char
-// AARCH64-OPENBSD:#define __USER_LABEL_PREFIX__
-// AARCH64-OPENBSD:#define __WCHAR_MAX__ 2147483647
-// AARCH64-OPENBSD:#define __WCHAR_TYPE__ int
-// AARCH64-OPENBSD:#define __WCHAR_WIDTH__ 32
-// AARCH64-OPENBSD:#define __WINT_TYPE__ int
-// AARCH64-OPENBSD:#define __WINT_WIDTH__ 32
-// AARCH64-OPENBSD:#define __aarch64__ 1
-//
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=aarch64-freebsd11 < /dev/null | FileCheck -match-full-lines -check-prefix AARCH64-FREEBSD %s
-//
-// AARCH64-FREEBSD:#define _LP64 1
-// AARCH64-FREEBSD-NOT:#define __AARCH64EB__ 1
-// AARCH64-FREEBSD:#define __AARCH64EL__ 1
-// AARCH64-FREEBSD-NOT:#define __AARCH_BIG_ENDIAN 1
-// AARCH64-FREEBSD:#define __ARM_64BIT_STATE 1
-// AARCH64-FREEBSD:#define __ARM_ARCH 8
-// AARCH64-FREEBSD:#define __ARM_ARCH_ISA_A64 1
-// AARCH64-FREEBSD-NOT:#define __ARM_BIG_ENDIAN 1
-// AARCH64-FREEBSD:#define __BIGGEST_ALIGNMENT__ 16
-// AARCH64-FREEBSD:#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
-// AARCH64-FREEBSD:#define __CHAR16_TYPE__ unsigned short
-// AARCH64-FREEBSD:#define __CHAR32_TYPE__ unsigned int
-// AARCH64-FREEBSD:#define __CHAR_BIT__ 8
-// AARCH64-FREEBSD:#define __DBL_DENORM_MIN__ 4.9406564584124654e-324
-// AARCH64-FREEBSD:#define __DBL_DIG__ 15
-// AARCH64-FREEBSD:#define __DBL_EPSILON__ 2.2204460492503131e-16
-// AARCH64-FREEBSD:#define __DBL_HAS_DENORM__ 1
-// AARCH64-FREEBSD:#define __DBL_HAS_INFINITY__ 1
-// AARCH64-FREEBSD:#define __DBL_HAS_QUIET_NAN__ 1
-// AARCH64-FREEBSD:#define __DBL_MANT_DIG__ 53
-// AARCH64-FREEBSD:#define __DBL_MAX_10_EXP__ 308
-// AARCH64-FREEBSD:#define __DBL_MAX_EXP__ 1024
-// AARCH64-FREEBSD:#define __DBL_MAX__ 1.7976931348623157e+308
-// AARCH64-FREEBSD:#define __DBL_MIN_10_EXP__ (-307)
-// AARCH64-FREEBSD:#define __DBL_MIN_EXP__ (-1021)
-// AARCH64-FREEBSD:#define __DBL_MIN__ 2.2250738585072014e-308
-// AARCH64-FREEBSD:#define __DECIMAL_DIG__ __LDBL_DECIMAL_DIG__
-// AARCH64-FREEBSD:#define __ELF__ 1
-// AARCH64-FREEBSD:#define __FLT_DENORM_MIN__ 1.40129846e-45F
-// AARCH64-FREEBSD:#define __FLT_DIG__ 6
-// AARCH64-FREEBSD:#define __FLT_EPSILON__ 1.19209290e-7F
-// AARCH64-FREEBSD:#define __FLT_EVAL_METHOD__ 0
-// AARCH64-FREEBSD:#define __FLT_HAS_DENORM__ 1
-// AARCH64-FREEBSD:#define __FLT_HAS_INFINITY__ 1
-// AARCH64-FREEBSD:#define __FLT_HAS_QUIET_NAN__ 1
-// AARCH64-FREEBSD:#define __FLT_MANT_DIG__ 24
-// AARCH64-FREEBSD:#define __FLT_MAX_10_EXP__ 38
-// AARCH64-FREEBSD:#define __FLT_MAX_EXP__ 128
-// AARCH64-FREEBSD:#define __FLT_MAX__ 3.40282347e+38F
-// AARCH64-FREEBSD:#define __FLT_MIN_10_EXP__ (-37)
-// AARCH64-FREEBSD:#define __FLT_MIN_EXP__ (-125)
-// AARCH64-FREEBSD:#define __FLT_MIN__ 1.17549435e-38F
-// AARCH64-FREEBSD:#define __FLT_RADIX__ 2
-// AARCH64-FREEBSD:#define __FreeBSD__ 11
-// AARCH64-FREEBSD:#define __INT16_C_SUFFIX__
-// AARCH64-FREEBSD:#define __INT16_FMTd__ "hd"
-// AARCH64-FREEBSD:#define __INT16_FMTi__ "hi"
-// AARCH64-FREEBSD:#define __INT16_MAX__ 32767
-// AARCH64-FREEBSD:#define __INT16_TYPE__ short
-// AARCH64-FREEBSD:#define __INT32_C_SUFFIX__
-// AARCH64-FREEBSD:#define __INT32_FMTd__ "d"
-// AARCH64-FREEBSD:#define __INT32_FMTi__ "i"
-// AARCH64-FREEBSD:#define __INT32_MAX__ 2147483647
-// AARCH64-FREEBSD:#define __INT32_TYPE__ int
-// AARCH64-FREEBSD:#define __INT64_C_SUFFIX__ L
-// AARCH64-FREEBSD:#define __INT64_FMTd__ "ld"
-// AARCH64-FREEBSD:#define __INT64_FMTi__ "li"
-// AARCH64-FREEBSD:#define __INT64_MAX__ 9223372036854775807L
-// AARCH64-FREEBSD:#define __INT64_TYPE__ long int
-// AARCH64-FREEBSD:#define __INT8_C_SUFFIX__
-// AARCH64-FREEBSD:#define __INT8_FMTd__ "hhd"
-// AARCH64-FREEBSD:#define __INT8_FMTi__ "hhi"
-// AARCH64-FREEBSD:#define __INT8_MAX__ 127
-// AARCH64-FREEBSD:#define __INT8_TYPE__ signed char
-// AARCH64-FREEBSD:#define __INTMAX_C_SUFFIX__ L
-// AARCH64-FREEBSD:#define __INTMAX_FMTd__ "ld"
-// AARCH64-FREEBSD:#define __INTMAX_FMTi__ "li"
-// AARCH64-FREEBSD:#define __INTMAX_MAX__ 9223372036854775807L
-// AARCH64-FREEBSD:#define __INTMAX_TYPE__ long int
-// AARCH64-FREEBSD:#define __INTMAX_WIDTH__ 64
-// AARCH64-FREEBSD:#define __INTPTR_FMTd__ "ld"
-// AARCH64-FREEBSD:#define __INTPTR_FMTi__ "li"
-// AARCH64-FREEBSD:#define __INTPTR_MAX__ 9223372036854775807L
-// AARCH64-FREEBSD:#define __INTPTR_TYPE__ long int
-// AARCH64-FREEBSD:#define __INTPTR_WIDTH__ 64
-// AARCH64-FREEBSD:#define __INT_FAST16_FMTd__ "hd"
-// AARCH64-FREEBSD:#define __INT_FAST16_FMTi__ "hi"
-// AARCH64-FREEBSD:#define __INT_FAST16_MAX__ 32767
-// AARCH64-FREEBSD:#define __INT_FAST16_TYPE__ short
-// AARCH64-FREEBSD:#define __INT_FAST32_FMTd__ "d"
-// AARCH64-FREEBSD:#define __INT_FAST32_FMTi__ "i"
-// AARCH64-FREEBSD:#define __INT_FAST32_MAX__ 2147483647
-// AARCH64-FREEBSD:#define __INT_FAST32_TYPE__ int
-// AARCH64-FREEBSD:#define __INT_FAST64_FMTd__ "ld"
-// AARCH64-FREEBSD:#define __INT_FAST64_FMTi__ "li"
-// AARCH64-FREEBSD:#define __INT_FAST64_MAX__ 9223372036854775807L
-// AARCH64-FREEBSD:#define __INT_FAST64_TYPE__ long int
-// AARCH64-FREEBSD:#define __INT_FAST8_FMTd__ "hhd"
-// AARCH64-FREEBSD:#define __INT_FAST8_FMTi__ "hhi"
-// AARCH64-FREEBSD:#define __INT_FAST8_MAX__ 127
-// AARCH64-FREEBSD:#define __INT_FAST8_TYPE__ signed char
-// AARCH64-FREEBSD:#define __INT_LEAST16_FMTd__ "hd"
-// AARCH64-FREEBSD:#define __INT_LEAST16_FMTi__ "hi"
-// AARCH64-FREEBSD:#define __INT_LEAST16_MAX__ 32767
-// AARCH64-FREEBSD:#define __INT_LEAST16_TYPE__ short
-// AARCH64-FREEBSD:#define __INT_LEAST32_FMTd__ "d"
-// AARCH64-FREEBSD:#define __INT_LEAST32_FMTi__ "i"
-// AARCH64-FREEBSD:#define __INT_LEAST32_MAX__ 2147483647
-// AARCH64-FREEBSD:#define __INT_LEAST32_TYPE__ int
-// AARCH64-FREEBSD:#define __INT_LEAST64_FMTd__ "ld"
-// AARCH64-FREEBSD:#define __INT_LEAST64_FMTi__ "li"
-// AARCH64-FREEBSD:#define __INT_LEAST64_MAX__ 9223372036854775807L
-// AARCH64-FREEBSD:#define __INT_LEAST64_TYPE__ long int
-// AARCH64-FREEBSD:#define __INT_LEAST8_FMTd__ "hhd"
-// AARCH64-FREEBSD:#define __INT_LEAST8_FMTi__ "hhi"
-// AARCH64-FREEBSD:#define __INT_LEAST8_MAX__ 127
-// AARCH64-FREEBSD:#define __INT_LEAST8_TYPE__ signed char
-// AARCH64-FREEBSD:#define __INT_MAX__ 2147483647
-// AARCH64-FREEBSD:#define __LDBL_DENORM_MIN__ 6.47517511943802511092443895822764655e-4966L
-// AARCH64-FREEBSD:#define __LDBL_DIG__ 33
-// AARCH64-FREEBSD:#define __LDBL_EPSILON__ 1.92592994438723585305597794258492732e-34L
-// AARCH64-FREEBSD:#define __LDBL_HAS_DENORM__ 1
-// AARCH64-FREEBSD:#define __LDBL_HAS_INFINITY__ 1
-// AARCH64-FREEBSD:#define __LDBL_HAS_QUIET_NAN__ 1
-// AARCH64-FREEBSD:#define __LDBL_MANT_DIG__ 113
-// AARCH64-FREEBSD:#define __LDBL_MAX_10_EXP__ 4932
-// AARCH64-FREEBSD:#define __LDBL_MAX_EXP__ 16384
-// AARCH64-FREEBSD:#define __LDBL_MAX__ 1.18973149535723176508575932662800702e+4932L
-// AARCH64-FREEBSD:#define __LDBL_MIN_10_EXP__ (-4931)
-// AARCH64-FREEBSD:#define __LDBL_MIN_EXP__ (-16381)
-// AARCH64-FREEBSD:#define __LDBL_MIN__ 3.36210314311209350626267781732175260e-4932L
-// AARCH64-FREEBSD:#define __LITTLE_ENDIAN__ 1
-// AARCH64-FREEBSD:#define __LONG_LONG_MAX__ 9223372036854775807LL
-// AARCH64-FREEBSD:#define __LONG_MAX__ 9223372036854775807L
-// AARCH64-FREEBSD:#define __LP64__ 1
-// AARCH64-FREEBSD:#define __POINTER_WIDTH__ 64
-// AARCH64-FREEBSD:#define __PTRDIFF_TYPE__ long int
-// AARCH64-FREEBSD:#define __PTRDIFF_WIDTH__ 64
-// AARCH64-FREEBSD:#define __SCHAR_MAX__ 127
-// AARCH64-FREEBSD:#define __SHRT_MAX__ 32767
-// AARCH64-FREEBSD:#define __SIG_ATOMIC_MAX__ 2147483647
-// AARCH64-FREEBSD:#define __SIG_ATOMIC_WIDTH__ 32
-// AARCH64-FREEBSD:#define __SIZEOF_DOUBLE__ 8
-// AARCH64-FREEBSD:#define __SIZEOF_FLOAT__ 4
-// AARCH64-FREEBSD:#define __SIZEOF_INT128__ 16
-// AARCH64-FREEBSD:#define __SIZEOF_INT__ 4
-// AARCH64-FREEBSD:#define __SIZEOF_LONG_DOUBLE__ 16
-// AARCH64-FREEBSD:#define __SIZEOF_LONG_LONG__ 8
-// AARCH64-FREEBSD:#define __SIZEOF_LONG__ 8
-// AARCH64-FREEBSD:#define __SIZEOF_POINTER__ 8
-// AARCH64-FREEBSD:#define __SIZEOF_PTRDIFF_T__ 8
-// AARCH64-FREEBSD:#define __SIZEOF_SHORT__ 2
-// AARCH64-FREEBSD:#define __SIZEOF_SIZE_T__ 8
-// AARCH64-FREEBSD:#define __SIZEOF_WCHAR_T__ 4
-// AARCH64-FREEBSD:#define __SIZEOF_WINT_T__ 4
-// AARCH64-FREEBSD:#define __SIZE_MAX__ 18446744073709551615UL
-// AARCH64-FREEBSD:#define __SIZE_TYPE__ long unsigned int
-// AARCH64-FREEBSD:#define __SIZE_WIDTH__ 64
-// AARCH64-FREEBSD:#define __UINT16_C_SUFFIX__
-// AARCH64-FREEBSD:#define __UINT16_MAX__ 65535
-// AARCH64-FREEBSD:#define __UINT16_TYPE__ unsigned short
-// AARCH64-FREEBSD:#define __UINT32_C_SUFFIX__ U
-// AARCH64-FREEBSD:#define __UINT32_MAX__ 4294967295U
-// AARCH64-FREEBSD:#define __UINT32_TYPE__ unsigned int
-// AARCH64-FREEBSD:#define __UINT64_C_SUFFIX__ UL
-// AARCH64-FREEBSD:#define __UINT64_MAX__ 18446744073709551615UL
-// AARCH64-FREEBSD:#define __UINT64_TYPE__ long unsigned int
-// AARCH64-FREEBSD:#define __UINT8_C_SUFFIX__
-// AARCH64-FREEBSD:#define __UINT8_MAX__ 255
-// AARCH64-FREEBSD:#define __UINT8_TYPE__ unsigned char
-// AARCH64-FREEBSD:#define __UINTMAX_C_SUFFIX__ UL
-// AARCH64-FREEBSD:#define __UINTMAX_MAX__ 18446744073709551615UL
-// AARCH64-FREEBSD:#define __UINTMAX_TYPE__ long unsigned int
-// AARCH64-FREEBSD:#define __UINTMAX_WIDTH__ 64
-// AARCH64-FREEBSD:#define __UINTPTR_MAX__ 18446744073709551615UL
-// AARCH64-FREEBSD:#define __UINTPTR_TYPE__ long unsigned int
-// AARCH64-FREEBSD:#define __UINTPTR_WIDTH__ 64
-// AARCH64-FREEBSD:#define __UINT_FAST16_MAX__ 65535
-// AARCH64-FREEBSD:#define __UINT_FAST16_TYPE__ unsigned short
-// AARCH64-FREEBSD:#define __UINT_FAST32_MAX__ 4294967295U
-// AARCH64-FREEBSD:#define __UINT_FAST32_TYPE__ unsigned int
-// AARCH64-FREEBSD:#define __UINT_FAST64_MAX__ 18446744073709551615UL
-// AARCH64-FREEBSD:#define __UINT_FAST64_TYPE__ long unsigned int
-// AARCH64-FREEBSD:#define __UINT_FAST8_MAX__ 255
-// AARCH64-FREEBSD:#define __UINT_FAST8_TYPE__ unsigned char
-// AARCH64-FREEBSD:#define __UINT_LEAST16_MAX__ 65535
-// AARCH64-FREEBSD:#define __UINT_LEAST16_TYPE__ unsigned short
-// AARCH64-FREEBSD:#define __UINT_LEAST32_MAX__ 4294967295U
-// AARCH64-FREEBSD:#define __UINT_LEAST32_TYPE__ unsigned int
-// AARCH64-FREEBSD:#define __UINT_LEAST64_MAX__ 18446744073709551615UL
-// AARCH64-FREEBSD:#define __UINT_LEAST64_TYPE__ long unsigned int
-// AARCH64-FREEBSD:#define __UINT_LEAST8_MAX__ 255
-// AARCH64-FREEBSD:#define __UINT_LEAST8_TYPE__ unsigned char
-// AARCH64-FREEBSD:#define __USER_LABEL_PREFIX__
-// AARCH64-FREEBSD:#define __WCHAR_MAX__ 4294967295U
-// AARCH64-FREEBSD:#define __WCHAR_TYPE__ unsigned int
-// AARCH64-FREEBSD:#define __WCHAR_UNSIGNED__ 1
-// AARCH64-FREEBSD:#define __WCHAR_WIDTH__ 32
-// AARCH64-FREEBSD:#define __WINT_MAX__ 2147483647
-// AARCH64-FREEBSD:#define __WINT_TYPE__ int
-// AARCH64-FREEBSD:#define __WINT_WIDTH__ 32
-// AARCH64-FREEBSD:#define __aarch64__ 1
-
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=aarch64-apple-ios7.0 < /dev/null | FileCheck -match-full-lines -check-prefix AARCH64-DARWIN %s
-//
-// AARCH64-DARWIN: #define _LP64 1
-// AARCH64-DARWIN-NOT: #define __AARCH64EB__ 1
-// AARCH64-DARWIN: #define __AARCH64EL__ 1
-// AARCH64-DARWIN-NOT: #define __AARCH_BIG_ENDIAN 1
-// AARCH64-DARWIN: #define __ARM_64BIT_STATE 1
-// AARCH64-DARWIN: #define __ARM_ARCH 8
-// AARCH64-DARWIN: #define __ARM_ARCH_ISA_A64 1
-// AARCH64-DARWIN-NOT: #define __ARM_BIG_ENDIAN 1
-// AARCH64-DARWIN: #define __BIGGEST_ALIGNMENT__ 8
-// AARCH64-DARWIN: #define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
-// AARCH64-DARWIN: #define __CHAR16_TYPE__ unsigned short
-// AARCH64-DARWIN: #define __CHAR32_TYPE__ unsigned int
-// AARCH64-DARWIN: #define __CHAR_BIT__ 8
-// AARCH64-DARWIN: #define __DBL_DENORM_MIN__ 4.9406564584124654e-324
-// AARCH64-DARWIN: #define __DBL_DIG__ 15
-// AARCH64-DARWIN: #define __DBL_EPSILON__ 2.2204460492503131e-16
-// AARCH64-DARWIN: #define __DBL_HAS_DENORM__ 1
-// AARCH64-DARWIN: #define __DBL_HAS_INFINITY__ 1
-// AARCH64-DARWIN: #define __DBL_HAS_QUIET_NAN__ 1
-// AARCH64-DARWIN: #define __DBL_MANT_DIG__ 53
-// AARCH64-DARWIN: #define __DBL_MAX_10_EXP__ 308
-// AARCH64-DARWIN: #define __DBL_MAX_EXP__ 1024
-// AARCH64-DARWIN: #define __DBL_MAX__ 1.7976931348623157e+308
-// AARCH64-DARWIN: #define __DBL_MIN_10_EXP__ (-307)
-// AARCH64-DARWIN: #define __DBL_MIN_EXP__ (-1021)
-// AARCH64-DARWIN: #define __DBL_MIN__ 2.2250738585072014e-308
-// AARCH64-DARWIN: #define __DECIMAL_DIG__ __LDBL_DECIMAL_DIG__
-// AARCH64-DARWIN: #define __FLT_DENORM_MIN__ 1.40129846e-45F
-// AARCH64-DARWIN: #define __FLT_DIG__ 6
-// AARCH64-DARWIN: #define __FLT_EPSILON__ 1.19209290e-7F
-// AARCH64-DARWIN: #define __FLT_EVAL_METHOD__ 0
-// AARCH64-DARWIN: #define __FLT_HAS_DENORM__ 1
-// AARCH64-DARWIN: #define __FLT_HAS_INFINITY__ 1
-// AARCH64-DARWIN: #define __FLT_HAS_QUIET_NAN__ 1
-// AARCH64-DARWIN: #define __FLT_MANT_DIG__ 24
-// AARCH64-DARWIN: #define __FLT_MAX_10_EXP__ 38
-// AARCH64-DARWIN: #define __FLT_MAX_EXP__ 128
-// AARCH64-DARWIN: #define __FLT_MAX__ 3.40282347e+38F
-// AARCH64-DARWIN: #define __FLT_MIN_10_EXP__ (-37)
-// AARCH64-DARWIN: #define __FLT_MIN_EXP__ (-125)
-// AARCH64-DARWIN: #define __FLT_MIN__ 1.17549435e-38F
-// AARCH64-DARWIN: #define __FLT_RADIX__ 2
-// AARCH64-DARWIN: #define __INT16_C_SUFFIX__
-// AARCH64-DARWIN: #define __INT16_FMTd__ "hd"
-// AARCH64-DARWIN: #define __INT16_FMTi__ "hi"
-// AARCH64-DARWIN: #define __INT16_MAX__ 32767
-// AARCH64-DARWIN: #define __INT16_TYPE__ short
-// AARCH64-DARWIN: #define __INT32_C_SUFFIX__
-// AARCH64-DARWIN: #define __INT32_FMTd__ "d"
-// AARCH64-DARWIN: #define __INT32_FMTi__ "i"
-// AARCH64-DARWIN: #define __INT32_MAX__ 2147483647
-// AARCH64-DARWIN: #define __INT32_TYPE__ int
-// AARCH64-DARWIN: #define __INT64_C_SUFFIX__ LL
-// AARCH64-DARWIN: #define __INT64_FMTd__ "lld"
-// AARCH64-DARWIN: #define __INT64_FMTi__ "lli"
-// AARCH64-DARWIN: #define __INT64_MAX__ 9223372036854775807LL
-// AARCH64-DARWIN: #define __INT64_TYPE__ long long int
-// AARCH64-DARWIN: #define __INT8_C_SUFFIX__
-// AARCH64-DARWIN: #define __INT8_FMTd__ "hhd"
-// AARCH64-DARWIN: #define __INT8_FMTi__ "hhi"
-// AARCH64-DARWIN: #define __INT8_MAX__ 127
-// AARCH64-DARWIN: #define __INT8_TYPE__ signed char
-// AARCH64-DARWIN: #define __INTMAX_C_SUFFIX__ L
-// AARCH64-DARWIN: #define __INTMAX_FMTd__ "ld"
-// AARCH64-DARWIN: #define __INTMAX_FMTi__ "li"
-// AARCH64-DARWIN: #define __INTMAX_MAX__ 9223372036854775807L
-// AARCH64-DARWIN: #define __INTMAX_TYPE__ long int
-// AARCH64-DARWIN: #define __INTMAX_WIDTH__ 64
-// AARCH64-DARWIN: #define __INTPTR_FMTd__ "ld"
-// AARCH64-DARWIN: #define __INTPTR_FMTi__ "li"
-// AARCH64-DARWIN: #define __INTPTR_MAX__ 9223372036854775807L
-// AARCH64-DARWIN: #define __INTPTR_TYPE__ long int
-// AARCH64-DARWIN: #define __INTPTR_WIDTH__ 64
-// AARCH64-DARWIN: #define __INT_FAST16_FMTd__ "hd"
-// AARCH64-DARWIN: #define __INT_FAST16_FMTi__ "hi"
-// AARCH64-DARWIN: #define __INT_FAST16_MAX__ 32767
-// AARCH64-DARWIN: #define __INT_FAST16_TYPE__ short
-// AARCH64-DARWIN: #define __INT_FAST32_FMTd__ "d"
-// AARCH64-DARWIN: #define __INT_FAST32_FMTi__ "i"
-// AARCH64-DARWIN: #define __INT_FAST32_MAX__ 2147483647
-// AARCH64-DARWIN: #define __INT_FAST32_TYPE__ int
-// AARCH64-DARWIN: #define __INT_FAST64_FMTd__ "lld"
-// AARCH64-DARWIN: #define __INT_FAST64_FMTi__ "lli"
-// AARCH64-DARWIN: #define __INT_FAST64_MAX__ 9223372036854775807LL
-// AARCH64-DARWIN: #define __INT_FAST64_TYPE__ long long int
-// AARCH64-DARWIN: #define __INT_FAST8_FMTd__ "hhd"
-// AARCH64-DARWIN: #define __INT_FAST8_FMTi__ "hhi"
-// AARCH64-DARWIN: #define __INT_FAST8_MAX__ 127
-// AARCH64-DARWIN: #define __INT_FAST8_TYPE__ signed char
-// AARCH64-DARWIN: #define __INT_LEAST16_FMTd__ "hd"
-// AARCH64-DARWIN: #define __INT_LEAST16_FMTi__ "hi"
-// AARCH64-DARWIN: #define __INT_LEAST16_MAX__ 32767
-// AARCH64-DARWIN: #define __INT_LEAST16_TYPE__ short
-// AARCH64-DARWIN: #define __INT_LEAST32_FMTd__ "d"
-// AARCH64-DARWIN: #define __INT_LEAST32_FMTi__ "i"
-// AARCH64-DARWIN: #define __INT_LEAST32_MAX__ 2147483647
-// AARCH64-DARWIN: #define __INT_LEAST32_TYPE__ int
-// AARCH64-DARWIN: #define __INT_LEAST64_FMTd__ "lld"
-// AARCH64-DARWIN: #define __INT_LEAST64_FMTi__ "lli"
-// AARCH64-DARWIN: #define __INT_LEAST64_MAX__ 9223372036854775807LL
-// AARCH64-DARWIN: #define __INT_LEAST64_TYPE__ long long int
-// AARCH64-DARWIN: #define __INT_LEAST8_FMTd__ "hhd"
-// AARCH64-DARWIN: #define __INT_LEAST8_FMTi__ "hhi"
-// AARCH64-DARWIN: #define __INT_LEAST8_MAX__ 127
-// AARCH64-DARWIN: #define __INT_LEAST8_TYPE__ signed char
-// AARCH64-DARWIN: #define __INT_MAX__ 2147483647
-// AARCH64-DARWIN: #define __LDBL_DENORM_MIN__ 4.9406564584124654e-324L
-// AARCH64-DARWIN: #define __LDBL_DIG__ 15
-// AARCH64-DARWIN: #define __LDBL_EPSILON__ 2.2204460492503131e-16L
-// AARCH64-DARWIN: #define __LDBL_HAS_DENORM__ 1
-// AARCH64-DARWIN: #define __LDBL_HAS_INFINITY__ 1
-// AARCH64-DARWIN: #define __LDBL_HAS_QUIET_NAN__ 1
-// AARCH64-DARWIN: #define __LDBL_MANT_DIG__ 53
-// AARCH64-DARWIN: #define __LDBL_MAX_10_EXP__ 308
-// AARCH64-DARWIN: #define __LDBL_MAX_EXP__ 1024
-// AARCH64-DARWIN: #define __LDBL_MAX__ 1.7976931348623157e+308L
-// AARCH64-DARWIN: #define __LDBL_MIN_10_EXP__ (-307)
-// AARCH64-DARWIN: #define __LDBL_MIN_EXP__ (-1021)
-// AARCH64-DARWIN: #define __LDBL_MIN__ 2.2250738585072014e-308L
-// AARCH64-DARWIN: #define __LONG_LONG_MAX__ 9223372036854775807LL
-// AARCH64-DARWIN: #define __LONG_MAX__ 9223372036854775807L
-// AARCH64-DARWIN: #define __LP64__ 1
-// AARCH64-DARWIN: #define __POINTER_WIDTH__ 64
-// AARCH64-DARWIN: #define __PTRDIFF_TYPE__ long int
-// AARCH64-DARWIN: #define __PTRDIFF_WIDTH__ 64
-// AARCH64-DARWIN: #define __SCHAR_MAX__ 127
-// AARCH64-DARWIN: #define __SHRT_MAX__ 32767
-// AARCH64-DARWIN: #define __SIG_ATOMIC_MAX__ 2147483647
-// AARCH64-DARWIN: #define __SIG_ATOMIC_WIDTH__ 32
-// AARCH64-DARWIN: #define __SIZEOF_DOUBLE__ 8
-// AARCH64-DARWIN: #define __SIZEOF_FLOAT__ 4
-// AARCH64-DARWIN: #define __SIZEOF_INT128__ 16
-// AARCH64-DARWIN: #define __SIZEOF_INT__ 4
-// AARCH64-DARWIN: #define __SIZEOF_LONG_DOUBLE__ 8
-// AARCH64-DARWIN: #define __SIZEOF_LONG_LONG__ 8
-// AARCH64-DARWIN: #define __SIZEOF_LONG__ 8
-// AARCH64-DARWIN: #define __SIZEOF_POINTER__ 8
-// AARCH64-DARWIN: #define __SIZEOF_PTRDIFF_T__ 8
-// AARCH64-DARWIN: #define __SIZEOF_SHORT__ 2
-// AARCH64-DARWIN: #define __SIZEOF_SIZE_T__ 8
-// AARCH64-DARWIN: #define __SIZEOF_WCHAR_T__ 4
-// AARCH64-DARWIN: #define __SIZEOF_WINT_T__ 4
-// AARCH64-DARWIN: #define __SIZE_MAX__ 18446744073709551615UL
-// AARCH64-DARWIN: #define __SIZE_TYPE__ long unsigned int
-// AARCH64-DARWIN: #define __SIZE_WIDTH__ 64
-// AARCH64-DARWIN: #define __UINT16_C_SUFFIX__
-// AARCH64-DARWIN: #define __UINT16_MAX__ 65535
-// AARCH64-DARWIN: #define __UINT16_TYPE__ unsigned short
-// AARCH64-DARWIN: #define __UINT32_C_SUFFIX__ U
-// AARCH64-DARWIN: #define __UINT32_MAX__ 4294967295U
-// AARCH64-DARWIN: #define __UINT32_TYPE__ unsigned int
-// AARCH64-DARWIN: #define __UINT64_C_SUFFIX__ ULL
-// AARCH64-DARWIN: #define __UINT64_MAX__ 18446744073709551615ULL
-// AARCH64-DARWIN: #define __UINT64_TYPE__ long long unsigned int
-// AARCH64-DARWIN: #define __UINT8_C_SUFFIX__
-// AARCH64-DARWIN: #define __UINT8_MAX__ 255
-// AARCH64-DARWIN: #define __UINT8_TYPE__ unsigned char
-// AARCH64-DARWIN: #define __UINTMAX_C_SUFFIX__ UL
-// AARCH64-DARWIN: #define __UINTMAX_MAX__ 18446744073709551615UL
-// AARCH64-DARWIN: #define __UINTMAX_TYPE__ long unsigned int
-// AARCH64-DARWIN: #define __UINTMAX_WIDTH__ 64
-// AARCH64-DARWIN: #define __UINTPTR_MAX__ 18446744073709551615UL
-// AARCH64-DARWIN: #define __UINTPTR_TYPE__ long unsigned int
-// AARCH64-DARWIN: #define __UINTPTR_WIDTH__ 64
-// AARCH64-DARWIN: #define __UINT_FAST16_MAX__ 65535
-// AARCH64-DARWIN: #define __UINT_FAST16_TYPE__ unsigned short
-// AARCH64-DARWIN: #define __UINT_FAST32_MAX__ 4294967295U
-// AARCH64-DARWIN: #define __UINT_FAST32_TYPE__ unsigned int
-// AARCH64-DARWIN: #define __UINT_FAST64_MAX__ 18446744073709551615ULL
-// AARCH64-DARWIN: #define __UINT_FAST64_TYPE__ long long unsigned int
-// AARCH64-DARWIN: #define __UINT_FAST8_MAX__ 255
-// AARCH64-DARWIN: #define __UINT_FAST8_TYPE__ unsigned char
-// AARCH64-DARWIN: #define __UINT_LEAST16_MAX__ 65535
-// AARCH64-DARWIN: #define __UINT_LEAST16_TYPE__ unsigned short
-// AARCH64-DARWIN: #define __UINT_LEAST32_MAX__ 4294967295U
-// AARCH64-DARWIN: #define __UINT_LEAST32_TYPE__ unsigned int
-// AARCH64-DARWIN: #define __UINT_LEAST64_MAX__ 18446744073709551615ULL
-// AARCH64-DARWIN: #define __UINT_LEAST64_TYPE__ long long unsigned int
-// AARCH64-DARWIN: #define __UINT_LEAST8_MAX__ 255
-// AARCH64-DARWIN: #define __UINT_LEAST8_TYPE__ unsigned char
-// AARCH64-DARWIN: #define __USER_LABEL_PREFIX__ _
-// AARCH64-DARWIN: #define __WCHAR_MAX__ 2147483647
-// AARCH64-DARWIN: #define __WCHAR_TYPE__ int
-// AARCH64-DARWIN-NOT: #define __WCHAR_UNSIGNED__
-// AARCH64-DARWIN: #define __WCHAR_WIDTH__ 32
-// AARCH64-DARWIN: #define __WINT_TYPE__ int
-// AARCH64-DARWIN: #define __WINT_WIDTH__ 32
-// AARCH64-DARWIN: #define __aarch64__ 1
-
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=thumbv7-windows-msvc < /dev/null | FileCheck -match-full-lines -check-prefix ARM-MSVC %s
 //
 // ARM-MSVC: #define _M_ARM_NT 1
 // ARM-MSVC: #define _WIN32 1
 // ARM-MSVC-NOT:#define __ARM_DWARF_EH__ 1
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=aarch64-windows-msvc < /dev/null | FileCheck -match-full-lines -check-prefix AARCH64-MSVC %s
-//
-// AARCH64-MSVC: #define _INTEGRAL_MAX_BITS 64
-// AARCH64-MSVC-NOT: #define _LP64 1
-// AARCH64-MSVC: #define _M_ARM64 1
-// AARCH64-MSVC: #define _WIN32 1
-// AARCH64-MSVC: #define _WIN64 1
-// AARCH64-MSVC: #define __AARCH64EL__ 1
-// AARCH64-MSVC: #define __ARM_64BIT_STATE 1
-// AARCH64-MSVC: #define __ARM_ACLE 200
-// AARCH64-MSVC: #define __ARM_ALIGN_MAX_STACK_PWR 4
-// AARCH64-MSVC: #define __ARM_ARCH 8
-// AARCH64-MSVC: #define __ARM_ARCH_ISA_A64 1
-// AARCH64-MSVC: #define __ARM_ARCH_PROFILE 'A'
-// AARCH64-MSVC: #define __ARM_FEATURE_CLZ 1
-// AARCH64-MSVC: #define __ARM_FEATURE_DIRECTED_ROUNDING 1
-// AARCH64-MSVC: #define __ARM_FEATURE_DIV 1
-// AARCH64-MSVC: #define __ARM_FEATURE_FMA 1
-// AARCH64-MSVC: #define __ARM_FEATURE_IDIV 1
-// AARCH64-MSVC: #define __ARM_FEATURE_LDREX 0xF
-// AARCH64-MSVC: #define __ARM_FEATURE_NUMERIC_MAXMIN 1
-// AARCH64-MSVC: #define __ARM_FEATURE_UNALIGNED 1
-// AARCH64-MSVC: #define __ARM_FP 0xE
-// AARCH64-MSVC: #define __ARM_FP16_ARGS 1
-// AARCH64-MSVC: #define __ARM_FP16_FORMAT_IEEE 1
-// AARCH64-MSVC: #define __ARM_PCS_AAPCS64 1
-// AARCH64-MSVC: #define __ARM_SIZEOF_MINIMAL_ENUM 4
-// AARCH64-MSVC: #define __ARM_SIZEOF_WCHAR_T 4
-// AARCH64-MSVC: #define __BIGGEST_ALIGNMENT__ 16
-// AARCH64-MSVC: #define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
-// AARCH64-MSVC: #define __CHAR16_TYPE__ unsigned short
-// AARCH64-MSVC: #define __CHAR32_TYPE__ unsigned int
-// AARCH64-MSVC: #define __CHAR_BIT__ 8
-// AARCH64-MSVC: #define __CONSTANT_CFSTRINGS__ 1
-// AARCH64-MSVC: #define __DBL_DECIMAL_DIG__ 17
-// AARCH64-MSVC: #define __DBL_DENORM_MIN__ 4.9406564584124654e-324
-// AARCH64-MSVC: #define __DBL_DIG__ 15
-// AARCH64-MSVC: #define __DBL_EPSILON__ 2.2204460492503131e-16
-// AARCH64-MSVC: #define __DBL_HAS_DENORM__ 1
-// AARCH64-MSVC: #define __DBL_HAS_INFINITY__ 1
-// AARCH64-MSVC: #define __DBL_HAS_QUIET_NAN__ 1
-// AARCH64-MSVC: #define __DBL_MANT_DIG__ 53
-// AARCH64-MSVC: #define __DBL_MAX_10_EXP__ 308
-// AARCH64-MSVC: #define __DBL_MAX_EXP__ 1024
-// AARCH64-MSVC: #define __DBL_MAX__ 1.7976931348623157e+308
-// AARCH64-MSVC: #define __DBL_MIN_10_EXP__ (-307)
-// AARCH64-MSVC: #define __DBL_MIN_EXP__ (-1021)
-// AARCH64-MSVC: #define __DBL_MIN__ 2.2250738585072014e-308
-// AARCH64-MSVC: #define __DECIMAL_DIG__ __LDBL_DECIMAL_DIG__
-// AARCH64-MSVC: #define __FINITE_MATH_ONLY__ 0
-// AARCH64-MSVC: #define __FLT_DECIMAL_DIG__ 9
-// AARCH64-MSVC: #define __FLT_DENORM_MIN__ 1.40129846e-45F
-// AARCH64-MSVC: #define __FLT_DIG__ 6
-// AARCH64-MSVC: #define __FLT_EPSILON__ 1.19209290e-7F
-// AARCH64-MSVC: #define __FLT_EVAL_METHOD__ 0
-// AARCH64-MSVC: #define __FLT_HAS_DENORM__ 1
-// AARCH64-MSVC: #define __FLT_HAS_INFINITY__ 1
-// AARCH64-MSVC: #define __FLT_HAS_QUIET_NAN__ 1
-// AARCH64-MSVC: #define __FLT_MANT_DIG__ 24
-// AARCH64-MSVC: #define __FLT_MAX_10_EXP__ 38
-// AARCH64-MSVC: #define __FLT_MAX_EXP__ 128
-// AARCH64-MSVC: #define __FLT_MAX__ 3.40282347e+38F
-// AARCH64-MSVC: #define __FLT_MIN_10_EXP__ (-37)
-// AARCH64-MSVC: #define __FLT_MIN_EXP__ (-125)
-// AARCH64-MSVC: #define __FLT_MIN__ 1.17549435e-38F
-// AARCH64-MSVC: #define __FLT_RADIX__ 2
-// AARCH64-MSVC: #define __INT_MAX__ 2147483647
-// AARCH64-MSVC: #define __LDBL_DECIMAL_DIG__ 17
-// AARCH64-MSVC: #define __LDBL_DENORM_MIN__ 4.9406564584124654e-324L
-// AARCH64-MSVC: #define __LDBL_DIG__ 15
-// AARCH64-MSVC: #define __LDBL_EPSILON__ 2.2204460492503131e-16L
-// AARCH64-MSVC: #define __LDBL_HAS_DENORM__ 1
-// AARCH64-MSVC: #define __LDBL_HAS_INFINITY__ 1
-// AARCH64-MSVC: #define __LDBL_HAS_QUIET_NAN__ 1
-// AARCH64-MSVC: #define __LDBL_MANT_DIG__ 53
-// AARCH64-MSVC: #define __LDBL_MAX_10_EXP__ 308
-// AARCH64-MSVC: #define __LDBL_MAX_EXP__ 1024
-// AARCH64-MSVC: #define __LDBL_MAX__ 1.7976931348623157e+308L
-// AARCH64-MSVC: #define __LDBL_MIN_10_EXP__ (-307)
-// AARCH64-MSVC: #define __LDBL_MIN_EXP__ (-1021)
-// AARCH64-MSVC: #define __LDBL_MIN__ 2.2250738585072014e-308L
-// AARCH64-MSVC: #define __LITTLE_ENDIAN__ 1
-// AARCH64-MSVC: #define __LONG_LONG_MAX__ 9223372036854775807LL
-// AARCH64-MSVC: #define __LONG_MAX__ 2147483647L
-// AARCH64-MSVC-NOT: #define __LP64__ 1
-// AARCH64-MSVC: #define __NO_INLINE__ 1
-// AARCH64-MSVC: #define __OBJC_BOOL_IS_BOOL 0
-// AARCH64-MSVC: #define __ORDER_BIG_ENDIAN__ 4321
-// AARCH64-MSVC: #define __ORDER_LITTLE_ENDIAN__ 1234
-// AARCH64-MSVC: #define __ORDER_PDP_ENDIAN__ 3412
-// AARCH64-MSVC: #define __POINTER_WIDTH__ 64
-// AARCH64-MSVC: #define __PRAGMA_REDEFINE_EXTNAME 1
-// AARCH64-MSVC: #define __SCHAR_MAX__ 127
-// AARCH64-MSVC: #define __SHRT_MAX__ 32767
-// AARCH64-MSVC: #define __SIG_ATOMIC_MAX__ 2147483647
-// AARCH64-MSVC: #define __SIG_ATOMIC_WIDTH__ 32
-// AARCH64-MSVC: #define __SIZEOF_DOUBLE__ 8
-// AARCH64-MSVC: #define __SIZEOF_FLOAT__ 4
-// AARCH64-MSVC: #define __SIZEOF_INT128__ 16
-// AARCH64-MSVC: #define __SIZEOF_INT__ 4
-// AARCH64-MSVC: #define __SIZEOF_LONG_DOUBLE__ 8
-// AARCH64-MSVC: #define __SIZEOF_LONG_LONG__ 8
-// AARCH64-MSVC: #define __SIZEOF_LONG__ 4
-// AARCH64-MSVC: #define __SIZEOF_POINTER__ 8
-// AARCH64-MSVC: #define __SIZEOF_PTRDIFF_T__ 8
-// AARCH64-MSVC: #define __SIZEOF_SHORT__ 2
-// AARCH64-MSVC: #define __SIZEOF_SIZE_T__ 8
-// AARCH64-MSVC: #define __SIZEOF_WCHAR_T__ 2
-// AARCH64-MSVC: #define __SIZEOF_WINT_T__ 2
-// AARCH64-MSVC: #define __SIZE_MAX__ 18446744073709551615ULL
-// AARCH64-MSVC: #define __SIZE_TYPE__ long long unsigned int
-// AARCH64-MSVC: #define __SIZE_WIDTH__ 64
-// AARCH64-MSVC: #define __STDC_HOSTED__ 0
-// AARCH64-MSVC: #define __STDC_UTF_16__ 1
-// AARCH64-MSVC: #define __STDC_UTF_32__ 1
-// AARCH64-MSVC: #define __STDC_VERSION__ 201112L
-// AARCH64-MSVC: #define __STDC__ 1
-// AARCH64-MSVC: #define __UINT16_C_SUFFIX__
-// AARCH64-MSVC: #define __UINT16_MAX__ 65535
-// AARCH64-MSVC: #define __UINT16_TYPE__ unsigned short
-// AARCH64-MSVC: #define __UINT32_C_SUFFIX__ U
-// AARCH64-MSVC: #define __UINT32_MAX__ 4294967295U
-// AARCH64-MSVC: #define __UINT32_TYPE__ unsigned int
-// AARCH64-MSVC: #define __UINT64_C_SUFFIX__ ULL
-// AARCH64-MSVC: #define __UINT64_MAX__ 18446744073709551615ULL
-// AARCH64-MSVC: #define __UINT64_TYPE__ long long unsigned int
-// AARCH64-MSVC: #define __UINT8_C_SUFFIX__
-// AARCH64-MSVC: #define __UINT8_MAX__ 255
-// AARCH64-MSVC: #define __UINT8_TYPE__ unsigned char
-// AARCH64-MSVC: #define __UINTMAX_C_SUFFIX__ ULL
-// AARCH64-MSVC: #define __UINTMAX_MAX__ 18446744073709551615ULL
-// AARCH64-MSVC: #define __UINTMAX_TYPE__ long long unsigned int
-// AARCH64-MSVC: #define __UINTMAX_WIDTH__ 64
-// AARCH64-MSVC: #define __UINTPTR_MAX__ 18446744073709551615ULL
-// AARCH64-MSVC: #define __UINTPTR_TYPE__ long long unsigned int
-// AARCH64-MSVC: #define __UINTPTR_WIDTH__ 64
-// AARCH64-MSVC: #define __UINT_FAST16_MAX__ 65535
-// AARCH64-MSVC: #define __UINT_FAST16_TYPE__ unsigned short
-// AARCH64-MSVC: #define __UINT_FAST32_MAX__ 4294967295U
-// AARCH64-MSVC: #define __UINT_FAST32_TYPE__ unsigned int
-// AARCH64-MSVC: #define __UINT_FAST64_MAX__ 18446744073709551615ULL
-// AARCH64-MSVC: #define __UINT_FAST64_TYPE__ long long unsigned int
-// AARCH64-MSVC: #define __UINT_FAST8_MAX__ 255
-// AARCH64-MSVC: #define __UINT_FAST8_TYPE__ unsigned char
-// AARCH64-MSVC: #define __UINT_LEAST16_MAX__ 65535
-// AARCH64-MSVC: #define __UINT_LEAST16_TYPE__ unsigned short
-// AARCH64-MSVC: #define __UINT_LEAST32_MAX__ 4294967295U
-// AARCH64-MSVC: #define __UINT_LEAST32_TYPE__ unsigned int
-// AARCH64-MSVC: #define __UINT_LEAST64_MAX__ 18446744073709551615ULL
-// AARCH64-MSVC: #define __UINT_LEAST64_TYPE__ long long unsigned int
-// AARCH64-MSVC: #define __UINT_LEAST8_MAX__ 255
-// AARCH64-MSVC: #define __UINT_LEAST8_TYPE__ unsigned char
-// AARCH64-MSVC: #define __USER_LABEL_PREFIX__
-// AARCH64-MSVC: #define __WCHAR_MAX__ 65535
-// AARCH64-MSVC: #define __WCHAR_TYPE__ unsigned short
-// AARCH64-MSVC: #define __WCHAR_UNSIGNED__ 1
-// AARCH64-MSVC: #define __WCHAR_WIDTH__ 16
-// AARCH64-MSVC: #define __WINT_TYPE__ unsigned short
-// AARCH64-MSVC: #define __WINT_WIDTH__ 16
-// AARCH64-MSVC: #define __aarch64__ 1
-
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=arm-none-none < /dev/null | FileCheck -match-full-lines -check-prefix ARM %s
 // RUN: %clang_cc1 -x c++ -E -dM -ffreestanding -triple=arm-none-none < /dev/null | FileCheck -match-full-lines -check-prefix ARM -check-prefix ARM-CXX %s
 //
@@ -7528,7 +6198,7 @@
 // X86_64:#define __WINT_WIDTH__ 32
 // X86_64:#define __amd64 1
 // X86_64:#define __amd64__ 1
-// X86_64:#define __code_model_small_ 1
+// X86_64:#define __code_model_small__ 1
 // X86_64:#define __x86_64 1
 // X86_64:#define __x86_64__ 1
 //
@@ -7540,7 +6210,7 @@
 // X86_64H:#define __x86_64h__ 1
 //
 // RUN: %clang -xc - -E -dM -mcmodel=medium --target=i386-unknown-linux < /dev/null | FileCheck -match-full-lines -check-prefix X86_MEDIUM %s
-// X86_MEDIUM:#define __code_model_medium_ 1
+// X86_MEDIUM:#define __code_model_medium__ 1
 //
 // RUN: %clang_cc1 -E -dM -ffreestanding -fgnuc-version=4.2.1 -triple=x86_64-none-none-gnux32 < /dev/null | FileCheck -match-full-lines -check-prefix X32 %s
 // RUN: %clang_cc1 -x c++ -E -dM -ffreestanding -fgnuc-version=4.2.1 -triple=x86_64-none-none-gnux32 < /dev/null | FileCheck -match-full-lines -check-prefix X32 -check-prefix X32-CXX %s
diff --git a/clang/test/Sema/MicrosoftExtensions.c b/clang/test/Sema/MicrosoftExtensions.c
--- a/clang/test/Sema/MicrosoftExtensions.c
+++ b/clang/test/Sema/MicrosoftExtensions.c
@@ -99,7 +99,7 @@
    sh = (short)ptr; // expected-warning{{cast to smaller integer type 'short' from 'char *' is a Microsoft extension}}
 
    // This is valid ISO C.
-   _Bool b = (_Bool)ptr; // expected-warning{{cast to smaller integer type '_Bool' from 'char *' is a Microsoft extension}}
+   _Bool b = (_Bool)ptr;
 }
 
 typedef struct {
diff --git a/clang/test/Sema/cast.c b/clang/test/Sema/cast.c
--- a/clang/test/Sema/cast.c
+++ b/clang/test/Sema/cast.c
@@ -151,7 +151,7 @@
 }
 
 void testVoidPtr(VoidPtr v) {
-  (void) (Bool) v; // expected-warning{{cast to smaller integer type 'Bool' (aka '_Bool') from 'VoidPtr' (aka 'void *')}}
+  (void)(Bool) v;
   (void) (Int) v; // expected-warning{{cast to smaller integer type 'Int' (aka 'int') from 'VoidPtr' (aka 'void *')}}
   (void) (Long) v;
   (void) (VoidPtr) v;
@@ -160,12 +160,12 @@
   // from other -Wpointer-to-int-cast warnings.
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wvoid-pointer-to-int-cast"
-  (void) (Bool) v; // no-warning
+  (void)(Int) v; // no-warning
 #pragma clang diagnostic pop
 }
 
 void testCharPtr(CharPtr v) {
-  (void) (Bool) v; // expected-warning{{cast to smaller integer type 'Bool' (aka '_Bool') from 'CharPtr' (aka 'char *')}}
+  (void)(Bool) v;
   (void) (Int) v; // expected-warning{{cast to smaller integer type 'Int' (aka 'int') from 'CharPtr' (aka 'char *')}}
   (void) (Long) v;
   (void) (VoidPtr) v;
@@ -174,7 +174,7 @@
   // from other -Wpointer-to-int-cast warnings.
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wvoid-pointer-to-int-cast"
-  (void) (Bool) v; // expected-warning{{cast to smaller integer type 'Bool' (aka '_Bool') from 'CharPtr' (aka 'char *')}}
+  (void)(Int) v; // expected-warning{{cast to smaller integer type 'Int' (aka 'int') from 'CharPtr' (aka 'char *')}}
 #pragma clang diagnostic pop
 }
 
diff --git a/clang/test/SemaCXX/cstyle-cast.cpp b/clang/test/SemaCXX/cstyle-cast.cpp
--- a/clang/test/SemaCXX/cstyle-cast.cpp
+++ b/clang/test/SemaCXX/cstyle-cast.cpp
@@ -178,6 +178,11 @@
   fnptr fnp = (fnptr)(l);
   (void)(char)(fnp); // expected-error {{cast from pointer to smaller type 'char' loses information}}
   (void)(long)(fnp);
+
+  (void)(bool)((void*)0);
+  (void)(bool)((int*)0);
+  (void)(char)((void*)0); // expected-error {{cast from pointer to smaller type 'char' loses information}}
+  (void)(char)((int*)0);  // expected-error {{cast from pointer to smaller type 'char' loses information}}
 }
 
 void pointer_conversion()
diff --git a/compiler-rt/lib/profile/GCDAProfiling.c b/compiler-rt/lib/profile/GCDAProfiling.c
--- a/compiler-rt/lib/profile/GCDAProfiling.c
+++ b/compiler-rt/lib/profile/GCDAProfiling.c
@@ -32,8 +32,9 @@
 #include <windows.h>
 #include "WindowsMMap.h"
 #else
-#include <sys/mman.h>
 #include <sys/file.h>
+#include <sys/mman.h>
+#include <unistd.h>
 #endif
 
 #if defined(__FreeBSD__) && defined(__i386__)
@@ -62,27 +63,20 @@
 #include "InstrProfiling.h"
 #include "InstrProfilingUtil.h"
 
+/* #define DEBUG_GCDAPROFILING */
+
 #ifndef _WIN32
 #include <pthread.h>
-static pthread_mutex_t gcov_flush_mutex = PTHREAD_MUTEX_INITIALIZER;
-static __inline void gcov_flush_lock() {
-  pthread_mutex_lock(&gcov_flush_mutex);
-}
-static __inline void gcov_flush_unlock() {
-  pthread_mutex_unlock(&gcov_flush_mutex);
-}
+pthread_mutex_t gcov_mutex = PTHREAD_MUTEX_INITIALIZER;
+static __inline void gcov_lock() { pthread_mutex_lock(&gcov_mutex); }
+static __inline void gcov_unlock() { pthread_mutex_unlock(&gcov_mutex); }
 #else
 #include <windows.h>
-static SRWLOCK gcov_flush_mutex = SRWLOCK_INIT;
-static __inline void gcov_flush_lock() {
-  AcquireSRWLockExclusive(&gcov_flush_mutex);
-}
-static __inline void gcov_flush_unlock() {
-  ReleaseSRWLockExclusive(&gcov_flush_mutex);
-}
+SRWLOCK gcov_mutex = SRWLOCK_INIT;
+static __inline void gcov_lock() { AcquireSRWLockExclusive(&gcov_mutex); }
+static __inline void gcov_unlock() { ReleaseSRWLockExclusive(&gcov_mutex); }
 #endif
 
-/* #define DEBUG_GCDAPROFILING */
 /*
  * --- GCOV file format I/O primitives ---
  */
@@ -138,6 +132,12 @@
  */
 struct fn_list flush_fn_list;
 
+/*
+ *  A list of reset functions that our __gcov_reset() function should call,
+ * shared between all dynamic objects.
+ */
+struct fn_list reset_fn_list;
+
 static void fn_list_insert(struct fn_list* list, fn_ptr fn) {
   struct fn_node* new_node = malloc(sizeof(struct fn_node));
   new_node->fn = fn;
@@ -638,8 +638,25 @@
   fn_list_insert(&flush_fn_list, fn);
 }
 
+COMPILER_RT_VISIBILITY
+void llvm_register_reset_function(fn_ptr fn) {
+  fn_list_insert(&reset_fn_list, fn);
+}
+
+COMPILER_RT_VISIBILITY
+void llvm_reset_counters(void) {
+  struct fn_node *curr = reset_fn_list.head;
+
+  while (curr) {
+    if (curr->id == CURRENT_ID) {
+      curr->fn();
+    }
+    curr = curr->next;
+  }
+}
+
 void __gcov_flush() {
-  gcov_flush_lock();
+  gcov_lock();
 
   struct fn_node* curr = flush_fn_list.head;
 
@@ -648,30 +665,69 @@
     curr = curr->next;
   }
 
-  gcov_flush_unlock();
+  gcov_unlock();
 }
 
+#if !defined(_WIN32)
+pid_t __gcov_fork() {
+  pid_t parent_pid = getpid();
+  pid_t pid;
+
+  gcov_lock();
+  // Avoid a concurrent modification of the lists during the fork.
+  // For example, a thread is making a fork while another one is
+  // loading a CU and so executing global initializer in this case
+  // the child process could inherit a bad list (e.g. bad tail)
+  // or could have the malloc in a wrong state.
+  pid = fork();
+  gcov_unlock();
+
+  if (pid == 0) {
+    pid_t child_pid = getpid();
+    if (child_pid != parent_pid) {
+      // The pid changed so we've a fork (one could have its own fork function)
+      // Just reset the counters for this child process
+      // No need to lock here since we just forked and cannot have any other
+      // threads.
+      llvm_reset_counters();
+    }
+  }
+  return pid;
+}
+#endif
+
 COMPILER_RT_VISIBILITY
 void llvm_delete_flush_function_list(void) {
   fn_list_remove(&flush_fn_list);
 }
 
 COMPILER_RT_VISIBILITY
-void llvm_gcov_init(fn_ptr wfn, fn_ptr ffn) {
+void llvm_delete_reset_function_list(void) { fn_list_remove(&reset_fn_list); }
+
+COMPILER_RT_VISIBILITY
+void llvm_gcov_init(fn_ptr wfn, fn_ptr ffn, fn_ptr rfn) {
   static int atexit_ran = 0;
 
+  gcov_lock();
+
   if (wfn)
     llvm_register_writeout_function(wfn);
 
   if (ffn)
     llvm_register_flush_function(ffn);
 
+  if (rfn)
+    llvm_register_reset_function(rfn);
+
+  gcov_unlock();
+
   if (atexit_ran == 0) {
     atexit_ran = 1;
 
     /* Make sure we write out the data and delete the data structures. */
     atexit(llvm_delete_flush_function_list);
     atexit(llvm_delete_writeout_function_list);
+    atexit(llvm_delete_reset_function_list);
     atexit(llvm_writeout_files);
   }
 }
diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -32,3 +32,4 @@
 # of the other directories.
 add_subdirectory(lib)
 add_subdirectory(test)
+add_subdirectory(fuzzing)
diff --git a/libc/cmake/modules/LLVMLibCRules.cmake b/libc/cmake/modules/LLVMLibCRules.cmake
--- a/libc/cmake/modules/LLVMLibCRules.cmake
+++ b/libc/cmake/modules/LLVMLibCRules.cmake
@@ -300,7 +300,7 @@
   if(NOT LLVM_INCLUDE_TESTS)
     return()
   endif()
-
+  
   cmake_parse_arguments(
     "LIBC_UNITTEST"
     "" # No optional arguments
@@ -375,6 +375,71 @@
   add_dependencies(check-libc ${suite_name})
 endfunction(add_libc_testsuite)
 
+# Rule to add a fuzzer test.
+# Usage
+#    add_libc_fuzzer(
+#      <target name>
+#      SRCS  <list of .cpp files for the test>
+#      HDRS  <list of .h files for the test>
+#      DEPENDS <list of dependencies>
+#    )
+function(add_libc_fuzzer target_name)
+  cmake_parse_arguments(
+    "LIBC_FUZZER"
+    "" # No optional arguments
+    "" # Single value arguments
+    "SRCS;HDRS;DEPENDS" # Multi-value arguments
+    ${ARGN}
+  )
+  if(NOT LIBC_FUZZER_SRCS)
+    message(FATAL_ERROR "'add_libc_fuzzer' target requires a SRCS list of .cpp files.")
+  endif()
+  if(NOT LIBC_FUZZER_DEPENDS)
+    message(FATAL_ERROR "'add_libc_fuzzer' target requires a DEPENDS list of 'add_entrypoint_object' targets.")
+  endif()
+
+  set(library_deps "")
+  foreach(dep IN LISTS LIBC_FUZZER_DEPENDS)
+    get_target_property(dep_type ${dep} "TARGET_TYPE")
+    if (dep_type)
+      string(COMPARE EQUAL ${dep_type} ${ENTRYPOINT_OBJ_TARGET_TYPE} dep_is_entrypoint)
+      if(dep_is_entrypoint)
+        get_target_property(obj_file ${dep} "OBJECT_FILE_RAW")
+        list(APPEND library_deps ${obj_file})
+        continue()
+      endif()
+    endif()
+    # TODO: Check if the dep is a normal CMake library target. If yes, then add it
+    # to the list of library_deps.
+  endforeach(dep)
+
+  add_executable(
+    ${target_name}
+    EXCLUDE_FROM_ALL
+    ${LIBC_FUZZER_SRCS}
+    ${LIBC_FUZZER_HDRS}
+  )
+  target_include_directories(
+    ${target_name}
+    PRIVATE
+      ${LIBC_SOURCE_DIR}
+      ${LIBC_BUILD_DIR}
+      ${LIBC_BUILD_DIR}/include
+  )
+
+  if(library_deps)
+    target_link_libraries(${target_name} PRIVATE ${library_deps})
+  endif()
+
+  set_target_properties(${target_name} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+
+  add_dependencies(
+    ${target_name}
+    ${LIBC_FUZZER_DEPENDS}
+  )
+  add_dependencies(libc-fuzzer ${target_name})
+endfunction(add_libc_fuzzer)
+
 # Rule to add header only libraries.
 # Usage
 #    add_header_library(
diff --git a/libc/docs/fuzzing.rst b/libc/docs/fuzzing.rst
new file mode 100644
--- /dev/null
+++ b/libc/docs/fuzzing.rst
@@ -0,0 +1,15 @@
+Fuzzing for LLVM-libc
+---------------------
+
+Fuzzing tests are used to ensure quality and security of LLVM-libc
+implementations. 
+
+Each fuzzing test lives under the fuzzing directory in a subdirectory
+corresponding with the src layout. 
+
+Currently we use system libc for functions that have yet to be implemented,
+however as they are implemented the fuzzers will be changed to use our 
+implementation to increase coverage for testing. 
+
+Fuzzers will be run on `oss-fuzz <https://github.com/google/oss-fuzz>`_ and the
+check-libc target will ensure that they build correctly. 
diff --git a/libc/docs/source_layout.rst b/libc/docs/source_layout.rst
--- a/libc/docs/source_layout.rst
+++ b/libc/docs/source_layout.rst
@@ -7,14 +7,13 @@
    + libc
         - cmake
         - docs
+        - fuzzing
         - include
         - lib
         - loader
         - src
         - test
-        + utils
-            - build_scripts
-            - testing
+        - utils
         - www
 
 Each of these directories is explained in detail below.
@@ -31,6 +30,13 @@
 The ``docs`` directory contains design docs and also informative documents like
 this document on source layout.
 
+The ``fuzzing`` directory
+----------------------
+
+This directory contains fuzzing tests for the various components of llvm-libc. The
+directory structure within this directory mirrors the directory structure of the
+top-level ``libc`` directory itself. For more details, see :doc:`fuzzing`.
+
 The ``include`` directory
 -------------------------
 
@@ -62,7 +68,7 @@
 This directory contains the implementations of the llvm-libc entrypoints. It is
 further organized as follows:
 
-1. There is a toplevel CMakeLists.txt file.
+1. There is a top-level CMakeLists.txt file.
 2. For every public header file provided by llvm-libc, there exists a
    corresponding directory in the ``src`` directory. The name of the directory
    is same as the base name of the header file. For example, the directory
@@ -79,17 +85,15 @@
 in the directory ``test/src/sys/mman/`` as implementation of ``mmap`` lives in
 ``src/sys/mman``.
 
-The ``www`` directory
+The `utils` directory
 ---------------------
 
-The ``www`` directory contains the HTML content of libc.llvm.org
-
-The ``utils/build_scripts`` directory
--------------------------------------
+This directory contains utilities used by other parts of the llvm-libc system.
+See the `README` files, in the sub-directories within this directory, to learn
+about the various utilities.
 
-This directory contains scripts which support the build system, tooling etc.
+The ``www`` directory
+---------------------
 
-The ``utils/testing`` directory
--------------------------------
+The ``www`` directory contains the HTML content of libc.llvm.org
 
-This directory contains testing infrastructure.
diff --git a/libc/fuzzing/CMakeLists.txt b/libc/fuzzing/CMakeLists.txt
new file mode 100644
--- /dev/null
+++ b/libc/fuzzing/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=fuzzer")
+add_custom_target(libc-fuzzer)
+add_dependencies(check-libc libc-fuzzer)
+
+add_subdirectory(string)
diff --git a/libc/fuzzing/string/CMakeLists.txt b/libc/fuzzing/string/CMakeLists.txt
new file mode 100644
--- /dev/null
+++ b/libc/fuzzing/string/CMakeLists.txt
@@ -0,0 +1,7 @@
+add_libc_fuzzer(
+  strcpy_fuzz
+  SRCS
+    strcpy_fuzz.cpp
+  DEPENDS
+    strcpy
+)
diff --git a/libc/fuzzing/string/strcpy_fuzz.cpp b/libc/fuzzing/string/strcpy_fuzz.cpp
new file mode 100644
--- /dev/null
+++ b/libc/fuzzing/string/strcpy_fuzz.cpp
@@ -0,0 +1,38 @@
+//===--------------------- strcpy_fuzz.cpp --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// Fuzzing test for llvm-libc strcpy implementation.
+///
+//===----------------------------------------------------------------------===//
+#include "src/string/strcpy.h"
+#include <stdint.h>
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+  // Validate input
+  if (!size) return 0;
+  if (data[size - 1] != '\0') return 0;
+  const char *src = (const char *)data;
+
+  char *dest = new char[size];
+  if (!dest) __builtin_trap();
+
+  __llvm_libc::strcpy(dest, src);
+
+  size_t i;
+  for (i = 0; src[i] != '\0'; i++) {
+    // Ensure correctness of strcpy
+    if (dest[i] != src[i]) __builtin_trap();
+  }
+  // Ensure strcpy null terminates dest
+  if (dest[i] != src[i]) __builtin_trap();
+
+  delete[] dest;
+
+  return 0;
+}
+
diff --git a/libc/src/signal/linux/raise.cpp b/libc/src/signal/linux/raise.cpp
--- a/libc/src/signal/linux/raise.cpp
+++ b/libc/src/signal/linux/raise.cpp
@@ -15,7 +15,7 @@
 
 int LLVM_LIBC_ENTRYPOINT(raise)(int sig) {
   __llvm_libc::Sigset sigset;
-  int got = __llvm_libc::block_all_signals(sigset);
+  __llvm_libc::block_all_signals(sigset);
   long pid = __llvm_libc::syscall(SYS_getpid);
   long tid = __llvm_libc::syscall(SYS_gettid);
   int ret = __llvm_libc::syscall(SYS_tgkill, pid, tid, sig);
diff --git a/libc/utils/CPP/README.md b/libc/utils/CPP/README.md
--- a/libc/utils/CPP/README.md
+++ b/libc/utils/CPP/README.md
@@ -1,7 +1,9 @@
-This directory contains re-implementations of some C++ standard library as well
-as some LLVM utilities. These are to be used with internal LLVM libc code and
-tests. More utilities will be added on an as needed basis. There are certain
-rules to be followed for future changes and additions:
+This directory contains re-implementations of some C++ standard library
+utilities, as well as some LLVM utilities. These utilities are for use with
+internal LLVM libc code and tests.
+
+More utilities will be added on an as needed basis. There are certain rules to
+be followed for future changes and additions:
 
 1. Only two kind of headers can be included: Other headers from this directory,
 and free standing C headers.
diff --git a/libc/utils/HdrGen/README.md b/libc/utils/HdrGen/README.md
new file mode 100644
--- /dev/null
+++ b/libc/utils/HdrGen/README.md
@@ -0,0 +1,5 @@
+# The LLVM libc header generation system
+
+LLVM libc uses a header generation scheme to generate public as well as internal
+header files. This directory contains the implementation of the header generator
+which drives this header generation scheme.
diff --git a/libc/utils/UnitTest/README.md b/libc/utils/UnitTest/README.md
new file mode 100644
--- /dev/null
+++ b/libc/utils/UnitTest/README.md
@@ -0,0 +1,23 @@
+# The LLVM libc unit test framework
+
+This directory contains a lightweight implementation of a
+[gtest](https://github.com/google/googletest) like unit test framework for LLVM
+libc.
+
+## Why not gtest?
+
+While gtest is great, featureful and time tested, it uses the C and C++
+standard libraries. Hence, using it to test LLVM libc (which is also an
+implementation of the C standard libraries) causes various kinds of
+mixup/conflict problems.
+
+## How is it different from gtest?
+
+LLVM libc's unit test framework is much less featureful as compared to gtest.
+But, what is available strives to be exactly like gtest.
+
+## Will it be made as featurful as gtest in future?
+
+It is not clear if LLVM libc needs/will need every feature of gtest. We only
+intend to extend it on an _as needed_ basis. Hence, it might never be as
+featureful as gtest.
diff --git a/libcxx/test/support/count_new.h b/libcxx/test/support/count_new.h
--- a/libcxx/test/support/count_new.h
+++ b/libcxx/test/support/count_new.h
@@ -347,10 +347,17 @@
   const bool MemCounter::disable_checking = false;
 #endif
 
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4640) // '%s' construction of local static object is not thread safe (/Zc:threadSafeInit-)
+#endif // _MSC_VER
 inline MemCounter* getGlobalMemCounter() {
   static MemCounter counter((MemCounter::MemCounterCtorArg_()));
   return &counter;
 }
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
 
 MemCounter &globalMemCounter = *getGlobalMemCounter();
 
diff --git a/libcxx/test/support/type_id.h b/libcxx/test/support/type_id.h
--- a/libcxx/test/support/type_id.h
+++ b/libcxx/test/support/type_id.h
@@ -49,11 +49,18 @@
 };
 
 // makeTypeID - Return the TypeID for the specified type 'T'.
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4640) // '%s' construction of local static object is not thread safe (/Zc:threadSafeInit-)
+#endif // _MSC_VER
 template <class T>
 inline TypeID const& makeTypeIDImp() {
   static const TypeID id(typeid(T).name());
   return id;
 }
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
 
 template <class T>
 struct TypeWrapper {};
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -1218,7 +1218,7 @@
     prio = curPrio++;
   uint32_t seed = *config->shuffleSectionSeed;
   std::mt19937 g(seed ? seed : std::random_device()());
-  std::shuffle(priorities.begin(), priorities.end(), g);
+  llvm::shuffle(priorities.begin(), priorities.end(), g);
   int prioIndex = 0;
   for (InputSectionBase *sec : inputSections) {
     if (order.try_emplace(sec, priorities[prioIndex]).second)
diff --git a/lld/test/ELF/shuffle-sections-init-fini.s b/lld/test/ELF/shuffle-sections-init-fini.s
--- a/lld/test/ELF/shuffle-sections-init-fini.s
+++ b/lld/test/ELF/shuffle-sections-init-fini.s
@@ -21,12 +21,12 @@
 # CHECK:      Hex dump of section '.init_array'
 # CHECK-NEXT: 0x{{[0-9a-f]+}} ff
 # ORDERED-SAME: 000102 03040506 0708090a 0b
-# SHUFFLED-NOT: 000102 03040506 0708090a 0b
+# SHUFFLED-SAME: 04000b 06010a08 09070203 05
 
 # CHECK:      Hex dump of section '.fini_array'
 # CHECK-NEXT: 0x{{[0-9a-f]+}} ff
-# ORDERED-SAME: 000102 03040506 0708090a 0b
-# SHUFFLED-NOT: 000102 03040506 0708090a 0b
+# ORDERED-SAME:  000102 03040506 0708090a 0b
+# SHUFFLED-SAME: 090401 070b0003 080a0605 02
 
 ## With a SECTIONS command, SHT_INIT_ARRAY prirotities are ignored.
 ## All .init_array* are shuffled together.
@@ -40,11 +40,8 @@
 
 # CHECK2:       Hex dump of section '.init_array'
 # ORDERED2-NEXT:  0x{{[0-9a-f]+}} 00010203 04050607 08090a0b ff
-# SHUFFLED2-NOT:  0x{{[0-9a-f]+}} 00010203 04050607 08090a0b ff
+# SHUFFLED2-NEXT: 0x{{[0-9a-f]+}} 04000b06 010a0809 07ff0203 05
 
-## std::shuffle have different implementations.
-## When the number of input sections are large, it is almost guaranteed
-## to have an unordered result with --shuffle-sections=.
 .irp i,0,1,2,3,4,5,6,7,8,9,10,11
   .section .init,"ax",@progbits,unique,\i
   .byte \i
diff --git a/lld/test/ELF/shuffle-sections.s b/lld/test/ELF/shuffle-sections.s
--- a/lld/test/ELF/shuffle-sections.s
+++ b/lld/test/ELF/shuffle-sections.s
@@ -6,6 +6,12 @@
 # CHECK: Hex dump of section '.text':
 # CHECK-NEXT: 01020304
 
+## --shuffle-sections= shuffles input sections.
+# RUN: ld.lld --shuffle-sections=1 %t.o -o %t1.out
+# RUN: llvm-readelf -x .text %t1.out | FileCheck %s --check-prefix=SHUFFLE1
+# SHUFFLE1: Hex dump of section '.text':
+# SHUFFLE1-NEXT: 0204cccc 0103
+
 ## Test that --shuffle-sections= can be used with --symbol-ordering-file
 # RUN: echo "foo" > %t_order.txt
 # RUN: echo "_start " >> %t_order.txt
@@ -13,12 +19,12 @@
 # RUN: ld.lld --symbol-ordering-file %t_order.txt --shuffle-sections=2 %t.o -o %t2.out
 # RUN: llvm-readelf -x .text %t2.out | FileCheck %s --check-prefix=SHUFFLE2
 # SHUFFLE2: Hex dump of section '.text':
-# SHUFFLE2-NEXT: 02cccccc 01{{....}}
+# SHUFFLE2-NEXT: 02cccccc 010304
 
 # RUN: ld.lld --symbol-ordering-file %t_order.txt --shuffle-sections=3 %t.o -o %t3.out
 # RUN: llvm-readelf -x .text %t3.out | FileCheck %s --check-prefix=SHUFFLE3
 # SHUFFLE3: Hex dump of section '.text':
-# SHUFFLE3-NEXT: 02cccccc 01{{....}}
+# SHUFFLE3-NEXT: 02cccccc 010403
 
 ## .text has an alignment of 4.
 .global _start
diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py
--- a/lldb/packages/Python/lldbsuite/test/lldbtest.py
+++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py
@@ -692,6 +692,10 @@
             # differ in the debug info, which is not being hashed.
             "settings set symbols.enable-external-lookup false",
 
+            # Disable fix-its by default so that incorrect expressions in tests don't
+            # pass just because Clang thinks it has a fix-it.
+            "settings set target.auto-apply-fixits false",
+
             # Testsuite runs in parallel and the host can have also other load.
             "settings set plugin.process.gdb-remote.packet-timeout 60",
 
@@ -2394,7 +2398,16 @@
         self.assertTrue(expr.strip() == expr, "Expression contains trailing/leading whitespace: '" + expr + "'")
 
         frame = self.frame()
-        eval_result = frame.EvaluateExpression(expr)
+        options = lldb.SBExpressionOptions()
+
+        # Disable fix-its that tests don't pass by accident.
+        options.SetAutoApplyFixIts(False)
+
+        # Set the usual default options for normal expressions.
+        options.SetIgnoreBreakpoints(True)
+        options.SetLanguage(frame.GuessLanguage())
+
+        eval_result = frame.EvaluateExpression(expr, options)
 
         if error_msg:
             self.assertFalse(eval_result.IsValid(), "Unexpected success with result: '" + str(eval_result) + "'")
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.h b/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.h
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.h
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.h
@@ -282,14 +282,9 @@
   ///
   /// \param[in] namespace_decl
   ///     If valid and module is non-NULL, the parent namespace.
-  ///
-  /// \param[in] current_id
-  ///     The ID for the current FindExternalVisibleDecls invocation,
-  ///     for logging purposes.
   void FindExternalVisibleDecls(NameSearchContext &context,
                                 lldb::ModuleSP module,
-                                CompilerDeclContext &namespace_decl,
-                                unsigned int current_id);
+                                CompilerDeclContext &namespace_decl);
 
   /// Find all Objective-C methods matching a given selector.
   ///
@@ -356,13 +351,11 @@
 
 protected:
   bool FindObjCMethodDeclsWithOrigin(
-      unsigned int current_id, NameSearchContext &context,
+      NameSearchContext &context,
       clang::ObjCInterfaceDecl *original_interface_decl, const char *log_info);
 
-  void FindDeclInModules(NameSearchContext &context, ConstString name,
-                         unsigned current_id);
-  void FindDeclInObjCRuntime(NameSearchContext &context, ConstString name,
-                             unsigned current_id);
+  void FindDeclInModules(NameSearchContext &context, ConstString name);
+  void FindDeclInObjCRuntime(NameSearchContext &context, ConstString name);
 
   friend struct NameSearchContext;
 
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.cpp
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.cpp
@@ -195,18 +195,14 @@
 void ClangASTSource::CompleteType(TagDecl *tag_decl) {
   Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_EXPRESSIONS));
 
-  static unsigned int invocation_id = 0;
-  unsigned int current_id = invocation_id++;
-
   if (log) {
     LLDB_LOG(log,
-             "    CompleteTagDecl[{0}] on (ASTContext*){1} Completing "
+             "    CompleteTagDecl on (ASTContext*){1} Completing "
              "(TagDecl*){2} named {3}",
-             current_id, m_clang_ast_context->getDisplayName(), tag_decl,
+             m_clang_ast_context->getDisplayName(), tag_decl,
              tag_decl->getName());
 
-    LLDB_LOG(log, "      CTD[%u] Before:\n{0}", current_id,
-             ClangUtil::DumpDecl(tag_decl));
+    LLDB_LOG(log, "      CTD Before:\n{0}", ClangUtil::DumpDecl(tag_decl));
   }
 
   auto iter = m_active_lexical_decls.find(tag_decl);
@@ -219,10 +215,8 @@
     // We couldn't complete the type.  Maybe there's a definition somewhere
     // else that can be completed.
 
-    LLDB_LOG(log,
-             "      CTD[{0}] Type could not be completed in the module in "
-             "which it was first found.",
-             current_id);
+    LLDB_LOG(log, "      CTD Type could not be completed in the module in "
+                  "which it was first found.");
 
     bool found = false;
 
@@ -234,9 +228,8 @@
           m_ast_importer_sp->GetNamespaceMap(namespace_context);
 
       if (log && log->GetVerbose())
-        LLDB_LOG(log,
-                 "      CTD[{0}] Inspecting namespace map{1} ({2} entries)",
-                 current_id, namespace_map.get(), namespace_map->size());
+        LLDB_LOG(log, "      CTD Inspecting namespace map{1} ({2} entries)",
+                 namespace_map.get(), namespace_map->size());
 
       if (!namespace_map)
         return;
@@ -244,9 +237,8 @@
       for (ClangASTImporter::NamespaceMap::iterator i = namespace_map->begin(),
                                                     e = namespace_map->end();
            i != e && !found; ++i) {
-        LLDB_LOG(log, "      CTD[{0}] Searching namespace {1} in module {2}",
-                 current_id, i->second.GetName(),
-                 i->first->GetFileSpec().GetFilename());
+        LLDB_LOG(log, "      CTD Searching namespace {1} in module {2}",
+                 i->second.GetName(), i->first->GetFileSpec().GetFilename());
 
         TypeList types;
 
@@ -423,31 +415,27 @@
   m_active_lexical_decls.insert(context_decl);
   ScopedLexicalDeclEraser eraser(m_active_lexical_decls, context_decl);
 
-  static unsigned int invocation_id = 0;
-  unsigned int current_id = invocation_id++;
-
   if (log) {
     if (const NamedDecl *context_named_decl = dyn_cast<NamedDecl>(context_decl))
       LLDB_LOG(log,
-               "FindExternalLexicalDecls[{0}] on (ASTContext*){1} '{2}' in "
+               "FindExternalLexicalDecls on (ASTContext*){1} '{2}' in "
                "'{3}' (%sDecl*){4}",
-               current_id, m_ast_context, m_clang_ast_context->getDisplayName(),
+               m_ast_context, m_clang_ast_context->getDisplayName(),
                context_named_decl->getNameAsString().c_str(),
                context_decl->getDeclKindName(),
                static_cast<const void *>(context_decl));
     else if (context_decl)
       LLDB_LOG(log,
-               "FindExternalLexicalDecls[{0}] on (ASTContext*){1} '{2}' in "
+               "FindExternalLexicalDecls on (ASTContext*){1} '{2}' in "
                "({3}Decl*){4}",
-               current_id, m_ast_context, m_clang_ast_context->getDisplayName(),
+               m_ast_context, m_clang_ast_context->getDisplayName(),
                context_decl->getDeclKindName(),
                static_cast<const void *>(context_decl));
     else
       LLDB_LOG(log,
-               "FindExternalLexicalDecls[{0}] on (ASTContext*){1} '{2}' in a "
+               "FindExternalLexicalDecls on (ASTContext*){1} '{2}' in a "
                "NULL context",
-               current_id, m_ast_context,
-               m_clang_ast_context->getDisplayName());
+               m_ast_context, m_clang_ast_context->getDisplayName());
   }
 
   ClangASTImporter::DeclOrigin original = m_ast_importer_sp->GetDeclOrigin(context_decl);
@@ -455,7 +443,7 @@
   if (!original.Valid())
     return;
 
-  LLDB_LOG(log, "  FELD[{0}] Original decl {1} (Decl*){2:x}:\n{3}", current_id,
+  LLDB_LOG(log, "  FELD Original decl {1} (Decl*){2:x}:\n{3}",
            static_cast<void *>(original.ctx),
            static_cast<void *>(original.decl),
            ClangUtil::DumpDecl(original.decl));
@@ -500,12 +488,12 @@
         std::string ast_dump = ClangUtil::DumpDecl(decl);
         if (const NamedDecl *context_named_decl =
                 dyn_cast<NamedDecl>(context_decl))
-          LLDB_LOG(
-              log, "  FELD[{0}] Adding [to {1}Decl {2}] lexical {3}Decl {4}",
-              current_id, context_named_decl->getDeclKindName(),
-              context_named_decl->getName(), decl->getDeclKindName(), ast_dump);
+          LLDB_LOG(log, "  FELD Adding [to {1}Decl {2}] lexical {3}Decl {4}",
+                   context_named_decl->getDeclKindName(),
+                   context_named_decl->getName(), decl->getDeclKindName(),
+                   ast_dump);
         else
-          LLDB_LOG(log, "  FELD[{0}] Adding lexical {1}Decl {2}", current_id,
+          LLDB_LOG(log, "  FELD Adding lexical {1}Decl {2}",
                    decl->getDeclKindName(), ast_dump);
       }
 
@@ -545,29 +533,25 @@
 
   Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_EXPRESSIONS));
 
-  static unsigned int invocation_id = 0;
-  unsigned int current_id = invocation_id++;
-
   if (log) {
     if (!context.m_decl_context)
       LLDB_LOG(log,
-               "ClangASTSource::FindExternalVisibleDecls[{0}] on "
+               "ClangASTSource::FindExternalVisibleDecls on "
                "(ASTContext*){1} '{2}' for '{3}' in a NULL DeclContext",
-               current_id, m_ast_context, m_clang_ast_context->getDisplayName(),
-               name);
+               m_ast_context, m_clang_ast_context->getDisplayName(), name);
     else if (const NamedDecl *context_named_decl =
                  dyn_cast<NamedDecl>(context.m_decl_context))
       LLDB_LOG(log,
-               "ClangASTSource::FindExternalVisibleDecls[{0}] on "
+               "ClangASTSource::FindExternalVisibleDecls on "
                "(ASTContext*){1} '{2}' for '{3}' in '{4}'",
-               current_id, m_ast_context, m_clang_ast_context->getDisplayName(),
-               name, context_named_decl->getName());
+               m_ast_context, m_clang_ast_context->getDisplayName(), name,
+               context_named_decl->getName());
     else
       LLDB_LOG(log,
-               "ClangASTSource::FindExternalVisibleDecls[{0}] on "
+               "ClangASTSource::FindExternalVisibleDecls on "
                "(ASTContext*){1} '{2}' for '{3}' in a '{4}'",
-               current_id, m_ast_context, m_clang_ast_context->getDisplayName(),
-               name, context.m_decl_context->getDeclKindName());
+               m_ast_context, m_clang_ast_context->getDisplayName(), name,
+               context.m_decl_context->getDeclKindName());
   }
 
   context.m_namespace_map = std::make_shared<ClangASTImporter::NamespaceMap>();
@@ -578,9 +562,8 @@
         m_ast_importer_sp->GetNamespaceMap(namespace_context);
 
     if (log && log->GetVerbose())
-      LLDB_LOG(log,
-               "  CAS::FEVD[{0}] Inspecting namespace map {1} ({2} entries)",
-               current_id, namespace_map.get(), namespace_map->size());
+      LLDB_LOG(log, "  CAS::FEVD Inspecting namespace map {1} ({2} entries)",
+               namespace_map.get(), namespace_map->size());
 
     if (!namespace_map)
       return;
@@ -588,11 +571,10 @@
     for (ClangASTImporter::NamespaceMap::iterator i = namespace_map->begin(),
                                                   e = namespace_map->end();
          i != e; ++i) {
-      LLDB_LOG(log, "  CAS::FEVD[{0}] Searching namespace {1} in module {2}",
-               current_id, i->second.GetName(),
-               i->first->GetFileSpec().GetFilename());
+      LLDB_LOG(log, "  CAS::FEVD Searching namespace {1} in module {2}",
+               i->second.GetName(), i->first->GetFileSpec().GetFilename());
 
-      FindExternalVisibleDecls(context, i->first, i->second, current_id);
+      FindExternalVisibleDecls(context, i->first, i->second);
     }
   } else if (isa<ObjCInterfaceDecl>(context.m_decl_context)) {
     FindObjCPropertyAndIvarDecls(context);
@@ -602,18 +584,15 @@
   } else {
     CompilerDeclContext namespace_decl;
 
-    LLDB_LOG(log, "  CAS::FEVD[{0}] Searching the root namespace", current_id);
+    LLDB_LOG(log, "  CAS::FEVD Searching the root namespace");
 
-    FindExternalVisibleDecls(context, lldb::ModuleSP(), namespace_decl,
-                             current_id);
+    FindExternalVisibleDecls(context, lldb::ModuleSP(), namespace_decl);
   }
 
   if (!context.m_namespace_map->empty()) {
     if (log && log->GetVerbose())
-      LLDB_LOG(log,
-               "  CAS::FEVD[{0}] Registering namespace map {1} ({2} entries)",
-               current_id, context.m_namespace_map.get(),
-               context.m_namespace_map->size());
+      LLDB_LOG(log, "  CAS::FEVD Registering namespace map {1} ({2} entries)",
+               context.m_namespace_map.get(), context.m_namespace_map->size());
 
     NamespaceDecl *clang_namespace_decl =
         AddNamespace(context, context.m_namespace_map);
@@ -646,7 +625,7 @@
 
 void ClangASTSource::FindExternalVisibleDecls(
     NameSearchContext &context, lldb::ModuleSP module_sp,
-    CompilerDeclContext &namespace_decl, unsigned int current_id) {
+    CompilerDeclContext &namespace_decl) {
   assert(m_ast_context);
 
   Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_EXPRESSIONS));
@@ -671,8 +650,8 @@
             std::pair<lldb::ModuleSP, CompilerDeclContext>(
                 module_sp, found_namespace_decl));
 
-        LLDB_LOG(log, "  CAS::FEVD[{0}] Found namespace {1} in module {2}",
-                 current_id, name, module_sp->GetFileSpec().GetFilename());
+        LLDB_LOG(log, "  CAS::FEVD Found namespace {1} in module {2}", name,
+                 module_sp->GetFileSpec().GetFilename());
       }
     }
   } else {
@@ -699,8 +678,8 @@
             std::pair<lldb::ModuleSP, CompilerDeclContext>(
                 image, found_namespace_decl));
 
-        LLDB_LOG(log, "  CAS::FEVD[{0}] Found namespace {1} in module {2}",
-                 current_id, name, image->GetFileSpec().GetFilename());
+        LLDB_LOG(log, "  CAS::FEVD Found namespace {1} in module {2}", name,
+                 image->GetFileSpec().GetFilename());
       }
     }
   }
@@ -725,8 +704,7 @@
       if (log) {
         const char *name_string = type_sp->GetName().GetCString();
 
-        LLDB_LOG(log, "  CAS::FEVD[{0}] Matching type found for \"{1}\": {2}",
-                 current_id, name,
+        LLDB_LOG(log, "  CAS::FEVD Matching type found for \"{1}\": {2}", name,
                  (name_string ? name_string : "<anonymous>"));
       }
 
@@ -735,8 +713,7 @@
       CompilerType copied_clang_type(GuardedCopyType(full_type));
 
       if (!copied_clang_type) {
-        LLDB_LOG(log, "  CAS::FEVD[{0}] - Couldn't export a type",
-                 current_id);
+        LLDB_LOG(log, "  CAS::FEVD - Couldn't export a type");
 
         continue;
       }
@@ -750,11 +727,11 @@
 
   if (!context.m_found.type) {
     // Try the modules next.
-    FindDeclInModules(context, name, current_id);
+    FindDeclInModules(context, name);
   }
 
   if (!context.m_found.type) {
-    FindDeclInObjCRuntime(context, name, current_id);
+    FindDeclInObjCRuntime(context, name);
   }
 }
 
@@ -809,8 +786,8 @@
 }
 
 bool ClangASTSource::FindObjCMethodDeclsWithOrigin(
-    unsigned int current_id, NameSearchContext &context,
-    ObjCInterfaceDecl *original_interface_decl, const char *log_info) {
+    NameSearchContext &context, ObjCInterfaceDecl *original_interface_decl,
+    const char *log_info) {
   const DeclarationName &decl_name(context.m_decl_name);
   clang::ASTContext *original_ctx = &original_interface_decl->getASTContext();
 
@@ -881,7 +858,7 @@
 
     Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_EXPRESSIONS));
 
-    LLDB_LOG(log, "  CAS::FOMD[{0}] found ({1}) {2}", current_id, log_info,
+    LLDB_LOG(log, "  CAS::FOMD found ({1}) {2}", log_info,
              ClangUtil::DumpDecl(copied_method_decl));
 
     context.AddNamedDecl(copied_method_decl);
@@ -891,7 +868,7 @@
 }
 
 void ClangASTSource::FindDeclInModules(NameSearchContext &context,
-                                       ConstString name, unsigned current_id) {
+                                       ConstString name) {
   Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_EXPRESSIONS));
 
   ClangModulesDeclVendor *modules_decl_vendor =
@@ -908,9 +885,9 @@
 
   if (log) {
     LLDB_LOG(log,
-             "  CAS::FEVD[{0}] Matching entity found for \"{1}\" in "
+             "  CAS::FEVD Matching entity found for \"{1}\" in "
              "the modules",
-             current_id, name);
+             name);
   }
 
   clang::NamedDecl *const decl_from_modules = decls[0];
@@ -923,9 +900,7 @@
         copied_decl ? dyn_cast<clang::NamedDecl>(copied_decl) : nullptr;
 
     if (!copied_named_decl) {
-      LLDB_LOG(log,
-               "  CAS::FEVD[{0}] - Couldn't export a type from the modules",
-               current_id);
+      LLDB_LOG(log, "  CAS::FEVD - Couldn't export a type from the modules");
 
       return;
     }
@@ -937,8 +912,7 @@
 }
 
 void ClangASTSource::FindDeclInObjCRuntime(NameSearchContext &context,
-                                           ConstString name,
-                                           unsigned current_id) {
+                                           ConstString name) {
   Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_EXPRESSIONS));
 
   lldb::ProcessSP process(m_target->GetProcessSP());
@@ -965,9 +939,8 @@
     return;
 
   if (log) {
-    LLDB_LOG(log,
-             "  CAS::FEVD[{0}] Matching type found for \"{0}\" in the runtime",
-             current_id, name);
+    LLDB_LOG(log, "  CAS::FEVD Matching type found for \"{0}\" in the runtime",
+             name);
   }
 
   clang::Decl *copied_decl = CopyDecl(decls[0]);
@@ -975,8 +948,7 @@
       copied_decl ? dyn_cast<clang::NamedDecl>(copied_decl) : nullptr;
 
   if (!copied_named_decl) {
-    LLDB_LOG(log, "  CAS::FEVD[{0}] - Couldn't export a type from the runtime",
-             current_id);
+    LLDB_LOG(log, "  CAS::FEVD - Couldn't export a type from the runtime");
 
     return;
   }
@@ -987,9 +959,6 @@
 void ClangASTSource::FindObjCMethodDecls(NameSearchContext &context) {
   Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_EXPRESSIONS));
 
-  static unsigned int invocation_id = 0;
-  unsigned int current_id = invocation_id++;
-
   const DeclarationName &decl_name(context.m_decl_name);
   const DeclContext *decl_ctx(context.m_decl_context);
 
@@ -1008,8 +977,8 @@
     ObjCInterfaceDecl *original_interface_decl =
         dyn_cast<ObjCInterfaceDecl>(original.decl);
 
-    if (FindObjCMethodDeclsWithOrigin(current_id, context,
-                                      original_interface_decl, "at origin"))
+    if (FindObjCMethodDeclsWithOrigin(context, original_interface_decl,
+                                      "at origin"))
       return; // found it, no need to look any further
   } while (false);
 
@@ -1035,9 +1004,9 @@
   ConstString selector_name(ss.GetString());
 
   LLDB_LOG(log,
-           "ClangASTSource::FindObjCMethodDecls[{0}] on (ASTContext*){1} '{2}' "
+           "ClangASTSource::FindObjCMethodDecls on (ASTContext*){1} '{2}' "
            "for selector [{3} {4}]",
-           current_id, m_ast_context, m_clang_ast_context->getDisplayName(),
+           m_ast_context, m_clang_ast_context->getDisplayName(),
            interface_decl->getName(), selector_name);
   SymbolContextList sc_list;
 
@@ -1158,7 +1127,7 @@
         if (!copied_method_decl)
           continue;
 
-        LLDB_LOG(log, "  CAS::FOMD[{0}] found (in symbols)\n{1}", current_id,
+        LLDB_LOG(log, "  CAS::FOMD found (in symbols)\n{1}",
                  ClangUtil::DumpDecl(copied_method_decl));
 
         context.AddNamedDecl(copied_method_decl);
@@ -1187,12 +1156,11 @@
       break; // already checked this one
 
     LLDB_LOG(log,
-             "CAS::FOPD[{0}] trying origin "
+             "CAS::FOPD trying origin "
              "(ObjCInterfaceDecl*){1}/(ASTContext*){2}...",
-             current_id, complete_interface_decl,
-             &complete_iface_decl->getASTContext());
+             complete_interface_decl, &complete_iface_decl->getASTContext());
 
-    FindObjCMethodDeclsWithOrigin(current_id, context, complete_interface_decl,
+    FindObjCMethodDeclsWithOrigin(context, complete_interface_decl,
                                   "in debug info");
 
     return;
@@ -1219,8 +1187,8 @@
       if (!interface_decl_from_modules)
         break;
 
-      if (FindObjCMethodDeclsWithOrigin(
-              current_id, context, interface_decl_from_modules, "in modules"))
+      if (FindObjCMethodDeclsWithOrigin(context, interface_decl_from_modules,
+                                        "in modules"))
         return;
     }
   } while (false);
@@ -1260,13 +1228,13 @@
     if (!runtime_interface_decl)
       break;
 
-    FindObjCMethodDeclsWithOrigin(current_id, context, runtime_interface_decl,
+    FindObjCMethodDeclsWithOrigin(context, runtime_interface_decl,
                                   "in runtime");
   } while (false);
 }
 
 static bool FindObjCPropertyAndIvarDeclsWithOrigin(
-    unsigned int current_id, NameSearchContext &context, ClangASTSource &source,
+    NameSearchContext &context, ClangASTSource &source,
     DeclFromUser<const ObjCInterfaceDecl> &origin_iface_decl) {
   Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_EXPRESSIONS));
 
@@ -1288,7 +1256,7 @@
     DeclFromParser<ObjCPropertyDecl> parser_property_decl(
         origin_property_decl.Import(source));
     if (parser_property_decl.IsValid()) {
-      LLDB_LOG(log, "  CAS::FOPD[{0}] found\n{1}", current_id,
+      LLDB_LOG(log, "  CAS::FOPD found\n{1}",
                ClangUtil::DumpDecl(parser_property_decl.decl));
 
       context.AddNamedDecl(parser_property_decl.decl);
@@ -1304,7 +1272,7 @@
         origin_ivar_decl.Import(source));
     if (parser_ivar_decl.IsValid()) {
       if (log) {
-        LLDB_LOG(log, "  CAS::FOPD[{0}] found\n{1}", current_id,
+        LLDB_LOG(log, "  CAS::FOPD found\n{1}",
                  ClangUtil::DumpDecl(parser_ivar_decl.decl));
       }
 
@@ -1319,9 +1287,6 @@
 void ClangASTSource::FindObjCPropertyAndIvarDecls(NameSearchContext &context) {
   Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_EXPRESSIONS));
 
-  static unsigned int invocation_id = 0;
-  unsigned int current_id = invocation_id++;
-
   DeclFromParser<const ObjCInterfaceDecl> parser_iface_decl(
       cast<ObjCInterfaceDecl>(context.m_decl_context));
   DeclFromUser<const ObjCInterfaceDecl> origin_iface_decl(
@@ -1330,21 +1295,19 @@
   ConstString class_name(parser_iface_decl->getNameAsString().c_str());
 
   LLDB_LOG(log,
-           "ClangASTSource::FindObjCPropertyAndIvarDecls[{0}] on "
+           "ClangASTSource::FindObjCPropertyAndIvarDecls on "
            "(ASTContext*){1} '{2}' for '{3}.{4}'",
-           current_id, m_ast_context, m_clang_ast_context->getDisplayName(),
+           m_ast_context, m_clang_ast_context->getDisplayName(),
            parser_iface_decl->getName(), context.m_decl_name.getAsString());
 
-  if (FindObjCPropertyAndIvarDeclsWithOrigin(
-          current_id, context, *this, origin_iface_decl))
+  if (FindObjCPropertyAndIvarDeclsWithOrigin(context, *this, origin_iface_decl))
     return;
 
   LLDB_LOG(log,
-           "CAS::FOPD[{0}] couldn't find the property on origin "
+           "CAS::FOPD couldn't find the property on origin "
            "(ObjCInterfaceDecl*){1}/(ASTContext*){2}, searching "
            "elsewhere...",
-           current_id, origin_iface_decl.decl,
-           &origin_iface_decl->getASTContext());
+           origin_iface_decl.decl, &origin_iface_decl->getASTContext());
 
   SymbolContext null_sc;
   TypeList type_list;
@@ -1366,13 +1329,11 @@
       break; // already checked this one
 
     LLDB_LOG(log,
-             "CAS::FOPD[{0}] trying origin "
+             "CAS::FOPD trying origin "
              "(ObjCInterfaceDecl*){1}/(ASTContext*){2}...",
-             current_id, complete_iface_decl.decl,
-             &complete_iface_decl->getASTContext());
+             complete_iface_decl.decl, &complete_iface_decl->getASTContext());
 
-    FindObjCPropertyAndIvarDeclsWithOrigin(current_id, context, *this,
-                                           complete_iface_decl);
+    FindObjCPropertyAndIvarDeclsWithOrigin(context, *this, complete_iface_decl);
 
     return;
   } while (false);
@@ -1403,10 +1364,10 @@
     LLDB_LOG(log,
              "CAS::FOPD[{0}] trying module "
              "(ObjCInterfaceDecl*){1}/(ASTContext*){2}...",
-             current_id, interface_decl_from_modules.decl,
+             interface_decl_from_modules.decl,
              &interface_decl_from_modules->getASTContext());
 
-    if (FindObjCPropertyAndIvarDeclsWithOrigin(current_id, context, *this,
+    if (FindObjCPropertyAndIvarDeclsWithOrigin(context, *this,
                                                interface_decl_from_modules))
       return;
   } while (false);
@@ -1447,11 +1408,11 @@
     LLDB_LOG(log,
              "CAS::FOPD[{0}] trying runtime "
              "(ObjCInterfaceDecl*){1}/(ASTContext*){2}...",
-             current_id, interface_decl_from_runtime.decl,
+             interface_decl_from_runtime.decl,
              &interface_decl_from_runtime->getASTContext());
 
-    if (FindObjCPropertyAndIvarDeclsWithOrigin(
-            current_id, context, *this, interface_decl_from_runtime))
+    if (FindObjCPropertyAndIvarDeclsWithOrigin(context, *this,
+                                               interface_decl_from_runtime))
       return;
   } while (false);
 }
@@ -1541,16 +1502,14 @@
                                       FieldOffsetMap &field_offsets,
                                       BaseOffsetMap &base_offsets,
                                       BaseOffsetMap &virtual_base_offsets) {
-  static unsigned int invocation_id = 0;
-  unsigned int current_id = invocation_id++;
 
   Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_EXPRESSIONS));
 
   LLDB_LOG(log,
-           "LayoutRecordType[{0}] on (ASTContext*){1} '{2}' for (RecordDecl*)"
+           "LayoutRecordType on (ASTContext*){1} '{2}' for (RecordDecl*)"
            "{3} [name = '{4}']",
-           current_id, m_ast_context, m_clang_ast_context->getDisplayName(),
-           record, record->getName());
+           m_ast_context, m_clang_ast_context->getDisplayName(), record,
+           record->getName());
 
   DeclFromParser<const RecordDecl> parser_record(record);
   DeclFromUser<const RecordDecl> origin_record(
@@ -1614,23 +1573,23 @@
               m_ast_context->getCharWidth();
 
   if (log) {
-    LLDB_LOG(log, "LRT[{0}] returned:", current_id);
-    LLDB_LOG(log, "LRT[{0}]   Original = (RecordDecl*)%p", current_id,
+    LLDB_LOG(log, "LRT returned:");
+    LLDB_LOG(log, "LRT   Original = (RecordDecl*)%p",
              static_cast<const void *>(origin_record.decl));
-    LLDB_LOG(log, "LRT[{0}]   Size = %" PRId64, current_id, size);
-    LLDB_LOG(log, "LRT[{0}]   Alignment = %" PRId64, current_id, alignment);
-    LLDB_LOG(log, "LRT[{0}]   Fields:", current_id);
+    LLDB_LOG(log, "LRT   Size = %" PRId64, size);
+    LLDB_LOG(log, "LRT   Alignment = %" PRId64, alignment);
+    LLDB_LOG(log, "LRT   Fields:");
     for (RecordDecl::field_iterator fi = record->field_begin(),
                                     fe = record->field_end();
          fi != fe; ++fi) {
       LLDB_LOG(log,
                "LRT[{0}]     (FieldDecl*){1}, Name = '{2}', Offset = {3} bits",
-               current_id, *fi, fi->getName(), field_offsets[*fi]);
+               *fi, fi->getName(), field_offsets[*fi]);
     }
     DeclFromParser<const CXXRecordDecl> parser_cxx_record =
         DynCast<const CXXRecordDecl>(parser_record);
     if (parser_cxx_record.IsValid()) {
-      LLDB_LOG(log, "LRT[{0}]   Bases:", current_id);
+      LLDB_LOG(log, "LRT   Bases:");
       for (CXXRecordDecl::base_class_const_iterator
                bi = parser_cxx_record->bases_begin(),
                be = parser_cxx_record->bases_end();
@@ -1644,16 +1603,16 @@
             DynCast<CXXRecordDecl>(base_record);
 
         LLDB_LOG(log,
-                 "LRT[{0}]     {1}(CXXRecordDecl*){2}, Name = '{3}', Offset = "
+                 "LRT     {1}(CXXRecordDecl*){2}, Name = '{3}', Offset = "
                  "{4} chars",
-                 current_id, (is_virtual ? "Virtual " : ""),
-                 base_cxx_record.decl, base_cxx_record.decl->getName(),
+                 (is_virtual ? "Virtual " : ""), base_cxx_record.decl,
+                 base_cxx_record.decl->getName(),
                  (is_virtual
                       ? virtual_base_offsets[base_cxx_record.decl].getQuantity()
                       : base_offsets[base_cxx_record.decl].getQuantity()));
       }
     } else {
-      LLDB_LOG(log, "LRD[{0}]   Not a CXXRecord, so no bases", current_id);
+      LLDB_LOG(log, "LRD   Not a CXXRecord, so no bases");
     }
   }
 
@@ -1663,24 +1622,21 @@
 void ClangASTSource::CompleteNamespaceMap(
     ClangASTImporter::NamespaceMapSP &namespace_map, ConstString name,
     ClangASTImporter::NamespaceMapSP &parent_map) const {
-  static unsigned int invocation_id = 0;
-  unsigned int current_id = invocation_id++;
 
   Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_EXPRESSIONS));
 
   if (log) {
     if (parent_map && parent_map->size())
       LLDB_LOG(log,
-               "CompleteNamespaceMap[{0}] on (ASTContext*){1} '{2}' Searching "
+               "CompleteNamespaceMap on (ASTContext*){1} '{2}' Searching "
                "for namespace {3} in namespace {4}",
-               current_id, m_ast_context, m_clang_ast_context->getDisplayName(),
-               name, parent_map->begin()->second.GetName());
+               m_ast_context, m_clang_ast_context->getDisplayName(), name,
+               parent_map->begin()->second.GetName());
     else
       LLDB_LOG(log,
-               "CompleteNamespaceMap[{0}] on (ASTContext*){1} '{2}' Searching "
+               "CompleteNamespaceMap on (ASTContext*){1} '{2}' Searching "
                "for namespace {3}",
-               current_id, m_ast_context, m_clang_ast_context->getDisplayName(),
-               name);
+               m_ast_context, m_clang_ast_context->getDisplayName(), name);
   }
 
   if (parent_map) {
@@ -1706,8 +1662,8 @@
       namespace_map->push_back(std::pair<lldb::ModuleSP, CompilerDeclContext>(
           module_sp, found_namespace_decl));
 
-      LLDB_LOG(log, "  CMN[{0}] Found namespace {1} in module {2}", current_id,
-               name, module_sp->GetFileSpec().GetFilename());
+      LLDB_LOG(log, "  CMN Found namespace {1} in module {2}", name,
+               module_sp->GetFileSpec().GetFilename());
     }
   } else {
     const ModuleList &target_images = m_target->GetImages();
@@ -1737,8 +1693,8 @@
       namespace_map->push_back(std::pair<lldb::ModuleSP, CompilerDeclContext>(
           image, found_namespace_decl));
 
-      LLDB_LOG(log, "  CMN[{0}] Found namespace {1} in module {2}", current_id,
-               name, image->GetFileSpec().GetFilename());
+      LLDB_LOG(log, "  CMN[{0}] Found namespace {1} in module {2}", name,
+               image->GetFileSpec().GetFilename());
     }
   }
 }
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionDeclMap.h b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionDeclMap.h
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionDeclMap.h
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionDeclMap.h
@@ -275,14 +275,9 @@
   ///
   /// \param[in] namespace_decl
   ///     If valid and module is non-NULL, the parent namespace.
-  ///
-  /// \param[in] current_id
-  ///     The ID for the current FindExternalVisibleDecls invocation,
-  ///     for logging purposes.
   void FindExternalVisibleDecls(NameSearchContext &context,
                                 lldb::ModuleSP module,
-                                const CompilerDeclContext &namespace_decl,
-                                unsigned int current_id);
+                                const CompilerDeclContext &namespace_decl);
 
 protected:
   /// Retrieves the declaration with the given name from the storage of
@@ -395,32 +390,19 @@
   ///
   /// \param[in] name
   ///     The name of the entities that need to be found.
-  ///
-  /// \param[in] current_id
-  ///     The ID for the current FindExternalVisibleDecls invocation,
-  ///     for logging purposes.
-  void SearchPersistenDecls(NameSearchContext &context, const ConstString name,
-                            unsigned int current_id);
+  void SearchPersistenDecls(NameSearchContext &context, const ConstString name);
 
   /// Handles looking up $__lldb_class which requires special treatment.
   ///
   /// \param[in] context
   ///     The NameSearchContext that can construct Decls for this name.
-  ///
-  /// \param[in] current_id
-  ///     The ID for the current FindExternalVisibleDecls invocation,
-  ///     for logging purposes.
-  void LookUpLldbClass(NameSearchContext &context, unsigned int current_id);
+  void LookUpLldbClass(NameSearchContext &context);
 
   /// Handles looking up $__lldb_objc_class which requires special treatment.
   ///
   /// \param[in] context
   ///     The NameSearchContext that can construct Decls for this name.
-  ///
-  /// \param[in] current_id
-  ///     The ID for the current FindExternalVisibleDecls invocation,
-  ///     for logging purposes.
-  void LookUpLldbObjCClass(NameSearchContext &context, unsigned int current_id);
+  void LookUpLldbObjCClass(NameSearchContext &context);
 
   /// Handles looking up the synthetic namespace that contains our local
   /// variables for the current frame.
@@ -439,12 +421,7 @@
   ///
   /// \param[in] name
   ///     The name of the entities that need to be found.
-  ///
-  /// \param[in] current_id
-  ///     The ID for the current FindExternalVisibleDecls invocation,
-  ///     for logging purposes.
-  void LookupInModulesDeclVendor(NameSearchContext &context, ConstString name,
-                                 unsigned current_id);
+  void LookupInModulesDeclVendor(NameSearchContext &context, ConstString name);
 
   /// Looks up a local variable.
   ///
@@ -454,10 +431,6 @@
   /// \param[in] name
   ///     The name of the entities that need to be found.
   ///
-  /// \param[in] current_id
-  ///     The ID for the current FindExternalVisibleDecls invocation,
-  ///     for logging purposes.
-  ///
   /// \param[in] sym_ctx
   ///     The current SymbolContext of this frame.
   ///
@@ -467,7 +440,7 @@
   /// \return
   ///    True iff a local variable was found.
   bool LookupLocalVariable(NameSearchContext &context, ConstString name,
-                           unsigned current_id, SymbolContext &sym_ctx,
+                           SymbolContext &sym_ctx,
                            const CompilerDeclContext &namespace_decl);
 
   /// Searches for functions in the given SymbolContextList.
@@ -500,14 +473,9 @@
   ///
   /// \param[in] namespace_decl
   ///     If valid and module is non-NULL, the parent namespace.
-  ///
-  /// \param[in] current_id
-  ///     The ID for the current FindExternalVisibleDecls invocation,
-  ///     for logging purposes.
   void LookupFunction(NameSearchContext &context, lldb::ModuleSP module_sp,
                       ConstString name,
-                      const CompilerDeclContext &namespace_decl,
-                      unsigned current_id);
+                      const CompilerDeclContext &namespace_decl);
 
   /// Given a target, find a variable that matches the given name and type.
   ///
@@ -567,7 +535,7 @@
   /// \param[in] valobj
   ///     The LLDB ValueObject for that variable.
   void AddOneVariable(NameSearchContext &context, lldb::VariableSP var,
-                      lldb::ValueObjectSP valobj, unsigned int current_id);
+                      lldb::ValueObjectSP valobj);
 
   /// Use the NameSearchContext to generate a Decl for the given persistent
   /// variable, and put it in the list of found entities.
@@ -577,18 +545,12 @@
   ///
   /// \param[in] pvar_sp
   ///     The persistent variable that needs a Decl.
-  ///
-  /// \param[in] current_id
-  ///     The ID of the current invocation of FindExternalVisibleDecls
-  ///     for logging purposes.
   void AddOneVariable(NameSearchContext &context,
-                      lldb::ExpressionVariableSP &pvar_sp,
-                      unsigned int current_id);
+                      lldb::ExpressionVariableSP &pvar_sp);
 
   /// Use the NameSearchContext to generate a Decl for the given LLDB symbol
   /// (treated as a variable), and put it in the list of found entities.
-  void AddOneGenericVariable(NameSearchContext &context, const Symbol &symbol,
-                             unsigned int current_id);
+  void AddOneGenericVariable(NameSearchContext &context, const Symbol &symbol);
 
   /// Use the NameSearchContext to generate a Decl for the given function.
   /// (Functions are not placed in the Tuple list.)  Can handle both fully
@@ -604,8 +566,7 @@
   /// \param[in] sym
   ///     The Symbol that corresponds to a function that needs to be
   ///     created with generic type (unitptr_t foo(...)).
-  void AddOneFunction(NameSearchContext &context, Function *fun, Symbol *sym,
-                      unsigned int current_id);
+  void AddOneFunction(NameSearchContext &context, Function *fun, Symbol *sym);
 
   /// Use the NameSearchContext to generate a Decl for the given register.
   ///
@@ -614,8 +575,7 @@
   ///
   /// \param[in] reg_info
   ///     The information corresponding to that register.
-  void AddOneRegister(NameSearchContext &context, const RegisterInfo *reg_info,
-                      unsigned int current_id);
+  void AddOneRegister(NameSearchContext &context, const RegisterInfo *reg_info);
 
   /// Use the NameSearchContext to generate a Decl for the given type.  (Types
   /// are not placed in the Tuple list.)
@@ -625,8 +585,7 @@
   ///
   /// \param[in] type
   ///     The type that needs to be created.
-  void AddOneType(NameSearchContext &context, const TypeFromUser &type,
-                  unsigned int current_id);
+  void AddOneType(NameSearchContext &context, const TypeFromUser &type);
 
   /// Generate a Decl for "*this" and add a member function declaration to it
   /// for the expression, then report it.
@@ -636,8 +595,7 @@
   ///
   /// \param[in] type
   ///     The type for *this.
-  void AddThisType(NameSearchContext &context, const TypeFromUser &type,
-                   unsigned int current_id);
+  void AddThisType(NameSearchContext &context, const TypeFromUser &type);
 
   /// Move a type out of the current ASTContext into another, but make sure to
   /// export all components of the type also.
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionDeclMap.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionDeclMap.cpp
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionDeclMap.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionDeclMap.cpp
@@ -640,28 +640,24 @@
     return;
   }
 
-  static unsigned int invocation_id = 0;
-  unsigned int current_id = invocation_id++;
-
   if (log) {
     if (!context.m_decl_context)
       LLDB_LOGF(log,
-                "ClangExpressionDeclMap::FindExternalVisibleDecls[%u] for "
+                "ClangExpressionDeclMap::FindExternalVisibleDecls for "
                 "'%s' in a NULL DeclContext",
-                current_id, name.GetCString());
+                name.GetCString());
     else if (const NamedDecl *context_named_decl =
                  dyn_cast<NamedDecl>(context.m_decl_context))
       LLDB_LOGF(log,
-                "ClangExpressionDeclMap::FindExternalVisibleDecls[%u] for "
+                "ClangExpressionDeclMap::FindExternalVisibleDecls for "
                 "'%s' in '%s'",
-                current_id, name.GetCString(),
+                name.GetCString(),
                 context_named_decl->getNameAsString().c_str());
     else
       LLDB_LOGF(log,
-                "ClangExpressionDeclMap::FindExternalVisibleDecls[%u] for "
+                "ClangExpressionDeclMap::FindExternalVisibleDecls for "
                 "'%s' in a '%s'",
-                current_id, name.GetCString(),
-                context.m_decl_context->getDeclKindName());
+                name.GetCString(), context.m_decl_context->getDeclKindName());
   }
 
   if (const NamespaceDecl *namespace_context =
@@ -671,8 +667,7 @@
       CompilerDeclContext compiler_decl_ctx =
           m_clang_ast_context->CreateDeclContext(
               const_cast<clang::DeclContext *>(context.m_decl_context));
-      FindExternalVisibleDecls(context, lldb::ModuleSP(), compiler_decl_ctx,
-                               current_id);
+      FindExternalVisibleDecls(context, lldb::ModuleSP(), compiler_decl_ctx);
       return;
     }
 
@@ -683,28 +678,27 @@
       return;
 
     if (log && log->GetVerbose())
-      log->Printf("  CEDM::FEVD[%u] Inspecting (NamespaceMap*)%p (%d entries)",
-                  current_id, static_cast<void *>(namespace_map.get()),
+      log->Printf("  CEDM::FEVD Inspecting (NamespaceMap*)%p (%d entries)",
+                  static_cast<void *>(namespace_map.get()),
                   (int)namespace_map->size());
 
     for (ClangASTImporter::NamespaceMap::iterator i = namespace_map->begin(),
                                                   e = namespace_map->end();
          i != e; ++i) {
       if (log)
-        log->Printf("  CEDM::FEVD[%u] Searching namespace %s in module %s",
-                    current_id, i->second.GetName().AsCString(),
+        log->Printf("  CEDM::FEVD Searching namespace %s in module %s",
+                    i->second.GetName().AsCString(),
                     i->first->GetFileSpec().GetFilename().GetCString());
 
-      FindExternalVisibleDecls(context, i->first, i->second, current_id);
+      FindExternalVisibleDecls(context, i->first, i->second);
     }
   } else if (isa<TranslationUnitDecl>(context.m_decl_context)) {
     CompilerDeclContext namespace_decl;
 
     if (log)
-      log->Printf("  CEDM::FEVD[%u] Searching the root namespace", current_id);
+      log->Printf("  CEDM::FEVD Searching the root namespace");
 
-    FindExternalVisibleDecls(context, lldb::ModuleSP(), namespace_decl,
-                             current_id);
+    FindExternalVisibleDecls(context, lldb::ModuleSP(), namespace_decl);
   }
 
   ClangASTSource::FindExternalVisibleDecls(context);
@@ -733,8 +727,7 @@
 }
 
 void ClangExpressionDeclMap::SearchPersistenDecls(NameSearchContext &context,
-                                                  const ConstString name,
-                                                  unsigned int current_id) {
+                                                  const ConstString name) {
   Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_EXPRESSIONS));
 
   NamedDecl *persistent_decl = GetPersistentDecl(name);
@@ -757,14 +750,12 @@
     MaybeRegisterFunctionBody(parser_function_decl);
   }
 
-  LLDB_LOGF(log, "  CEDM::FEVD[%u] Found persistent decl %s", current_id,
-            name.GetCString());
+  LLDB_LOG(log, "  CEDM::FEVD Found persistent decl %s", name);
 
   context.AddNamedDecl(parser_named_decl);
 }
 
-void ClangExpressionDeclMap::LookUpLldbClass(NameSearchContext &context,
-                                             unsigned int current_id) {
+void ClangExpressionDeclMap::LookUpLldbClass(NameSearchContext &context) {
   Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_EXPRESSIONS));
 
   StackFrame *frame = m_parser_vars->m_exe_ctx.GetFramePtr();
@@ -779,8 +770,7 @@
     if (!ctx_obj_ptr || status.Fail())
       return;
 
-    AddThisType(context, TypeFromUser(m_ctx_obj->GetCompilerType()),
-                current_id);
+    AddThisType(context, TypeFromUser(m_ctx_obj->GetCompilerType()));
 
     m_struct_vars->m_object_pointer_type =
         TypeFromUser(ctx_obj_ptr->GetCompilerType());
@@ -815,10 +805,10 @@
     TypeFromUser class_user_type(class_qual_type.getAsOpaquePtr(),
                                  function_decl_ctx.GetTypeSystem());
 
-    LLDB_LOG(log, "  CEDM::FEVD[{0}] Adding type for $__lldb_class: {1}",
-             current_id, class_qual_type.getAsString());
+    LLDB_LOG(log, "  CEDM::FEVD Adding type for $__lldb_class: {1}",
+             class_qual_type.getAsString());
 
-    AddThisType(context, class_user_type, current_id);
+    AddThisType(context, class_user_type);
 
     if (method_decl->isInstance()) {
       // self is a pointer to the object
@@ -857,17 +847,16 @@
     TypeFromUser pointee_type =
         this_type->GetForwardCompilerType().GetPointeeType();
 
-    LLDB_LOG(log, "  FEVD[{0}] Adding type for $__lldb_class: {1}", current_id,
+    LLDB_LOG(log, "  FEVD Adding type for $__lldb_class: {1}",
              ClangUtil::GetQualType(pointee_type).getAsString());
 
-    AddThisType(context, pointee_type, current_id);
+    AddThisType(context, pointee_type);
     TypeFromUser this_user_type(this_type->GetFullCompilerType());
     m_struct_vars->m_object_pointer_type = this_user_type;
   }
 }
 
-void ClangExpressionDeclMap::LookUpLldbObjCClass(NameSearchContext &context,
-                                                 unsigned int current_id) {
+void ClangExpressionDeclMap::LookUpLldbObjCClass(NameSearchContext &context) {
   Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_EXPRESSIONS));
 
   StackFrame *frame = m_parser_vars->m_exe_ctx.GetFramePtr();
@@ -878,7 +867,7 @@
     if (!ctx_obj_ptr || status.Fail())
       return;
 
-    AddOneType(context, TypeFromUser(m_ctx_obj->GetCompilerType()), current_id);
+    AddOneType(context, TypeFromUser(m_ctx_obj->GetCompilerType()));
 
     m_struct_vars->m_object_pointer_type =
         TypeFromUser(ctx_obj_ptr->GetCompilerType());
@@ -924,9 +913,9 @@
                                  function_decl_ctx.GetTypeSystem());
 
     LLDB_LOG(log, "  FEVD[{0}] Adding type for $__lldb_objc_class: {1}",
-             current_id, ClangUtil::ToString(interface_type));
+             ClangUtil::ToString(interface_type));
 
-    AddOneType(context, class_user_type, current_id);
+    AddOneType(context, class_user_type);
 
     if (method_decl->isInstanceMethod()) {
       // self is a pointer to the object
@@ -986,11 +975,11 @@
     return;
 
   LLDB_LOG(log, "  FEVD[{0}] Adding type for $__lldb_objc_class: {1}",
-           current_id, ClangUtil::ToString(self_type->GetFullCompilerType()));
+           ClangUtil::ToString(self_type->GetFullCompilerType()));
 
   TypeFromUser class_user_type(self_clang_type);
 
-  AddOneType(context, class_user_type, current_id);
+  AddOneType(context, class_user_type);
 
   TypeFromUser self_user_type(self_type->GetFullCompilerType());
 
@@ -1024,7 +1013,7 @@
 }
 
 void ClangExpressionDeclMap::LookupInModulesDeclVendor(
-    NameSearchContext &context, ConstString name, unsigned current_id) {
+    NameSearchContext &context, ConstString name) {
   Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_EXPRESSIONS));
 
   if (!m_target)
@@ -1045,16 +1034,14 @@
   clang::NamedDecl *const decl_from_modules = decls[0];
 
   LLDB_LOG(log,
-           "  CAS::FEVD[{0}] Matching decl found for "
+           "  CAS::FEVD Matching decl found for "
            "\"{1}\" in the modules",
-           current_id, name);
+           name);
 
   clang::Decl *copied_decl = CopyDecl(decl_from_modules);
   if (!copied_decl) {
-    LLDB_LOG(log,
-             "  CAS::FEVD[{0}] - Couldn't export a "
-             "declaration from the modules",
-             current_id);
+    LLDB_LOG(log, "  CAS::FEVD - Couldn't export a "
+                  "declaration from the modules");
     return;
   }
 
@@ -1072,8 +1059,8 @@
 }
 
 bool ClangExpressionDeclMap::LookupLocalVariable(
-    NameSearchContext &context, ConstString name, unsigned current_id,
-    SymbolContext &sym_ctx, const CompilerDeclContext &namespace_decl) {
+    NameSearchContext &context, ConstString name, SymbolContext &sym_ctx,
+    const CompilerDeclContext &namespace_decl) {
   if (sym_ctx.block == nullptr)
     return false;
 
@@ -1108,7 +1095,7 @@
     if (var && !variable_found) {
       variable_found = true;
       ValueObjectSP valobj = ValueObjectVariable::Create(frame, var);
-      AddOneVariable(context, var, valobj, current_id);
+      AddOneVariable(context, var, valobj);
       context.m_found.variable = true;
     }
   }
@@ -1215,7 +1202,7 @@
 
 void ClangExpressionDeclMap::LookupFunction(
     NameSearchContext &context, lldb::ModuleSP module_sp, ConstString name,
-    const CompilerDeclContext &namespace_decl, unsigned current_id) {
+    const CompilerDeclContext &namespace_decl) {
   if (!m_parser_vars)
     return;
 
@@ -1293,7 +1280,7 @@
         if (decl_ctx.IsClassMethod(nullptr, nullptr, nullptr))
           continue;
 
-        AddOneFunction(context, sym_ctx.function, nullptr, current_id);
+        AddOneFunction(context, sym_ctx.function, nullptr);
         context.m_found.function_with_type_info = true;
         context.m_found.function = true;
       } else if (sym_ctx.symbol) {
@@ -1325,10 +1312,10 @@
 
     if (!context.m_found.function_with_type_info) {
       if (extern_symbol) {
-        AddOneFunction(context, nullptr, extern_symbol, current_id);
+        AddOneFunction(context, nullptr, extern_symbol);
         context.m_found.function = true;
       } else if (non_extern_symbol) {
-        AddOneFunction(context, nullptr, non_extern_symbol, current_id);
+        AddOneFunction(context, nullptr, non_extern_symbol);
         context.m_found.function = true;
       }
     }
@@ -1337,7 +1324,7 @@
 
 void ClangExpressionDeclMap::FindExternalVisibleDecls(
     NameSearchContext &context, lldb::ModuleSP module_sp,
-    const CompilerDeclContext &namespace_decl, unsigned int current_id) {
+    const CompilerDeclContext &namespace_decl) {
   assert(m_ast_context);
 
   Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_EXPRESSIONS));
@@ -1362,16 +1349,16 @@
 
   // Try the persistent decls, which take precedence over all else.
   if (!namespace_decl)
-    SearchPersistenDecls(context, name, current_id);
+    SearchPersistenDecls(context, name);
 
   if (name.GetStringRef().startswith("$") && !namespace_decl) {
     if (name == "$__lldb_class") {
-      LookUpLldbClass(context, current_id);
+      LookUpLldbClass(context);
       return;
     }
 
     if (name == "$__lldb_objc_class") {
-      LookUpLldbObjCClass(context, current_id);
+      LookUpLldbObjCClass(context);
       return;
     }
     if (name == g_lldb_local_vars_namespace_cstr) {
@@ -1391,7 +1378,7 @@
         m_parser_vars->m_persistent_vars->GetVariable(name));
 
     if (pvar_sp) {
-      AddOneVariable(context, pvar_sp, current_id);
+      AddOneVariable(context, pvar_sp);
       return;
     }
 
@@ -1404,10 +1391,9 @@
               reg_name));
 
       if (reg_info) {
-        LLDB_LOGF(log, "  CEDM::FEVD[%u] Found register %s", current_id,
-                  reg_info->name);
+        LLDB_LOGF(log, "  CEDM::FEVD Found register %s", reg_info->name);
 
-        AddOneRegister(context, reg_info, current_id);
+        AddOneRegister(context, reg_info);
       }
     }
     return;
@@ -1416,7 +1402,7 @@
   bool local_var_lookup = !namespace_decl || (namespace_decl.GetName() ==
                                               g_lldb_local_vars_namespace_cstr);
   if (frame && local_var_lookup)
-    if (LookupLocalVariable(context, name, current_id, sym_ctx, namespace_decl))
+    if (LookupLocalVariable(context, name, sym_ctx, namespace_decl))
       return;
 
   if (target) {
@@ -1426,17 +1412,17 @@
 
     if (var) {
       valobj = ValueObjectVariable::Create(target, var);
-      AddOneVariable(context, var, valobj, current_id);
+      AddOneVariable(context, var, valobj);
       context.m_found.variable = true;
       return;
     }
   }
 
-  LookupFunction(context, module_sp, name, namespace_decl, current_id);
+  LookupFunction(context, module_sp, name, namespace_decl);
 
   // Try the modules next.
   if (!context.m_found.function_with_type_info)
-    LookupInModulesDeclVendor(context, name, current_id);
+    LookupInModulesDeclVendor(context, name);
 
   if (target && !context.m_found.variable && !namespace_decl) {
     // We couldn't find a non-symbol variable for this.  Now we'll hunt for a
@@ -1460,7 +1446,7 @@
           m_ast_context->getDiagnostics().getCustomDiagID(
               clang::DiagnosticsEngine::Level::Warning, "%0");
       m_ast_context->getDiagnostics().Report(diag_id) << warning.c_str();
-      AddOneGenericVariable(context, *data_symbol, current_id);
+      AddOneGenericVariable(context, *data_symbol);
       context.m_found.variable = true;
     }
   }
@@ -1556,8 +1542,7 @@
 
 void ClangExpressionDeclMap::AddOneVariable(NameSearchContext &context,
                                             VariableSP var,
-                                            ValueObjectSP valobj,
-                                            unsigned int current_id) {
+                                            ValueObjectSP valobj) {
   assert(m_parser_vars.get());
 
   Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_EXPRESSIONS));
@@ -1608,15 +1593,12 @@
   if (is_reference)
     entity->m_flags |= ClangExpressionVariable::EVTypeIsReference;
 
-  LLDB_LOG(log,
-           "  CEDM::FEVD[{0}] Found variable {1}, returned\n{2} (original {3})",
-           current_id, decl_name, ClangUtil::DumpDecl(var_decl),
-           ClangUtil::ToString(ut));
+  LLDB_LOG(log, "  CEDM::FEVD Found variable {1}, returned\n{2} (original {3})",
+           decl_name, ClangUtil::DumpDecl(var_decl), ClangUtil::ToString(ut));
 }
 
 void ClangExpressionDeclMap::AddOneVariable(NameSearchContext &context,
-                                            ExpressionVariableSP &pvar_sp,
-                                            unsigned int current_id) {
+                                            ExpressionVariableSP &pvar_sp) {
   Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_EXPRESSIONS));
 
   TypeFromUser user_type(
@@ -1625,8 +1607,8 @@
   TypeFromParser parser_type(GuardedCopyType(user_type));
 
   if (!parser_type.GetOpaqueQualType()) {
-    LLDB_LOGF(log, "  CEDM::FEVD[%u] Couldn't import type for pvar %s",
-              current_id, pvar_sp->GetName().GetCString());
+    LLDB_LOGF(log, "  CEDM::FEVD Couldn't import type for pvar %s",
+              pvar_sp->GetName().GetCString());
     return;
   }
 
@@ -1642,13 +1624,12 @@
   parser_vars->m_llvm_value = nullptr;
   parser_vars->m_lldb_value.Clear();
 
-  LLDB_LOG(log, "  CEDM::FEVD[{0}] Added pvar {1}, returned\n{2}", current_id,
+  LLDB_LOG(log, "  CEDM::FEVD Added pvar {1}, returned\n{2}",
            pvar_sp->GetName(), ClangUtil::DumpDecl(var_decl));
 }
 
 void ClangExpressionDeclMap::AddOneGenericVariable(NameSearchContext &context,
-                                                   const Symbol &symbol,
-                                                   unsigned int current_id) {
+                                                   const Symbol &symbol) {
   assert(m_parser_vars.get());
 
   Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_EXPRESSIONS));
@@ -1695,13 +1676,12 @@
   parser_vars->m_llvm_value = nullptr;
   parser_vars->m_lldb_sym = &symbol;
 
-  LLDB_LOG(log, "  CEDM::FEVD[{0}] Found variable {1}, returned\n{2}",
-           current_id, decl_name, ClangUtil::DumpDecl(var_decl));
+  LLDB_LOG(log, "  CEDM::FEVD Found variable {1}, returned\n{2}", decl_name,
+           ClangUtil::DumpDecl(var_decl));
 }
 
 void ClangExpressionDeclMap::AddOneRegister(NameSearchContext &context,
-                                            const RegisterInfo *reg_info,
-                                            unsigned int current_id) {
+                                            const RegisterInfo *reg_info) {
   Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_EXPRESSIONS));
 
   CompilerType clang_type =
@@ -1735,14 +1715,13 @@
   parser_vars->m_lldb_value.Clear();
   entity->m_flags |= ClangExpressionVariable::EVBareRegister;
 
-  LLDB_LOG(log, "  CEDM::FEVD[{0}] Added register {1}, returned\n{2}",
-           current_id, context.m_decl_name.getAsString(),
-           ClangUtil::DumpDecl(var_decl));
+  LLDB_LOG(log, "  CEDM::FEVD Added register {1}, returned\n{2}",
+           context.m_decl_name.getAsString(), ClangUtil::DumpDecl(var_decl));
 }
 
 void ClangExpressionDeclMap::AddOneFunction(NameSearchContext &context,
-                                            Function *function, Symbol *symbol,
-                                            unsigned int current_id) {
+                                            Function *function,
+                                            Symbol *symbol) {
   assert(m_parser_vars.get());
 
   Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_EXPRESSIONS));
@@ -1785,9 +1764,9 @@
               function->DumpSymbolContext(&ss);
 
               LLDB_LOG(log,
-                       "  CEDM::FEVD[{0}] Imported decl for function template"
+                       "  CEDM::FEVD Imported decl for function template"
                        " {1} (description {2}), returned\n{3}",
-                       current_id, copied_function_template->getNameAsString(),
+                       copied_function_template->getNameAsString(),
                        ss.GetData(),
                        ClangUtil::DumpDecl(copied_function_template));
             }
@@ -1804,10 +1783,10 @@
               function->DumpSymbolContext(&ss);
 
               LLDB_LOG(log,
-                       "  CEDM::FEVD[{0}]] Imported decl for function {1} "
+                       "  CEDM::FEVD Imported decl for function {1} "
                        "(description {2}), returned\n{3}",
-                       current_id, copied_function_decl->getNameAsString(),
-                       ss.GetData(), ClangUtil::DumpDecl(copied_function_decl));
+                       copied_function_decl->getNameAsString(), ss.GetData(),
+                       ClangUtil::DumpDecl(copied_function_decl));
             }
 
             context.AddNamedDecl(copied_function_decl);
@@ -1916,16 +1895,15 @@
                      Address::DumpStyleResolvedDescription);
 
     LLDB_LOG(log,
-             "  CEDM::FEVD[{0}] Found {1} function {2} (description {3}), "
+             "  CEDM::FEVD Found {1} function {2} (description {3}), "
              "returned\n{4}",
-             current_id, (function ? "specific" : "generic"), decl_name,
-             ss.GetData(), ClangUtil::DumpDecl(function_decl));
+             (function ? "specific" : "generic"), decl_name, ss.GetData(),
+             ClangUtil::DumpDecl(function_decl));
   }
 }
 
 void ClangExpressionDeclMap::AddThisType(NameSearchContext &context,
-                                         const TypeFromUser &ut,
-                                         unsigned int current_id) {
+                                         const TypeFromUser &ut) {
   CompilerType copied_clang_type = GuardedCopyType(ut);
 
   Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_EXPRESSIONS));
@@ -1994,8 +1972,7 @@
 }
 
 void ClangExpressionDeclMap::AddOneType(NameSearchContext &context,
-                                        const TypeFromUser &ut,
-                                        unsigned int current_id) {
+                                        const TypeFromUser &ut) {
   CompilerType copied_clang_type = GuardedCopyType(ut);
 
   if (!copied_clang_type) {
diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCDeclVendor.cpp b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCDeclVendor.cpp
--- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCDeclVendor.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCDeclVendor.cpp
@@ -30,17 +30,14 @@
 
   bool FindExternalVisibleDeclsByName(const clang::DeclContext *decl_ctx,
                                       clang::DeclarationName name) override {
-    static unsigned int invocation_id = 0;
-    unsigned int current_id = invocation_id++;
 
     Log *log(GetLogIfAllCategoriesSet(
         LIBLLDB_LOG_EXPRESSIONS)); // FIXME - a more appropriate log channel?
 
     if (log) {
       LLDB_LOGF(log,
-                "AppleObjCExternalASTSource::FindExternalVisibleDeclsByName[%"
-                "u] on (ASTContext*)%p Looking for %s in (%sDecl*)%p",
-                current_id,
+                "AppleObjCExternalASTSource::FindExternalVisibleDeclsByName"
+                " on (ASTContext*)%p Looking for %s in (%sDecl*)%p",
                 static_cast<void *>(&decl_ctx->getParentASTContext()),
                 name.getAsString().c_str(), decl_ctx->getDeclKindName(),
                 static_cast<const void *>(decl_ctx));
@@ -70,44 +67,37 @@
   }
 
   void CompleteType(clang::TagDecl *tag_decl) override {
-    static unsigned int invocation_id = 0;
-    unsigned int current_id = invocation_id++;
 
     Log *log(GetLogIfAllCategoriesSet(
         LIBLLDB_LOG_EXPRESSIONS)); // FIXME - a more appropriate log channel?
 
     LLDB_LOGF(log,
-              "AppleObjCExternalASTSource::CompleteType[%u] on "
+              "AppleObjCExternalASTSource::CompleteType on "
               "(ASTContext*)%p Completing (TagDecl*)%p named %s",
-              current_id, static_cast<void *>(&tag_decl->getASTContext()),
+              static_cast<void *>(&tag_decl->getASTContext()),
               static_cast<void *>(tag_decl), tag_decl->getName().str().c_str());
 
-    LLDB_LOG(log, "  AOEAS::CT[{0}] Before:\n{1}", current_id,
-             ClangUtil::DumpDecl(tag_decl));
+    LLDB_LOG(log, "  AOEAS::CT Before:\n{1}", ClangUtil::DumpDecl(tag_decl));
 
-    LLDB_LOG(log, "  AOEAS::CT[{1}] After:{1}", current_id,
-             ClangUtil::DumpDecl(tag_decl));
+    LLDB_LOG(log, "  AOEAS::CT After:{1}", ClangUtil::DumpDecl(tag_decl));
 
     return;
   }
 
   void CompleteType(clang::ObjCInterfaceDecl *interface_decl) override {
-    static unsigned int invocation_id = 0;
-    unsigned int current_id = invocation_id++;
 
     Log *log(GetLogIfAllCategoriesSet(
         LIBLLDB_LOG_EXPRESSIONS)); // FIXME - a more appropriate log channel?
 
     if (log) {
       LLDB_LOGF(log,
-                "AppleObjCExternalASTSource::CompleteType[%u] on "
+                "AppleObjCExternalASTSource::CompleteType on "
                 "(ASTContext*)%p Completing (ObjCInterfaceDecl*)%p named %s",
-                current_id,
                 static_cast<void *>(&interface_decl->getASTContext()),
                 static_cast<void *>(interface_decl),
                 interface_decl->getName().str().c_str());
 
-      LLDB_LOGF(log, "  AOEAS::CT[%u] Before:", current_id);
+      LLDB_LOGF(log, "  AOEAS::CT Before:");
       LLDB_LOG(log, "    [CT] {0}", ClangUtil::DumpDecl(interface_decl));
     }
 
@@ -537,15 +527,13 @@
 uint32_t AppleObjCDeclVendor::FindDecls(ConstString name, bool append,
                                         uint32_t max_matches,
                                         std::vector<CompilerDecl> &decls) {
-  static unsigned int invocation_id = 0;
-  unsigned int current_id = invocation_id++;
 
   Log *log(GetLogIfAllCategoriesSet(
       LIBLLDB_LOG_EXPRESSIONS)); // FIXME - a more appropriate log channel?
 
-  LLDB_LOGF(log, "AppleObjCDeclVendor::FindDecls [%u] ('%s', %s, %u, )",
-            current_id, (const char *)name.AsCString(),
-            append ? "true" : "false", max_matches);
+  LLDB_LOGF(log, "AppleObjCDeclVendor::FindDecls ('%s', %s, %u, )",
+            (const char *)name.AsCString(), append ? "true" : "false",
+            max_matches);
 
   if (!append)
     decls.clear();
@@ -578,24 +566,21 @@
             isa_value = metadata->GetISAPtr();
 
           LLDB_LOG(log,
-                   "AOCTV::FT [%u] Found %s (isa 0x%" PRIx64
-                   ") in the ASTContext",
-                   current_id, result_iface_type.getAsString(), isa_value);
+                   "AOCTV::FT Found %s (isa 0x%" PRIx64 ") in the ASTContext",
+                   result_iface_type.getAsString(), isa_value);
         }
 
         decls.push_back(m_ast_ctx.GetCompilerDecl(result_iface_decl));
         ret++;
         break;
       } else {
-        LLDB_LOGF(log,
-                  "AOCTV::FT [%u] There's something in the ASTContext, but "
-                  "it's not something we know about",
-                  current_id);
+        LLDB_LOGF(log, "AOCTV::FT There's something in the ASTContext, but "
+                       "it's not something we know about");
         break;
       }
     } else if (log) {
-      LLDB_LOGF(log, "AOCTV::FT [%u] Couldn't find %s in the ASTContext",
-                current_id, name.AsCString());
+      LLDB_LOGF(log, "AOCTV::FT Couldn't find %s in the ASTContext",
+                name.AsCString());
     }
 
     // It's not.  If it exists, we have to put it into our ASTContext.
@@ -603,7 +588,7 @@
     ObjCLanguageRuntime::ObjCISA isa = m_runtime.GetISA(name);
 
     if (!isa) {
-      LLDB_LOGF(log, "AOCTV::FT [%u] Couldn't find the isa", current_id);
+      LLDB_LOGF(log, "AOCTV::FT Couldn't find the isa");
 
       break;
     }
@@ -612,9 +597,9 @@
 
     if (!iface_decl) {
       LLDB_LOGF(log,
-                "AOCTV::FT [%u] Couldn't get the Objective-C interface for "
+                "AOCTV::FT Couldn't get the Objective-C interface for "
                 "isa 0x%" PRIx64,
-                current_id, (uint64_t)isa);
+                (uint64_t)isa);
 
       break;
     }
@@ -622,7 +607,7 @@
     if (log) {
       clang::QualType new_iface_type = ast_ctx.getObjCInterfaceType(iface_decl);
 
-      LLDB_LOG(log, "AOCTV::FT [{0}] Created {1} (isa 0x{2:x})", current_id,
+      LLDB_LOG(log, "AOCTV::FT Created {1} (isa 0x{2:x})",
                new_iface_type.getAsString(), (uint64_t)isa);
     }
 
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
--- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
@@ -279,7 +279,7 @@
   void InitializePythonHome() {
 #if LLDB_EMBED_PYTHON_HOME
 #if PY_MAJOR_VERSION >= 3
-    typedef const wchar_t* str_type;
+    typedef wchar_t* str_type;
 #else
     typedef char* str_type;
 #endif
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.h b/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.h
--- a/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.h
@@ -12,6 +12,7 @@
 #include "Plugins/SymbolFile/DWARF/DWARFIndex.h"
 #include "Plugins/SymbolFile/DWARF/LogChannelDWARF.h"
 #include "Plugins/SymbolFile/DWARF/ManualDWARFIndex.h"
+#include "Plugins/SymbolFile/DWARF/SymbolFileDWARF.h"
 #include "lldb/Utility/ConstString.h"
 #include "llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h"
 
@@ -20,7 +21,7 @@
 public:
   static llvm::Expected<std::unique_ptr<DebugNamesDWARFIndex>>
   Create(Module &module, DWARFDataExtractor debug_names,
-         DWARFDataExtractor debug_str, DWARFDebugInfo &debug_info);
+         DWARFDataExtractor debug_str, SymbolFileDWARF &dwarf);
 
   void Preload() override { m_fallback.Preload(); }
 
@@ -49,11 +50,11 @@
                        std::unique_ptr<llvm::DWARFDebugNames> debug_names_up,
                        DWARFDataExtractor debug_names_data,
                        DWARFDataExtractor debug_str_data,
-                       DWARFDebugInfo &debug_info)
-      : DWARFIndex(module), m_debug_info(debug_info),
+                       SymbolFileDWARF &dwarf)
+      : DWARFIndex(module), m_debug_info(dwarf.DebugInfo()),
         m_debug_names_data(debug_names_data), m_debug_str_data(debug_str_data),
         m_debug_names_up(std::move(debug_names_up)),
-        m_fallback(module, debug_info, GetUnits(*m_debug_names_up)) {}
+        m_fallback(module, dwarf, GetUnits(*m_debug_names_up)) {}
 
   DWARFDebugInfo &m_debug_info;
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp
--- a/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp
@@ -19,14 +19,14 @@
 llvm::Expected<std::unique_ptr<DebugNamesDWARFIndex>>
 DebugNamesDWARFIndex::Create(Module &module, DWARFDataExtractor debug_names,
                              DWARFDataExtractor debug_str,
-                             DWARFDebugInfo &debug_info) {
+                             SymbolFileDWARF &dwarf) {
   auto index_up = std::make_unique<DebugNames>(debug_names.GetAsLLVM(),
                                                 debug_str.GetAsLLVM());
   if (llvm::Error E = index_up->extract())
     return std::move(E);
 
   return std::unique_ptr<DebugNamesDWARFIndex>(new DebugNamesDWARFIndex(
-      module, std::move(index_up), debug_names, debug_str, debug_info));
+      module, std::move(index_up), debug_names, debug_str, dwarf));
 }
 
 llvm::DenseSet<dw_offset_t>
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.h b/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.h
--- a/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.h
@@ -14,13 +14,14 @@
 #include "llvm/ADT/DenseSet.h"
 
 class DWARFDebugInfo;
+class SymbolFileDWARFDwo;
 
 namespace lldb_private {
 class ManualDWARFIndex : public DWARFIndex {
 public:
-  ManualDWARFIndex(Module &module, DWARFDebugInfo &debug_info,
+  ManualDWARFIndex(Module &module, SymbolFileDWARF &dwarf,
                    llvm::DenseSet<dw_offset_t> units_to_avoid = {})
-      : DWARFIndex(module), m_debug_info(&debug_info),
+      : DWARFIndex(module), m_dwarf(&dwarf),
         m_units_to_avoid(std::move(units_to_avoid)) {}
 
   void Preload() override { Index(); }
@@ -56,14 +57,15 @@
     NameToDIE namespaces;
   };
   void Index();
-  void IndexUnit(DWARFUnit &unit, IndexSet &set);
+  void IndexUnit(DWARFUnit &unit, SymbolFileDWARFDwo *dwp, IndexSet &set);
 
   static void IndexUnitImpl(DWARFUnit &unit,
                             const lldb::LanguageType cu_language,
                             IndexSet &set);
 
-  /// Non-null value means we haven't built the index yet.
-  DWARFDebugInfo *m_debug_info;
+  /// The DWARF file which we are indexing. Set to nullptr after the index is
+  /// built.
+  SymbolFileDWARF *m_dwarf;
   /// Which dwarf units should we skip while building the index.
   llvm::DenseSet<dw_offset_t> m_units_to_avoid;
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp b/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp
--- a/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp
@@ -22,22 +22,38 @@
 using namespace lldb;
 
 void ManualDWARFIndex::Index() {
-  if (!m_debug_info)
+  if (!m_dwarf)
     return;
 
-  DWARFDebugInfo &debug_info = *m_debug_info;
-  m_debug_info = nullptr;
+  SymbolFileDWARF &main_dwarf = *m_dwarf;
+  m_dwarf = nullptr;
 
   static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
-  Timer scoped_timer(func_cat, "%p", static_cast<void *>(&debug_info));
+  Timer scoped_timer(func_cat, "%p", static_cast<void *>(&main_dwarf));
+
+  DWARFDebugInfo &main_info = main_dwarf.DebugInfo();
+  SymbolFileDWARFDwo *dwp_dwarf = main_dwarf.GetDwpSymbolFile().get();
+  DWARFDebugInfo *dwp_info = dwp_dwarf ? &dwp_dwarf->DebugInfo() : nullptr;
 
   std::vector<DWARFUnit *> units_to_index;
-  units_to_index.reserve(debug_info.GetNumUnits());
-  for (size_t U = 0; U < debug_info.GetNumUnits(); ++U) {
-    DWARFUnit *unit = debug_info.GetUnitAtIndex(U);
+  units_to_index.reserve(main_info.GetNumUnits() +
+                         (dwp_info ? dwp_info->GetNumUnits() : 0));
+
+  // Process all units in the main file, as well as any type units in the dwp
+  // file. Type units in dwo files are handled when we reach the dwo file in
+  // IndexUnit.
+  for (size_t U = 0; U < main_info.GetNumUnits(); ++U) {
+    DWARFUnit *unit = main_info.GetUnitAtIndex(U);
     if (unit && m_units_to_avoid.count(unit->GetOffset()) == 0)
       units_to_index.push_back(unit);
   }
+  if (dwp_info && dwp_info->ContainsTypeUnits()) {
+    for (size_t U = 0; U < dwp_info->GetNumUnits(); ++U) {
+      if (auto *tu = llvm::dyn_cast<DWARFTypeUnit>(dwp_info->GetUnitAtIndex(U)))
+        units_to_index.push_back(tu);
+    }
+  }
+
   if (units_to_index.empty())
     return;
 
@@ -48,7 +64,7 @@
   std::vector<llvm::Optional<DWARFUnit::ScopedExtractDIEs>> clear_cu_dies(
       units_to_index.size());
   auto parser_fn = [&](size_t cu_idx) {
-    IndexUnit(*units_to_index[cu_idx], sets[cu_idx]);
+    IndexUnit(*units_to_index[cu_idx], dwp_dwarf, sets[cu_idx]);
   };
 
   auto extract_fn = [&units_to_index, &clear_cu_dies](size_t cu_idx) {
@@ -87,11 +103,8 @@
                      [&]() { finalize_fn(&IndexSet::namespaces); });
 }
 
-void ManualDWARFIndex::IndexUnit(DWARFUnit &unit, IndexSet &set) {
-  assert(
-      !unit.IsDWOUnit() &&
-      "DWARFUnit associated with .dwo or .dwp should not be indexed directly");
-
+void ManualDWARFIndex::IndexUnit(DWARFUnit &unit, SymbolFileDWARFDwo *dwp,
+                                 IndexSet &set) {
   Log *log = LogChannelDWARF::GetLogIfAll(DWARF_LOG_LOOKUPS);
 
   if (log) {
@@ -105,9 +118,16 @@
   IndexUnitImpl(unit, cu_language, set);
 
   if (SymbolFileDWARFDwo *dwo_symbol_file = unit.GetDwoSymbolFile()) {
-    DWARFDebugInfo &dwo_info = dwo_symbol_file->DebugInfo();
-    for (size_t i = 0; i < dwo_info.GetNumUnits(); ++i)
-      IndexUnitImpl(*dwo_info.GetUnitAtIndex(i), cu_language, set);
+    // Type units in a dwp file are indexed separately, so we just need to
+    // process the split unit here. However, if the split unit is in a dwo file,
+    // then we need to process type units here.
+    if (dwo_symbol_file == dwp) {
+      IndexUnitImpl(unit.GetNonSkeletonUnit(), cu_language, set);
+    } else {
+      DWARFDebugInfo &dwo_info = dwo_symbol_file->DebugInfo();
+      for (size_t i = 0; i < dwo_info.GetNumUnits(); ++i)
+        IndexUnitImpl(*dwo_info.GetUnitAtIndex(i), cu_language, set);
+    }
   }
 }
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
@@ -292,6 +292,8 @@
 
   lldb_private::DWARFContext &GetDWARFContext() { return m_context; }
 
+  const std::shared_ptr<SymbolFileDWARFDwo> &GetDwpSymbolFile();
+
   lldb_private::FileSpec GetFile(DWARFUnit &unit, size_t file_idx);
 
   static llvm::Expected<lldb_private::TypeSystem &>
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
@@ -458,9 +458,9 @@
     LoadSectionData(eSectionTypeDWARFDebugNames, debug_names);
     if (debug_names.GetByteSize() > 0) {
       llvm::Expected<std::unique_ptr<DebugNamesDWARFIndex>> index_or =
-          DebugNamesDWARFIndex::Create(
-              *GetObjectFile()->GetModule(), debug_names,
-              m_context.getOrLoadStrData(), DebugInfo());
+          DebugNamesDWARFIndex::Create(*GetObjectFile()->GetModule(),
+                                       debug_names,
+                                       m_context.getOrLoadStrData(), *this);
       if (index_or) {
         m_index = std::move(*index_or);
         return;
@@ -470,8 +470,8 @@
     }
   }
 
-  m_index = std::make_unique<ManualDWARFIndex>(*GetObjectFile()->GetModule(),
-                                                DebugInfo());
+  m_index =
+      std::make_unique<ManualDWARFIndex>(*GetObjectFile()->GetModule(), *this);
 }
 
 bool SymbolFileDWARF::SupportedVersion(uint16_t version) {
@@ -1555,9 +1555,8 @@
   if (!dwo_name)
     return nullptr;
 
-  FindDwpSymbolFile();
-  if (m_dwp_symfile)
-    return m_dwp_symfile;
+  if (std::shared_ptr<SymbolFileDWARFDwo> dwp_sp = GetDwpSymbolFile())
+    return dwp_sp;
 
   FileSpec dwo_file(dwo_name);
   FileSystem::Instance().Resolve(dwo_file);
@@ -3876,7 +3875,7 @@
   return m_debug_map_symfile;
 }
 
-void SymbolFileDWARF::FindDwpSymbolFile() {
+const std::shared_ptr<SymbolFileDWARFDwo> &SymbolFileDWARF::GetDwpSymbolFile() {
   llvm::call_once(m_dwp_symfile_once_flag, [this]() {
     ModuleSpec module_spec;
     module_spec.GetFileSpec() = m_objfile_sp->GetFileSpec();
@@ -3899,6 +3898,7 @@
           std::make_shared<SymbolFileDWARFDwo>(*this, dwp_obj_file, 0x3fffffff);
     }
   });
+  return m_dwp_symfile;
 }
 
 llvm::Expected<TypeSystem &> SymbolFileDWARF::GetTypeSystem(DWARFUnit &unit) {
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/TestDataFormatterLibcxxString.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/TestDataFormatterLibcxxString.py
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/TestDataFormatterLibcxxString.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/TestDataFormatterLibcxxString.py
@@ -116,7 +116,9 @@
                 '%s::allocator<unsigned char> >) uchar = "aaaaa"'%(ns,ns,ns),
         ])
 
-        if is_64_bit:
+        # The test assumes that std::string is in its cap-size-data layout.
+        is_alternate_layout = ('arm' in self.getArchitecture()) and self.platformIsDarwin()
+        if is_64_bit and not is_alternate_layout:
             self.expect("frame variable garbage1", substrs=['garbage1 = Summary Unavailable'])
             self.expect("frame variable garbage2", substrs=['garbage2 = Summary Unavailable'])
             self.expect("frame variable garbage3", substrs=['garbage3 = Summary Unavailable'])
diff --git a/lldb/test/API/lang/cpp/operators/main.cpp b/lldb/test/API/lang/cpp/operators/main.cpp
--- a/lldb/test/API/lang/cpp/operators/main.cpp
+++ b/lldb/test/API/lang/cpp/operators/main.cpp
@@ -171,7 +171,7 @@
   //% self.expect("expr static_cast<long>(c)", endstr=" 12\n")
   //% self.expect("expr c.operatorint()", endstr=" 13\n")
   //% self.expect("expr c.operatornew()", endstr=" 14\n")
-  //% self.expect("expr (new C)->custom_new", endstr=" true\n")
+  //% self.expect("expr (new struct C)->custom_new", endstr=" true\n")
   //% self.expect("expr (new struct C[1])->custom_new", endstr=" true\n")
   //% self.expect("expr delete c2; side_effect", endstr=" = 1\n")
   //% self.expect("expr delete[] c3; side_effect", endstr=" = 2\n")
diff --git a/lldb/test/Shell/SymbolFile/Breakpad/Inputs/basic-elf.yaml b/lldb/test/Shell/SymbolFile/Breakpad/Inputs/basic-elf.yaml
--- a/lldb/test/Shell/SymbolFile/Breakpad/Inputs/basic-elf.yaml
+++ b/lldb/test/Shell/SymbolFile/Breakpad/Inputs/basic-elf.yaml
@@ -20,8 +20,6 @@
     Address:         0x00000000004000B0
     AddressAlign:    0x0000000000000010
     Size:            0x42
-Symbols:         
-DynamicSymbols:  
 ProgramHeaders:
   - Type: PT_LOAD
     Flags: [ PF_X, PF_R ]
diff --git a/lldb/test/Shell/SymbolFile/DWARF/dwp-debug-types.s b/lldb/test/Shell/SymbolFile/DWARF/dwp-debug-types.s
--- a/lldb/test/Shell/SymbolFile/DWARF/dwp-debug-types.s
+++ b/lldb/test/Shell/SymbolFile/DWARF/dwp-debug-types.s
@@ -3,6 +3,7 @@
 # RUN: llvm-mc --filetype=obj --triple x86_64-pc-linux %s -o %t --defsym MAIN=0
 # RUN: llvm-mc --filetype=obj --triple x86_64-pc-linux %s -o %t.dwp --defsym DWP=0
 # RUN: %lldb %t -o "type lookup ENUM0" -o "target variable A" -b | FileCheck %s
+# RUN: lldb-test symbols %t | FileCheck %s --check-prefix=SYMBOLS
 
 # CHECK-LABEL: type lookup ENUM0
 # CHECK-NEXT: enum ENUM0 {
@@ -13,6 +14,19 @@
 # CHECK: (ENUM0) A = case0
 # CHECK: (ENUM1) A = case0
 
+# Make sure each entity is present in the index only once.
+# SYMBOLS:      Globals and statics:
+# SYMBOLS-NEXT: 3fffffff/INFO/00000023 "A"
+# SYMBOLS-NEXT: 3fffffff/INFO/0000005a "A"
+# SYMBOLS-EMPTY:
+
+# SYMBOLS: Types:
+# SYMBOLS-NEXT: 3fffffff/TYPE/00000018 "ENUM0"
+# SYMBOLS-NEXT: 3fffffff/TYPE/0000002d "int"
+# SYMBOLS-NEXT: 3fffffff/TYPE/00000062 "int"
+# SYMBOLS-NEXT: 3fffffff/TYPE/0000004d "ENUM1"
+# SYMBOLS-EMPTY:
+
 .ifdef MAIN
         .section        .debug_abbrev,"",@progbits
         .byte   1                       # Abbreviation Code
@@ -203,9 +217,9 @@
 .endr
 .endmacro
 
-        .section        .debug_cu_index,"e",@progbits
+        .section        .debug_cu_index,"",@progbits
         index 1, .debug_info.dwo, .Lcu_begin, .Ldebug_info_end
 
-        .section        .debug_tu_index,"e",@progbits
+        .section        .debug_tu_index,"",@progbits
         index 2, .debug_types.dwo, .Ltu_begin, .Ltype_info_end
 .endif
diff --git a/lldb/test/Shell/SymbolFile/DWARF/dwp.s b/lldb/test/Shell/SymbolFile/DWARF/dwp.s
--- a/lldb/test/Shell/SymbolFile/DWARF/dwp.s
+++ b/lldb/test/Shell/SymbolFile/DWARF/dwp.s
@@ -105,7 +105,7 @@
 # This deliberately excludes compile unit 4 to check test the case of a missing
 # split unit.
 .irpc I,0123
-        .section        .debug_str.dwo,"e",@progbits
+        .section        .debug_str.dwo,"MSe",@progbits,1
 .Lstr\I:
         .byte   'I', 'N', 'T', '0'+\I, 0
 
@@ -232,7 +232,7 @@
 .Ldebug_info_end\I:
 .endr
 
-        .section        .debug_cu_index,"e",@progbits
+        .section        .debug_cu_index,"",@progbits
         .short  2                       # DWARF version number
         .short  0                       # Reserved
         .long   4                       # Section count
diff --git a/lldb/test/Shell/lit-lldb-init.in b/lldb/test/Shell/lit-lldb-init.in
--- a/lldb/test/Shell/lit-lldb-init.in
+++ b/lldb/test/Shell/lit-lldb-init.in
@@ -3,3 +3,4 @@
 settings set plugin.process.gdb-remote.packet-timeout 60
 settings set interpreter.echo-comment-commands false
 settings set symbols.clang-modules-cache-path "@LLDB_TEST_MODULE_CACHE_LLDB@"
+settings set target.auto-apply-fixits false
diff --git a/llvm/docs/Extensions.rst b/llvm/docs/Extensions.rst
--- a/llvm/docs/Extensions.rst
+++ b/llvm/docs/Extensions.rst
@@ -503,7 +503,7 @@
   sub.w sp, sp, r4
 
 However, this has the limitation of 32 MiB (±16MiB).  In order to accommodate
-larger binaries, LLVM supports the use of ``-mcode-model=large`` to allow a 4GiB
+larger binaries, LLVM supports the use of ``-mcmodel=large`` to allow a 4GiB
 range via a slight deviation.  It will generate an indirect jump as follows:
 
 .. code-block:: gas
@@ -544,7 +544,7 @@
   sub sp, sp, x15, lsl #4
 
 However, this has the limitation of 256 MiB (±128MiB).  In order to accommodate
-larger binaries, LLVM supports the use of ``-mcode-model=large`` to allow a 8GiB
+larger binaries, LLVM supports the use of ``-mcmodel=large`` to allow a 8GiB
 (±4GiB) range via a slight deviation.  It will generate an indirect jump as
 follows:
 
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -14331,6 +14331,136 @@
       %res = call i4 @llvm.udiv.fix.i4(i4 3, i4 4, i32 1)  ; %res = 2 (or 1) (1.5 / 2 = 0.75)
 
 
+'``llvm.sdiv.fix.sat.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.sdiv.fix.sat``
+on any integer bit width or vectors of integers.
+
+::
+
+      declare i16 @llvm.sdiv.fix.sat.i16(i16 %a, i16 %b, i32 %scale)
+      declare i32 @llvm.sdiv.fix.sat.i32(i32 %a, i32 %b, i32 %scale)
+      declare i64 @llvm.sdiv.fix.sat.i64(i64 %a, i64 %b, i32 %scale)
+      declare <4 x i32> @llvm.sdiv.fix.sat.v4i32(<4 x i32> %a, <4 x i32> %b, i32 %scale)
+
+Overview
+"""""""""
+
+The '``llvm.sdiv.fix.sat``' family of intrinsic functions perform signed
+fixed point saturation division on 2 arguments of the same scale.
+
+Arguments
+""""""""""
+
+The arguments (%a and %b) and the result may be of integer types of any bit
+width, but they must have the same bit width. ``%a`` and ``%b`` are the two
+values that will undergo signed fixed point division. The argument
+``%scale`` represents the scale of both operands, and must be a constant
+integer.
+
+Semantics:
+""""""""""
+
+This operation performs fixed point division on the 2 arguments of a
+specified scale. The result will also be returned in the same scale specified
+in the third argument.
+
+If the result value cannot be precisely represented in the given scale, the
+value is rounded up or down to the closest representable value. The rounding
+direction is unspecified.
+
+The maximum value this operation can clamp to is the largest signed value
+representable by the bit width of the first 2 arguments. The minimum value is the
+smallest signed value representable by this bit width.
+
+It is undefined behavior if the second argument is zero.
+
+
+Examples
+"""""""""
+
+.. code-block:: llvm
+
+      %res = call i4 @llvm.sdiv.fix.sat.i4(i4 6, i4 2, i32 0)  ; %res = 3 (6 / 2 = 3)
+      %res = call i4 @llvm.sdiv.fix.sat.i4(i4 6, i4 4, i32 1)  ; %res = 3 (3 / 2 = 1.5)
+      %res = call i4 @llvm.sdiv.fix.sat.i4(i4 3, i4 -2, i32 1) ; %res = -3 (1.5 / -1 = -1.5)
+
+      ; The result in the following could be rounded up to 1 or down to 0.5
+      %res = call i4 @llvm.sdiv.fix.sat.i4(i4 3, i4 4, i32 1)  ; %res = 2 (or 1) (1.5 / 2 = 0.75)
+
+      ; Saturation
+      %res = call i4 @llvm.sdiv.fix.sat.i4(i4 -8, i4 -1, i32 0)  ; %res = 7 (-8 / -1 = 8 => 7)
+      %res = call i4 @llvm.sdiv.fix.sat.i4(i4 4, i4 2, i32 2)  ; %res = 7 (1 / 0.5 = 2 => 1.75)
+      %res = call i4 @llvm.sdiv.fix.sat.i4(i4 -4, i4 1, i32 2)  ; %res = -8 (-1 / 0.25 = -4 => -2)
+
+
+'``llvm.udiv.fix.sat.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.udiv.fix.sat``
+on any integer bit width or vectors of integers.
+
+::
+
+      declare i16 @llvm.udiv.fix.sat.i16(i16 %a, i16 %b, i32 %scale)
+      declare i32 @llvm.udiv.fix.sat.i32(i32 %a, i32 %b, i32 %scale)
+      declare i64 @llvm.udiv.fix.sat.i64(i64 %a, i64 %b, i32 %scale)
+      declare <4 x i32> @llvm.udiv.fix.sat.v4i32(<4 x i32> %a, <4 x i32> %b, i32 %scale)
+
+Overview
+"""""""""
+
+The '``llvm.udiv.fix.sat``' family of intrinsic functions perform unsigned
+fixed point saturation division on 2 arguments of the same scale.
+
+Arguments
+""""""""""
+
+The arguments (%a and %b) and the result may be of integer types of any bit
+width, but they must have the same bit width. ``%a`` and ``%b`` are the two
+values that will undergo unsigned fixed point division. The argument
+``%scale`` represents the scale of both operands, and must be a constant
+integer.
+
+Semantics:
+""""""""""
+
+This operation performs fixed point division on the 2 arguments of a
+specified scale. The result will also be returned in the same scale specified
+in the third argument.
+
+If the result value cannot be precisely represented in the given scale, the
+value is rounded up or down to the closest representable value. The rounding
+direction is unspecified.
+
+The maximum value this operation can clamp to is the largest unsigned value
+representable by the bit width of the first 2 arguments. The minimum value is the
+smallest unsigned value representable by this bit width (zero).
+
+It is undefined behavior if the second argument is zero.
+
+Examples
+"""""""""
+
+.. code-block:: llvm
+
+      %res = call i4 @llvm.udiv.fix.sat.i4(i4 6, i4 2, i32 0)  ; %res = 3 (6 / 2 = 3)
+      %res = call i4 @llvm.udiv.fix.sat.i4(i4 6, i4 4, i32 1)  ; %res = 3 (3 / 2 = 1.5)
+
+      ; The result in the following could be rounded down to 0.5 or up to 1
+      %res = call i4 @llvm.udiv.fix.sat.i4(i4 3, i4 4, i32 1)  ; %res = 1 (or 2) (1.5 / 2 = 0.75)
+
+      ; Saturation
+      %res = call i4 @llvm.udiv.fix.sat.i4(i4 8, i4 2, i32 2)  ; %res = 15 (2 / 0.5 = 4 => 3.75)
+
+
 Specialised Arithmetic Intrinsics
 ---------------------------------
 
diff --git a/llvm/docs/LoopTerminology.rst b/llvm/docs/LoopTerminology.rst
--- a/llvm/docs/LoopTerminology.rst
+++ b/llvm/docs/LoopTerminology.rst
@@ -43,6 +43,9 @@
 * Any two loops are either fully disjoint (no intersecting blocks), or
   one must be a sub-loop of the other.
 
+* Loops in a function form a forest. One implication of this fact
+  is that a loop either has no parent or a single parent.
+
 A loop may have an arbitrary number of exits, both explicit (via
 control flow) and implicit (via throwing calls which transfer control
 out of the containing function).  There is no special requirement on
diff --git a/llvm/include/llvm/ADT/STLExtras.h b/llvm/include/llvm/ADT/STLExtras.h
--- a/llvm/include/llvm/ADT/STLExtras.h
+++ b/llvm/include/llvm/ADT/STLExtras.h
@@ -1007,6 +1007,16 @@
 //     Extra additions for arrays
 //===----------------------------------------------------------------------===//
 
+// We have a copy here so that LLVM behaves the same when using different
+// standard libraries.
+template <class Iterator, class RNG>
+void shuffle(Iterator first, Iterator last, RNG &&g) {
+  // It would be better to use a std::uniform_int_distribution,
+  // but that would be stdlib dependent.
+  for (auto size = last - first; size > 1; ++first, (void)--size)
+    std::iter_swap(first, first + g() % size);
+}
+
 /// Find the length of an array.
 template <class T, std::size_t N>
 constexpr inline size_t array_lengthof(T (&)[N]) {
diff --git a/llvm/include/llvm/Analysis/CFGPrinter.h b/llvm/include/llvm/Analysis/CFGPrinter.h
--- a/llvm/include/llvm/Analysis/CFGPrinter.h
+++ b/llvm/include/llvm/Analysis/CFGPrinter.h
@@ -53,6 +53,9 @@
 template<>
 struct DOTGraphTraits<const Function*> : public DefaultDOTGraphTraits {
 
+  // Cache for is hidden property
+  llvm::DenseMap <const BasicBlock *, bool> isHiddenBasicBlock;
+
   DOTGraphTraits (bool isSimple=false) : DefaultDOTGraphTraits(isSimple) {}
 
   static std::string getGraphName(const Function *F) {
@@ -173,6 +176,8 @@
     // profile count (due to scaling).
     return ("label=\"W:" + Twine(Weight->getZExtValue()) + "\"").str();
   }
+  bool isNodeHidden(const BasicBlock *Node);
+  void computeHiddenNodes(const Function *F);
 };
 } // End llvm namespace
 
diff --git a/llvm/include/llvm/Analysis/LoopInfo.h b/llvm/include/llvm/Analysis/LoopInfo.h
--- a/llvm/include/llvm/Analysis/LoopInfo.h
+++ b/llvm/include/llvm/Analysis/LoopInfo.h
@@ -103,6 +103,14 @@
     return D;
   }
   BlockT *getHeader() const { return getBlocks().front(); }
+  /// Return the parent loop if it exists or nullptr for top
+  /// level loops.
+
+  /// A loop is either top-level in a function (that is, it is not
+  /// contained in any other loop) or it is entirely enclosed in
+  /// some other loop.
+  /// If a loop is top-level, it has no parent, otherwise its
+  /// parent is the innermost loop in which it is enclosed.
   LoopT *getParentLoop() const { return ParentLoop; }
 
   /// This is a raw interface for bypassing addChildLoop.
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GISelKnownBits.h b/llvm/include/llvm/CodeGen/GlobalISel/GISelKnownBits.h
--- a/llvm/include/llvm/CodeGen/GlobalISel/GISelKnownBits.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GISelKnownBits.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_CODEGEN_GLOBALISEL_KNOWNBITSINFO_H
 #define LLVM_CODEGEN_GLOBALISEL_KNOWNBITSINFO_H
 
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/Register.h"
@@ -32,6 +33,8 @@
   const TargetLowering &TL;
   const DataLayout &DL;
   unsigned MaxDepth;
+  /// Cache maintained during a computeKnownBits request.
+  SmallDenseMap<Register, KnownBits, 16> ComputeKnownBitsCache;
 
 public:
   GISelKnownBits(MachineFunction &MF, unsigned MaxDepth = 6);
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -291,6 +291,11 @@
     /// constant integer.
     SDIVFIX, UDIVFIX,
 
+    /// Same as the corresponding unsaturated fixed point instructions, but the
+    /// result is clamped between the min and max values representable by the
+    /// bits of the first 2 operands.
+    SDIVFIXSAT, UDIVFIXSAT,
+
     /// Simple binary floating point operators.
     FADD, FSUB, FMUL, FDIV, FREM,
 
diff --git a/llvm/include/llvm/CodeGen/ScoreboardHazardRecognizer.h b/llvm/include/llvm/CodeGen/ScoreboardHazardRecognizer.h
--- a/llvm/include/llvm/CodeGen/ScoreboardHazardRecognizer.h
+++ b/llvm/include/llvm/CodeGen/ScoreboardHazardRecognizer.h
@@ -16,6 +16,7 @@
 #define LLVM_CODEGEN_SCOREBOARDHAZARDRECOGNIZER_H
 
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/MC/MCInstrItineraries.h"
 #include <cassert>
 #include <cstddef>
 #include <cstring>
@@ -37,7 +38,7 @@
   // bottom-up scheduler, then the scoreboard cycles are the inverse of the
   // scheduler's cycles.
   class Scoreboard {
-    unsigned *Data = nullptr;
+    InstrStage::FuncUnits *Data = nullptr;
 
     // The maximum number of cycles monitored by the Scoreboard. This
     // value is determined based on the target itineraries to ensure
@@ -56,7 +57,7 @@
 
     size_t getDepth() const { return Depth; }
 
-    unsigned& operator[](size_t idx) const {
+    InstrStage::FuncUnits& operator[](size_t idx) const {
       // Depth is expected to be a power-of-2.
       assert(Depth && !(Depth & (Depth - 1)) &&
              "Scoreboard was not initialized properly!");
@@ -67,7 +68,7 @@
     void reset(size_t d = 1) {
       if (!Data) {
         Depth = d;
-        Data = new unsigned[Depth];
+        Data = new InstrStage::FuncUnits[Depth];
       }
 
       memset(Data, 0, Depth * sizeof(Data[0]));
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1209,13 +1209,6 @@
                            ArrayRef<SDValue> Ops, MachineMemOperand *MMO,
                            ISD::MemIndexType IndexType);
 
-  /// Return (create a new or find existing) a target-specific node.
-  /// TargetMemSDNode should be derived class from MemSDNode.
-  template <class TargetMemSDNode>
-  SDValue getTargetMemSDNode(SDVTList VTs, ArrayRef<SDValue> Ops,
-                             const SDLoc &dl, EVT MemVT,
-                             MachineMemOperand *MMO);
-
   /// Construct a node to track a Value* through the backend.
   SDValue getSrcValue(const Value *v);
 
@@ -1856,41 +1849,6 @@
   }
 };
 
-template <class TargetMemSDNode>
-SDValue SelectionDAG::getTargetMemSDNode(SDVTList VTs,
-                                         ArrayRef<SDValue> Ops,
-                                         const SDLoc &dl, EVT MemVT,
-                                         MachineMemOperand *MMO) {
-  /// Compose node ID and try to find an existing node.
-  FoldingSetNodeID ID;
-  unsigned Opcode =
-    TargetMemSDNode(dl.getIROrder(), DebugLoc(), VTs, MemVT, MMO).getOpcode();
-  ID.AddInteger(Opcode);
-  ID.AddPointer(VTs.VTs);
-  for (auto& Op : Ops) {
-    ID.AddPointer(Op.getNode());
-    ID.AddInteger(Op.getResNo());
-  }
-  ID.AddInteger(MemVT.getRawBits());
-  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
-  ID.AddInteger(getSyntheticNodeSubclassData<TargetMemSDNode>(
-    dl.getIROrder(), VTs, MemVT, MMO));
-
-  void *IP = nullptr;
-  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
-    cast<TargetMemSDNode>(E)->refineAlignment(MMO);
-    return SDValue(E, 0);
-  }
-
-  /// Existing node was not found. Create a new one.
-  auto *N = newSDNode<TargetMemSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
-                                       MemVT, MMO);
-  createOperands(N, Ops);
-  CSEMap.InsertNode(N, IP);
-  InsertNode(N);
-  return SDValue(N, 0);
-}
-
 } // end namespace llvm
 
 #endif // LLVM_CODEGEN_SELECTIONDAG_H
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1043,7 +1043,9 @@
     case ISD::UMULFIX:
     case ISD::UMULFIXSAT:
     case ISD::SDIVFIX:
+    case ISD::SDIVFIXSAT:
     case ISD::UDIVFIX:
+    case ISD::UDIVFIXSAT:
       Supported = isSupportedFixedPointOperation(Op, VT, Scale);
       break;
     }
@@ -4269,7 +4271,7 @@
   /// method accepts integers as its arguments.
   SDValue expandFixedPointMul(SDNode *Node, SelectionDAG &DAG) const;
 
-  /// Method for building the DAG expansion of ISD::[US]DIVFIX. This
+  /// Method for building the DAG expansion of ISD::[US]DIVFIX[SAT]. This
   /// method accepts integers as its arguments.
   /// Note: This method may fail if the division could not be performed
   /// within the type. Clients must retry with a wider type if this happens.
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h b/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h
--- a/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h
@@ -1176,7 +1176,7 @@
   /// Pre-prune passes.
   ///
   /// These passes are called on the graph after it is built, and before any
-  /// symbols have been pruned.
+  /// symbols have been pruned. Graph nodes still have their original vmaddrs.
   ///
   /// Notable use cases: Marking symbols live or should-discard.
   LinkGraphPassList PrePrunePasses;
@@ -1184,15 +1184,26 @@
   /// Post-prune passes.
   ///
   /// These passes are called on the graph after dead stripping, but before
-  /// fixups are applied.
+  /// memory is allocated or nodes assigned their final addresses.
   ///
   /// Notable use cases: Building GOT, stub, and TLV symbols.
   LinkGraphPassList PostPrunePasses;
 
+  /// Pre-fixup passes.
+  ///
+  /// These passes are called on the graph after memory has been allocated,
+  /// content copied into working memory, and nodes have been assigned their
+  /// final addresses.
+  ///
+  /// Notable use cases: Late link-time optimizations like GOT and stub
+  /// elimination.
+  LinkGraphPassList PostAllocationPasses;
+
   /// Post-fixup passes.
   ///
   /// These passes are called on the graph after block contents has been copied
-  /// to working memory, and fixups applied.
+  /// to working memory, and fixups applied. Graph nodes have been updated to
+  /// their final target vmaddrs.
   ///
   /// Notable use cases: Testing and validation.
   LinkGraphPassList PostFixupPasses;
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/MachO_x86_64.h b/llvm/include/llvm/ExecutionEngine/JITLink/MachO_x86_64.h
--- a/llvm/include/llvm/ExecutionEngine/JITLink/MachO_x86_64.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/MachO_x86_64.h
@@ -22,6 +22,7 @@
 
 enum MachOX86RelocationKind : Edge::Kind {
   Branch32 = Edge::FirstRelocation,
+  Branch32ToStub,
   Pointer32,
   Pointer64,
   Pointer64Anon,
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Core.h b/llvm/include/llvm/ExecutionEngine/Orc/Core.h
--- a/llvm/include/llvm/ExecutionEngine/Orc/Core.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Core.h
@@ -424,6 +424,44 @@
   SymbolNameSet Symbols;
 };
 
+/// Errors of this type should be returned if a module fails to include
+/// definitions that are claimed by the module's associated
+/// MaterializationResponsibility. If this error is returned it is indicative of
+/// a broken transformation / compiler / object cache.
+class MissingSymbolDefinitions : public ErrorInfo<MissingSymbolDefinitions> {
+public:
+  static char ID;
+
+  MissingSymbolDefinitions(std::string ModuleName, SymbolNameVector Symbols)
+    : ModuleName(std::move(ModuleName)), Symbols(std::move(Symbols)) {}
+  std::error_code convertToErrorCode() const override;
+  void log(raw_ostream &OS) const override;
+  const std::string &getModuleName() const { return ModuleName; }
+  const SymbolNameVector &getSymbols() const { return Symbols; }
+private:
+  std::string ModuleName;
+  SymbolNameVector Symbols;
+};
+
+/// Errors of this type should be returned if a module contains definitions for
+/// symbols that are not claimed by the module's associated
+/// MaterializationResponsibility. If this error is returned it is indicative of
+/// a broken transformation / compiler / object cache.
+class UnexpectedSymbolDefinitions : public ErrorInfo<UnexpectedSymbolDefinitions> {
+public:
+  static char ID;
+
+  UnexpectedSymbolDefinitions(std::string ModuleName, SymbolNameVector Symbols)
+    : ModuleName(std::move(ModuleName)), Symbols(std::move(Symbols)) {}
+  std::error_code convertToErrorCode() const override;
+  void log(raw_ostream &OS) const override;
+  const std::string &getModuleName() const { return ModuleName; }
+  const SymbolNameVector &getSymbols() const { return Symbols; }
+private:
+  std::string ModuleName;
+  SymbolNameVector Symbols;
+};
+
 /// Tracks responsibility for materialization, and mediates interactions between
 /// MaterializationUnits and JDs.
 ///
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/OrcError.h b/llvm/include/llvm/ExecutionEngine/Orc/OrcError.h
--- a/llvm/include/llvm/ExecutionEngine/Orc/OrcError.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/OrcError.h
@@ -37,7 +37,9 @@
   UnexpectedRPCCall,
   UnexpectedRPCResponse,
   UnknownErrorCodeFromRemote,
-  UnknownResourceHandle
+  UnknownResourceHandle,
+  MissingSymbolDefinitions,
+  UnexpectedSymbolDefinitions,
 };
 
 std::error_code orcError(OrcErrorCode ErrCode);
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -969,6 +969,14 @@
                                  [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
                                  [IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative, ImmArg<2>]>;
 
+def int_sdiv_fix_sat : Intrinsic<[llvm_anyint_ty],
+                                 [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
+                                 [IntrNoMem, ImmArg<2>]>;
+
+def int_udiv_fix_sat : Intrinsic<[llvm_anyint_ty],
+                                 [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
+                                 [IntrNoMem, ImmArg<2>]>;
+
 //===------------------------- Memory Use Markers -------------------------===//
 //
 def int_lifetime_start  : Intrinsic<[],
diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -1674,7 +1674,8 @@
 }
 
 //===----------------------------------------------------------------------===//
-// Matchers for overflow check patterns: e.g. (a + b) u< a
+// Matchers for overflow check patterns: e.g. (a + b) u< a, (a ^ -1) <u b
+// Note that S might be matched to other instructions than AddInst.
 //
 
 template <typename LHS_t, typename RHS_t, typename Sum_t>
@@ -1705,6 +1706,19 @@
       if (AddExpr.match(ICmpRHS) && (ICmpLHS == AddLHS || ICmpLHS == AddRHS))
         return L.match(AddLHS) && R.match(AddRHS) && S.match(ICmpRHS);
 
+    Value *Op1;
+    auto XorExpr = m_OneUse(m_Xor(m_Value(Op1), m_AllOnes()));
+    // (a ^ -1) <u b
+    if (Pred == ICmpInst::ICMP_ULT) {
+      if (XorExpr.match(ICmpLHS))
+        return L.match(Op1) && R.match(ICmpRHS) && S.match(ICmpLHS);
+    }
+    //  b > u (a ^ -1)
+    if (Pred == ICmpInst::ICMP_UGT) {
+      if (XorExpr.match(ICmpRHS))
+        return L.match(Op1) && R.match(ICmpLHS) && S.match(ICmpRHS);
+    }
+
     // Match special-case for increment-by-1.
     if (Pred == ICmpInst::ICMP_EQ) {
       // (a + 1) == 0
diff --git a/llvm/include/llvm/MC/MCInstrItineraries.h b/llvm/include/llvm/MC/MCInstrItineraries.h
--- a/llvm/include/llvm/MC/MCInstrItineraries.h
+++ b/llvm/include/llvm/MC/MCInstrItineraries.h
@@ -61,8 +61,11 @@
     Reserved = 1
   };
 
+  /// Bitmask representing a set of functional units.
+  typedef uint64_t FuncUnits;
+
   unsigned Cycles_;  ///< Length of stage in machine cycles
-  unsigned Units_;   ///< Choice of functional units
+  FuncUnits Units_;  ///< Choice of functional units
   int NextCycles_;   ///< Number of machine cycles to next stage
   ReservationKinds Kind_; ///< Kind of the FU reservation
 
@@ -72,7 +75,7 @@
   }
 
   /// Returns the choice of FUs.
-  unsigned getUnits() const {
+  FuncUnits getUnits() const {
     return Units_;
   }
 
diff --git a/llvm/include/llvm/ObjectYAML/ELFYAML.h b/llvm/include/llvm/ObjectYAML/ELFYAML.h
--- a/llvm/include/llvm/ObjectYAML/ELFYAML.h
+++ b/llvm/include/llvm/ObjectYAML/ELFYAML.h
@@ -167,7 +167,7 @@
 struct Section : public Chunk {
   ELF_SHT Type;
   Optional<ELF_SHF> Flags;
-  llvm::yaml::Hex64 Address;
+  Optional<llvm::yaml::Hex64> Address;
   StringRef Link;
   llvm::yaml::Hex64 AddressAlign;
   Optional<llvm::yaml::Hex64> EntSize;
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -402,7 +402,9 @@
 def umulfix    : SDNode<"ISD::UMULFIX"   , SDTIntScaledBinOp, [SDNPCommutative]>;
 def umulfixsat : SDNode<"ISD::UMULFIXSAT", SDTIntScaledBinOp, [SDNPCommutative]>;
 def sdivfix    : SDNode<"ISD::SDIVFIX"   , SDTIntScaledBinOp>;
+def sdivfixsat : SDNode<"ISD::SDIVFIXSAT", SDTIntScaledBinOp>;
 def udivfix    : SDNode<"ISD::UDIVFIX"   , SDTIntScaledBinOp>;
+def udivfixsat : SDNode<"ISD::UDIVFIXSAT", SDTIntScaledBinOp>;
 
 def sext_inreg : SDNode<"ISD::SIGN_EXTEND_INREG", SDTExtInreg>;
 def sext_invec : SDNode<"ISD::SIGN_EXTEND_VECTOR_INREG", SDTExtInvec>;
diff --git a/llvm/lib/Analysis/CFGPrinter.cpp b/llvm/lib/Analysis/CFGPrinter.cpp
--- a/llvm/lib/Analysis/CFGPrinter.cpp
+++ b/llvm/lib/Analysis/CFGPrinter.cpp
@@ -17,11 +17,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/Analysis/CFGPrinter.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
+#include <algorithm>
+
 using namespace llvm;
 
 static cl::opt<std::string> CFGFuncName(
@@ -33,6 +36,12 @@
     "cfg-dot-filename-prefix", cl::Hidden,
     cl::desc("The prefix used for the CFG dot file names."));
 
+static cl::opt<bool> HideUnreachablePaths("cfg-hide-unreachable-paths",
+            cl::init(false));
+
+static cl::opt<bool> HideDeoptimizePaths("cfg-hide-deoptimize-paths",
+            cl::init(false));
+
 namespace {
   struct CFGViewerLegacyPass : public FunctionPass {
     static char ID; // Pass identifcation, replacement for typeid
@@ -200,3 +209,30 @@
   return new CFGOnlyPrinterLegacyPass();
 }
 
+void DOTGraphTraits<const Function *>::computeHiddenNodes(const Function *F) {
+  auto evaluateBB = [&](const BasicBlock *Node) {
+    if (succ_begin(Node) == succ_end(Node)) {
+      const Instruction *TI = Node->getTerminator();
+      isHiddenBasicBlock[Node] =
+        (HideUnreachablePaths && isa<UnreachableInst>(TI)) ||
+        (HideDeoptimizePaths && Node->getTerminatingDeoptimizeCall());
+      return;
+    }
+    isHiddenBasicBlock[Node] = std::all_of(
+        succ_begin(Node), succ_end(Node),
+        [this](const BasicBlock *BB) { return isHiddenBasicBlock[BB]; });
+  };
+  /// The post order traversal iteration is done to know the status of
+  /// isHiddenBasicBlock for all the successors on the current BB.
+  for_each(po_begin(&F->getEntryBlock()), po_end(&F->getEntryBlock()),
+           evaluateBB);
+}
+
+bool DOTGraphTraits<const Function *>::isNodeHidden(const BasicBlock *Node) {
+  // If both restricting flags are false, all nodes are displayed.
+  if (!HideUnreachablePaths && !HideDeoptimizePaths)
+    return false;
+  if (isHiddenBasicBlock.find(Node) == isHiddenBasicBlock.end())
+    computeHiddenNodes(Node->getParent());
+  return isHiddenBasicBlock[Node];
+}
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -399,7 +399,8 @@
     bool simplifyOffsetableRelocate(Instruction &I);
 
     bool tryToSinkFreeOperands(Instruction *I);
-    bool replaceMathCmpWithIntrinsic(BinaryOperator *BO, CmpInst *Cmp,
+    bool replaceMathCmpWithIntrinsic(BinaryOperator *BO, Value *Arg0,
+                                     Value *Arg1, CmpInst *Cmp,
                                      Intrinsic::ID IID);
     bool optimizeCmp(CmpInst *Cmp, bool &ModifiedDT);
     bool combineToUSubWithOverflow(CmpInst *Cmp, bool &ModifiedDT);
@@ -1185,6 +1186,7 @@
 }
 
 bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO,
+                                                 Value *Arg0, Value *Arg1,
                                                  CmpInst *Cmp,
                                                  Intrinsic::ID IID) {
   if (BO->getParent() != Cmp->getParent()) {
@@ -1202,8 +1204,6 @@
   }
 
   // We allow matching the canonical IR (add X, C) back to (usubo X, -C).
-  Value *Arg0 = BO->getOperand(0);
-  Value *Arg1 = BO->getOperand(1);
   if (BO->getOpcode() == Instruction::Add &&
       IID == Intrinsic::usub_with_overflow) {
     assert(isa<Constant>(Arg1) && "Unexpected input for usubo");
@@ -1213,7 +1213,9 @@
   // Insert at the first instruction of the pair.
   Instruction *InsertPt = nullptr;
   for (Instruction &Iter : *Cmp->getParent()) {
-    if (&Iter == BO || &Iter == Cmp) {
+    // If BO is an XOR, it is not guaranteed that it comes after both inputs to
+    // the overflow intrinsic are defined.
+    if ((BO->getOpcode() != Instruction::Xor && &Iter == BO) || &Iter == Cmp) {
       InsertPt = &Iter;
       break;
     }
@@ -1222,12 +1224,16 @@
 
   IRBuilder<> Builder(InsertPt);
   Value *MathOV = Builder.CreateBinaryIntrinsic(IID, Arg0, Arg1);
-  Value *Math = Builder.CreateExtractValue(MathOV, 0, "math");
+  if (BO->getOpcode() != Instruction::Xor) {
+    Value *Math = Builder.CreateExtractValue(MathOV, 0, "math");
+    BO->replaceAllUsesWith(Math);
+  } else
+    assert(BO->hasOneUse() &&
+           "Patterns with XOr should use the BO only in the compare");
   Value *OV = Builder.CreateExtractValue(MathOV, 1, "ov");
-  BO->replaceAllUsesWith(Math);
   Cmp->replaceAllUsesWith(OV);
-  BO->eraseFromParent();
   Cmp->eraseFromParent();
+  BO->eraseFromParent();
   return true;
 }
 
@@ -1267,9 +1273,13 @@
                                                bool &ModifiedDT) {
   Value *A, *B;
   BinaryOperator *Add;
-  if (!match(Cmp, m_UAddWithOverflow(m_Value(A), m_Value(B), m_BinOp(Add))))
+  if (!match(Cmp, m_UAddWithOverflow(m_Value(A), m_Value(B), m_BinOp(Add)))) {
     if (!matchUAddWithOverflowConstantEdgeCases(Cmp, Add))
       return false;
+    // Set A and B in case we match matchUAddWithOverflowConstantEdgeCases.
+    A = Add->getOperand(0);
+    B = Add->getOperand(1);
+  }
 
   if (!TLI->shouldFormOverflowOp(ISD::UADDO,
                                  TLI->getValueType(*DL, Add->getType()),
@@ -1282,7 +1292,8 @@
   if (Add->getParent() != Cmp->getParent() && !Add->hasOneUse())
     return false;
 
-  if (!replaceMathCmpWithIntrinsic(Add, Cmp, Intrinsic::uadd_with_overflow))
+  if (!replaceMathCmpWithIntrinsic(Add, A, B, Cmp,
+                                   Intrinsic::uadd_with_overflow))
     return false;
 
   // Reset callers - do not crash by iterating over a dead instruction.
@@ -1344,7 +1355,8 @@
                                  Sub->hasNUsesOrMore(2)))
     return false;
 
-  if (!replaceMathCmpWithIntrinsic(Sub, Cmp, Intrinsic::usub_with_overflow))
+  if (!replaceMathCmpWithIntrinsic(Sub, Sub->getOperand(0), Sub->getOperand(1),
+                                   Cmp, Intrinsic::usub_with_overflow))
     return false;
 
   // Reset callers - do not crash by iterating over a dead instruction.
diff --git a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
--- a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
@@ -69,7 +69,10 @@
   LLT Ty = MRI.getType(R);
   APInt DemandedElts =
       Ty.isVector() ? APInt::getAllOnesValue(Ty.getNumElements()) : APInt(1, 1);
+  // For now, we only maintain the cache during one request.
+  assert(ComputeKnownBitsCache.empty() && "Cache should have been cleared");
   computeKnownBitsImpl(R, Known, DemandedElts);
+  ComputeKnownBitsCache.clear();
   return Known;
 }
 
@@ -85,6 +88,17 @@
 
 APInt GISelKnownBits::getKnownOnes(Register R) { return getKnownBits(R).One; }
 
+LLVM_ATTRIBUTE_UNUSED static void
+dumpResult(const MachineInstr &MI, const KnownBits &Known, unsigned Depth) {
+  dbgs() << "[" << Depth << "] Compute known bits: " << MI << "[" << Depth
+         << "] Computed for: " << MI << "[" << Depth << "] Known: 0x"
+         << (Known.Zero | Known.One).toString(16, false) << "\n"
+         << "[" << Depth << "] Zero: 0x" << Known.Zero.toString(16, false)
+         << "\n"
+         << "[" << Depth << "] One:  0x" << Known.One.toString(16, false)
+         << "\n";
+}
+
 void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known,
                                           const APInt &DemandedElts,
                                           unsigned Depth) {
@@ -102,6 +116,14 @@
   }
 
   unsigned BitWidth = DstTy.getSizeInBits();
+  auto CacheEntry = ComputeKnownBitsCache.find(R);
+  if (CacheEntry != ComputeKnownBitsCache.end()) {
+    Known = CacheEntry->second;
+    LLVM_DEBUG(dbgs() << "Cache hit at ");
+    LLVM_DEBUG(dumpResult(MI, Known, Depth));
+    assert(Known.getBitWidth() == BitWidth && "Cache entry size doesn't match");
+    return;
+  }
   Known = KnownBits(BitWidth); // Don't know anything
 
   if (DstTy.isVector())
@@ -137,6 +159,14 @@
     // point of the pipeline, otherwise the main live-range will be
     // defined more than once, which is against SSA.
     assert(MI.getOperand(0).getSubReg() == 0 && "Is this code in SSA?");
+    // Record in the cache that we know nothing for MI.
+    // This will get updated later and in the meantime, if we reach that
+    // phi again, because of a loop, we will cut the search thanks to this
+    // cache entry. When this happens this cache entry is actually accurate,
+    // thus we are not losing anything by doing that, because right now,
+    // the main analysis will reach the maximum depth without being able
+    // to fully analyze the phi.
+    ComputeKnownBitsCache[R] = KnownBits(BitWidth);
     // PHI's operand are a mix of registers and basic blocks interleaved.
     // We only care about the register ones.
     for (unsigned Idx = 1; Idx < MI.getNumOperands(); Idx += 2) {
@@ -374,14 +404,10 @@
   }
 
   assert(!Known.hasConflict() && "Bits known to be one AND zero?");
-  LLVM_DEBUG(dbgs() << "[" << Depth << "] Compute known bits: " << MI << "["
-                    << Depth << "] Computed for: " << MI << "[" << Depth
-                    << "] Known: 0x"
-                    << (Known.Zero | Known.One).toString(16, false) << "\n"
-                    << "[" << Depth << "] Zero: 0x"
-                    << Known.Zero.toString(16, false) << "\n"
-                    << "[" << Depth << "] One:  0x"
-                    << Known.One.toString(16, false) << "\n");
+  LLVM_DEBUG(dumpResult(MI, Known, Depth));
+
+  // Update the cache.
+  ComputeKnownBitsCache[R] = Known;
 }
 
 unsigned GISelKnownBits::computeNumSignBits(Register R,
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -909,7 +909,7 @@
 struct FuncUnitSorter {
   const InstrItineraryData *InstrItins;
   const MCSubtargetInfo *STI;
-  DenseMap<unsigned, unsigned> Resources;
+  DenseMap<InstrStage::FuncUnits, unsigned> Resources;
 
   FuncUnitSorter(const TargetSubtargetInfo &TSI)
       : InstrItins(TSI.getInstrItineraryData()), STI(&TSI) {}
@@ -917,14 +917,15 @@
   // Compute the number of functional unit alternatives needed
   // at each stage, and take the minimum value. We prioritize the
   // instructions by the least number of choices first.
-  unsigned minFuncUnits(const MachineInstr *Inst, unsigned &F) const {
+  unsigned minFuncUnits(const MachineInstr *Inst,
+                        InstrStage::FuncUnits &F) const {
     unsigned SchedClass = Inst->getDesc().getSchedClass();
     unsigned min = UINT_MAX;
     if (InstrItins && !InstrItins->isEmpty()) {
       for (const InstrStage &IS :
            make_range(InstrItins->beginStage(SchedClass),
                       InstrItins->endStage(SchedClass))) {
-        unsigned funcUnits = IS.getUnits();
+        InstrStage::FuncUnits funcUnits = IS.getUnits();
         unsigned numAlternatives = countPopulation(funcUnits);
         if (numAlternatives < min) {
           min = numAlternatives;
@@ -970,7 +971,7 @@
       for (const InstrStage &IS :
            make_range(InstrItins->beginStage(SchedClass),
                       InstrItins->endStage(SchedClass))) {
-        unsigned FuncUnits = IS.getUnits();
+        InstrStage::FuncUnits FuncUnits = IS.getUnits();
         if (countPopulation(FuncUnits) == 1)
           Resources[FuncUnits]++;
       }
@@ -998,7 +999,7 @@
 
   /// Return true if IS1 has less priority than IS2.
   bool operator()(const MachineInstr *IS1, const MachineInstr *IS2) const {
-    unsigned F1 = 0, F2 = 0;
+    InstrStage::FuncUnits F1 = 0, F2 = 0;
     unsigned MFUs1 = minFuncUnits(IS1, F1);
     unsigned MFUs2 = minFuncUnits(IS2, F2);
     if (MFUs1 == MFUs2)
diff --git a/llvm/lib/CodeGen/ScoreboardHazardRecognizer.cpp b/llvm/lib/CodeGen/ScoreboardHazardRecognizer.cpp
--- a/llvm/lib/CodeGen/ScoreboardHazardRecognizer.cpp
+++ b/llvm/lib/CodeGen/ScoreboardHazardRecognizer.cpp
@@ -92,10 +92,11 @@
     last--;
 
   for (unsigned i = 0; i <= last; i++) {
-    unsigned FUs = (*this)[i];
+    InstrStage::FuncUnits FUs = (*this)[i];
     dbgs() << "\t";
-    for (int j = 31; j >= 0; j--)
-      dbgs() << ((FUs & (1 << j)) ? '1' : '0');
+    for (int j = std::numeric_limits<InstrStage::FuncUnits>::digits - 1;
+         j >= 0; j--)
+      dbgs() << ((FUs & (1ULL << j)) ? '1' : '0');
     dbgs() << '\n';
   }
 }
@@ -142,7 +143,7 @@
         break;
       }
 
-      unsigned freeUnits = IS->getUnits();
+      InstrStage::FuncUnits freeUnits = IS->getUnits();
       switch (IS->getReservationKind()) {
       case InstrStage::Required:
         // Required FUs conflict with both reserved and required ones
@@ -193,7 +194,7 @@
       assert(((cycle + i) < RequiredScoreboard.getDepth()) &&
              "Scoreboard depth exceeded!");
 
-      unsigned freeUnits = IS->getUnits();
+      InstrStage::FuncUnits freeUnits = IS->getUnits();
       switch (IS->getReservationKind()) {
       case InstrStage::Required:
         // Required FUs conflict with both reserved and required ones
@@ -206,7 +207,7 @@
       }
 
       // reduce to a single unit
-      unsigned freeUnit = 0;
+      InstrStage::FuncUnits freeUnit = 0;
       do {
         freeUnit = freeUnits;
         freeUnits = freeUnit & (freeUnit - 1);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1132,7 +1132,9 @@
   case ISD::UMULFIX:
   case ISD::UMULFIXSAT:
   case ISD::SDIVFIX:
-  case ISD::UDIVFIX: {
+  case ISD::SDIVFIXSAT:
+  case ISD::UDIVFIX:
+  case ISD::UDIVFIXSAT: {
     unsigned Scale = Node->getConstantOperandVal(2);
     Action = TLI.getFixedPointOperationAction(Node->getOpcode(),
                                               Node->getValueType(0), Scale);
@@ -3489,7 +3491,9 @@
     Results.push_back(TLI.expandFixedPointMul(Node, DAG));
     break;
   case ISD::SDIVFIX:
+  case ISD::SDIVFIXSAT:
   case ISD::UDIVFIX:
+  case ISD::UDIVFIXSAT:
     if (SDValue V = TLI.expandFixedPointDiv(Node->getOpcode(), SDLoc(Node),
                                             Node->getOperand(0),
                                             Node->getOperand(1),
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -162,7 +162,9 @@
   case ISD::UMULFIXSAT:  Res = PromoteIntRes_MULFIX(N); break;
 
   case ISD::SDIVFIX:
-  case ISD::UDIVFIX:     Res = PromoteIntRes_DIVFIX(N); break;
+  case ISD::SDIVFIXSAT:
+  case ISD::UDIVFIX:
+  case ISD::UDIVFIXSAT:  Res = PromoteIntRes_DIVFIX(N); break;
 
   case ISD::ABS:         Res = PromoteIntRes_ABS(N); break;
 
@@ -784,22 +786,51 @@
                      N->getOperand(2));
 }
 
+static SDValue SaturateWidenedDIVFIX(SDValue V, SDLoc &dl,
+                                     unsigned SatW, bool Signed,
+                                     const TargetLowering &TLI,
+                                     SelectionDAG &DAG) {
+  EVT VT = V.getValueType();
+  unsigned VTW = VT.getScalarSizeInBits();
+
+  if (!Signed) {
+    // Saturate to the unsigned maximum by getting the minimum of V and the
+    // maximum.
+    return DAG.getNode(ISD::UMIN, dl, VT, V,
+                       DAG.getConstant(APInt::getLowBitsSet(VTW, SatW),
+                                       dl, VT));
+  }
+
+  // Saturate to the signed maximum (the low SatW - 1 bits) by taking the
+  // signed minimum of it and V.
+  V = DAG.getNode(ISD::SMIN, dl, VT, V,
+                  DAG.getConstant(APInt::getLowBitsSet(VTW, SatW - 1),
+                                  dl, VT));
+  // Saturate to the signed minimum (the high SatW + 1 bits) by taking the
+  // signed maximum of it and V.
+  V = DAG.getNode(ISD::SMAX, dl, VT, V,
+                  DAG.getConstant(APInt::getHighBitsSet(VTW, VTW - SatW + 1),
+                                  dl, VT));
+  return V;
+}
+
 static SDValue earlyExpandDIVFIX(SDNode *N, SDValue LHS, SDValue RHS,
-                                    unsigned Scale, const TargetLowering &TLI,
-                                    SelectionDAG &DAG) {
+                                 unsigned Scale, const TargetLowering &TLI,
+                                 SelectionDAG &DAG, unsigned SatW = 0) {
   EVT VT = LHS.getValueType();
-  bool Signed = N->getOpcode() == ISD::SDIVFIX;
+  unsigned VTSize = VT.getScalarSizeInBits();
+  bool Signed = N->getOpcode() == ISD::SDIVFIX ||
+                N->getOpcode() == ISD::SDIVFIXSAT;
+  bool Saturating = N->getOpcode() == ISD::SDIVFIXSAT ||
+                    N->getOpcode() == ISD::UDIVFIXSAT;
 
   SDLoc dl(N);
-  // See if we can perform the division in this type without widening.
-  if (SDValue V = TLI.expandFixedPointDiv(N->getOpcode(), dl, LHS, RHS, Scale,
-                                          DAG))
-    return V;
-
-  // If that didn't work, double the type width and try again. That must work,
-  // or something is wrong.
-  EVT WideVT = EVT::getIntegerVT(*DAG.getContext(),
-                                 VT.getScalarSizeInBits() * 2);
+  // Widen the types by a factor of two. This is guaranteed to expand, since it
+  // will always have enough high bits in the LHS to shift into.
+  EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), VTSize * 2);
+  if (VT.isVector())
+    WideVT = EVT::getVectorVT(*DAG.getContext(), WideVT,
+                              VT.getVectorElementCount());
   if (Signed) {
     LHS = DAG.getSExtOrTrunc(LHS, dl, WideVT);
     RHS = DAG.getSExtOrTrunc(RHS, dl, WideVT);
@@ -808,18 +839,28 @@
     RHS = DAG.getZExtOrTrunc(RHS, dl, WideVT);
   }
 
-  // TODO: Saturation.
-
   SDValue Res = TLI.expandFixedPointDiv(N->getOpcode(), dl, LHS, RHS, Scale,
                                         DAG);
   assert(Res && "Expanding DIVFIX with wide type failed?");
+  if (Saturating) {
+    // If the caller has told us to saturate at something less, use that width
+    // instead of the type before doubling. However, it cannot be more than
+    // what we just widened!
+    assert(SatW <= VTSize &&
+           "Tried to saturate to more than the original type?");
+    Res = SaturateWidenedDIVFIX(Res, dl, SatW == 0 ? VTSize : SatW, Signed,
+                                TLI, DAG);
+  }
   return DAG.getZExtOrTrunc(Res, dl, VT);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_DIVFIX(SDNode *N) {
   SDLoc dl(N);
   SDValue Op1Promoted, Op2Promoted;
-  bool Signed = N->getOpcode() == ISD::SDIVFIX;
+  bool Signed = N->getOpcode() == ISD::SDIVFIX ||
+                N->getOpcode() == ISD::SDIVFIXSAT;
+  bool Saturating = N->getOpcode() == ISD::SDIVFIXSAT ||
+                    N->getOpcode() == ISD::UDIVFIXSAT;
   if (Signed) {
     Op1Promoted = SExtPromotedInteger(N->getOperand(0));
     Op2Promoted = SExtPromotedInteger(N->getOperand(1));
@@ -830,23 +871,41 @@
   EVT PromotedType = Op1Promoted.getValueType();
   unsigned Scale = N->getConstantOperandVal(2);
 
-  SDValue Res;
   // If the type is already legal and the operation is legal in that type, we
   // should not early expand.
   if (TLI.isTypeLegal(PromotedType)) {
     TargetLowering::LegalizeAction Action =
         TLI.getFixedPointOperationAction(N->getOpcode(), PromotedType, Scale);
-    if (Action == TargetLowering::Legal || Action == TargetLowering::Custom)
-      Res = DAG.getNode(N->getOpcode(), dl, PromotedType, Op1Promoted,
-                        Op2Promoted, N->getOperand(2));
+    if (Action == TargetLowering::Legal || Action == TargetLowering::Custom) {
+      EVT ShiftTy = TLI.getShiftAmountTy(PromotedType, DAG.getDataLayout());
+      unsigned Diff = PromotedType.getScalarSizeInBits() -
+                      N->getValueType(0).getScalarSizeInBits();
+      if (Saturating)
+        Op1Promoted = DAG.getNode(ISD::SHL, dl, PromotedType, Op1Promoted,
+                                  DAG.getConstant(Diff, dl, ShiftTy));
+      SDValue Res = DAG.getNode(N->getOpcode(), dl, PromotedType, Op1Promoted,
+                                Op2Promoted, N->getOperand(2));
+      if (Saturating)
+        Res = DAG.getNode(Signed ? ISD::SRA : ISD::SRL, dl, PromotedType, Res,
+                          DAG.getConstant(Diff, dl, ShiftTy));
+      return Res;
+    }
   }
 
-  if (!Res)
-    Res = earlyExpandDIVFIX(N, Op1Promoted, Op2Promoted, Scale, TLI, DAG);
-
-  // TODO: Saturation.
-
-  return Res;
+  // See if we can perform the division in this type without expanding.
+  if (SDValue Res = TLI.expandFixedPointDiv(N->getOpcode(), dl, Op1Promoted,
+                                        Op2Promoted, Scale, DAG)) {
+    if (Saturating)
+      Res = SaturateWidenedDIVFIX(Res, dl,
+                                  N->getValueType(0).getScalarSizeInBits(),
+                                  Signed, TLI, DAG);
+    return Res;
+  }
+  // If we cannot, expand it to twice the type width. If we are saturating, give
+  // it the original width as a saturating width so we don't need to emit
+  // two saturations.
+  return earlyExpandDIVFIX(N, Op1Promoted, Op2Promoted, Scale, TLI, DAG,
+                            N->getValueType(0).getScalarSizeInBits());
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo) {
@@ -1315,7 +1374,9 @@
   case ISD::UMULFIX:
   case ISD::UMULFIXSAT:
   case ISD::SDIVFIX:
-  case ISD::UDIVFIX: Res = PromoteIntOp_FIX(N); break;
+  case ISD::SDIVFIXSAT:
+  case ISD::UDIVFIX:
+  case ISD::UDIVFIXSAT: Res = PromoteIntOp_FIX(N); break;
 
   case ISD::FPOWI: Res = PromoteIntOp_FPOWI(N); break;
 
@@ -1923,7 +1984,9 @@
   case ISD::UMULFIXSAT: ExpandIntRes_MULFIX(N, Lo, Hi); break;
 
   case ISD::SDIVFIX:
-  case ISD::UDIVFIX: ExpandIntRes_DIVFIX(N, Lo, Hi); break;
+  case ISD::SDIVFIXSAT:
+  case ISD::UDIVFIX:
+  case ISD::UDIVFIXSAT: ExpandIntRes_DIVFIX(N, Lo, Hi); break;
 
   case ISD::VECREDUCE_ADD:
   case ISD::VECREDUCE_MUL:
@@ -3253,8 +3316,15 @@
 
 void DAGTypeLegalizer::ExpandIntRes_DIVFIX(SDNode *N, SDValue &Lo,
                                            SDValue &Hi) {
-  SDValue Res = earlyExpandDIVFIX(N, N->getOperand(0), N->getOperand(1),
-                                  N->getConstantOperandVal(2), TLI, DAG);
+  SDLoc dl(N);
+  // Try expanding in the existing type first.
+  SDValue Res = TLI.expandFixedPointDiv(N->getOpcode(), dl, N->getOperand(0),
+                                        N->getOperand(1),
+                                        N->getConstantOperandVal(2), DAG);
+
+  if (!Res)
+    Res = earlyExpandDIVFIX(N, N->getOperand(0), N->getOperand(1),
+                            N->getConstantOperandVal(2), TLI, DAG);
   SplitInteger(Res, Lo, Hi);
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -142,7 +142,7 @@
   void ExpandUADDSUBO(SDNode *Node, SmallVectorImpl<SDValue> &Results);
   void ExpandSADDSUBO(SDNode *Node, SmallVectorImpl<SDValue> &Results);
   void ExpandMULO(SDNode *Node, SmallVectorImpl<SDValue> &Results);
-  SDValue ExpandFixedPointDiv(SDNode *Node);
+  void ExpandFixedPointDiv(SDNode *Node, SmallVectorImpl<SDValue> &Results);
   SDValue ExpandStrictFPOp(SDNode *Node);
   void ExpandStrictFPOp(SDNode *Node, SmallVectorImpl<SDValue> &Results);
 
@@ -463,7 +463,9 @@
   case ISD::UMULFIX:
   case ISD::UMULFIXSAT:
   case ISD::SDIVFIX:
-  case ISD::UDIVFIX: {
+  case ISD::SDIVFIXSAT:
+  case ISD::UDIVFIX:
+  case ISD::UDIVFIXSAT: {
     unsigned Scale = Node->getConstantOperandVal(2);
     Action = TLI.getFixedPointOperationAction(Node->getOpcode(),
                                               Node->getValueType(0), Scale);
@@ -968,8 +970,11 @@
     break;
   case ISD::SDIVFIX:
   case ISD::UDIVFIX:
-    Results.push_back(ExpandFixedPointDiv(Node));
+    ExpandFixedPointDiv(Node, Results);
     return;
+  case ISD::SDIVFIXSAT:
+  case ISD::UDIVFIXSAT:
+    break;
 #define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN)               \
   case ISD::STRICT_##DAGN:
 #include "llvm/IR/ConstrainedOps.def"
@@ -1454,12 +1459,12 @@
   Results.push_back(Overflow);
 }
 
-SDValue VectorLegalizer::ExpandFixedPointDiv(SDNode *Node) {
+void VectorLegalizer::ExpandFixedPointDiv(SDNode *Node,
+                                          SmallVectorImpl<SDValue> &Results) {
   SDNode *N = Node;
   if (SDValue Expanded = TLI.expandFixedPointDiv(N->getOpcode(), SDLoc(N),
           N->getOperand(0), N->getOperand(1), N->getConstantOperandVal(2), DAG))
-    return Expanded;
-  return DAG.UnrollVectorOp(N);
+    Results.push_back(Expanded);
 }
 
 void VectorLegalizer::ExpandStrictFPOp(SDNode *Node,
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -166,7 +166,9 @@
   case ISD::UMULFIX:
   case ISD::UMULFIXSAT:
   case ISD::SDIVFIX:
+  case ISD::SDIVFIXSAT:
   case ISD::UDIVFIX:
+  case ISD::UDIVFIXSAT:
     R = ScalarizeVecRes_FIX(N);
     break;
   }
@@ -956,7 +958,9 @@
   case ISD::UMULFIX:
   case ISD::UMULFIXSAT:
   case ISD::SDIVFIX:
+  case ISD::SDIVFIXSAT:
   case ISD::UDIVFIX:
+  case ISD::UDIVFIXSAT:
     SplitVecRes_FIX(N, Lo, Hi);
     break;
   }
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5112,8 +5112,13 @@
   }
 
   switch (Opcode) {
-  case ISD::FADD:
   case ISD::FSUB:
+    // -0.0 - undef --> undef (consistent with "fneg undef")
+    if (N1CFP && N1CFP->getValueAPF().isNegZero() && N2.isUndef())
+      return getUNDEF(VT);
+    LLVM_FALLTHROUGH;
+
+  case ISD::FADD:
   case ISD::FMUL:
   case ISD::FDIV:
   case ISD::FREM:
@@ -6696,8 +6701,6 @@
   assert((Opcode == ISD::INTRINSIC_VOID ||
           Opcode == ISD::INTRINSIC_W_CHAIN ||
           Opcode == ISD::PREFETCH ||
-          Opcode == ISD::LIFETIME_START ||
-          Opcode == ISD::LIFETIME_END ||
           ((int)Opcode <= std::numeric_limits<int>::max() &&
            (int)Opcode >= ISD::FIRST_TARGET_MEMORY_OPCODE)) &&
          "Opcode is not a memory-accessing opcode!");
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -5451,7 +5451,8 @@
                             SDValue LHS, SDValue RHS, SDValue Scale,
                             SelectionDAG &DAG, const TargetLowering &TLI) {
   EVT VT = LHS.getValueType();
-  bool Signed = Opcode == ISD::SDIVFIX;
+  bool Signed = Opcode == ISD::SDIVFIX || Opcode == ISD::SDIVFIXSAT;
+  bool Saturating = Opcode == ISD::SDIVFIXSAT || Opcode == ISD::UDIVFIXSAT;
   LLVMContext &Ctx = *DAG.getContext();
 
   // If the type is legal but the operation isn't, this node might survive all
@@ -5463,14 +5464,16 @@
   // by bumping the size by one bit. This will force it to Promote, enabling the
   // early expansion and avoiding the need to expand later.
 
-  // We don't have to do this if Scale is 0; that can always be expanded.
+  // We don't have to do this if Scale is 0; that can always be expanded, unless
+  // it's a saturating signed operation. Those can experience true integer
+  // division overflow, a case which we must avoid.
 
   // FIXME: We wouldn't have to do this (or any of the early
   // expansion/promotion) if it was possible to expand a libcall of an
   // illegal type during operation legalization. But it's not, so things
   // get a bit hacky.
   unsigned ScaleInt = cast<ConstantSDNode>(Scale)->getZExtValue();
-  if (ScaleInt > 0 &&
+  if ((ScaleInt > 0 || (Saturating && Signed)) &&
       (TLI.isTypeLegal(VT) ||
        (VT.isVector() && TLI.isTypeLegal(VT.getVectorElementType())))) {
     TargetLowering::LegalizeAction Action = TLI.getFixedPointOperationAction(
@@ -5492,8 +5495,16 @@
         LHS = DAG.getZExtOrTrunc(LHS, DL, PromVT);
         RHS = DAG.getZExtOrTrunc(RHS, DL, PromVT);
       }
-      // TODO: Saturation.
+      EVT ShiftTy = TLI.getShiftAmountTy(PromVT, DAG.getDataLayout());
+      // For saturating operations, we need to shift up the LHS to get the
+      // proper saturation width, and then shift down again afterwards.
+      if (Saturating)
+        LHS = DAG.getNode(ISD::SHL, DL, PromVT, LHS,
+                          DAG.getConstant(1, DL, ShiftTy));
       SDValue Res = DAG.getNode(Opcode, DL, PromVT, LHS, RHS, Scale);
+      if (Saturating)
+        Res = DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, PromVT, Res,
+                          DAG.getConstant(1, DL, ShiftTy));
       return DAG.getZExtOrTrunc(Res, DL, VT);
     }
   }
@@ -5757,6 +5768,10 @@
     return ISD::SDIVFIX;
   case Intrinsic::udiv_fix:
     return ISD::UDIVFIX;
+  case Intrinsic::sdiv_fix_sat:
+    return ISD::SDIVFIXSAT;
+  case Intrinsic::udiv_fix_sat:
+    return ISD::UDIVFIXSAT;
   default:
     llvm_unreachable("Unhandled fixed point intrinsic");
   }
@@ -6460,7 +6475,9 @@
     return;
   }
   case Intrinsic::sdiv_fix:
-  case Intrinsic::udiv_fix: {
+  case Intrinsic::udiv_fix:
+  case Intrinsic::sdiv_fix_sat:
+  case Intrinsic::udiv_fix_sat: {
     SDValue Op1 = getValue(I.getArgOperand(0));
     SDValue Op2 = getValue(I.getArgOperand(1));
     SDValue Op3 = getValue(I.getArgOperand(2));
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -314,7 +314,9 @@
   case ISD::UMULFIXSAT:                 return "umulfixsat";
 
   case ISD::SDIVFIX:                    return "sdivfix";
+  case ISD::SDIVFIXSAT:                 return "sdivfixsat";
   case ISD::UDIVFIX:                    return "udivfix";
+  case ISD::UDIVFIXSAT:                 return "udivfixsat";
 
   // Conversion operators.
   case ISD::SIGN_EXTEND:                return "sign_extend";
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -7332,12 +7332,13 @@
 TargetLowering::expandFixedPointDiv(unsigned Opcode, const SDLoc &dl,
                                     SDValue LHS, SDValue RHS,
                                     unsigned Scale, SelectionDAG &DAG) const {
-  assert((Opcode == ISD::SDIVFIX ||
-          Opcode == ISD::UDIVFIX) &&
+  assert((Opcode == ISD::SDIVFIX || Opcode == ISD::SDIVFIXSAT ||
+          Opcode == ISD::UDIVFIX || Opcode == ISD::UDIVFIXSAT) &&
          "Expected a fixed point division opcode");
 
   EVT VT = LHS.getValueType();
-  bool Signed = Opcode == ISD::SDIVFIX;
+  bool Signed = Opcode == ISD::SDIVFIX || Opcode == ISD::SDIVFIXSAT;
+  bool Saturating = Opcode == ISD::SDIVFIXSAT || Opcode == ISD::UDIVFIXSAT;
   EVT BoolVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
 
   // If there is enough room in the type to upscale the LHS or downscale the
@@ -7349,7 +7350,15 @@
                             : DAG.computeKnownBits(LHS).countMinLeadingZeros();
   unsigned RHSTrail = DAG.computeKnownBits(RHS).countMinTrailingZeros();
 
-  if (LHSLead + RHSTrail < Scale)
+  // For signed saturating operations, we need to be able to detect true integer
+  // division overflow; that is, when you have MIN / -EPS. However, this
+  // is undefined behavior and if we emit divisions that could take such
+  // values it may cause undesired behavior (arithmetic exceptions on x86, for
+  // example).
+  // Avoid this by requiring an extra bit so that we never get this case.
+  // FIXME: This is a bit unfortunate as it means that for an 8-bit 7-scale
+  // signed saturating division, we need to emit a whopping 32-bit division.
+  if (LHSLead + RHSTrail < Scale + (unsigned)(Saturating && Signed))
     return SDValue();
 
   unsigned LHSShift = std::min(LHSLead, Scale);
@@ -7403,8 +7412,6 @@
     Quot = DAG.getNode(ISD::UDIV, dl, VT,
                        LHS, RHS);
 
-  // TODO: Saturation.
-
   return Quot;
 }
 
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -660,7 +660,9 @@
     setOperationAction(ISD::UMULFIX, VT, Expand);
     setOperationAction(ISD::UMULFIXSAT, VT, Expand);
     setOperationAction(ISD::SDIVFIX, VT, Expand);
+    setOperationAction(ISD::SDIVFIXSAT, VT, Expand);
     setOperationAction(ISD::UDIVFIX, VT, Expand);
+    setOperationAction(ISD::UDIVFIXSAT, VT, Expand);
 
     // Overflow operations default to expand
     setOperationAction(ISD::SADDO, VT, Expand);
diff --git a/llvm/lib/ExecutionEngine/JITLink/BasicGOTAndStubsBuilder.h b/llvm/lib/ExecutionEngine/JITLink/BasicGOTAndStubsBuilder.h
--- a/llvm/lib/ExecutionEngine/JITLink/BasicGOTAndStubsBuilder.h
+++ b/llvm/lib/ExecutionEngine/JITLink/BasicGOTAndStubsBuilder.h
@@ -15,6 +15,8 @@
 
 #include "llvm/ExecutionEngine/JITLink/JITLink.h"
 
+#define DEBUG_TYPE "jitlink"
+
 namespace llvm {
 namespace jitlink {
 
@@ -27,12 +29,25 @@
     // the newly added ones, so just copy the existing blocks out.
     std::vector<Block *> Blocks(G.blocks().begin(), G.blocks().end());
 
+    LLVM_DEBUG(dbgs() << "Creating GOT entries and stubs:\n");
+
     for (auto *B : Blocks)
       for (auto &E : B->edges())
-        if (impl().isGOTEdge(E))
+        if (impl().isGOTEdge(E)) {
+          LLVM_DEBUG({
+            dbgs() << "  Updating GOT edge ";
+            printEdge(dbgs(), *B, E, "<target GOT>");
+            dbgs() << "\n";
+          });
           impl().fixGOTEdge(E, getGOTEntrySymbol(E.getTarget()));
-        else if (impl().isExternalBranchEdge(E))
+        } else if (impl().isExternalBranchEdge(E)) {
+          LLVM_DEBUG({
+            dbgs() << "  Updating external branch edge ";
+            printEdge(dbgs(), *B, E, "<target PC-rel>");
+            dbgs() << "\n";
+          });
           impl().fixExternalBranchEdge(E, getStubSymbol(E.getTarget()));
+        }
   }
 
 protected:
@@ -44,11 +59,17 @@
     // Build the entry if it doesn't exist.
     if (GOTEntryI == GOTEntries.end()) {
       auto &GOTEntry = impl().createGOTEntry(Target);
+      LLVM_DEBUG({
+        dbgs() << "    Created GOT entry for " << Target.getName() << ": "
+               << GOTEntry << "\n";
+      });
       GOTEntryI =
           GOTEntries.insert(std::make_pair(Target.getName(), &GOTEntry)).first;
     }
 
     assert(GOTEntryI != GOTEntries.end() && "Could not get GOT entry symbol");
+    LLVM_DEBUG(
+        { dbgs() << "    Using GOT entry " << *GOTEntryI->second << "\n"; });
     return *GOTEntryI->second;
   }
 
@@ -59,10 +80,15 @@
 
     if (StubI == Stubs.end()) {
       auto &StubSymbol = impl().createStub(Target);
+      LLVM_DEBUG({
+        dbgs() << "    Created stub for " << Target.getName() << ": "
+               << StubSymbol << "\n";
+      });
       StubI = Stubs.insert(std::make_pair(Target.getName(), &StubSymbol)).first;
     }
 
     assert(StubI != Stubs.end() && "Count not get stub symbol");
+    LLVM_DEBUG({ dbgs() << "    Using stub " << *StubI->second << "\n"; });
     return *StubI->second;
   }
 
diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h
--- a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h
+++ b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h
@@ -100,14 +100,14 @@
 
   // Copy block contents and apply relocations.
   // Implemented in JITLinker.
-  virtual Error
-  copyAndFixUpBlocks(const SegmentLayoutMap &Layout,
-                     JITLinkMemoryManager::Allocation &Alloc) const = 0;
+  virtual Error fixUpBlocks(LinkGraph &G) const = 0;
 
   SegmentLayoutMap layOutBlocks();
   Error allocateSegments(const SegmentLayoutMap &Layout);
   JITLinkContext::LookupMap getExternalSymbolNames() const;
   void applyLookupResult(AsyncLookupResult LR);
+  void copyBlockContentToWorkingMemory(const SegmentLayoutMap &Layout,
+                                       JITLinkMemoryManager::Allocation &Alloc);
   void deallocateAndBailOut(Error Err);
 
   void dumpGraph(raw_ostream &OS);
@@ -144,88 +144,25 @@
     return static_cast<const LinkerImpl &>(*this);
   }
 
-  Error
-  copyAndFixUpBlocks(const SegmentLayoutMap &Layout,
-                     JITLinkMemoryManager::Allocation &Alloc) const override {
-    LLVM_DEBUG(dbgs() << "Copying and fixing up blocks:\n");
-    for (auto &KV : Layout) {
-      auto &Prot = KV.first;
-      auto &SegLayout = KV.second;
-
-      auto SegMem = Alloc.getWorkingMemory(
-          static_cast<sys::Memory::ProtectionFlags>(Prot));
-      char *LastBlockEnd = SegMem.data();
-      char *BlockDataPtr = LastBlockEnd;
-
-      LLVM_DEBUG({
-        dbgs() << "  Processing segment "
-               << static_cast<sys::Memory::ProtectionFlags>(Prot) << " [ "
-               << (const void *)SegMem.data() << " .. "
-               << (const void *)((char *)SegMem.data() + SegMem.size())
-               << " ]\n    Processing content sections:\n";
-      });
-
-      for (auto *B : SegLayout.ContentBlocks) {
-        LLVM_DEBUG(dbgs() << "    " << *B << ":\n");
-
-        // Pad to alignment/alignment-offset.
-        BlockDataPtr = alignToBlock(BlockDataPtr, *B);
-
-        LLVM_DEBUG({
-          dbgs() << "      Bumped block pointer to "
-                 << (const void *)BlockDataPtr << " to meet block alignment "
-                 << B->getAlignment() << " and alignment offset "
-                 << B->getAlignmentOffset() << "\n";
-        });
-
-        // Zero pad up to alignment.
-        LLVM_DEBUG({
-          if (LastBlockEnd != BlockDataPtr)
-            dbgs() << "      Zero padding from " << (const void *)LastBlockEnd
-                   << " to " << (const void *)BlockDataPtr << "\n";
-        });
-
-        while (LastBlockEnd != BlockDataPtr)
-          *LastBlockEnd++ = 0;
-
-        // Copy initial block content.
-        LLVM_DEBUG({
-          dbgs() << "      Copying block " << *B << " content, "
-                 << B->getContent().size() << " bytes, from "
-                 << (const void *)B->getContent().data() << " to "
-                 << (const void *)BlockDataPtr << "\n";
-        });
-        memcpy(BlockDataPtr, B->getContent().data(), B->getContent().size());
-
-        // Copy Block data and apply fixups.
-        LLVM_DEBUG(dbgs() << "      Applying fixups.\n");
-        for (auto &E : B->edges()) {
-
-          // Skip non-relocation edges.
-          if (!E.isRelocation())
-            continue;
-
-          // Dispatch to LinkerImpl for fixup.
-          if (auto Err = impl().applyFixup(*B, E, BlockDataPtr))
-            return Err;
-        }
-
-        // Point the block's content to the fixed up buffer.
-        B->setContent(StringRef(BlockDataPtr, B->getContent().size()));
-
-        // Update block end pointer.
-        LastBlockEnd = BlockDataPtr + B->getContent().size();
-        BlockDataPtr = LastBlockEnd;
-      }
+  Error fixUpBlocks(LinkGraph &G) const override {
+    LLVM_DEBUG(dbgs() << "Fixing up blocks:\n");
+
+    for (auto *B : G.blocks()) {
+      LLVM_DEBUG(dbgs() << "  " << *B << ":\n");
+
+      // Copy Block data and apply fixups.
+      LLVM_DEBUG(dbgs() << "    Applying fixups.\n");
+      for (auto &E : B->edges()) {
 
-      // Zero pad the rest of the segment.
-      LLVM_DEBUG({
-        dbgs() << "    Zero padding end of segment from "
-               << (const void *)LastBlockEnd << " to "
-               << (const void *)((char *)SegMem.data() + SegMem.size()) << "\n";
-      });
-      while (LastBlockEnd != SegMem.data() + SegMem.size())
-        *LastBlockEnd++ = 0;
+        // Skip non-relocation edges.
+        if (!E.isRelocation())
+          continue;
+
+        // Dispatch to LinkerImpl for fixup.
+        auto *BlockData = const_cast<char *>(B->getContent().data());
+        if (auto Err = impl().applyFixup(*B, E, BlockData))
+          return Err;
+      }
     }
 
     return Error::success();
diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp
--- a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp
@@ -24,6 +24,8 @@
 
 void JITLinkerBase::linkPhase1(std::unique_ptr<JITLinkerBase> Self) {
 
+  LLVM_DEBUG({ dbgs() << "Building jitlink graph for new input...\n"; });
+
   // Build the link graph.
   if (auto GraphOrErr = buildGraph(Ctx->getObjectBuffer()))
     G = std::move(*GraphOrErr);
@@ -31,6 +33,10 @@
     return Ctx->notifyFailed(GraphOrErr.takeError());
   assert(G && "Graph should have been created by buildGraph above");
 
+  LLVM_DEBUG({
+    dbgs() << "Starting link phase 1 for graph " << G->getName() << "\n";
+  });
+
   // Prune and optimize the graph.
   if (auto Err = runPasses(Passes.PrePrunePasses))
     return Ctx->notifyFailed(std::move(Err));
@@ -59,10 +65,17 @@
     return Ctx->notifyFailed(std::move(Err));
 
   // Notify client that the defined symbols have been assigned addresses.
+  LLVM_DEBUG(
+      { dbgs() << "Resolving symbols defined in " << G->getName() << "\n"; });
   Ctx->notifyResolved(*G);
 
   auto ExternalSymbols = getExternalSymbolNames();
 
+  LLVM_DEBUG({
+    dbgs() << "Issuing lookup for external symbols for " << G->getName()
+           << " (may trigger materialization/linking of other graphs)...\n";
+  });
+
   // We're about to hand off ownership of ourself to the continuation. Grab a
   // pointer to the context so that we can call it to initiate the lookup.
   //
@@ -87,6 +100,11 @@
 void JITLinkerBase::linkPhase2(std::unique_ptr<JITLinkerBase> Self,
                                Expected<AsyncLookupResult> LR,
                                SegmentLayoutMap Layout) {
+
+  LLVM_DEBUG({
+    dbgs() << "Starting link phase 2 for graph " << G->getName() << "\n";
+  });
+
   // If the lookup failed, bail out.
   if (!LR)
     return deallocateAndBailOut(LR.takeError());
@@ -94,13 +112,25 @@
   // Assign addresses to external addressables.
   applyLookupResult(*LR);
 
+  // Copy block content to working memory.
+  copyBlockContentToWorkingMemory(Layout, *Alloc);
+
+  LLVM_DEBUG({
+    dbgs() << "Link graph \"" << G->getName()
+           << "\" before post-allocation passes:\n";
+    dumpGraph(dbgs());
+  });
+
+  if (auto Err = runPasses(Passes.PostAllocationPasses))
+    return deallocateAndBailOut(std::move(Err));
+
   LLVM_DEBUG({
     dbgs() << "Link graph \"" << G->getName() << "\" before copy-and-fixup:\n";
     dumpGraph(dbgs());
   });
 
-  // Copy block content to working memory and fix up.
-  if (auto Err = copyAndFixUpBlocks(Layout, *Alloc))
+  // Fix up block content.
+  if (auto Err = fixUpBlocks(*G))
     return deallocateAndBailOut(std::move(Err));
 
   LLVM_DEBUG({
@@ -122,9 +152,16 @@
 }
 
 void JITLinkerBase::linkPhase3(std::unique_ptr<JITLinkerBase> Self, Error Err) {
+
+  LLVM_DEBUG({
+    dbgs() << "Starting link phase 3 for graph " << G->getName() << "\n";
+  });
+
   if (Err)
     return deallocateAndBailOut(std::move(Err));
   Ctx->notifyFinalized(std::move(Alloc));
+
+  LLVM_DEBUG({ dbgs() << "Link of graph " << G->getName() << " complete\n"; });
 }
 
 Error JITLinkerBase::runPasses(LinkGraphPassList &Passes) {
@@ -165,7 +202,7 @@
   }
 
   LLVM_DEBUG({
-    dbgs() << "Segment ordering:\n";
+    dbgs() << "Computed segment ordering:\n";
     for (auto &KV : Layout) {
       dbgs() << "  Segment "
              << static_cast<sys::Memory::ProtectionFlags>(KV.first) << ":\n";
@@ -302,6 +339,77 @@
          "All strong external symbols should have been resolved by now");
 }
 
+void JITLinkerBase::copyBlockContentToWorkingMemory(
+    const SegmentLayoutMap &Layout, JITLinkMemoryManager::Allocation &Alloc) {
+
+  LLVM_DEBUG(dbgs() << "Copying block content:\n");
+  for (auto &KV : Layout) {
+    auto &Prot = KV.first;
+    auto &SegLayout = KV.second;
+
+    auto SegMem =
+        Alloc.getWorkingMemory(static_cast<sys::Memory::ProtectionFlags>(Prot));
+    char *LastBlockEnd = SegMem.data();
+    char *BlockDataPtr = LastBlockEnd;
+
+    LLVM_DEBUG({
+      dbgs() << "  Processing segment "
+             << static_cast<sys::Memory::ProtectionFlags>(Prot) << " [ "
+             << (const void *)SegMem.data() << " .. "
+             << (const void *)((char *)SegMem.data() + SegMem.size())
+             << " ]\n    Processing content sections:\n";
+    });
+
+    for (auto *B : SegLayout.ContentBlocks) {
+      LLVM_DEBUG(dbgs() << "    " << *B << ":\n");
+
+      // Pad to alignment/alignment-offset.
+      BlockDataPtr = alignToBlock(BlockDataPtr, *B);
+
+      LLVM_DEBUG({
+        dbgs() << "      Bumped block pointer to " << (const void *)BlockDataPtr
+               << " to meet block alignment " << B->getAlignment()
+               << " and alignment offset " << B->getAlignmentOffset() << "\n";
+      });
+
+      // Zero pad up to alignment.
+      LLVM_DEBUG({
+        if (LastBlockEnd != BlockDataPtr)
+          dbgs() << "      Zero padding from " << (const void *)LastBlockEnd
+                 << " to " << (const void *)BlockDataPtr << "\n";
+      });
+
+      while (LastBlockEnd != BlockDataPtr)
+        *LastBlockEnd++ = 0;
+
+      // Copy initial block content.
+      LLVM_DEBUG({
+        dbgs() << "      Copying block " << *B << " content, "
+               << B->getContent().size() << " bytes, from "
+               << (const void *)B->getContent().data() << " to "
+               << (const void *)BlockDataPtr << "\n";
+      });
+      memcpy(BlockDataPtr, B->getContent().data(), B->getContent().size());
+
+      // Point the block's content to the fixed up buffer.
+      B->setContent(StringRef(BlockDataPtr, B->getContent().size()));
+
+      // Update block end pointer.
+      LastBlockEnd = BlockDataPtr + B->getContent().size();
+      BlockDataPtr = LastBlockEnd;
+    }
+
+    // Zero pad the rest of the segment.
+    LLVM_DEBUG({
+      dbgs() << "    Zero padding end of segment from "
+             << (const void *)LastBlockEnd << " to "
+             << (const void *)((char *)SegMem.data() + SegMem.size()) << "\n";
+    });
+    while (LastBlockEnd != SegMem.data() + SegMem.size())
+      *LastBlockEnd++ = 0;
+  }
+}
+
 void JITLinkerBase::deallocateAndBailOut(Error Err) {
   assert(Err && "Should not be bailing out on success value");
   assert(Alloc && "can not call deallocateAndBailOut before allocation");
diff --git a/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
--- a/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
@@ -350,6 +350,9 @@
 class MachO_x86_64_GOTAndStubsBuilder
     : public BasicGOTAndStubsBuilder<MachO_x86_64_GOTAndStubsBuilder> {
 public:
+  static const uint8_t NullGOTEntryContent[8];
+  static const uint8_t StubContent[6];
+
   MachO_x86_64_GOTAndStubsBuilder(LinkGraph &G)
       : BasicGOTAndStubsBuilder<MachO_x86_64_GOTAndStubsBuilder>(G) {}
 
@@ -367,7 +370,13 @@
   void fixGOTEdge(Edge &E, Symbol &GOTEntry) {
     assert((E.getKind() == PCRel32GOT || E.getKind() == PCRel32GOTLoad) &&
            "Not a GOT edge?");
-    E.setKind(PCRel32);
+    // If this is a PCRel32GOT then change it to an ordinary PCRel32. If it is
+    // a PCRel32GOTLoad then leave it as-is for now. We will use the kind to
+    // check for GOT optimization opportunities in the
+    // optimizeMachO_x86_64_GOTAndStubs pass below.
+    if (E.getKind() == PCRel32GOT)
+      E.setKind(PCRel32);
+
     E.setTarget(GOTEntry);
     // Leave the edge addend as-is.
   }
@@ -388,6 +397,11 @@
   void fixExternalBranchEdge(Edge &E, Symbol &Stub) {
     assert(E.getKind() == Branch32 && "Not a Branch32 edge?");
     assert(E.getAddend() == 0 && "Branch32 edge has non-zero addend?");
+
+    // Set the edge kind to Branch32ToStub. We will use this to check for stub
+    // optimization opportunities in the optimizeMachO_x86_64_GOTAndStubs pass
+    // below.
+    E.setKind(Branch32ToStub);
     E.setTarget(Stub);
   }
 
@@ -417,8 +431,6 @@
                      sizeof(StubContent));
   }
 
-  static const uint8_t NullGOTEntryContent[8];
-  static const uint8_t StubContent[6];
   Section *GOTSection = nullptr;
   Section *StubsSection = nullptr;
 };
@@ -429,6 +441,89 @@
     0xFF, 0x25, 0x00, 0x00, 0x00, 0x00};
 } // namespace
 
+Error optimizeMachO_x86_64_GOTAndStubs(LinkGraph &G) {
+  LLVM_DEBUG(dbgs() << "Optimizing GOT entries and stubs:\n");
+
+  for (auto *B : G.blocks())
+    for (auto &E : B->edges())
+      if (E.getKind() == PCRel32GOTLoad) {
+        assert(E.getOffset() >= 3 && "GOT edge occurs too early in block");
+
+        // Switch the edge kind to PCRel32: Whether we change the edge target
+        // or not this will be the desired kind.
+        E.setKind(PCRel32);
+
+        // Optimize GOT references.
+        auto &GOTBlock = E.getTarget().getBlock();
+        assert(GOTBlock.getSize() == G.getPointerSize() &&
+               "GOT entry block should be pointer sized");
+        assert(GOTBlock.edges_size() == 1 &&
+               "GOT entry should only have one outgoing edge");
+
+        auto &GOTTarget = GOTBlock.edges().begin()->getTarget();
+        JITTargetAddress EdgeAddr = B->getAddress() + E.getOffset();
+        JITTargetAddress TargetAddr = GOTTarget.getAddress();
+
+        // Check that this is a recognized MOV instruction.
+        // FIXME: Can we assume this?
+        constexpr uint8_t MOVQRIPRel[] = {0x48, 0x8b};
+        if (strncmp(B->getContent().data() + E.getOffset() - 3,
+                    reinterpret_cast<const char *>(MOVQRIPRel), 2) != 0)
+          continue;
+
+        int64_t Displacement = TargetAddr - EdgeAddr + 4;
+        if (Displacement >= std::numeric_limits<int32_t>::min() &&
+            Displacement <= std::numeric_limits<int32_t>::max()) {
+          E.setTarget(GOTTarget);
+          auto *BlockData = reinterpret_cast<uint8_t *>(
+              const_cast<char *>(B->getContent().data()));
+          BlockData[E.getOffset() - 2] = 0x8d;
+          LLVM_DEBUG({
+            dbgs() << "  Replaced GOT load wih LEA:\n    ";
+            printEdge(dbgs(), *B, E,
+                      getMachOX86RelocationKindName(E.getKind()));
+            dbgs() << "\n";
+          });
+        }
+      } else if (E.getKind() == Branch32ToStub) {
+
+        // Switch the edge kind to PCRel32: Whether we change the edge target
+        // or not this will be the desired kind.
+        E.setKind(Branch32);
+
+        auto &StubBlock = E.getTarget().getBlock();
+        assert(StubBlock.getSize() ==
+                   sizeof(MachO_x86_64_GOTAndStubsBuilder::StubContent) &&
+               "Stub block should be stub sized");
+        assert(StubBlock.edges_size() == 1 &&
+               "Stub block should only have one outgoing edge");
+
+        auto &GOTBlock = StubBlock.edges().begin()->getTarget().getBlock();
+        assert(GOTBlock.getSize() == G.getPointerSize() &&
+               "GOT block should be pointer sized");
+        assert(GOTBlock.edges_size() == 1 &&
+               "GOT block should only have one outgoing edge");
+
+        auto &GOTTarget = GOTBlock.edges().begin()->getTarget();
+        JITTargetAddress EdgeAddr = B->getAddress() + E.getOffset();
+        JITTargetAddress TargetAddr = GOTTarget.getAddress();
+
+        int64_t Displacement = TargetAddr - EdgeAddr + 4;
+        if (Displacement >= std::numeric_limits<int32_t>::min() &&
+            Displacement <= std::numeric_limits<int32_t>::max()) {
+          E.setTarget(GOTTarget);
+          LLVM_DEBUG({
+            dbgs() << "  Replaced stub branch with direct branch:\n    ";
+            printEdge(dbgs(), *B, E,
+                      getMachOX86RelocationKindName(E.getKind()));
+            dbgs() << "\n";
+          });
+        }
+      }
+
+  return Error::success();
+}
+
 namespace llvm {
 namespace jitlink {
 
@@ -570,6 +665,9 @@
       MachO_x86_64_GOTAndStubsBuilder(G).run();
       return Error::success();
     });
+
+    // Add GOT/Stubs optimizer pass.
+    Config.PostAllocationPasses.push_back(optimizeMachO_x86_64_GOTAndStubs);
   }
 
   if (auto Err = Ctx->modifyPassConfig(TT, Config))
@@ -583,6 +681,8 @@
   switch (R) {
   case Branch32:
     return "Branch32";
+  case Branch32ToStub:
+    return "Branch32ToStub";
   case Pointer32:
     return "Pointer32";
   case Pointer64:
diff --git a/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp b/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
--- a/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
@@ -7,9 +7,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h"
+
+#include "llvm/ADT/Hashing.h"
 #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Support/FormatVariadic.h"
 
 using namespace llvm;
 using namespace llvm::orc;
@@ -294,29 +297,52 @@
   //
   // FIXME: We apply this promotion once per partitioning. It's safe, but
   // overkill.
-
   auto ExtractedTSM =
       TSM.withModuleDo([&](Module &M) -> Expected<ThreadSafeModule> {
         auto PromotedGlobals = PromoteSymbols(M);
         if (!PromotedGlobals.empty()) {
+
           MangleAndInterner Mangle(ES, M.getDataLayout());
           SymbolFlagsMap SymbolFlags;
-          for (auto &GV : PromotedGlobals)
-            SymbolFlags[Mangle(GV->getName())] =
-                JITSymbolFlags::fromGlobalValue(*GV);
+          IRSymbolMapper::add(ES, *getManglingOptions(),
+                              PromotedGlobals, SymbolFlags);
+
           if (auto Err = R.defineMaterializing(SymbolFlags))
             return std::move(Err);
         }
 
         expandPartition(*GVsToExtract);
 
+        // Submodule name is given by hashing the names of the globals.
+        std::string SubModuleName;
+        {
+          std::vector<const GlobalValue*> HashGVs;
+          HashGVs.reserve(GVsToExtract->size());
+          for (auto *GV : *GVsToExtract)
+            HashGVs.push_back(GV);
+          llvm::sort(HashGVs, [](const GlobalValue *LHS, const GlobalValue *RHS) {
+              return LHS->getName() < RHS->getName();
+            });
+          hash_code HC(0);
+          for (auto *GV : HashGVs) {
+            assert(GV->hasName() && "All GVs to extract should be named by now");
+            auto GVName = GV->getName();
+            HC = hash_combine(HC, hash_combine_range(GVName.begin(), GVName.end()));
+          }
+          raw_string_ostream(SubModuleName)
+            << ".submodule."
+            << formatv(sizeof(size_t) == 8 ? "{0:x16}" : "{0:x8}",
+                       static_cast<size_t>(HC))
+            << ".ll";
+        }
+
         // Extract the requested partiton (plus any necessary aliases) and
         // put the rest back into the impl dylib.
         auto ShouldExtract = [&](const GlobalValue &GV) -> bool {
           return GVsToExtract->count(&GV);
         };
 
-        return extractSubModule(TSM, ".submodule", ShouldExtract);
+        return extractSubModule(TSM, SubModuleName , ShouldExtract);
       });
 
   if (!ExtractedTSM) {
diff --git a/llvm/lib/ExecutionEngine/Orc/Core.cpp b/llvm/lib/ExecutionEngine/Orc/Core.cpp
--- a/llvm/lib/ExecutionEngine/Orc/Core.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Core.cpp
@@ -144,6 +144,8 @@
 char FailedToMaterialize::ID = 0;
 char SymbolsNotFound::ID = 0;
 char SymbolsCouldNotBeRemoved::ID = 0;
+char MissingSymbolDefinitions::ID = 0;
+char UnexpectedSymbolDefinitions::ID = 0;
 
 RegisterDependenciesFunction NoDependenciesToRegister =
     RegisterDependenciesFunction();
@@ -352,6 +354,24 @@
   OS << "Symbols could not be removed: " << Symbols;
 }
 
+std::error_code MissingSymbolDefinitions::convertToErrorCode() const {
+  return orcError(OrcErrorCode::MissingSymbolDefinitions);
+}
+
+void MissingSymbolDefinitions::log(raw_ostream &OS) const {
+  OS << "Missing definitions in module " << ModuleName
+     << ": " << Symbols;
+}
+
+std::error_code UnexpectedSymbolDefinitions::convertToErrorCode() const {
+  return orcError(OrcErrorCode::UnexpectedSymbolDefinitions);
+}
+
+void UnexpectedSymbolDefinitions::log(raw_ostream &OS) const {
+  OS << "Unexpected definitions in module " << ModuleName
+     << ": " << Symbols;
+}
+
 AsynchronousSymbolQuery::AsynchronousSymbolQuery(
     const SymbolLookupSet &Symbols, SymbolState RequiredState,
     SymbolsResolvedCallback NotifyComplete)
diff --git a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
--- a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
@@ -107,12 +107,16 @@
 /// llvm.global_ctors.
 class GlobalCtorDtorScraper {
 public:
-  GlobalCtorDtorScraper(GenericLLVMIRPlatformSupport &PS) : PS(PS) {}
+
+  GlobalCtorDtorScraper(GenericLLVMIRPlatformSupport &PS,
+                        StringRef InitFunctionPrefix)
+    : PS(PS), InitFunctionPrefix(InitFunctionPrefix) {}
   Expected<ThreadSafeModule> operator()(ThreadSafeModule TSM,
                                         MaterializationResponsibility &R);
 
 private:
   GenericLLVMIRPlatformSupport &PS;
+  StringRef InitFunctionPrefix;
 };
 
 /// Generic IR Platform Support
@@ -125,12 +129,14 @@
   // GenericLLVMIRPlatform &P) : P(P) {
   GenericLLVMIRPlatformSupport(LLJIT &J) : J(J) {
 
+    MangleAndInterner Mangle(getExecutionSession(), J.getDataLayout());
+    InitFunctionPrefix = Mangle("__orc_init_func.");
+
     getExecutionSession().setPlatform(
         std::make_unique<GenericLLVMIRPlatform>(*this));
 
-    setInitTransform(J, GlobalCtorDtorScraper(*this));
+    setInitTransform(J, GlobalCtorDtorScraper(*this, *InitFunctionPrefix));
 
-    MangleAndInterner Mangle(getExecutionSession(), J.getDataLayout());
     SymbolMap StdInterposes;
 
     StdInterposes[Mangle("__lljit.platform_support_instance")] =
@@ -169,6 +175,18 @@
     std::lock_guard<std::mutex> Lock(PlatformSupportMutex);
     if (auto &InitSym = MU.getInitializerSymbol())
       InitSymbols[&JD].add(InitSym);
+    else {
+      // If there's no identified init symbol attached, but there is a symbol
+      // with the GenericIRPlatform::InitFunctionPrefix, then treat that as
+      // an init function. Add the symbol to both the InitSymbols map (which
+      // will trigger a lookup to materialize the module) and the InitFunctions
+      // map (which holds the names of the symbols to execute).
+      for (auto &KV : MU.getSymbols())
+        if ((*KV.first).startswith(*InitFunctionPrefix)) {
+          InitSymbols[&JD].add(KV.first);
+          InitFunctions[&JD].add(KV.first);
+        }
+    }
     return Error::success();
   }
 
@@ -387,6 +405,7 @@
 
   std::mutex PlatformSupportMutex;
   LLJIT &J;
+  SymbolStringPtr InitFunctionPrefix;
   DenseMap<JITDylib *, SymbolLookupSet> InitSymbols;
   DenseMap<JITDylib *, SymbolLookupSet> InitFunctions;
   DenseMap<JITDylib *, SymbolLookupSet> DeInitFunctions;
@@ -415,7 +434,7 @@
 
     std::string InitFunctionName;
     raw_string_ostream(InitFunctionName)
-        << "__orc_init." << M.getModuleIdentifier();
+        << InitFunctionPrefix << M.getModuleIdentifier();
 
     MangleAndInterner Mangle(PS.getExecutionSession(), M.getDataLayout());
     auto InternedName = Mangle(InitFunctionName);
diff --git a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
--- a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
@@ -148,6 +148,36 @@
     if (const auto &InitSym = MR.getInitializerSymbol())
       InternedResult[InitSym] = JITEvaluatedSymbol();
 
+    {
+      // Check that InternedResult matches up with MR.getSymbols().
+      // This guards against faulty transformations / compilers / object caches.
+
+      if (InternedResult.size() > MR.getSymbols().size()) {
+        SymbolNameVector ExtraSymbols;
+        for (auto &KV : InternedResult)
+          if (!MR.getSymbols().count(KV.first))
+            ExtraSymbols.push_back(KV.first);
+        ES.reportError(
+          make_error<UnexpectedSymbolDefinitions>(G.getName(),
+                                                  std::move(ExtraSymbols)));
+        MR.failMaterialization();
+        return;
+      }
+
+      SymbolNameVector MissingSymbols;
+      for (auto &KV : MR.getSymbols())
+        if (!InternedResult.count(KV.first))
+          MissingSymbols.push_back(KV.first);
+
+      if (!MissingSymbols.empty()) {
+        ES.reportError(
+          make_error<MissingSymbolDefinitions>(G.getName(),
+                                               std::move(MissingSymbols)));
+        MR.failMaterialization();
+        return;
+      }
+    }
+
     if (auto Err = MR.notifyResolved(InternedResult)) {
       Layer.getExecutionSession().reportError(std::move(Err));
       MR.failMaterialization();
diff --git a/llvm/lib/ExecutionEngine/OrcError/OrcError.cpp b/llvm/lib/ExecutionEngine/OrcError/OrcError.cpp
--- a/llvm/lib/ExecutionEngine/OrcError/OrcError.cpp
+++ b/llvm/lib/ExecutionEngine/OrcError/OrcError.cpp
@@ -61,6 +61,10 @@
              "(Use StringError to get error message)";
     case OrcErrorCode::UnknownResourceHandle:
       return "Unknown resource handle";
+    case OrcErrorCode::MissingSymbolDefinitions:
+      return "MissingSymbolsDefinitions";
+    case OrcErrorCode::UnexpectedSymbolDefinitions:
+      return "UnexpectedSymbolDefinitions";
     }
     llvm_unreachable("Unhandled error code");
   }
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -4727,7 +4727,9 @@
   case Intrinsic::umul_fix:
   case Intrinsic::umul_fix_sat:
   case Intrinsic::sdiv_fix:
-  case Intrinsic::udiv_fix: {
+  case Intrinsic::sdiv_fix_sat:
+  case Intrinsic::udiv_fix:
+  case Intrinsic::udiv_fix_sat: {
     Value *Op1 = Call.getArgOperand(0);
     Value *Op2 = Call.getArgOperand(1);
     Assert(Op1->getType()->isIntOrIntVectorTy(),
@@ -4742,7 +4744,7 @@
            "third argument of [us][mul|div]_fix[_sat] must fit within 32 bits");
 
     if (ID == Intrinsic::smul_fix || ID == Intrinsic::smul_fix_sat ||
-        ID == Intrinsic::sdiv_fix) {
+        ID == Intrinsic::sdiv_fix || ID == Intrinsic::sdiv_fix_sat) {
       Assert(
           Op3->getZExtValue() < Op1->getType()->getScalarSizeInBits(),
           "the scale of s[mul|div]_fix[_sat] must be less than the width of "
diff --git a/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
--- a/llvm/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
@@ -634,20 +634,29 @@
       }
   }
 
-  MCSection *ELFSection = getContext().getELFSection(
+  MCSectionELF *Section = getContext().getELFSection(
       SectionName, Type, Flags, Size, GroupName, UniqueID, LinkedToSym);
-  getStreamer().SwitchSection(ELFSection, Subsection);
+  getStreamer().SwitchSection(Section, Subsection);
+  if (Section->getType() != Type)
+    Error(loc, "changed section type for " + SectionName + ", expected: 0x" +
+                   utohexstr(Section->getType()));
+  if (Section->getFlags() != Flags)
+    Error(loc, "changed section flags for " + SectionName + ", expected: 0x" +
+                   utohexstr(Section->getFlags()));
+  if (Section->getEntrySize() != Size)
+    Error(loc, "changed section entsize for " + SectionName +
+                   ", expected: " + Twine(Section->getEntrySize()));
 
   if (getContext().getGenDwarfForAssembly()) {
-    bool InsertResult = getContext().addGenDwarfSection(ELFSection);
+    bool InsertResult = getContext().addGenDwarfSection(Section);
     if (InsertResult) {
       if (getContext().getDwarfVersion() <= 2)
         Warning(loc, "DWARF2 only supports one section per compilation unit");
 
-      if (!ELFSection->getBeginSymbol()) {
+      if (!Section->getBeginSymbol()) {
         MCSymbol *SectionStartSymbol = getContext().createTempSymbol();
         getStreamer().emitLabel(SectionStartSymbol);
-        ELFSection->setBeginSymbol(SectionStartSymbol);
+        Section->setBeginSymbol(SectionStartSymbol);
       }
     }
   }
diff --git a/llvm/lib/ObjectYAML/ELFEmitter.cpp b/llvm/lib/ObjectYAML/ELFEmitter.cpp
--- a/llvm/lib/ObjectYAML/ELFEmitter.cpp
+++ b/llvm/lib/ObjectYAML/ELFEmitter.cpp
@@ -128,6 +128,7 @@
   NameToIdxMap DynSymN2I;
   ELFYAML::Object &Doc;
 
+  uint64_t LocationCounter = 0;
   bool HasError = false;
   yaml::ErrorHandler ErrHandler;
   void reportError(const Twine &Msg);
@@ -218,6 +219,8 @@
 
   ELFState(ELFYAML::Object &D, yaml::ErrorHandler EH);
 
+  void assignSectionAddress(Elf_Shdr &SHeader, ELFYAML::Section *YAMLSec);
+
 public:
   static bool writeELF(raw_ostream &OS, ELFYAML::Object &Doc,
                        yaml::ErrorHandler EH);
@@ -390,6 +393,8 @@
   else
     return false;
 
+  LocationCounter += Header.sh_size;
+
   // Override section fields if requested.
   overrideFields<ELFT>(YAMLSec, Header);
   return true;
@@ -413,6 +418,7 @@
   for (const std::unique_ptr<ELFYAML::Chunk> &D : Doc.Chunks) {
     if (auto S = dyn_cast<ELFYAML::Fill>(D.get())) {
       writeFill(*S, CBA);
+      LocationCounter += S->Size;
       continue;
     }
 
@@ -438,9 +444,10 @@
     SHeader.sh_type = Sec->Type;
     if (Sec->Flags)
       SHeader.sh_flags = *Sec->Flags;
-    SHeader.sh_addr = Sec->Address;
     SHeader.sh_addralign = Sec->AddressAlign;
 
+    assignSectionAddress(SHeader, Sec);
+
     if (!Sec->Link.empty())
       SHeader.sh_link = toSectionIndex(Sec->Link, Sec->Name);
 
@@ -500,11 +507,34 @@
       llvm_unreachable("Unknown section type");
     }
 
+    LocationCounter += SHeader.sh_size;
+
     // Override section fields if requested.
     overrideFields<ELFT>(Sec, SHeader);
   }
 }
 
+template <class ELFT>
+void ELFState<ELFT>::assignSectionAddress(Elf_Shdr &SHeader,
+                                          ELFYAML::Section *YAMLSec) {
+  if (YAMLSec && YAMLSec->Address) {
+    SHeader.sh_addr = *YAMLSec->Address;
+    LocationCounter = *YAMLSec->Address;
+    return;
+  }
+
+  // sh_addr represents the address in the memory image of a process. Sections
+  // in a relocatable object file or non-allocatable sections do not need
+  // sh_addr assignment.
+  if (Doc.Header.Type.value == ELF::ET_REL ||
+      !(SHeader.sh_flags & ELF::SHF_ALLOC))
+    return;
+
+  LocationCounter =
+      alignTo(LocationCounter, SHeader.sh_addralign ? SHeader.sh_addralign : 1);
+  SHeader.sh_addr = LocationCounter;
+}
+
 static size_t findFirstNonGlobal(ArrayRef<ELFYAML::Symbol> Symbols) {
   for (size_t I = 0; I < Symbols.size(); ++I)
     if (Symbols[I].Binding.value != ELF::STB_LOCAL)
@@ -629,7 +659,8 @@
                            ? (uint64_t)(*YAMLSec->EntSize)
                            : sizeof(Elf_Sym);
   SHeader.sh_addralign = YAMLSec ? (uint64_t)YAMLSec->AddressAlign : 8;
-  SHeader.sh_addr = YAMLSec ? (uint64_t)YAMLSec->Address : 0;
+
+  assignSectionAddress(SHeader, YAMLSec);
 
   auto &OS = CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign);
   if (RawSec && (RawSec->Content || RawSec->Size)) {
@@ -678,8 +709,7 @@
 
   // If the section is explicitly described in the YAML
   // then we want to use its section address.
-  if (YAMLSec)
-    SHeader.sh_addr = YAMLSec->Address;
+  assignSectionAddress(SHeader, YAMLSec);
 }
 
 template <class ELFT> void ELFState<ELFT>::reportError(const Twine &Msg) {
diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp
--- a/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -1013,7 +1013,7 @@
   IO.mapOptional("Name", Section.Name, StringRef());
   IO.mapRequired("Type", Section.Type);
   IO.mapOptional("Flags", Section.Flags);
-  IO.mapOptional("Address", Section.Address, Hex64(0));
+  IO.mapOptional("Address", Section.Address);
   IO.mapOptional("Link", Section.Link, StringRef());
   IO.mapOptional("AddressAlign", Section.AddressAlign, Hex64(0));
   IO.mapOptional("EntSize", Section.EntSize);
diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp
--- a/llvm/lib/Passes/StandardInstrumentations.cpp
+++ b/llvm/lib/Passes/StandardInstrumentations.cpp
@@ -70,16 +70,24 @@
   llvm_unreachable("Unknown IR unit");
 }
 
-void printIR(const Module *M, StringRef Banner, StringRef Extra = StringRef()) {
-  dbgs() << Banner << Extra << "\n";
-  M->print(dbgs(), nullptr, false);
-}
 void printIR(const Function *F, StringRef Banner,
              StringRef Extra = StringRef()) {
   if (!llvm::isFunctionInPrintList(F->getName()))
     return;
   dbgs() << Banner << Extra << "\n" << static_cast<const Value &>(*F);
 }
+
+void printIR(const Module *M, StringRef Banner, StringRef Extra = StringRef()) {
+  if (llvm::isFunctionInPrintList("*") || llvm::forcePrintModuleIR()) {
+    dbgs() << Banner << Extra << "\n";
+    M->print(dbgs(), nullptr, false);
+  } else {
+    for (const auto &F : M->functions()) {
+      printIR(&F, Banner, Extra);
+    }
+  }
+}
+
 void printIR(const LazyCallGraph::SCC *C, StringRef Banner,
              StringRef Extra = StringRef()) {
   bool BannerPrinted = false;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -154,9 +154,6 @@
   InstructionSelector::ComplexRendererFns
   selectVOP3PMods(MachineOperand &Root) const;
 
-  InstructionSelector::ComplexRendererFns
-  selectVOP3PMods0(MachineOperand &Root) const;
-
   InstructionSelector::ComplexRendererFns
   selectVOP3OpSelMods0(MachineOperand &Root) const;
   InstructionSelector::ComplexRendererFns
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2508,23 +2508,6 @@
   }};
 }
 
-InstructionSelector::ComplexRendererFns
-AMDGPUInstructionSelector::selectVOP3PMods0(MachineOperand &Root) const {
-  MachineRegisterInfo &MRI
-    = Root.getParent()->getParent()->getParent()->getRegInfo();
-
-  Register Src;
-  unsigned Mods;
-  std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
-
-  return {{
-      [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
-      [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } , // src_mods
-      // FIXME: Handle clamp and op_sel
-      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }
-  }};
-}
-
 InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
   Register Src;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -440,7 +440,7 @@
 
   S.PushSection();
   S.SwitchSection(Context.getELFSection(
-    ElfNote::SectionName, ELF::SHT_NOTE, 0));
+    ElfNote::SectionName, ELF::SHT_NOTE, ELF::SHF_ALLOC));
   S.emitIntValue(NameSZ, 4);                                  // namesz
   S.emitValue(DescSZ, 4);                                     // descz
   S.emitIntValue(NoteType, 4);                                // type
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -15323,6 +15323,9 @@
       return false;
   }
 
+  if (Subtarget->hasMVEIntegerOps())
+    return true;
+
   // Don't create a loadext if we can fold the extension into a wide/long
   // instruction.
   // If there's more than one user instruction, the loadext is desirable no
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -465,7 +465,7 @@
   short getRegForm(const MachineInstr &MI) const;
   unsigned getSize(const MachineInstr &MI) const;
   uint64_t getType(const MachineInstr &MI) const;
-  unsigned getUnits(const MachineInstr &MI) const;
+  InstrStage::FuncUnits getUnits(const MachineInstr &MI) const;
 
   MachineBasicBlock::instr_iterator expandVGatherPseudo(MachineInstr &MI) const;
 
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -4485,7 +4485,7 @@
   return (F >> HexagonII::TypePos) & HexagonII::TypeMask;
 }
 
-unsigned HexagonInstrInfo::getUnits(const MachineInstr &MI) const {
+InstrStage::FuncUnits HexagonInstrInfo::getUnits(const MachineInstr &MI) const {
   const InstrItineraryData &II = *Subtarget.getInstrItineraryData();
   const InstrStage &IS = *II.beginStage(MI.getDesc().getSchedClass());
 
diff --git a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
--- a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -1060,8 +1060,7 @@
   // we ignore the instruction.
   const MCInstrDesc& TID = MI.getDesc();
   auto *IS = ResourceTracker->getInstrItins()->beginStage(TID.getSchedClass());
-  unsigned FuncUnits = IS->getUnits();
-  return !FuncUnits;
+  return !IS->getUnits();
 }
 
 bool HexagonPacketizerList::isSoloInstruction(const MachineInstr &MI) {
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -20,7 +20,6 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCValue.h"
-#include "llvm/Object/MachO.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -204,7 +204,6 @@
     bool tryBitfieldInsert(SDNode *N);
     bool tryBitPermutation(SDNode *N);
     bool tryIntCompareInGPR(SDNode *N);
-    bool tryAndWithMask(SDNode *N);
 
     // tryTLSXFormLoad - Convert an ISD::LOAD fed by a PPCISD::ADD_TLS into
     // an X-Form load instruction with the offset being a relocation coming from
@@ -343,6 +342,11 @@
 
 private:
     bool trySETCC(SDNode *N);
+    bool tryAsSingleRLDICL(SDNode *N);
+    bool tryAsSingleRLDICR(SDNode *N);
+    bool tryAsSingleRLWINM(SDNode *N);
+    bool tryAsSingleRLWINM8(SDNode *N);
+    bool tryAsSingleRLWIMI(SDNode *N);
 
     void PeepholePPC64();
     void PeepholePPC64ZExt();
@@ -4371,142 +4375,172 @@
   return true;
 }
 
-bool PPCDAGToDAGISel::tryAndWithMask(SDNode *N) {
-  if (N->getOpcode() != ISD::AND)
+bool PPCDAGToDAGISel::tryAsSingleRLWINM(SDNode *N) {
+  assert(N->getOpcode() == ISD::AND && "ISD::AND SDNode expected");
+  unsigned Imm;
+  if (!isInt32Immediate(N->getOperand(1), Imm))
     return false;
 
   SDLoc dl(N);
   SDValue Val = N->getOperand(0);
-  unsigned Imm, Imm2, SH, MB, ME;
-  uint64_t Imm64;
-
+  unsigned SH, MB, ME;
   // If this is an and of a value rotated between 0 and 31 bits and then and'd
   // with a mask, emit rlwinm
-  if (isInt32Immediate(N->getOperand(1), Imm) &&
-      isRotateAndMask(N->getOperand(0).getNode(), Imm, false, SH, MB, ME)) {
-    SDValue Val = N->getOperand(0).getOperand(0);
-    SDValue Ops[] = { Val, getI32Imm(SH, dl), getI32Imm(MB, dl),
-                      getI32Imm(ME, dl) };
+  if (isRotateAndMask(Val.getNode(), Imm, false, SH, MB, ME)) {
+    Val = Val.getOperand(0);
+    SDValue Ops[] = {Val, getI32Imm(SH, dl), getI32Imm(MB, dl),
+                     getI32Imm(ME, dl)};
     CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
     return true;
   }
 
   // If this is just a masked value where the input is not handled, and
   // is not a rotate-left (handled by a pattern in the .td file), emit rlwinm
-  if (isInt32Immediate(N->getOperand(1), Imm)) {
-    if (isRunOfOnes(Imm, MB, ME) &&
-        N->getOperand(0).getOpcode() != ISD::ROTL) {
-      SDValue Ops[] = { Val, getI32Imm(0, dl), getI32Imm(MB, dl),
-                        getI32Imm(ME, dl) };
-      CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
-      return true;
-    }
-    // AND X, 0 -> 0, not "rlwinm 32".
-    if (Imm == 0) {
-      ReplaceUses(SDValue(N, 0), N->getOperand(1));
-      return true;
-    }
+  if (isRunOfOnes(Imm, MB, ME) && Val.getOpcode() != ISD::ROTL) {
+    SDValue Ops[] = {Val, getI32Imm(0, dl), getI32Imm(MB, dl),
+                     getI32Imm(ME, dl)};
+    CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+    return true;
+  }
 
-    // ISD::OR doesn't get all the bitfield insertion fun.
-    // (and (or x, c1), c2) where isRunOfOnes(~(c1^c2)) might be a
-    // bitfield insert.
-    if (N->getOperand(0).getOpcode() == ISD::OR &&
-        isInt32Immediate(N->getOperand(0).getOperand(1), Imm2)) {
-      // The idea here is to check whether this is equivalent to:
-      //   (c1 & m) | (x & ~m)
-      // where m is a run-of-ones mask. The logic here is that, for each bit in
-      // c1 and c2:
-      //  - if both are 1, then the output will be 1.
-      //  - if both are 0, then the output will be 0.
-      //  - if the bit in c1 is 0, and the bit in c2 is 1, then the output will
-      //    come from x.
-      //  - if the bit in c1 is 1, and the bit in c2 is 0, then the output will
-      //    be 0.
-      //  If that last condition is never the case, then we can form m from the
-      //  bits that are the same between c1 and c2.
-      unsigned MB, ME;
-      if (isRunOfOnes(~(Imm^Imm2), MB, ME) && !(~Imm & Imm2)) {
-        SDValue Ops[] = { N->getOperand(0).getOperand(0),
-                            N->getOperand(0).getOperand(1),
-                            getI32Imm(0, dl), getI32Imm(MB, dl),
-                            getI32Imm(ME, dl) };
-        ReplaceNode(N, CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops));
-        return true;
-      }
-    }
-  } else if (isInt64Immediate(N->getOperand(1).getNode(), Imm64)) {
-    // If this is a 64-bit zero-extension mask, emit rldicl.
-    if (isMask_64(Imm64)) {
-      MB = 64 - countTrailingOnes(Imm64);
-      SH = 0;
-
-      if (Val.getOpcode() == ISD::ANY_EXTEND) {
-        auto Op0 = Val.getOperand(0);
-        if ( Op0.getOpcode() == ISD::SRL &&
-           isInt32Immediate(Op0.getOperand(1).getNode(), Imm) && Imm <= MB) {
-
-           auto ResultType = Val.getNode()->getValueType(0);
-           auto ImDef = CurDAG->getMachineNode(PPC::IMPLICIT_DEF, dl,
-                                               ResultType);
-           SDValue IDVal (ImDef, 0);
-
-           Val = SDValue(CurDAG->getMachineNode(PPC::INSERT_SUBREG, dl,
-                         ResultType, IDVal, Op0.getOperand(0),
-                         getI32Imm(1, dl)), 0);
-           SH = 64 - Imm;
-        }
-      }
+  // AND X, 0 -> 0, not "rlwinm 32".
+  if (Imm == 0) {
+    ReplaceUses(SDValue(N, 0), N->getOperand(1));
+    return true;
+  }
 
-      // If the operand is a logical right shift, we can fold it into this
-      // instruction: rldicl(rldicl(x, 64-n, n), 0, mb) -> rldicl(x, 64-n, mb)
-      // for n <= mb. The right shift is really a left rotate followed by a
-      // mask, and this mask is a more-restrictive sub-mask of the mask implied
-      // by the shift.
-      if (Val.getOpcode() == ISD::SRL &&
-          isInt32Immediate(Val.getOperand(1).getNode(), Imm) && Imm <= MB) {
-        assert(Imm < 64 && "Illegal shift amount");
-        Val = Val.getOperand(0);
-        SH = 64 - Imm;
-      }
+  return false;
+}
 
-      SDValue Ops[] = { Val, getI32Imm(SH, dl), getI32Imm(MB, dl) };
-      CurDAG->SelectNodeTo(N, PPC::RLDICL, MVT::i64, Ops);
-      return true;
-    } else if (isMask_64(~Imm64)) {
-      // If this is a negated 64-bit zero-extension mask,
-      // i.e. the immediate is a sequence of ones from most significant side
-      // and all zero for reminder, we should use rldicr.
-      MB = 63 - countTrailingOnes(~Imm64);
-      SH = 0;
-      SDValue Ops[] = { Val, getI32Imm(SH, dl), getI32Imm(MB, dl) };
-      CurDAG->SelectNodeTo(N, PPC::RLDICR, MVT::i64, Ops);
-      return true;
-    }
+bool PPCDAGToDAGISel::tryAsSingleRLWINM8(SDNode *N) {
+  assert(N->getOpcode() == ISD::AND && "ISD::AND SDNode expected");
+  uint64_t Imm64;
+  if (!isInt64Immediate(N->getOperand(1).getNode(), Imm64))
+    return false;
 
-    // It is not 16-bit imm that means we need two instructions at least if
-    // using "and" instruction. Try to exploit it with rotate mask instructions.
-    if (isRunOfOnes64(Imm64, MB, ME)) {
-      if (MB >= 32 && MB <= ME) {
-        //                MB  ME
-        // +----------------------+
-        // |xxxxxxxxxxx00011111000|
-        // +----------------------+
-        //  0         32         64
-        // We can only do it if the MB is larger than 32 and MB <= ME
-        // as RLWINM will replace the content of [0 - 32) with [32 - 64) even
-        // we didn't rotate it.
-        SDValue Ops[] = { Val, getI64Imm(0, dl), getI64Imm(MB - 32, dl),
-                          getI64Imm(ME - 32, dl) };
-        CurDAG->SelectNodeTo(N, PPC::RLWINM8, MVT::i64, Ops);
-        return true;
-      }
-      // TODO - handle it with rldicl + rldicl
-    }
+  unsigned MB, ME;
+  if (isRunOfOnes64(Imm64, MB, ME) && MB >= 32 && MB <= ME) {
+    //                MB  ME
+    // +----------------------+
+    // |xxxxxxxxxxx00011111000|
+    // +----------------------+
+    //  0         32         64
+    // We can only do it if the MB is larger than 32 and MB <= ME
+    // as RLWINM will replace the contents of [0 - 32) with [32 - 64) even
+    // we didn't rotate it.
+    SDLoc dl(N);
+    SDValue Ops[] = {N->getOperand(0), getI64Imm(0, dl), getI64Imm(MB - 32, dl),
+                     getI64Imm(ME - 32, dl)};
+    CurDAG->SelectNodeTo(N, PPC::RLWINM8, MVT::i64, Ops);
+    return true;
+  }
+
+  return false;
+}
+
+bool PPCDAGToDAGISel::tryAsSingleRLWIMI(SDNode *N) {
+  assert(N->getOpcode() == ISD::AND && "ISD::AND SDNode expected");
+  unsigned Imm;
+  if (!isInt32Immediate(N->getOperand(1), Imm))
+    return false;
+
+  SDValue Val = N->getOperand(0);
+  unsigned Imm2;
+  // ISD::OR doesn't get all the bitfield insertion fun.
+  // (and (or x, c1), c2) where isRunOfOnes(~(c1^c2)) might be a
+  // bitfield insert.
+  if (Val.getOpcode() != ISD::OR || !isInt32Immediate(Val.getOperand(1), Imm2))
+    return false;
+
+  // The idea here is to check whether this is equivalent to:
+  //   (c1 & m) | (x & ~m)
+  // where m is a run-of-ones mask. The logic here is that, for each bit in
+  // c1 and c2:
+  //  - if both are 1, then the output will be 1.
+  //  - if both are 0, then the output will be 0.
+  //  - if the bit in c1 is 0, and the bit in c2 is 1, then the output will
+  //    come from x.
+  //  - if the bit in c1 is 1, and the bit in c2 is 0, then the output will
+  //    be 0.
+  //  If that last condition is never the case, then we can form m from the
+  //  bits that are the same between c1 and c2.
+  unsigned MB, ME;
+  if (isRunOfOnes(~(Imm ^ Imm2), MB, ME) && !(~Imm & Imm2)) {
+    SDLoc dl(N);
+    SDValue Ops[] = {Val.getOperand(0), Val.getOperand(1), getI32Imm(0, dl),
+                     getI32Imm(MB, dl), getI32Imm(ME, dl)};
+    ReplaceNode(N, CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops));
+    return true;
   }
 
   return false;
 }
 
+bool PPCDAGToDAGISel::tryAsSingleRLDICL(SDNode *N) {
+  assert(N->getOpcode() == ISD::AND && "ISD::AND SDNode expected");
+  uint64_t Imm64;
+  if (!isInt64Immediate(N->getOperand(1).getNode(), Imm64) || !isMask_64(Imm64))
+    return false;
+
+  // If this is a 64-bit zero-extension mask, emit rldicl.
+  unsigned MB = 64 - countTrailingOnes(Imm64);
+  unsigned SH = 0;
+  unsigned Imm;
+  SDValue Val = N->getOperand(0);
+  SDLoc dl(N);
+
+  if (Val.getOpcode() == ISD::ANY_EXTEND) {
+    auto Op0 = Val.getOperand(0);
+    if (Op0.getOpcode() == ISD::SRL &&
+        isInt32Immediate(Op0.getOperand(1).getNode(), Imm) && Imm <= MB) {
+
+      auto ResultType = Val.getNode()->getValueType(0);
+      auto ImDef = CurDAG->getMachineNode(PPC::IMPLICIT_DEF, dl, ResultType);
+      SDValue IDVal(ImDef, 0);
+
+      Val = SDValue(CurDAG->getMachineNode(PPC::INSERT_SUBREG, dl, ResultType,
+                                           IDVal, Op0.getOperand(0),
+                                           getI32Imm(1, dl)),
+                    0);
+      SH = 64 - Imm;
+    }
+  }
+
+  // If the operand is a logical right shift, we can fold it into this
+  // instruction: rldicl(rldicl(x, 64-n, n), 0, mb) -> rldicl(x, 64-n, mb)
+  // for n <= mb. The right shift is really a left rotate followed by a
+  // mask, and this mask is a more-restrictive sub-mask of the mask implied
+  // by the shift.
+  if (Val.getOpcode() == ISD::SRL &&
+      isInt32Immediate(Val.getOperand(1).getNode(), Imm) && Imm <= MB) {
+    assert(Imm < 64 && "Illegal shift amount");
+    Val = Val.getOperand(0);
+    SH = 64 - Imm;
+  }
+
+  SDValue Ops[] = {Val, getI32Imm(SH, dl), getI32Imm(MB, dl)};
+  CurDAG->SelectNodeTo(N, PPC::RLDICL, MVT::i64, Ops);
+  return true;
+}
+
+bool PPCDAGToDAGISel::tryAsSingleRLDICR(SDNode *N) {
+  assert(N->getOpcode() == ISD::AND && "ISD::AND SDNode expected");
+  uint64_t Imm64;
+  if (!isInt64Immediate(N->getOperand(1).getNode(), Imm64) ||
+      !isMask_64(~Imm64))
+    return false;
+
+  // If this is a negated 64-bit zero-extension mask,
+  // i.e. the immediate is a sequence of ones from most significant side
+  // and all zero for reminder, we should use rldicr.
+  unsigned MB = 63 - countTrailingOnes(~Imm64);
+  unsigned SH = 0;
+  SDLoc dl(N);
+  SDValue Ops[] = {N->getOperand(0), getI32Imm(SH, dl), getI32Imm(MB, dl)};
+  CurDAG->SelectNodeTo(N, PPC::RLDICR, MVT::i64, Ops);
+  return true;
+}
+
 // Select - Convert the specified operand from a target-independent to a
 // target-specific node if it hasn't already been changed.
 void PPCDAGToDAGISel::Select(SDNode *N) {
@@ -4730,7 +4764,8 @@
 
   case ISD::AND:
     // If this is an 'and' with a mask, try to emit rlwinm/rldicl/rldicr
-    if (tryAndWithMask(N))
+    if (tryAsSingleRLWINM(N) || tryAsSingleRLWIMI(N) || tryAsSingleRLDICL(N) ||
+        tryAsSingleRLDICR(N) || tryAsSingleRLWINM8(N))
       return;
 
     // Other cases are autogenerated.
diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.h b/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
--- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
@@ -52,13 +52,14 @@
                                 MachineBasicBlock::iterator MI) const override;
 
   // Return the byte offset from the incoming stack pointer of Reg's
-  // ABI-defined save slot.  Return 0 if no slot is defined for Reg.
-  unsigned getRegSpillOffset(unsigned Reg) const {
-    return RegSpillOffsets[Reg];
-  }
+  // ABI-defined save slot.  Return 0 if no slot is defined for Reg.  Adjust
+  // the offset in case MF has packed-stack.
+  unsigned getRegSpillOffset(MachineFunction &MF, unsigned Reg) const;
 
   // Get or create the frame index of where the old frame pointer is stored.
   int getOrCreateFramePointerSaveIndex(MachineFunction &MF) const;
+
+  bool usePackedStack(MachineFunction &MF) const;
 };
 } // end namespace llvm
 
diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
--- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -62,18 +62,6 @@
     RegSpillOffsets[SpillOffsetTable[I].Reg] = SpillOffsetTable[I].Offset;
 }
 
-static bool usePackedStack(MachineFunction &MF) {
-  bool HasPackedStackAttr = MF.getFunction().hasFnAttribute("packed-stack");
-  bool IsVarArg = MF.getFunction().isVarArg();
-  bool CallConv = MF.getFunction().getCallingConv() != CallingConv::GHC;
-  bool BackChain = MF.getFunction().hasFnAttribute("backchain");
-  bool FramAddressTaken = MF.getFrameInfo().isFrameAddressTaken();
-  if (HasPackedStackAttr && BackChain)
-    report_fatal_error("packed-stack with backchain is currently unsupported.");
-  return HasPackedStackAttr && !IsVarArg && CallConv && !BackChain &&
-         !FramAddressTaken;
-}
-
 bool SystemZFrameLowering::
 assignCalleeSavedSpillSlots(MachineFunction &MF,
                             const TargetRegisterInfo *TRI,
@@ -87,71 +75,44 @@
   unsigned LowGPR = 0;
   unsigned HighGPR = SystemZ::R15D;
   int StartSPOffset = SystemZMC::CallFrameSize;
-  int CurrOffset;
-  if (!usePackedStack(MF)) {
-    for (auto &CS : CSI) {
-      unsigned Reg = CS.getReg();
-      int Offset = RegSpillOffsets[Reg];
-      if (Offset) {
-        if (SystemZ::GR64BitRegClass.contains(Reg) && StartSPOffset > Offset) {
-          LowGPR = Reg;
-          StartSPOffset = Offset;
-        }
-        Offset -= SystemZMC::CallFrameSize;
-        int FrameIdx = MFFrame.CreateFixedSpillStackObject(8, Offset);
-        CS.setFrameIdx(FrameIdx);
-      } else
-        CS.setFrameIdx(INT32_MAX);
-    }
+  for (auto &CS : CSI) {
+    unsigned Reg = CS.getReg();
+    int Offset = getRegSpillOffset(MF, Reg);
+    if (Offset) {
+      if (SystemZ::GR64BitRegClass.contains(Reg) && StartSPOffset > Offset) {
+        LowGPR = Reg;
+        StartSPOffset = Offset;
+      }
+      Offset -= SystemZMC::CallFrameSize;
+      int FrameIdx = MFFrame.CreateFixedSpillStackObject(8, Offset);
+      CS.setFrameIdx(FrameIdx);
+    } else
+      CS.setFrameIdx(INT32_MAX);
+  }
 
-    // Save the range of call-saved registers, for use by the
-    // prologue/epilogue inserters.
-    ZFI->setRestoreGPRRegs(LowGPR, HighGPR, StartSPOffset);
-    if (IsVarArg) {
-      // Also save the GPR varargs, if any.  R6D is call-saved, so would
-      // already be included, but we also need to handle the call-clobbered
-      // argument registers.
-      unsigned FirstGPR = ZFI->getVarArgsFirstGPR();
-      if (FirstGPR < SystemZ::NumArgGPRs) {
-        unsigned Reg = SystemZ::ArgGPRs[FirstGPR];
-        int Offset = RegSpillOffsets[Reg];
-        if (StartSPOffset > Offset) {
-          LowGPR = Reg; StartSPOffset = Offset;
-        }
+  // Save the range of call-saved registers, for use by the
+  // prologue/epilogue inserters.
+  ZFI->setRestoreGPRRegs(LowGPR, HighGPR, StartSPOffset);
+  if (IsVarArg) {
+    // Also save the GPR varargs, if any.  R6D is call-saved, so would
+    // already be included, but we also need to handle the call-clobbered
+    // argument registers.
+    unsigned FirstGPR = ZFI->getVarArgsFirstGPR();
+    if (FirstGPR < SystemZ::NumArgGPRs) {
+      unsigned Reg = SystemZ::ArgGPRs[FirstGPR];
+      int Offset = getRegSpillOffset(MF, Reg);
+      if (StartSPOffset > Offset) {
+        LowGPR = Reg; StartSPOffset = Offset;
       }
     }
-    ZFI->setSpillGPRRegs(LowGPR, HighGPR, StartSPOffset);
-
-    CurrOffset = -SystemZMC::CallFrameSize;
-  } else {
-    // Packed stack: put all the GPRs at the top of the Register save area.
-    uint32_t LowGR64Num = UINT32_MAX;
-    for (auto &CS : CSI) {
-      unsigned Reg = CS.getReg();
-      if (SystemZ::GR64BitRegClass.contains(Reg)) {
-        unsigned GR64Num = SystemZMC::getFirstReg(Reg);
-        int Offset = -8 * (15 - GR64Num + 1);
-        if (LowGR64Num > GR64Num) {
-          LowGR64Num = GR64Num;
-          StartSPOffset = SystemZMC::CallFrameSize + Offset;
-        }
-        int FrameIdx = MFFrame.CreateFixedSpillStackObject(8, Offset);
-        CS.setFrameIdx(FrameIdx);
-      } else
-        CS.setFrameIdx(INT32_MAX);
-    }
-    if (LowGR64Num < UINT32_MAX)
-      LowGPR = SystemZMC::GR64Regs[LowGR64Num];
-
-    // Save the range of call-saved registers, for use by the
-    // prologue/epilogue inserters.
-    ZFI->setRestoreGPRRegs(LowGPR, HighGPR, StartSPOffset);
-    ZFI->setSpillGPRRegs(LowGPR, HighGPR, StartSPOffset);
-
-    CurrOffset = LowGPR ? -(SystemZMC::CallFrameSize - StartSPOffset) : 0;
   }
+  ZFI->setSpillGPRRegs(LowGPR, HighGPR, StartSPOffset);
 
   // Create fixed stack objects for the remaining registers.
+  int CurrOffset = -SystemZMC::CallFrameSize;
+  if (usePackedStack(MF))
+    CurrOffset += StartSPOffset;
+
   for (auto &CS : CSI) {
     if (CS.getFrameIdx() != INT32_MAX)
       continue;
@@ -511,10 +472,13 @@
         .addCFIIndex(CFIIndex);
     SPOffsetFromCFA += Delta;
 
-    if (StoreBackchain)
+    if (StoreBackchain) {
+      // The back chain is stored topmost with packed-stack.
+      int Offset = usePackedStack(MF) ? SystemZMC::CallFrameSize - 8 : 0;
       BuildMI(MBB, MBBI, DL, ZII->get(SystemZ::STG))
-        .addReg(SystemZ::R1D, RegState::Kill).addReg(SystemZ::R15D).addImm(0)
-        .addReg(0);
+        .addReg(SystemZ::R1D, RegState::Kill).addReg(SystemZ::R15D)
+        .addImm(Offset).addReg(0);
+    }
   }
 
   if (HasFP) {
@@ -662,14 +626,43 @@
   }
 }
 
+unsigned SystemZFrameLowering::getRegSpillOffset(MachineFunction &MF,
+                                                 unsigned Reg) const {
+  bool IsVarArg = MF.getFunction().isVarArg();
+  bool BackChain = MF.getFunction().hasFnAttribute("backchain");
+  bool SoftFloat = MF.getSubtarget<SystemZSubtarget>().hasSoftFloat();
+  unsigned Offset = RegSpillOffsets[Reg];
+  if (usePackedStack(MF) && !(IsVarArg && !SoftFloat)) {
+    if (SystemZ::GR64BitRegClass.contains(Reg))
+      // Put all GPRs at the top of the Register save area with packed
+      // stack. Make room for the backchain if needed.
+      Offset += BackChain ? 24 : 32;
+    else
+      Offset = 0;
+  }
+  return Offset;
+}
+
 int SystemZFrameLowering::
 getOrCreateFramePointerSaveIndex(MachineFunction &MF) const {
   SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
   int FI = ZFI->getFramePointerSaveIndex();
   if (!FI) {
     MachineFrameInfo &MFFrame = MF.getFrameInfo();
-    FI = MFFrame.CreateFixedObject(8, -SystemZMC::CallFrameSize, false);
+    // The back chain is stored topmost with packed-stack.
+    int Offset = usePackedStack(MF) ? -8 : -SystemZMC::CallFrameSize;
+    FI = MFFrame.CreateFixedObject(8, Offset, false);
     ZFI->setFramePointerSaveIndex(FI);
   }
   return FI;
 }
+
+bool SystemZFrameLowering::usePackedStack(MachineFunction &MF) const {
+  bool HasPackedStackAttr = MF.getFunction().hasFnAttribute("packed-stack");
+  bool BackChain = MF.getFunction().hasFnAttribute("backchain");
+  bool SoftFloat = MF.getSubtarget<SystemZSubtarget>().hasSoftFloat();
+  if (HasPackedStackAttr && BackChain && !SoftFloat)
+    report_fatal_error("packed-stack + backchain + hard-float is unsupported.");
+  bool CallConv = MF.getFunction().getCallingConv() != CallingConv::GHC;
+  return HasPackedStackAttr && CallConv;
+}
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -1464,7 +1464,8 @@
 
     // ...and a similar frame index for the caller-allocated save area
     // that will be used to store the incoming registers.
-    int64_t RegSaveOffset = -SystemZMC::CallFrameSize;
+    int64_t RegSaveOffset =
+      -SystemZMC::CallFrameSize + TFL->getRegSpillOffset(MF, SystemZ::R2D) - 16;
     unsigned RegSaveIndex = MFI.CreateFixedObject(1, RegSaveOffset, true);
     FuncInfo->setRegSaveFrameIndex(RegSaveIndex);
 
@@ -1473,8 +1474,9 @@
     if (NumFixedFPRs < SystemZ::NumArgFPRs && !useSoftFloat()) {
       SDValue MemOps[SystemZ::NumArgFPRs];
       for (unsigned I = NumFixedFPRs; I < SystemZ::NumArgFPRs; ++I) {
-        unsigned Offset = TFL->getRegSpillOffset(SystemZ::ArgFPRs[I]);
-        int FI = MFI.CreateFixedObject(8, RegSaveOffset + Offset, true);
+        unsigned Offset = TFL->getRegSpillOffset(MF, SystemZ::ArgFPRs[I]);
+        int FI =
+          MFI.CreateFixedObject(8, -SystemZMC::CallFrameSize + Offset, true);
         SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
         unsigned VReg = MF.addLiveIn(SystemZ::ArgFPRs[I],
                                      &SystemZ::FP64BitRegClass);
@@ -3241,6 +3243,8 @@
 
 SDValue SystemZTargetLowering::lowerFRAMEADDR(SDValue Op,
                                               SelectionDAG &DAG) const {
+  auto *TFL =
+      static_cast<const SystemZFrameLowering *>(Subtarget.getFrameLowering());
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   MFI.setFrameAddressIsTaken(true);
@@ -3249,9 +3253,12 @@
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
 
+  // Return null if the back chain is not present.
+  bool HasBackChain = MF.getFunction().hasFnAttribute("backchain");
+  if (TFL->usePackedStack(MF) && !HasBackChain)
+    return DAG.getConstant(0, DL, PtrVT);
+
   // By definition, the frame address is the address of the back chain.
-  auto *TFL =
-      static_cast<const SystemZFrameLowering *>(Subtarget.getFrameLowering());
   int BackChainIdx = TFL->getOrCreateFramePointerSaveIndex(MF);
   SDValue BackChain = DAG.getFrameIndex(BackChainIdx, PtrVT);
 
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp
@@ -16,7 +16,7 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCSymbolELF.h"
-#include "llvm/Object/ELF.h"
+#include "llvm/BinaryFormat/ELF.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp
@@ -66,6 +66,17 @@
 using BlockVector = SmallVector<MachineBasicBlock *, 4>;
 using BlockSet = SmallPtrSet<MachineBasicBlock *, 4>;
 
+static BlockVector getSortedEntries(const BlockSet &Entries) {
+  BlockVector SortedEntries(Entries.begin(), Entries.end());
+  llvm::sort(SortedEntries,
+             [](const MachineBasicBlock *A, const MachineBasicBlock *B) {
+               auto ANum = A->getNumber();
+               auto BNum = B->getNumber();
+               return ANum < BNum;
+             });
+  return SortedEntries;
+}
+
 // Calculates reachability in a region. Ignores branches to blocks outside of
 // the region, and ignores branches to the region entry (for the case where
 // the region is the inner part of a loop).
@@ -241,7 +252,6 @@
 bool WebAssemblyFixIrreducibleControlFlow::processRegion(
     MachineBasicBlock *Entry, BlockSet &Blocks, MachineFunction &MF) {
   bool Changed = false;
-
   // Remove irreducibility before processing child loops, which may take
   // multiple iterations.
   while (true) {
@@ -249,12 +259,18 @@
 
     bool FoundIrreducibility = false;
 
-    for (auto *LoopEntry : Graph.getLoopEntries()) {
+    for (auto *LoopEntry : getSortedEntries(Graph.getLoopEntries())) {
       // Find mutual entries - all entries which can reach this one, and
       // are reached by it (that always includes LoopEntry itself). All mutual
       // entries must be in the same loop, so if we have more than one, then we
       // have irreducible control flow.
       //
+      // (Note that we need to sort the entries here, as otherwise the order can
+      // matter: being mutual is a symmetric relationship, and each set of
+      // mutuals will be handled properly no matter which we see first. However,
+      // there can be multiple disjoint sets of mutuals, and which we process
+      // first changes the output.)
+      //
       // Note that irreducibility may involve inner loops, e.g. imagine A
       // starts one loop, and it has B inside it which starts an inner loop.
       // If we add a branch from all the way on the outside to B, then in a
@@ -325,13 +341,7 @@
   assert(Entries.size() >= 2);
 
   // Sort the entries to ensure a deterministic build.
-  BlockVector SortedEntries(Entries.begin(), Entries.end());
-  llvm::sort(SortedEntries,
-             [&](const MachineBasicBlock *A, const MachineBasicBlock *B) {
-               auto ANum = A->getNumber();
-               auto BNum = B->getNumber();
-               return ANum < BNum;
-             });
+  BlockVector SortedEntries = getSortedEntries(Entries);
 
 #ifndef NDEBUG
   for (auto Block : SortedEntries)
diff --git a/llvm/lib/Target/X86/MCTargetDesc/LLVMBuild.txt b/llvm/lib/Target/X86/MCTargetDesc/LLVMBuild.txt
--- a/llvm/lib/Target/X86/MCTargetDesc/LLVMBuild.txt
+++ b/llvm/lib/Target/X86/MCTargetDesc/LLVMBuild.txt
@@ -18,5 +18,5 @@
 type = Library
 name = X86Desc
 parent = X86
-required_libraries = MC MCDisassembler Object Support X86Info X86Utils BinaryFormat
+required_libraries = MC MCDisassembler Support X86Info X86Utils BinaryFormat
 add_to_library_groups = X86
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -5394,6 +5394,163 @@
       CurDAG->RemoveDeadNode(Node);
       return;
     }
+    break;
+  }
+  case X86ISD::MGATHER: {
+    auto *Mgt = cast<X86MaskedGatherSDNode>(Node);
+    SDValue IndexOp = Mgt->getIndex();
+    SDValue Mask = Mgt->getMask();
+    MVT IndexVT = IndexOp.getSimpleValueType();
+    MVT ValueVT = Node->getSimpleValueType(0);
+    MVT MaskVT = Mask.getSimpleValueType();
+
+    // This is just to prevent crashes if the nodes are malformed somehow. We're
+    // otherwise only doing loose type checking in here based on type what
+    // a type constraint would say just like table based isel.
+    if (!ValueVT.isVector() || !MaskVT.isVector())
+      break;
+
+    unsigned NumElts = ValueVT.getVectorNumElements();
+    MVT ValueSVT = ValueVT.getVectorElementType();
+
+    bool IsFP = ValueSVT.isFloatingPoint();
+    unsigned EltSize = ValueSVT.getSizeInBits();
+
+    unsigned Opc = 0;
+    bool AVX512Gather = MaskVT.getVectorElementType() == MVT::i1;
+    if (AVX512Gather) {
+      if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
+        Opc = IsFP ? X86::VGATHERDPSZ128rm : X86::VPGATHERDDZ128rm;
+      else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
+        Opc = IsFP ? X86::VGATHERDPSZ256rm : X86::VPGATHERDDZ256rm;
+      else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
+        Opc = IsFP ? X86::VGATHERDPSZrm : X86::VPGATHERDDZrm;
+      else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
+        Opc = IsFP ? X86::VGATHERDPDZ128rm : X86::VPGATHERDQZ128rm;
+      else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
+        Opc = IsFP ? X86::VGATHERDPDZ256rm : X86::VPGATHERDQZ256rm;
+      else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
+        Opc = IsFP ? X86::VGATHERDPDZrm : X86::VPGATHERDQZrm;
+      else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
+        Opc = IsFP ? X86::VGATHERQPSZ128rm : X86::VPGATHERQDZ128rm;
+      else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
+        Opc = IsFP ? X86::VGATHERQPSZ256rm : X86::VPGATHERQDZ256rm;
+      else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
+        Opc = IsFP ? X86::VGATHERQPSZrm : X86::VPGATHERQDZrm;
+      else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
+        Opc = IsFP ? X86::VGATHERQPDZ128rm : X86::VPGATHERQQZ128rm;
+      else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
+        Opc = IsFP ? X86::VGATHERQPDZ256rm : X86::VPGATHERQQZ256rm;
+      else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
+        Opc = IsFP ? X86::VGATHERQPDZrm : X86::VPGATHERQQZrm;
+    } else {
+      assert(EVT(MaskVT) == EVT(ValueVT).changeVectorElementTypeToInteger() &&
+             "Unexpected mask VT!");
+      if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
+        Opc = IsFP ? X86::VGATHERDPSrm : X86::VPGATHERDDrm;
+      else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
+        Opc = IsFP ? X86::VGATHERDPSYrm : X86::VPGATHERDDYrm;
+      else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
+        Opc = IsFP ? X86::VGATHERDPDrm : X86::VPGATHERDQrm;
+      else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
+        Opc = IsFP ? X86::VGATHERDPDYrm : X86::VPGATHERDQYrm;
+      else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
+        Opc = IsFP ? X86::VGATHERQPSrm : X86::VPGATHERQDrm;
+      else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
+        Opc = IsFP ? X86::VGATHERQPSYrm : X86::VPGATHERQDYrm;
+      else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
+        Opc = IsFP ? X86::VGATHERQPDrm : X86::VPGATHERQQrm;
+      else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
+        Opc = IsFP ? X86::VGATHERQPDYrm : X86::VPGATHERQQYrm;
+    }
+
+    if (!Opc)
+      break;
+
+    SDValue BasePtr = Mgt->getBasePtr();
+    SDValue Base, Scale, Index, Disp, Segment;
+    if (!selectVectorAddr(Node, BasePtr, Base, Scale, Index, Disp, Segment))
+      break;
+
+    SDValue PassThru = Mgt->getPassThru();
+    SDValue Chain = Mgt->getChain();
+    SDVTList VTs = Mgt->getVTList();
+
+    MachineSDNode *NewNode;
+    if (AVX512Gather) {
+      SDValue Ops[] = {PassThru, Mask, Base,    Scale,
+                       Index,    Disp, Segment, Chain};
+      NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
+    } else {
+      SDValue Ops[] = {PassThru, Base,    Scale, Index,
+                       Disp,     Segment, Mask,  Chain};
+      NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
+    }
+    CurDAG->setNodeMemRefs(NewNode, {Mgt->getMemOperand()});
+    ReplaceNode(Node, NewNode);
+    return;
+  }
+  case X86ISD::MSCATTER: {
+    auto *Sc = cast<X86MaskedScatterSDNode>(Node);
+    SDValue Value = Sc->getValue();
+    SDValue IndexOp = Sc->getIndex();
+    MVT IndexVT = IndexOp.getSimpleValueType();
+    MVT ValueVT = Value.getSimpleValueType();
+
+    // This is just to prevent crashes if the nodes are malformed somehow. We're
+    // otherwise only doing loose type checking in here based on type what
+    // a type constraint would say just like table based isel.
+    if (!ValueVT.isVector())
+      break;
+
+    unsigned NumElts = ValueVT.getVectorNumElements();
+    MVT ValueSVT = ValueVT.getVectorElementType();
+
+    bool IsFP = ValueSVT.isFloatingPoint();
+    unsigned EltSize = ValueSVT.getSizeInBits();
+
+    unsigned Opc;
+    if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
+      Opc = IsFP ? X86::VSCATTERDPSZ128mr : X86::VPSCATTERDDZ128mr;
+    else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
+      Opc = IsFP ? X86::VSCATTERDPSZ256mr : X86::VPSCATTERDDZ256mr;
+    else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
+      Opc = IsFP ? X86::VSCATTERDPSZmr : X86::VPSCATTERDDZmr;
+    else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
+      Opc = IsFP ? X86::VSCATTERDPDZ128mr : X86::VPSCATTERDQZ128mr;
+    else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
+      Opc = IsFP ? X86::VSCATTERDPDZ256mr : X86::VPSCATTERDQZ256mr;
+    else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
+      Opc = IsFP ? X86::VSCATTERDPDZmr : X86::VPSCATTERDQZmr;
+    else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
+      Opc = IsFP ? X86::VSCATTERQPSZ128mr : X86::VPSCATTERQDZ128mr;
+    else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
+      Opc = IsFP ? X86::VSCATTERQPSZ256mr : X86::VPSCATTERQDZ256mr;
+    else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
+      Opc = IsFP ? X86::VSCATTERQPSZmr : X86::VPSCATTERQDZmr;
+    else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
+      Opc = IsFP ? X86::VSCATTERQPDZ128mr : X86::VPSCATTERQQZ128mr;
+    else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
+      Opc = IsFP ? X86::VSCATTERQPDZ256mr : X86::VPSCATTERQQZ256mr;
+    else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
+      Opc = IsFP ? X86::VSCATTERQPDZmr : X86::VPSCATTERQQZmr;
+    else
+      break;
+
+    SDValue BasePtr = Sc->getBasePtr();
+    SDValue Base, Scale, Index, Disp, Segment;
+    if (!selectVectorAddr(Node, BasePtr, Base, Scale, Index, Disp, Segment))
+      break;
+
+    SDValue Mask = Sc->getMask();
+    SDValue Chain = Sc->getChain();
+    SDVTList VTs = Sc->getVTList();
+    SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Value, Chain};
+
+    MachineSDNode *NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
+    CurDAG->setNodeMemRefs(NewNode, {Sc->getMemOperand()});
+    ReplaceNode(Node, NewNode);
+    return;
   }
   }
 
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1519,101 +1519,14 @@
                              const TargetLibraryInfo *libInfo);
   } // end namespace X86
 
-  // Base class for all X86 non-masked store operations.
-  class X86StoreSDNode : public MemSDNode {
-  public:
-    X86StoreSDNode(unsigned Opcode, unsigned Order, const DebugLoc &dl,
-                   SDVTList VTs, EVT MemVT,
-                   MachineMemOperand *MMO)
-      :MemSDNode(Opcode, Order, dl, VTs, MemVT, MMO) {}
-    const SDValue &getValue() const { return getOperand(1); }
-    const SDValue &getBasePtr() const { return getOperand(2); }
-
-    static bool classof(const SDNode *N) {
-      return N->getOpcode() == X86ISD::VTRUNCSTORES ||
-        N->getOpcode() == X86ISD::VTRUNCSTOREUS;
-    }
-  };
-
-  // Base class for all X86 masked store operations.
-  // The class has the same order of operands as MaskedStoreSDNode for
-  // convenience.
-  class X86MaskedStoreSDNode : public MemSDNode {
-  public:
-    X86MaskedStoreSDNode(unsigned Opcode, unsigned Order,
-                         const DebugLoc &dl, SDVTList VTs, EVT MemVT,
-                         MachineMemOperand *MMO)
-      : MemSDNode(Opcode, Order, dl, VTs, MemVT, MMO) {}
-
-    const SDValue &getValue()   const { return getOperand(1); }
-    const SDValue &getBasePtr() const { return getOperand(2); }
-    const SDValue &getMask()    const { return getOperand(3); }
-
-    static bool classof(const SDNode *N) {
-      return N->getOpcode() == X86ISD::VMTRUNCSTORES ||
-        N->getOpcode() == X86ISD::VMTRUNCSTOREUS;
-    }
-  };
-
-  // X86 Truncating Store with Signed saturation.
-  class TruncSStoreSDNode : public X86StoreSDNode {
-  public:
-    TruncSStoreSDNode(unsigned Order, const DebugLoc &dl,
-                        SDVTList VTs, EVT MemVT, MachineMemOperand *MMO)
-      : X86StoreSDNode(X86ISD::VTRUNCSTORES, Order, dl, VTs, MemVT, MMO) {}
-
-    static bool classof(const SDNode *N) {
-      return N->getOpcode() == X86ISD::VTRUNCSTORES;
-    }
-  };
-
-  // X86 Truncating Store with Unsigned saturation.
-  class TruncUSStoreSDNode : public X86StoreSDNode {
-  public:
-    TruncUSStoreSDNode(unsigned Order, const DebugLoc &dl,
-                      SDVTList VTs, EVT MemVT, MachineMemOperand *MMO)
-      : X86StoreSDNode(X86ISD::VTRUNCSTOREUS, Order, dl, VTs, MemVT, MMO) {}
-
-    static bool classof(const SDNode *N) {
-      return N->getOpcode() == X86ISD::VTRUNCSTOREUS;
-    }
-  };
-
-  // X86 Truncating Masked Store with Signed saturation.
-  class MaskedTruncSStoreSDNode : public X86MaskedStoreSDNode {
-  public:
-    MaskedTruncSStoreSDNode(unsigned Order,
-                         const DebugLoc &dl, SDVTList VTs, EVT MemVT,
-                         MachineMemOperand *MMO)
-      : X86MaskedStoreSDNode(X86ISD::VMTRUNCSTORES, Order, dl, VTs, MemVT, MMO) {}
-
-    static bool classof(const SDNode *N) {
-      return N->getOpcode() == X86ISD::VMTRUNCSTORES;
-    }
-  };
-
-  // X86 Truncating Masked Store with Unsigned saturation.
-  class MaskedTruncUSStoreSDNode : public X86MaskedStoreSDNode {
-  public:
-    MaskedTruncUSStoreSDNode(unsigned Order,
-                            const DebugLoc &dl, SDVTList VTs, EVT MemVT,
-                            MachineMemOperand *MMO)
-      : X86MaskedStoreSDNode(X86ISD::VMTRUNCSTOREUS, Order, dl, VTs, MemVT, MMO) {}
-
-    static bool classof(const SDNode *N) {
-      return N->getOpcode() == X86ISD::VMTRUNCSTOREUS;
-    }
-  };
-
   // X86 specific Gather/Scatter nodes.
   // The class has the same order of operands as MaskedGatherScatterSDNode for
   // convenience.
-  class X86MaskedGatherScatterSDNode : public MemSDNode {
+  class X86MaskedGatherScatterSDNode : public MemIntrinsicSDNode {
   public:
-    X86MaskedGatherScatterSDNode(unsigned Opc, unsigned Order,
-                                 const DebugLoc &dl, SDVTList VTs, EVT MemVT,
-                                 MachineMemOperand *MMO)
-        : MemSDNode(Opc, Order, dl, VTs, MemVT, MMO) {}
+    // This is a intended as a utility and should never be directly created.
+    X86MaskedGatherScatterSDNode() = delete;
+    ~X86MaskedGatherScatterSDNode() = delete;
 
     const SDValue &getBasePtr() const { return getOperand(3); }
     const SDValue &getIndex()   const { return getOperand(4); }
@@ -1628,11 +1541,6 @@
 
   class X86MaskedGatherSDNode : public X86MaskedGatherScatterSDNode {
   public:
-    X86MaskedGatherSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
-                          EVT MemVT, MachineMemOperand *MMO)
-        : X86MaskedGatherScatterSDNode(X86ISD::MGATHER, Order, dl, VTs, MemVT,
-                                       MMO) {}
-
     const SDValue &getPassThru() const { return getOperand(1); }
 
     static bool classof(const SDNode *N) {
@@ -1642,11 +1550,6 @@
 
   class X86MaskedScatterSDNode : public X86MaskedGatherScatterSDNode {
   public:
-    X86MaskedScatterSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
-                           EVT MemVT, MachineMemOperand *MMO)
-        : X86MaskedGatherScatterSDNode(X86ISD::MSCATTER, Order, dl, VTs, MemVT,
-                                       MMO) {}
-
     const SDValue &getValue() const { return getOperand(1); }
 
     static bool classof(const SDNode *N) {
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -8284,6 +8284,10 @@
     MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
                                       : MVT::getIntegerVT(LoadSizeInBits);
     MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
+    // Allow v4f32 on SSE1 only targets.
+    // FIXME: Add more isel patterns so we can just use VT directly.
+    if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
+      VecVT = MVT::v4f32;
     if (TLI.isTypeLegal(VecVT)) {
       SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
       SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
@@ -24756,11 +24760,15 @@
   if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
     Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
 
+  // Cast mask to an integer type.
+  Mask = DAG.getBitcast(MaskVT, Mask);
+
   MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
 
   SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
-  SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
-    VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand());
+  SDValue Res =
+      DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
+                              MemIntr->getMemoryVT(), MemIntr->getMemOperand());
   return DAG.getMergeValues({ Res, Res.getValue(2) }, dl);
 }
 
@@ -24796,8 +24804,9 @@
   MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
 
   SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
-  SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
-    VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand());
+  SDValue Res =
+      DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
+                              MemIntr->getMemoryVT(), MemIntr->getMemOperand());
   return DAG.getMergeValues({ Res, Res.getValue(2) }, dl);
 }
 
@@ -24826,8 +24835,9 @@
 
   SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
   SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
-  SDValue Res = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
-      VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand());
+  SDValue Res =
+      DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
+                              MemIntr->getMemoryVT(), MemIntr->getMemOperand());
   return Res.getValue(1);
 }
 
@@ -24987,13 +24997,11 @@
 EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
                 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
                 SelectionDAG &DAG) {
-
   SDVTList VTs = DAG.getVTList(MVT::Other);
   SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
   SDValue Ops[] = { Chain, Val, Ptr, Undef };
-  return SignedSat ?
-    DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
-    DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
+  unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
+  return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
 }
 
 /// Emit Masked Truncating Store with signed or unsigned saturation.
@@ -25001,12 +25009,10 @@
 EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
                       SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
                       MachineMemOperand *MMO, SelectionDAG &DAG) {
-
   SDVTList VTs = DAG.getVTList(MVT::Other);
   SDValue Ops[] = { Chain, Val, Ptr, Mask };
-  return SignedSat ?
-    DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
-    DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
+  unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
+  return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
 }
 
 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
@@ -27514,15 +27520,14 @@
   return false;
 }
 
-// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
-// TODO: In 32-bit mode, use FISTP when X87 is available?
 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
   Type *MemType = SI->getValueOperand()->getType();
 
   bool NoImplicitFloatOps =
       SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
   if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
-      !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2())
+      !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
+      (Subtarget.hasSSE1() || Subtarget.hasX87()))
     return false;
 
   return needsCmpXchgNb(MemType);
@@ -27541,7 +27546,7 @@
       LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
   if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
       !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
-      (Subtarget.hasSSE2() || Subtarget.hasX87()))
+      (Subtarget.hasSSE1() || Subtarget.hasX87()))
     return AtomicExpansionKind::None;
 
   return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
@@ -28281,28 +28286,54 @@
     return Op;
 
   if (VT == MVT::i64 && !IsTypeLegal) {
-    // For illegal i64 atomic_stores, we can try to use MOVQ if SSE2 is enabled.
-    // FIXME: Use movlps with SSE1.
-    // FIXME: Use fist with X87.
+    // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
+    // is enabled.
     bool NoImplicitFloatOps =
         DAG.getMachineFunction().getFunction().hasFnAttribute(
             Attribute::NoImplicitFloat);
-    if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
-        Subtarget.hasSSE2()) {
-      SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
-                                     Node->getOperand(2));
-      SDVTList Tys = DAG.getVTList(MVT::Other);
-      SDValue Ops[] = { Node->getChain(), SclToVec, Node->getBasePtr() };
-      SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys,
-                                              Ops, MVT::i64,
-                                              Node->getMemOperand());
+    if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
+      SDValue Chain;
+      if (Subtarget.hasSSE1()) {
+        SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
+                                       Node->getOperand(2));
+        MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
+        SclToVec = DAG.getBitcast(StVT, SclToVec);
+        SDVTList Tys = DAG.getVTList(MVT::Other);
+        SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
+        Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
+                                        MVT::i64, Node->getMemOperand());
+      } else if (Subtarget.hasX87()) {
+        // First load this into an 80-bit X87 register using a stack temporary.
+        // This will put the whole integer into the significand.
+        SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
+        int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+        MachinePointerInfo MPI =
+            MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
+        Chain =
+            DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr,
+                         MPI, /*Align*/ 0, MachineMemOperand::MOStore);
+        SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
+        SDValue LdOps[] = {Chain, StackPtr};
+        SDValue Value =
+            DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
+                                    /*Align*/ 0, MachineMemOperand::MOLoad);
+        Chain = Value.getValue(1);
+
+        // Now use an FIST to do the atomic store.
+        SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
+        Chain =
+            DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
+                                    StoreOps, MVT::i64, Node->getMemOperand());
+      }
 
-      // If this is a sequentially consistent store, also emit an appropriate
-      // barrier.
-      if (IsSeqCst)
-        Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
+      if (Chain) {
+        // If this is a sequentially consistent store, also emit an appropriate
+        // barrier.
+        if (IsSeqCst)
+          Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
 
-      return Chain;
+        return Chain;
+      }
     }
   }
 
@@ -28476,8 +28507,8 @@
       Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
       SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
       SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
-      SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
-          VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
+      SDValue NewScatter = DAG.getMemIntrinsicNode(
+          X86ISD::MSCATTER, dl, VTs, Ops, N->getMemoryVT(), N->getMemOperand());
       return SDValue(NewScatter.getNode(), 1);
     }
     return SDValue();
@@ -28511,8 +28542,8 @@
 
   SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
   SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
-  SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
-      VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
+  SDValue NewScatter = DAG.getMemIntrinsicNode(
+      X86ISD::MSCATTER, dl, VTs, Ops, N->getMemoryVT(), N->getMemOperand());
   return SDValue(NewScatter.getNode(), 1);
 }
 
@@ -28667,9 +28698,9 @@
 
   SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
                     N->getScale() };
-  SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
-      DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
-      N->getMemOperand());
+  SDValue NewGather = DAG.getMemIntrinsicNode(
+      X86ISD::MGATHER, dl, DAG.getVTList(VT, MaskVT, MVT::Other), Ops,
+      N->getMemoryVT(), N->getMemOperand());
   SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
                                 NewGather, DAG.getIntPtrConstant(0, dl));
   return DAG.getMergeValues({Extract, NewGather.getValue(2)}, dl);
@@ -29638,15 +29669,27 @@
             Attribute::NoImplicitFloat);
     if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
       auto *Node = cast<AtomicSDNode>(N);
-      if (Subtarget.hasSSE2()) {
-        // Use a VZEXT_LOAD which will be selected as MOVQ. Then extract the
-        // lower 64-bits.
-        SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
+      if (Subtarget.hasSSE1()) {
+        // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
+        // Then extract the lower 64-bits.
+        MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
+        SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
         SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
         SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
                                              MVT::i64, Node->getMemOperand());
-        SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
+        if (Subtarget.hasSSE2()) {
+          SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
+                                    DAG.getIntPtrConstant(0, dl));
+          Results.push_back(Res);
+          Results.push_back(Ld.getValue(1));
+          return;
+        }
+        // We use an alternative sequence for SSE1 that extracts as v2f32 and
+        // then casts to i64. This avoids a 128-bit stack temporary being
+        // created by type legalization if we were to cast v4f32->v2i64.
+        SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
                                   DAG.getIntPtrConstant(0, dl));
+        Res = DAG.getBitcast(MVT::i64, Res);
         Results.push_back(Res);
         Results.push_back(Ld.getValue(1));
         return;
@@ -29771,9 +29814,10 @@
       }
       SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
                         Gather->getBasePtr(), Index, Gather->getScale() };
-      SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
-        DAG.getVTList(WideVT, Mask.getValueType(), MVT::Other), Ops, dl,
-        Gather->getMemoryVT(), Gather->getMemOperand());
+      SDValue Res = DAG.getMemIntrinsicNode(
+          X86ISD::MGATHER, dl,
+          DAG.getVTList(WideVT, Mask.getValueType(), MVT::Other), Ops,
+          Gather->getMemoryVT(), Gather->getMemOperand());
       Results.push_back(Res);
       Results.push_back(Res.getValue(2));
       return;
@@ -42121,8 +42165,10 @@
   SDValue Mask = Mst->getMask();
   if (Mask.getScalarValueSizeInBits() != 1) {
     APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
-    if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI))
+    if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
+      DCI.AddToWorklist(N);
       return SDValue(N, 0);
+    }
     if (SDValue NewMask =
             TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
       return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
@@ -42370,6 +42416,30 @@
   return SDValue();
 }
 
+static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     const X86Subtarget &Subtarget) {
+  auto *St = cast<MemIntrinsicSDNode>(N);
+
+  SDValue StoredVal = N->getOperand(1);
+  MVT VT = StoredVal.getSimpleValueType();
+  EVT MemVT = St->getMemoryVT();
+
+  // Figure out which elements we demand.
+  unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
+  APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
+
+  APInt KnownUndef, KnownZero;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, KnownUndef,
+                                     KnownZero, DCI)) {
+    DCI.AddToWorklist(N);
+    return SDValue(N, 0);
+  }
+
+  return SDValue();
+}
+
 /// Return 'true' if this vector operation is "horizontal"
 /// and return the operands for the horizontal operation in LHS and RHS.  A
 /// horizontal operation performs the binary operation on successive elements
@@ -43711,8 +43781,10 @@
   // BT ignores high bits in the bit index operand.
   unsigned BitWidth = N1.getValueSizeInBits();
   APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
-  if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI))
+  if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
+    DCI.AddToWorklist(N);
     return SDValue(N, 0);
+  }
 
   return SDValue();
 }
@@ -43726,8 +43798,30 @@
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     APInt DemandedElts = APInt::getLowBitsSet(8, 4);
     if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
-                                       DCI))
+                                       DCI)) {
+      DCI.AddToWorklist(N);
       return SDValue(N, 0);
+    }
+
+    if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
+      LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
+      // Unless the load is volatile or atomic.
+      if (LN->isSimple()) {
+        SDLoc dl(N);
+        SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
+        SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+        SDValue VZLoad =
+            DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MVT::i64,
+                                    LN->getPointerInfo(),
+                                    LN->getAlignment(),
+                                    LN->getMemOperand()->getFlags());
+        SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
+                                      DAG.getBitcast(MVT::v8i16, VZLoad));
+        DCI.CombineTo(N, Convert);
+        DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
+        return SDValue(N, 0);
+      }
+    }
   }
 
   return SDValue();
@@ -44608,8 +44702,10 @@
   if (Mask.getScalarValueSizeInBits() != 1) {
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
-    if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
+    if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
+      DCI.AddToWorklist(N);
       return SDValue(N, 0);
+    }
   }
 
   return SDValue();
@@ -44698,8 +44794,10 @@
   if (Mask.getScalarValueSizeInBits() != 1) {
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
-    if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
+    if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
+      DCI.AddToWorklist(N);
       return SDValue(N, 0);
+    }
   }
 
   return SDValue();
@@ -46731,6 +46829,8 @@
   case ISD::MLOAD:          return combineMaskedLoad(N, DAG, DCI, Subtarget);
   case ISD::STORE:          return combineStore(N, DAG, DCI, Subtarget);
   case ISD::MSTORE:         return combineMaskedStore(N, DAG, DCI, Subtarget);
+  case X86ISD::VEXTRACT_STORE:
+    return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
   case ISD::SINT_TO_FP:
   case ISD::STRICT_SINT_TO_FP:
     return combineSIntToFP(N, DAG, DCI, Subtarget);
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -8566,7 +8566,7 @@
 
 let Uses = [MXCSR], mayRaiseFPException = 1 in
 multiclass avx512_cvtph2ps<X86VectorVTInfo _dest, X86VectorVTInfo _src,
-                           X86MemOperand x86memop, PatFrag ld_frag,
+                           X86MemOperand x86memop, dag ld_dag,
                            X86FoldableSchedWrite sched> {
   defm rr : AVX512_maskable_split<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst),
                             (ins _src.RC:$src), "vcvtph2ps", "$src", "$src",
@@ -8575,8 +8575,8 @@
                             T8PD, Sched<[sched]>;
   defm rm : AVX512_maskable_split<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst),
                             (ins x86memop:$src), "vcvtph2ps", "$src", "$src",
-                            (X86any_cvtph2ps (_src.VT (ld_frag addr:$src))),
-                            (X86cvtph2ps (_src.VT (ld_frag addr:$src)))>,
+                            (X86any_cvtph2ps (_src.VT ld_dag)),
+                            (X86cvtph2ps (_src.VT ld_dag))>,
                             T8PD, Sched<[sched.Folded]>;
 }
 
@@ -8591,22 +8591,21 @@
 }
 
 let Predicates = [HasAVX512] in
-  defm VCVTPH2PSZ : avx512_cvtph2ps<v16f32_info, v16i16x_info, f256mem, load,
-                                    WriteCvtPH2PSZ>,
+  defm VCVTPH2PSZ : avx512_cvtph2ps<v16f32_info, v16i16x_info, f256mem,
+                                    (load addr:$src), WriteCvtPH2PSZ>,
                     avx512_cvtph2ps_sae<v16f32_info, v16i16x_info, WriteCvtPH2PSZ>,
                     EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>;
 
 let Predicates = [HasVLX] in {
   defm VCVTPH2PSZ256 : avx512_cvtph2ps<v8f32x_info, v8i16x_info, f128mem,
-                       load, WriteCvtPH2PSY>, EVEX, EVEX_V256,
+                       (load addr:$src), WriteCvtPH2PSY>, EVEX, EVEX_V256,
                        EVEX_CD8<32, CD8VH>;
   defm VCVTPH2PSZ128 : avx512_cvtph2ps<v4f32x_info, v8i16x_info, f64mem,
-                       load, WriteCvtPH2PS>, EVEX, EVEX_V128,
+                       (bitconvert (v2i64 (X86vzload64 addr:$src))),
+                       WriteCvtPH2PS>, EVEX, EVEX_V128,
                        EVEX_CD8<32, CD8VH>;
 
   // Pattern match vcvtph2ps of a scalar i64 load.
-  def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
-            (VCVTPH2PSZ128rm addr:$src)>;
   def : Pat<(v4f32 (X86any_cvtph2ps (v8i16 (bitconvert
               (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
             (VCVTPH2PSZ128rm addr:$src)>;
@@ -9727,54 +9726,49 @@
 
 // FIXME: Improve scheduling of gather/scatter instructions.
 multiclass avx512_gather<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
-                         X86MemOperand memop, PatFrag GatherNode,
-                         RegisterClass MaskRC = _.KRCWM> {
+                         X86MemOperand memop, RegisterClass MaskRC = _.KRCWM> {
   let Constraints = "@earlyclobber $dst, $src1 = $dst, $mask = $mask_wb",
-      ExeDomain = _.ExeDomain in
+      ExeDomain = _.ExeDomain, mayLoad = 1, hasSideEffects = 0 in
   def rm  : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst, MaskRC:$mask_wb),
             (ins _.RC:$src1, MaskRC:$mask, memop:$src2),
             !strconcat(OpcodeStr#_.Suffix,
             "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
-            [(set _.RC:$dst, MaskRC:$mask_wb,
-              (GatherNode  (_.VT _.RC:$src1), MaskRC:$mask,
-                     vectoraddr:$src2))]>, EVEX, EVEX_K,
-             EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteLoad]>;
+            []>, EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteLoad]>;
 }
 
 multiclass avx512_gather_q_pd<bits<8> dopc, bits<8> qopc,
                         AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
   defm NAME##D##SUFF##Z: avx512_gather<dopc, OpcodeStr##"d", _.info512,
-                                      vy512xmem, mgatherv8i32>, EVEX_V512, VEX_W;
+                                      vy512xmem>, EVEX_V512, VEX_W;
   defm NAME##Q##SUFF##Z: avx512_gather<qopc, OpcodeStr##"q", _.info512,
-                                      vz512mem,  mgatherv8i64>, EVEX_V512, VEX_W;
+                                      vz512mem>, EVEX_V512, VEX_W;
 let Predicates = [HasVLX] in {
   defm NAME##D##SUFF##Z256: avx512_gather<dopc, OpcodeStr##"d", _.info256,
-                              vx256xmem, mgatherv4i32>, EVEX_V256, VEX_W;
+                              vx256xmem>, EVEX_V256, VEX_W;
   defm NAME##Q##SUFF##Z256: avx512_gather<qopc, OpcodeStr##"q", _.info256,
-                              vy256xmem, mgatherv4i64>, EVEX_V256, VEX_W;
+                              vy256xmem>, EVEX_V256, VEX_W;
   defm NAME##D##SUFF##Z128: avx512_gather<dopc, OpcodeStr##"d", _.info128,
-                              vx128xmem, mgatherv4i32>, EVEX_V128, VEX_W;
+                              vx128xmem>, EVEX_V128, VEX_W;
   defm NAME##Q##SUFF##Z128: avx512_gather<qopc, OpcodeStr##"q", _.info128,
-                              vx128xmem, mgatherv2i64>, EVEX_V128, VEX_W;
+                              vx128xmem>, EVEX_V128, VEX_W;
 }
 }
 
 multiclass avx512_gather_d_ps<bits<8> dopc, bits<8> qopc,
                        AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
-  defm NAME##D##SUFF##Z: avx512_gather<dopc, OpcodeStr##"d", _.info512, vz512mem,
-                                       mgatherv16i32>, EVEX_V512;
-  defm NAME##Q##SUFF##Z: avx512_gather<qopc, OpcodeStr##"q", _.info256, vz256mem,
-                                       mgatherv8i64>, EVEX_V512;
+  defm NAME##D##SUFF##Z: avx512_gather<dopc, OpcodeStr##"d", _.info512, vz512mem>,
+                                       EVEX_V512;
+  defm NAME##Q##SUFF##Z: avx512_gather<qopc, OpcodeStr##"q", _.info256, vz256mem>,
+                                       EVEX_V512;
 let Predicates = [HasVLX] in {
   defm NAME##D##SUFF##Z256: avx512_gather<dopc, OpcodeStr##"d", _.info256,
-                                          vy256xmem, mgatherv8i32>, EVEX_V256;
+                                          vy256xmem>, EVEX_V256;
   defm NAME##Q##SUFF##Z256: avx512_gather<qopc, OpcodeStr##"q", _.info128,
-                                          vy128xmem, mgatherv4i64>, EVEX_V256;
+                                          vy128xmem>, EVEX_V256;
   defm NAME##D##SUFF##Z128: avx512_gather<dopc, OpcodeStr##"d", _.info128,
-                                          vx128xmem, mgatherv4i32>, EVEX_V128;
+                                          vx128xmem>, EVEX_V128;
   defm NAME##Q##SUFF##Z128: avx512_gather<qopc, OpcodeStr##"q", _.info128,
-                                          vx64xmem, mgatherv2i64, VK2WM>,
-                                          EVEX_V128;
+                                          vx64xmem, VK2WM>, EVEX_V128;
 }
 }
 
@@ -9786,55 +9780,52 @@
                 avx512_gather_d_ps<0x90, 0x91, avx512vl_i32_info, "vpgather", "D">;
 
 multiclass avx512_scatter<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
-                          X86MemOperand memop, PatFrag ScatterNode,
-                          RegisterClass MaskRC = _.KRCWM> {
+                          X86MemOperand memop, RegisterClass MaskRC = _.KRCWM> {
 
-let mayStore = 1, Constraints = "$mask = $mask_wb", ExeDomain = _.ExeDomain in
+let mayStore = 1, Constraints = "$mask = $mask_wb", ExeDomain = _.ExeDomain, 
+    hasSideEffects = 0 in
 
   def mr  : AVX5128I<opc, MRMDestMem, (outs MaskRC:$mask_wb),
             (ins memop:$dst, MaskRC:$mask, _.RC:$src),
             !strconcat(OpcodeStr#_.Suffix,
             "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"),
-            [(set MaskRC:$mask_wb, (ScatterNode (_.VT _.RC:$src),
-                                    MaskRC:$mask,  vectoraddr:$dst))]>,
-            EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>,
+            []>, EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>,
             Sched<[WriteStore]>;
 }
 
 multiclass avx512_scatter_q_pd<bits<8> dopc, bits<8> qopc,
                         AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
   defm NAME##D##SUFF##Z: avx512_scatter<dopc, OpcodeStr##"d", _.info512,
-                                      vy512xmem, mscatterv8i32>, EVEX_V512, VEX_W;
+                                      vy512xmem>, EVEX_V512, VEX_W;
   defm NAME##Q##SUFF##Z: avx512_scatter<qopc, OpcodeStr##"q", _.info512,
-                                      vz512mem,  mscatterv8i64>, EVEX_V512, VEX_W;
+                                      vz512mem>, EVEX_V512, VEX_W;
 let Predicates = [HasVLX] in {
   defm NAME##D##SUFF##Z256: avx512_scatter<dopc, OpcodeStr##"d", _.info256,
-                              vx256xmem, mscatterv4i32>, EVEX_V256, VEX_W;
+                              vx256xmem>, EVEX_V256, VEX_W;
   defm NAME##Q##SUFF##Z256: avx512_scatter<qopc, OpcodeStr##"q", _.info256,
-                              vy256xmem, mscatterv4i64>, EVEX_V256, VEX_W;
+                              vy256xmem>, EVEX_V256, VEX_W;
   defm NAME##D##SUFF##Z128: avx512_scatter<dopc, OpcodeStr##"d", _.info128,
-                              vx128xmem, mscatterv4i32>, EVEX_V128, VEX_W;
+                              vx128xmem>, EVEX_V128, VEX_W;
   defm NAME##Q##SUFF##Z128: avx512_scatter<qopc, OpcodeStr##"q", _.info128,
-                              vx128xmem, mscatterv2i64>, EVEX_V128, VEX_W;
+                              vx128xmem>, EVEX_V128, VEX_W;
 }
 }
 
 multiclass avx512_scatter_d_ps<bits<8> dopc, bits<8> qopc,
                        AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
-  defm NAME##D##SUFF##Z: avx512_scatter<dopc, OpcodeStr##"d", _.info512, vz512mem,
-                                       mscatterv16i32>, EVEX_V512;
-  defm NAME##Q##SUFF##Z: avx512_scatter<qopc, OpcodeStr##"q", _.info256, vz256mem,
-                                       mscatterv8i64>, EVEX_V512;
+  defm NAME##D##SUFF##Z: avx512_scatter<dopc, OpcodeStr##"d", _.info512, vz512mem>,
+                                       EVEX_V512;
+  defm NAME##Q##SUFF##Z: avx512_scatter<qopc, OpcodeStr##"q", _.info256, vz256mem>,
+                                       EVEX_V512;
 let Predicates = [HasVLX] in {
   defm NAME##D##SUFF##Z256: avx512_scatter<dopc, OpcodeStr##"d", _.info256,
-                                          vy256xmem, mscatterv8i32>, EVEX_V256;
+                                          vy256xmem>, EVEX_V256;
   defm NAME##Q##SUFF##Z256: avx512_scatter<qopc, OpcodeStr##"q", _.info128,
-                                          vy128xmem, mscatterv4i64>, EVEX_V256;
+                                          vy128xmem>, EVEX_V256;
   defm NAME##D##SUFF##Z128: avx512_scatter<dopc, OpcodeStr##"d", _.info128,
-                                          vx128xmem, mscatterv4i32>, EVEX_V128;
+                                          vx128xmem>, EVEX_V128;
   defm NAME##Q##SUFF##Z128: avx512_scatter<qopc, OpcodeStr##"q", _.info128,
-                                          vx64xmem, mscatterv2i64, VK2WM>,
-                                          EVEX_V128;
+                                          vx64xmem, VK2WM>, EVEX_V128;
 }
 }
 
diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
--- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -915,89 +915,6 @@
 def memopv8i16 : PatFrag<(ops node:$ptr), (v8i16 (memop node:$ptr))>;
 def memopv16i8 : PatFrag<(ops node:$ptr), (v16i8 (memop node:$ptr))>;
 
-def X86masked_gather : SDNode<"X86ISD::MGATHER",
-                              SDTypeProfile<2, 3, [SDTCisVec<0>,
-                                                   SDTCisVec<1>, SDTCisInt<1>,
-                                                   SDTCisSameAs<0, 2>,
-                                                   SDTCisSameAs<1, 3>,
-                                                   SDTCisPtrTy<4>]>,
-                             [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
-
-def X86masked_scatter : SDNode<"X86ISD::MSCATTER",
-                              SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>,
-                                                   SDTCisSameAs<0, 2>,
-                                                   SDTCVecEltisVT<0, i1>,
-                                                   SDTCisPtrTy<3>]>,
-                             [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
-
-def mgatherv4i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
-  (X86masked_gather node:$src1, node:$src2, node:$src3) , [{
-  X86MaskedGatherSDNode *Mgt = cast<X86MaskedGatherSDNode>(N);
-  return Mgt->getIndex().getValueType() == MVT::v4i32;
-}]>;
-
-def mgatherv8i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
-  (X86masked_gather node:$src1, node:$src2, node:$src3) , [{
-  X86MaskedGatherSDNode *Mgt = cast<X86MaskedGatherSDNode>(N);
-  return Mgt->getIndex().getValueType() == MVT::v8i32;
-}]>;
-
-def mgatherv2i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
-  (X86masked_gather node:$src1, node:$src2, node:$src3) , [{
-  X86MaskedGatherSDNode *Mgt = cast<X86MaskedGatherSDNode>(N);
-  return Mgt->getIndex().getValueType() == MVT::v2i64;
-}]>;
-def mgatherv4i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
-  (X86masked_gather node:$src1, node:$src2, node:$src3) , [{
-  X86MaskedGatherSDNode *Mgt = cast<X86MaskedGatherSDNode>(N);
-  return Mgt->getIndex().getValueType() == MVT::v4i64;
-}]>;
-def mgatherv8i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
-  (X86masked_gather node:$src1, node:$src2, node:$src3) , [{
-  X86MaskedGatherSDNode *Mgt = cast<X86MaskedGatherSDNode>(N);
-  return Mgt->getIndex().getValueType() == MVT::v8i64;
-}]>;
-def mgatherv16i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
-  (X86masked_gather node:$src1, node:$src2, node:$src3) , [{
-  X86MaskedGatherSDNode *Mgt = cast<X86MaskedGatherSDNode>(N);
-  return Mgt->getIndex().getValueType() == MVT::v16i32;
-}]>;
-
-def mscatterv2i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
-  (X86masked_scatter node:$src1, node:$src2, node:$src3) , [{
-  X86MaskedScatterSDNode *Sc = cast<X86MaskedScatterSDNode>(N);
-  return Sc->getIndex().getValueType() == MVT::v2i64;
-}]>;
-
-def mscatterv4i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
-  (X86masked_scatter node:$src1, node:$src2, node:$src3) , [{
-  X86MaskedScatterSDNode *Sc = cast<X86MaskedScatterSDNode>(N);
-  return Sc->getIndex().getValueType() == MVT::v4i32;
-}]>;
-
-def mscatterv4i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
-  (X86masked_scatter node:$src1, node:$src2, node:$src3) , [{
-  X86MaskedScatterSDNode *Sc = cast<X86MaskedScatterSDNode>(N);
-  return Sc->getIndex().getValueType() == MVT::v4i64;
-}]>;
-
-def mscatterv8i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
-  (X86masked_scatter node:$src1, node:$src2, node:$src3) , [{
-  X86MaskedScatterSDNode *Sc = cast<X86MaskedScatterSDNode>(N);
-  return Sc->getIndex().getValueType() == MVT::v8i32;
-}]>;
-
-def mscatterv8i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
-  (X86masked_scatter node:$src1, node:$src2, node:$src3) , [{
-  X86MaskedScatterSDNode *Sc = cast<X86MaskedScatterSDNode>(N);
-  return Sc->getIndex().getValueType() == MVT::v8i64;
-}]>;
-def mscatterv16i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
-  (X86masked_scatter node:$src1, node:$src2, node:$src3) , [{
-  X86MaskedScatterSDNode *Sc = cast<X86MaskedScatterSDNode>(N);
-  return Sc->getIndex().getValueType() == MVT::v16i32;
-}]>;
-
 // 128-bit bitconvert pattern fragments
 def bc_v4f32 : PatFrag<(ops node:$in), (v4f32 (bitconvert node:$in))>;
 def bc_v2f64 : PatFrag<(ops node:$in), (v2f64 (bitconvert node:$in))>;
@@ -1205,60 +1122,60 @@
 
 def truncstore_s_vi8 : PatFrag<(ops node:$val, node:$ptr),
                                (X86TruncSStore node:$val, node:$ptr), [{
-  return cast<TruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
 }]>;
 
 def truncstore_us_vi8 : PatFrag<(ops node:$val, node:$ptr),
                                (X86TruncUSStore node:$val, node:$ptr), [{
-  return cast<TruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
 }]>;
 
 def truncstore_s_vi16 : PatFrag<(ops node:$val, node:$ptr),
                                (X86TruncSStore node:$val, node:$ptr), [{
-  return cast<TruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
 }]>;
 
 def truncstore_us_vi16 : PatFrag<(ops node:$val, node:$ptr),
                                (X86TruncUSStore node:$val, node:$ptr), [{
-  return cast<TruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
 }]>;
 
 def truncstore_s_vi32 : PatFrag<(ops node:$val, node:$ptr),
                                (X86TruncSStore node:$val, node:$ptr), [{
-  return cast<TruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
 }]>;
 
 def truncstore_us_vi32 : PatFrag<(ops node:$val, node:$ptr),
                                (X86TruncUSStore node:$val, node:$ptr), [{
-  return cast<TruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
 }]>;
 
 def masked_truncstore_s_vi8 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
                      (X86MTruncSStore node:$src1, node:$src2, node:$src3), [{
-  return cast<MaskedTruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
 }]>;
 
 def masked_truncstore_us_vi8 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
                                (X86MTruncUSStore node:$src1, node:$src2, node:$src3), [{
-  return cast<MaskedTruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
 }]>;
 
 def masked_truncstore_s_vi16 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
                                (X86MTruncSStore node:$src1, node:$src2, node:$src3), [{
-  return cast<MaskedTruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
 }]>;
 
 def masked_truncstore_us_vi16 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
                                (X86MTruncUSStore node:$src1, node:$src2, node:$src3), [{
-  return cast<MaskedTruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
 }]>;
 
 def masked_truncstore_s_vi32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
                                (X86MTruncSStore node:$src1, node:$src2, node:$src3), [{
-  return cast<MaskedTruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
 }]>;
 
 def masked_truncstore_us_vi32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
                                (X86MTruncUSStore node:$src1, node:$src2, node:$src3), [{
-  return cast<MaskedTruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
 }]>;
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -7339,10 +7339,10 @@
              "vcvtph2ps\t{$src, $dst|$dst, $src}",
              [(set RC:$dst, (X86any_cvtph2ps VR128:$src))]>,
              T8PD, VEX, Sched<[sched]>;
+  let hasSideEffects = 0, mayLoad = 1 in
   def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
              "vcvtph2ps\t{$src, $dst|$dst, $src}",
-             [(set RC:$dst, (X86any_cvtph2ps (loadv8i16 addr:$src)))]>,
-             T8PD, VEX, Sched<[sched.Folded]>;
+             []>, T8PD, VEX, Sched<[sched.Folded]>;
 }
 
 multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop,
@@ -7373,6 +7373,8 @@
   def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16
               (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
             (VCVTPH2PSrm addr:$src)>;
+  def : Pat<(v8f32 (X86any_cvtph2ps (loadv8i16 addr:$src))),
+            (VCVTPH2PSYrm addr:$src)>;
 
   def : Pat<(store (f64 (extractelt
                          (bc_v2f64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))),
@@ -7931,57 +7933,48 @@
 
 // FIXME: Improve scheduling of gather instructions.
 multiclass avx2_gather<bits<8> opc, string OpcodeStr, ValueType VTx,
-                       ValueType VTy, PatFrag GatherNode128,
-                       PatFrag GatherNode256, RegisterClass RC256,
+                       ValueType VTy, RegisterClass RC256,
                        X86MemOperand memop128, X86MemOperand memop256,
                        ValueType MTx = VTx, ValueType MTy = VTy> {
+let mayLoad = 1, hasSideEffects = 0 in {
   def rm  : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb),
             (ins VR128:$src1, memop128:$src2, VR128:$mask),
             !strconcat(OpcodeStr,
               "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
-            [(set (VTx VR128:$dst), (MTx VR128:$mask_wb),
-                  (GatherNode128 VR128:$src1, VR128:$mask,
-                                vectoraddr:$src2))]>,
-            VEX, Sched<[WriteLoad]>;
+            []>, VEX, Sched<[WriteLoad]>;
   def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb),
             (ins RC256:$src1, memop256:$src2, RC256:$mask),
             !strconcat(OpcodeStr,
               "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
-            [(set (VTy RC256:$dst), (MTy RC256:$mask_wb),
-                  (GatherNode256 RC256:$src1, RC256:$mask,
-                                vectoraddr:$src2))]>,
-            VEX, VEX_L, Sched<[WriteLoad]>;
+            []>, VEX, VEX_L, Sched<[WriteLoad]>;
+}
 }
 
 let Predicates = [HasAVX2] in {
   let mayLoad = 1, hasSideEffects = 0, Constraints
     = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
     in {
-    defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", v2i64, v4i64, mgatherv4i32,
-                        mgatherv4i32, VR256, vx128mem, vx256mem>, VEX_W;
-    defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", v2i64, v4i64, mgatherv2i64,
-                        mgatherv4i64, VR256, vx128mem, vy256mem>, VEX_W;
-    defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", v4i32, v8i32, mgatherv4i32,
-                        mgatherv8i32, VR256, vx128mem, vy256mem>;
-    defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", v4i32, v4i32, mgatherv2i64,
-                        mgatherv4i64, VR128, vx64mem, vy128mem>;
+    defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", v2i64, v4i64,
+                        VR256, vx128mem, vx256mem>, VEX_W;
+    defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", v2i64, v4i64,
+                        VR256, vx128mem, vy256mem>, VEX_W;
+    defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", v4i32, v8i32,
+                        VR256, vx128mem, vy256mem>;
+    defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", v4i32, v4i32,
+                        VR128, vx64mem, vy128mem>;
 
     let ExeDomain = SSEPackedDouble in {
-      defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", v2f64, v4f64, mgatherv4i32,
-                          mgatherv4i32, VR256, vx128mem, vx256mem,
-                          v2i64, v4i64>, VEX_W;
-      defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", v2f64, v4f64, mgatherv2i64,
-                          mgatherv4i64, VR256, vx128mem, vy256mem,
-                          v2i64, v4i64>, VEX_W;
+      defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", v2f64, v4f64,
+                          VR256, vx128mem, vx256mem, v2i64, v4i64>, VEX_W;
+      defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", v2f64, v4f64,
+                          VR256, vx128mem, vy256mem, v2i64, v4i64>, VEX_W;
     }
 
     let ExeDomain = SSEPackedSingle in {
-      defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", v4f32, v8f32, mgatherv4i32,
-                          mgatherv8i32, VR256, vx128mem, vy256mem,
-                          v4i32, v8i32>;
-      defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", v4f32, v4f32, mgatherv2i64,
-                          mgatherv4i64, VR128, vx64mem, vy128mem,
-                          v4i32, v4i32>;
+      defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", v4f32, v8f32,
+                          VR256, vx128mem, vy256mem, v4i32, v8i32>;
+      defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", v4f32, v4f32,
+                          VR128, vx64mem, vy128mem, v4i32, v4i32>;
     }
   }
 }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -5568,8 +5568,11 @@
         isa<IntegerType>(A->getType())) {
       Value *Result;
       Constant *Overflow;
-      if (OptimizeOverflowCheck(Instruction::Add, /*Signed*/false, A, B,
-                                *AddI, Result, Overflow)) {
+      // m_UAddWithOverflow can match patterns that do not include  an explicit
+      // "add" instruction, so check the opcode of the matched op.
+      if (AddI->getOpcode() == Instruction::Add &&
+          OptimizeOverflowCheck(Instruction::Add, /*Signed*/ false, A, B, *AddI,
+                                Result, Overflow)) {
         replaceInstUsesWith(*AddI, Result);
         return replaceInstUsesWith(I, Overflow);
       }
diff --git a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
--- a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -115,7 +115,8 @@
   // list.
   Function *
   insertCounterWriteout(ArrayRef<std::pair<GlobalVariable *, MDNode *>>);
-  Function *insertFlush(ArrayRef<std::pair<GlobalVariable *, MDNode *>>);
+  Function *insertReset(ArrayRef<std::pair<GlobalVariable *, MDNode *>>);
+  Function *insertFlush(Function *ResetF);
 
   void AddFlushBeforeForkAndExec();
 
@@ -631,35 +632,74 @@
 }
 
 void GCOVProfiler::AddFlushBeforeForkAndExec() {
-  SmallVector<Instruction *, 2> ForkAndExecs;
+  SmallVector<std::pair<bool, CallInst *>, 2> ForkAndExecs;
   for (auto &F : M->functions()) {
     auto *TLI = &GetTLI(F);
     for (auto &I : instructions(F)) {
       if (CallInst *CI = dyn_cast<CallInst>(&I)) {
         if (Function *Callee = CI->getCalledFunction()) {
           LibFunc LF;
-          if (TLI->getLibFunc(*Callee, LF) &&
-              (LF == LibFunc_fork || LF == LibFunc_execl ||
-               LF == LibFunc_execle || LF == LibFunc_execlp ||
-               LF == LibFunc_execv || LF == LibFunc_execvp ||
-               LF == LibFunc_execve || LF == LibFunc_execvpe ||
-               LF == LibFunc_execvP)) {
-            ForkAndExecs.push_back(&I);
+          if (TLI->getLibFunc(*Callee, LF)) {
+            if (LF == LibFunc_fork) {
+#if !defined(_WIN32)
+              ForkAndExecs.push_back({true, CI});
+#endif
+            } else if (LF == LibFunc_execl || LF == LibFunc_execle ||
+                       LF == LibFunc_execlp || LF == LibFunc_execv ||
+                       LF == LibFunc_execvp || LF == LibFunc_execve ||
+                       LF == LibFunc_execvpe || LF == LibFunc_execvP) {
+              ForkAndExecs.push_back({false, CI});
+            }
           }
         }
       }
     }
   }
 
-  // We need to split the block after the fork/exec call
-  // because else the counters for the lines after will be
-  // the same as before the call.
-  for (auto I : ForkAndExecs) {
-    IRBuilder<> Builder(I);
-    FunctionType *FTy = FunctionType::get(Builder.getVoidTy(), {}, false);
-    FunctionCallee GCOVFlush = M->getOrInsertFunction("__gcov_flush", FTy);
-    Builder.CreateCall(GCOVFlush);
-    I->getParent()->splitBasicBlock(I);
+  for (auto F : ForkAndExecs) {
+    IRBuilder<> Builder(F.second);
+    BasicBlock *Parent = F.second->getParent();
+    auto NextInst = ++F.second->getIterator();
+
+    if (F.first) {
+      // We've a fork so just reset the counters in the child process
+      FunctionType *FTy = FunctionType::get(Builder.getInt32Ty(), {}, false);
+      FunctionCallee GCOVFork = M->getOrInsertFunction("__gcov_fork", FTy);
+      F.second->setCalledFunction(GCOVFork);
+      if (NextInst != Parent->end()) {
+        // We split just after the fork to have a counter for the lines after
+        // Anyway there's a bug:
+        // void foo() { fork(); }
+        // void bar() { foo(); blah(); }
+        // then "blah();" will be called 2 times but showed as 1
+        // because "blah()" belongs to the same block as "foo();"
+        Parent->splitBasicBlock(NextInst);
+
+        // back() is a br instruction with a debug location
+        // equals to the one from NextAfterFork
+        // So to avoid to have two debug locs on two blocks just change it
+        DebugLoc Loc = F.second->getDebugLoc();
+        Parent->back().setDebugLoc(Loc);
+      }
+    } else {
+      // Since the process is replaced by a new one we need to write out gcdas
+      // No need to reset the counters since they'll be lost after the exec**
+      FunctionType *FTy = FunctionType::get(Builder.getVoidTy(), {}, false);
+      FunctionCallee WriteoutF =
+          M->getOrInsertFunction("llvm_writeout_files", FTy);
+      Builder.CreateCall(WriteoutF);
+      if (NextInst != Parent->end()) {
+        DebugLoc Loc = F.second->getDebugLoc();
+        Builder.SetInsertPoint(&*NextInst);
+        // If the exec** fails we must reset the counters since they've been
+        // dumped
+        FunctionCallee ResetF =
+            M->getOrInsertFunction("llvm_reset_counters", FTy);
+        Builder.CreateCall(ResetF)->setDebugLoc(Loc);
+        Parent->splitBasicBlock(NextInst);
+        Parent->back().setDebugLoc(Loc);
+      }
+    }
   }
 }
 
@@ -851,7 +891,8 @@
     }
 
     Function *WriteoutF = insertCounterWriteout(CountersBySP);
-    Function *FlushF = insertFlush(CountersBySP);
+    Function *ResetF = insertReset(CountersBySP);
+    Function *FlushF = insertFlush(ResetF);
 
     // Create a small bit of code that registers the "__llvm_gcov_writeout" to
     // be executed at exit and the "__llvm_gcov_flush" function to be executed
@@ -869,16 +910,14 @@
     IRBuilder<> Builder(BB);
 
     FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
-    Type *Params[] = {
-      PointerType::get(FTy, 0),
-      PointerType::get(FTy, 0)
-    };
+    Type *Params[] = {PointerType::get(FTy, 0), PointerType::get(FTy, 0),
+                      PointerType::get(FTy, 0)};
     FTy = FunctionType::get(Builder.getVoidTy(), Params, false);
 
-    // Initialize the environment and register the local writeout and flush
-    // functions.
+    // Initialize the environment and register the local writeout, flush and
+    // reset functions.
     FunctionCallee GCOVInit = M->getOrInsertFunction("llvm_gcov_init", FTy);
-    Builder.CreateCall(GCOVInit, {WriteoutF, FlushF});
+    Builder.CreateCall(GCOVInit, {WriteoutF, FlushF, ResetF});
     Builder.CreateRetVoid();
 
     appendToGlobalCtors(*M, F, 0);
@@ -1191,8 +1230,43 @@
   return WriteoutF;
 }
 
-Function *GCOVProfiler::
-insertFlush(ArrayRef<std::pair<GlobalVariable*, MDNode*> > CountersBySP) {
+Function *GCOVProfiler::insertReset(
+    ArrayRef<std::pair<GlobalVariable *, MDNode *>> CountersBySP) {
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  Function *ResetF = M->getFunction("__llvm_gcov_reset");
+  if (!ResetF)
+    ResetF = Function::Create(FTy, GlobalValue::InternalLinkage,
+                              "__llvm_gcov_reset", M);
+  else
+    ResetF->setLinkage(GlobalValue::InternalLinkage);
+  ResetF->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+  ResetF->addFnAttr(Attribute::NoInline);
+  if (Options.NoRedZone)
+    ResetF->addFnAttr(Attribute::NoRedZone);
+
+  BasicBlock *Entry = BasicBlock::Create(*Ctx, "entry", ResetF);
+  IRBuilder<> Builder(Entry);
+
+  // Zero out the counters.
+  for (const auto &I : CountersBySP) {
+    GlobalVariable *GV = I.first;
+    Constant *Null = Constant::getNullValue(GV->getValueType());
+    Builder.CreateStore(Null, GV);
+  }
+
+  Type *RetTy = ResetF->getReturnType();
+  if (RetTy == Type::getVoidTy(*Ctx))
+    Builder.CreateRetVoid();
+  else if (RetTy->isIntegerTy())
+    // Used if __llvm_gcov_reset was implicitly declared.
+    Builder.CreateRet(ConstantInt::get(RetTy, 0));
+  else
+    report_fatal_error("invalid return type for __llvm_gcov_reset");
+
+  return ResetF;
+}
+
+Function *GCOVProfiler::insertFlush(Function *ResetF) {
   FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
   Function *FlushF = M->getFunction("__llvm_gcov_flush");
   if (!FlushF)
@@ -1206,20 +1280,13 @@
     FlushF->addFnAttr(Attribute::NoRedZone);
 
   BasicBlock *Entry = BasicBlock::Create(*Ctx, "entry", FlushF);
+  IRBuilder<> Builder(Entry);
 
-  // Write out the current counters.
   Function *WriteoutF = M->getFunction("__llvm_gcov_writeout");
   assert(WriteoutF && "Need to create the writeout function first!");
 
-  IRBuilder<> Builder(Entry);
-  Builder.CreateCall(WriteoutF, {});
-
-  // Zero out the counters.
-  for (const auto &I : CountersBySP) {
-    GlobalVariable *GV = I.first;
-    Constant *Null = Constant::getNullValue(GV->getValueType());
-    Builder.CreateStore(Null, GV);
-  }
+  Builder.CreateCall(WriteoutF);
+  Builder.CreateCall(ResetF);
 
   Type *RetTy = FlushF->getReturnType();
   if (RetTy == Type::getVoidTy(*Ctx))
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -1455,6 +1455,10 @@
   // Keep track of blocks with throwing instructions not modeled in MemorySSA.
   SmallPtrSet<BasicBlock *, 16> ThrowingBlocks;
 
+  /// Keep track of instructions (partly) overlapping with killing MemoryDefs per
+  /// basic block.
+  DenseMap<BasicBlock *, InstOverlapIntervalsTy> IOLs;
+
   DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT,
            PostDominatorTree &PDT, const TargetLibraryInfo &TLI)
       : F(F), AA(AA), MSSA(MSSA), DT(DT), PDT(PDT), TLI(TLI) {}
@@ -1684,6 +1688,9 @@
         Updater.removeMemoryAccess(MA);
       }
 
+      auto I = IOLs.find(DeadInst->getParent());
+      if (I != IOLs.end())
+        I->second.erase(DeadInst);
       // Remove its operands
       for (Use &O : DeadInst->operands())
         if (Instruction *OpI = dyn_cast<Instruction>(O)) {
@@ -1804,7 +1811,10 @@
 
       // Check if NI overwrites SI.
       int64_t InstWriteOffset, DepWriteOffset;
-      InstOverlapIntervalsTy IOL;
+      auto Iter = State.IOLs.insert(
+          std::make_pair<BasicBlock *, InstOverlapIntervalsTy>(
+              NI->getParent(), InstOverlapIntervalsTy()));
+      auto &IOL = Iter.first->second;
       OverwriteResult OR = isOverwrite(SILoc, NILoc, DL, TLI, DepWriteOffset,
                                        InstWriteOffset, NI, IOL, AA, &F);
 
@@ -1819,6 +1829,10 @@
     }
   }
 
+  if (EnablePartialOverwriteTracking)
+    for (auto &KV : State.IOLs)
+      MadeChange |= removePartiallyOverlappedStores(&AA, DL, KV.second);
+
   return MadeChange;
 }
 } // end anonymous namespace
diff --git a/llvm/lib/Transforms/Utils/KnowledgeRetention.cpp b/llvm/lib/Transforms/Utils/KnowledgeRetention.cpp
--- a/llvm/lib/Transforms/Utils/KnowledgeRetention.cpp
+++ b/llvm/lib/Transforms/Utils/KnowledgeRetention.cpp
@@ -229,8 +229,8 @@
       Lookup->Tag->getKey() != AttrName)
     return false;
   if (IsOn) {
-    if (Lookup->End - Lookup->Begin < BOIE_WasOn)
-      return false;
+    assert((Lookup->End - Lookup->Begin > BOIE_WasOn) &&
+           "missing argument of attribute");
     while (true) {
       if (Lookup == Assume.bundle_op_info_end() ||
           Lookup->Tag->getKey() != AttrName)
diff --git a/llvm/test/CodeGen/AArch64/sat-add.ll b/llvm/test/CodeGen/AArch64/sat-add.ll
--- a/llvm/test/CodeGen/AArch64/sat-add.ll
+++ b/llvm/test/CodeGen/AArch64/sat-add.ll
@@ -201,11 +201,11 @@
 define i8 @unsigned_sat_variable_i8_using_cmp_notval(i8 %x, i8 %y) {
 ; CHECK-LABEL: unsigned_sat_variable_i8_using_cmp_notval:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0xff
-; CHECK-NEXT:    mvn w9, w1
-; CHECK-NEXT:    add w10, w0, w1
-; CHECK-NEXT:    cmp w8, w9, uxtb
-; CHECK-NEXT:    csinv w0, w10, wzr, ls
+; CHECK-NEXT:    and w8, w1, #0xff
+; CHECK-NEXT:    add w8, w8, w0, uxtb
+; CHECK-NEXT:    add w9, w0, w1
+; CHECK-NEXT:    tst w8, #0x100
+; CHECK-NEXT:    csinv w0, w9, wzr, eq
 ; CHECK-NEXT:    ret
   %noty = xor i8 %y, -1
   %a = add i8 %x, %y
@@ -247,11 +247,11 @@
 define i16 @unsigned_sat_variable_i16_using_cmp_notval(i16 %x, i16 %y) {
 ; CHECK-LABEL: unsigned_sat_variable_i16_using_cmp_notval:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0xffff
-; CHECK-NEXT:    mvn w9, w1
-; CHECK-NEXT:    add w10, w0, w1
-; CHECK-NEXT:    cmp w8, w9, uxth
-; CHECK-NEXT:    csinv w0, w10, wzr, ls
+; CHECK-NEXT:    and w8, w1, #0xffff
+; CHECK-NEXT:    add w8, w8, w0, uxth
+; CHECK-NEXT:    add w9, w0, w1
+; CHECK-NEXT:    tst w8, #0x10000
+; CHECK-NEXT:    csinv w0, w9, wzr, eq
 ; CHECK-NEXT:    ret
   %noty = xor i16 %y, -1
   %a = add i16 %x, %y
@@ -290,10 +290,9 @@
 define i32 @unsigned_sat_variable_i32_using_cmp_notval(i32 %x, i32 %y) {
 ; CHECK-LABEL: unsigned_sat_variable_i32_using_cmp_notval:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn w8, w1
-; CHECK-NEXT:    add w9, w0, w1
-; CHECK-NEXT:    cmp w0, w8
-; CHECK-NEXT:    csinv w0, w9, wzr, ls
+; CHECK-NEXT:    add w8, w0, w1
+; CHECK-NEXT:    cmn w1, w0
+; CHECK-NEXT:    csinv w0, w8, wzr, lo
 ; CHECK-NEXT:    ret
   %noty = xor i32 %y, -1
   %a = add i32 %x, %y
@@ -332,10 +331,9 @@
 define i64 @unsigned_sat_variable_i64_using_cmp_notval(i64 %x, i64 %y) {
 ; CHECK-LABEL: unsigned_sat_variable_i64_using_cmp_notval:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn x8, x1
-; CHECK-NEXT:    add x9, x0, x1
-; CHECK-NEXT:    cmp x0, x8
-; CHECK-NEXT:    csinv x0, x9, xzr, ls
+; CHECK-NEXT:    add x8, x0, x1
+; CHECK-NEXT:    cmn x1, x0
+; CHECK-NEXT:    csinv x0, x8, xzr, lo
 ; CHECK-NEXT:    ret
   %noty = xor i64 %y, -1
   %a = add i64 %x, %y
diff --git a/llvm/test/CodeGen/AMDGPU/hsa.ll b/llvm/test/CodeGen/AMDGPU/hsa.ll
--- a/llvm/test/CodeGen/AMDGPU/hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa.ll
@@ -19,7 +19,8 @@
 ; ELF: }
 
 ; ELF: SHT_NOTE
-; ELF: Flags [ (0x0)
+; ELF: Flags [ (0x2)
+; ELF: SHF_ALLOC (0x2)
 ; ELF: ]
 ; ELF: SectionData (
 ; ELF: 0000: 04000000 08000000 01000000 414D4400
diff --git a/llvm/test/CodeGen/SystemZ/frame-23.ll b/llvm/test/CodeGen/SystemZ/frame-23.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/frame-23.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+;
+; Test backchain with packed-stack, which requires soft-float.
+
+attributes #0 = { nounwind "backchain" "packed-stack" "use-soft-float"="true" }
+define i64 @fun0(i64 %a) #0 {
+; CHECK-LABEL: fun0:
+; CHECK:      stmg	%r14, %r15, 136(%r15)
+; CHECK-NEXT: lgr	%r1, %r15
+; CHECK-NEXT: aghi	%r15, -24
+; CHECK-NEXT: stg	%r1, 152(%r15)
+; CHECK-NEXT: brasl	%r14, foo@PLT
+; CHECK-NEXT: lmg	%r14, %r15, 160(%r15)
+; CHECK-NEXT: br	%r14
+entry:
+  %call = call i64 @foo(i64 %a)
+  ret i64 %call
+}
+
+declare i64 @foo(i64)
diff --git a/llvm/test/CodeGen/SystemZ/frame-24.ll b/llvm/test/CodeGen/SystemZ/frame-24.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/frame-24.ll
@@ -0,0 +1,72 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+;
+; Test saving of vararg registers and backchain with packed stack.
+
+%struct.__va_list_tag = type { i64, i64, i8*, i8* }
+declare void @llvm.va_start(i8*)
+
+attributes #0 = { nounwind "packed-stack"="true" }
+define void @fun0(i64 %g0, double %d0, i64 %n, ...) #0 {
+; CHECK-LABEL: fun0:
+; CHECK:      stmg	%r4, %r15, 32(%r15)
+; CHECK-NEXT: aghi	%r15, -192
+; CHECK-NEXT: std	%f2, 328(%r15)
+; CHECK-NEXT: std	%f4, 336(%r15)
+; CHECK-NEXT: std	%f6, 344(%r15)
+; CHECK-NEXT: la	%r0, 352(%r15)
+; CHECK-NEXT: stg	%r0, 176(%r15)
+; CHECK-NEXT: la	%r0, 192(%r15)
+; CHECK-NEXT: stg	%r0, 184(%r15)
+; CHECK-NEXT: mvghi	160(%r15), 2
+; CHECK-NEXT: mvghi	168(%r15), 1
+; CHECK-NEXT: lmg	%r6, %r15, 240(%r15)
+; CHECK-NEXT: br	%r14
+entry:
+  %vl = alloca [1 x %struct.__va_list_tag], align 8
+  %0 = bitcast [1 x %struct.__va_list_tag]* %vl to i8*
+  call void @llvm.va_start(i8* nonnull %0)
+  ret void
+}
+
+attributes #1 = { nounwind "packed-stack"="true" "use-soft-float"="true" }
+define void @fun1(i64 %g0, double %d0, i64 %n, ...) #1 {
+; CHECK-LABEL: fun1:
+; CHECK:      stmg	%r5, %r15, 72(%r15)
+; CHECK-NEXT: aghi	%r15, -160
+; CHECK-NEXT: la	%r0, 192(%r15)
+; CHECK-NEXT: stg	%r0, 184(%r15)
+; CHECK-NEXT: la	%r0, 320(%r15)
+; CHECK-NEXT: stg	%r0, 176(%r15)
+; CHECK-NEXT: mvghi	168(%r15), 0
+; CHECK-NEXT: mvghi	160(%r15), 3
+; CHECK-NEXT: lmg	%r6, %r15, 240(%r15)
+; CHECK-NEXT: br	%r14
+entry:
+  %vl = alloca [1 x %struct.__va_list_tag], align 8
+  %0 = bitcast [1 x %struct.__va_list_tag]* %vl to i8*
+  call void @llvm.va_start(i8* nonnull %0)
+  ret void
+}
+
+attributes #2 = { nounwind "packed-stack"="true" "use-soft-float"="true" "backchain"}
+define void @fun2(i64 %g0, double %d0, i64 %n, ...) #2 {
+; CHECK-LABEL: fun2:
+; CHECK:      stmg	%r5, %r15, 64(%r15)
+; CHECK-NEXT: lgr	%r1, %r15
+; CHECK-NEXT: aghi	%r15, -168
+; CHECK-NEXT: stg	%r1, 152(%r15)
+; CHECK-NEXT: la	%r0, 192(%r15)
+; CHECK-NEXT: stg	%r0, 184(%r15)
+; CHECK-NEXT: la	%r0, 328(%r15)
+; CHECK-NEXT: stg	%r0, 176(%r15)
+; CHECK-NEXT: mvghi	168(%r15), 0
+; CHECK-NEXT: mvghi	160(%r15), 3
+; CHECK-NEXT: lmg	%r6, %r15, 240(%r15)
+; CHECK-NEXT: br	%r14
+entry:
+  %vl = alloca [1 x %struct.__va_list_tag], align 8
+  %0 = bitcast [1 x %struct.__va_list_tag]* %vl to i8*
+  call void @llvm.va_start(i8* nonnull %0)
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/SystemZ/frameaddr-02.ll b/llvm/test/CodeGen/SystemZ/frameaddr-02.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/frameaddr-02.ll
@@ -0,0 +1,54 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+; Test lowering of @llvm.frameaddress with packed-stack.
+
+; With back chain
+attributes #0 = { nounwind "packed-stack" "backchain" "use-soft-float"="true" }
+define i8* @fp0() #0 {
+entry:
+; CHECK-LABEL: fp0:
+; CHECK:      la   %r2, 152(%r15)
+; CHECK-NEXT: br   %r14
+  %0 = tail call i8* @llvm.frameaddress(i32 0)
+  ret i8* %0
+}
+
+define i8* @fp0f() #0 {
+entry:
+; CHECK-LABEL: fp0f:
+; CHECK:      lgr	%r1, %r15
+; CHECK-NEXT: aghi	%r15, -16
+; CHECK-NEXT: stg	%r1, 152(%r15)
+; CHECK-NEXT: la	%r2, 168(%r15)
+; CHECK-NEXT: aghi	%r15, 16
+; CHECK-NEXT: br	%r14
+  %0 = alloca i64, align 8
+  %1 = tail call i8* @llvm.frameaddress(i32 0)
+  ret i8* %1
+}
+
+; Without back chain
+
+attributes #1 = { nounwind "packed-stack" }
+define i8* @fp1() #1 {
+entry:
+; CHECK-LABEL: fp1:
+; CHECK:      lghi %r2, 0
+; CHECK-NEXT: br   %r14
+  %0 = tail call i8* @llvm.frameaddress(i32 0)
+  ret i8* %0
+}
+
+define i8* @fp1f() #1 {
+entry:
+; CHECK-LABEL: fp1f:
+; CHECK:      aghi	%r15, -8
+; CHECK-NEXT: lghi	%r2, 0
+; CHECK-NEXT: aghi	%r15, 8
+; CHECK-NEXT: br	%r14
+  %0 = alloca i64, align 8
+  %1 = tail call i8* @llvm.frameaddress(i32 0)
+  ret i8* %1
+}
+
+declare i8* @llvm.frameaddress(i32) nounwind readnone
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll
@@ -0,0 +1,229 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -disable-mve-tail-predication=false %s -o - | FileCheck %s
+
+define dso_local arm_aapcs_vfpcc void @sext_i8(i16* noalias nocapture %a, i8* nocapture readonly %b, i32 %N) {
+; CHECK-LABEL: sext_i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    it eq
+; CHECK-NEXT:    popeq {r7, pc}
+; CHECK-NEXT:    mov r3, r0
+; CHECK-NEXT:    dlstp.16 lr, r2
+; CHECK:  	.LBB0_1: @ %vector.body
+; CHECK:    	 vldrb.s16 q0, [r1], #8
+; CHECK-NEXT:    vldrh.u16 q1, [r3], #16
+; CHECK-NEXT:    vadd.i16 q0, q1, q0
+; CHECK-NEXT:    vstrh.16 q0, [r0]
+; CHECK-NEXT:    mov r0, r3
+; CHECK-NEXT:    letp lr, .LBB0_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    pop {r7, pc}
+entry:
+  %cmp8 = icmp eq i32 %N, 0
+  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
+
+vector.ph:                                        ; preds = %entry
+  %n.rnd.up = add i32 %N, 7
+  %n.vec = and i32 %n.rnd.up, -8
+  %trip.count.minus.1 = add i32 %N, -1
+  %broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
+  %broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
+  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
+  %induction = or <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %0 = getelementptr inbounds i8, i8* %b, i32 %index
+  %1 = icmp ule <8 x i32> %induction, %broadcast.splat11
+  %2 = bitcast i8* %0 to <8 x i8>*
+  %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %2, i32 1, <8 x i1> %1, <8 x i8> undef)
+  %3 = sext <8 x i8> %wide.masked.load to <8 x i16>
+  %4 = getelementptr inbounds i16, i16* %a, i32 %index
+  %5 = bitcast i16* %4 to <8 x i16>*
+  %wide.masked.load12 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %5, i32 2, <8 x i1> %1, <8 x i16> undef)
+  %6 = add <8 x i16> %wide.masked.load12, %3
+  %7 = bitcast i16* %4 to <8 x i16>*
+  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %6, <8 x i16>* %7, i32 2, <8 x i1> %1)
+  %index.next = add i32 %index, 8
+  %8 = icmp eq i32 %index.next, %n.vec
+  br i1 %8, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+; Function Attrs: nofree norecurse nounwind
+define dso_local arm_aapcs_vfpcc void @zext_i8(i16* noalias nocapture %a, i8* nocapture readonly %b, i32 %N) local_unnamed_addr #0 {
+; CHECK-LABEL: zext_i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    it eq
+; CHECK-NEXT:    popeq {r7, pc}
+; CHECK-NEXT:    mov r3, r0
+; CHECK-NEXT:    dlstp.16 lr, r2
+; CHECK:  	.LBB1_1: @ %vector.body
+; CHECK:    	 vldrb.u16 q0, [r1], #8
+; CHECK-NEXT:    vldrh.u16 q1, [r3], #16
+; CHECK-NEXT:    vadd.i16 q0, q1, q0
+; CHECK-NEXT:    vstrh.16 q0, [r0]
+; CHECK-NEXT:    mov r0, r3
+; CHECK-NEXT:    letp lr, .LBB1_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    pop {r7, pc}
+entry:
+  %cmp8 = icmp eq i32 %N, 0
+  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
+
+vector.ph:                                        ; preds = %entry
+  %n.rnd.up = add i32 %N, 7
+  %n.vec = and i32 %n.rnd.up, -8
+  %trip.count.minus.1 = add i32 %N, -1
+  %broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
+  %broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
+  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
+  %induction = or <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %0 = getelementptr inbounds i8, i8* %b, i32 %index
+  %1 = icmp ule <8 x i32> %induction, %broadcast.splat11
+  %2 = bitcast i8* %0 to <8 x i8>*
+  %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %2, i32 1, <8 x i1> %1, <8 x i8> undef)
+  %3 = zext <8 x i8> %wide.masked.load to <8 x i16>
+  %4 = getelementptr inbounds i16, i16* %a, i32 %index
+  %5 = bitcast i16* %4 to <8 x i16>*
+  %wide.masked.load12 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %5, i32 2, <8 x i1> %1, <8 x i16> undef)
+  %6 = add <8 x i16> %wide.masked.load12, %3
+  %7 = bitcast i16* %4 to <8 x i16>*
+  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %6, <8 x i16>* %7, i32 2, <8 x i1> %1)
+  %index.next = add i32 %index, 8
+  %8 = icmp eq i32 %index.next, %n.vec
+  br i1 %8, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+; Function Attrs: nofree norecurse nounwind
+define dso_local arm_aapcs_vfpcc void @sext_i16(i32* noalias nocapture %a, i16* nocapture readonly %b, i32 %N) local_unnamed_addr #0 {
+; CHECK-LABEL: sext_i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    it eq
+; CHECK-NEXT:    popeq {r7, pc}
+; CHECK-NEXT:    mov r3, r0
+; CHECK-NEXT:    dlstp.32 lr, r2
+; CHECK:  	.LBB2_1: @ %vector.body
+; CHECK:    	 vldrh.s32 q0, [r1], #8
+; CHECK-NEXT:    vldrw.u32 q1, [r3], #16
+; CHECK-NEXT:    vadd.i32 q0, q1, q0
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    mov r0, r3
+; CHECK-NEXT:    letp lr, .LBB2_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    pop {r7, pc}
+entry:
+  %cmp6 = icmp eq i32 %N, 0
+  br i1 %cmp6, label %for.cond.cleanup, label %vector.ph
+
+vector.ph:                                        ; preds = %entry
+  %n.rnd.up = add i32 %N, 3
+  %n.vec = and i32 %n.rnd.up, -4
+  %trip.count.minus.1 = add i32 %N, -1
+  %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+  %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+  %0 = getelementptr inbounds i16, i16* %b, i32 %index
+  %1 = icmp ule <4 x i32> %induction, %broadcast.splat9
+  %2 = bitcast i16* %0 to <4 x i16>*
+  %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef)
+  %3 = sext <4 x i16> %wide.masked.load to <4 x i32>
+  %4 = getelementptr inbounds i32, i32* %a, i32 %index
+  %5 = bitcast i32* %4 to <4 x i32>*
+  %wide.masked.load10 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %5, i32 4, <4 x i1> %1, <4 x i32> undef)
+  %6 = add nsw <4 x i32> %wide.masked.load10, %3
+  %7 = bitcast i32* %4 to <4 x i32>*
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %6, <4 x i32>* %7, i32 4, <4 x i1> %1)
+  %index.next = add i32 %index, 4
+  %8 = icmp eq i32 %index.next, %n.vec
+  br i1 %8, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+; Function Attrs: nofree norecurse nounwind
+define dso_local arm_aapcs_vfpcc void @zext_i16(i32* noalias nocapture %a, i16* nocapture readonly %b, i32 %N) local_unnamed_addr #0 {
+; CHECK-LABEL: zext_i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    it eq
+; CHECK-NEXT:    popeq {r7, pc}
+; CHECK-NEXT:    mov r3, r0
+; CHECK-NEXT:    dlstp.32 lr, r2
+; CHECK:  	 .LBB3_1: @ %vector.body
+; CHECK:    	 vldrh.u32 q0, [r1], #8
+; CHECK-NEXT:    vldrw.u32 q1, [r3], #16
+; CHECK-NEXT:    vadd.i32 q0, q1, q0
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    mov r0, r3
+; CHECK-NEXT:    letp lr, .LBB3_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    pop {r7, pc}
+entry:
+  %cmp6 = icmp eq i32 %N, 0
+  br i1 %cmp6, label %for.cond.cleanup, label %vector.ph
+
+vector.ph:                                        ; preds = %entry
+  %n.rnd.up = add i32 %N, 3
+  %n.vec = and i32 %n.rnd.up, -4
+  %trip.count.minus.1 = add i32 %N, -1
+  %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+  %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+  %0 = getelementptr inbounds i16, i16* %b, i32 %index
+  %1 = icmp ule <4 x i32> %induction, %broadcast.splat9
+  %2 = bitcast i16* %0 to <4 x i16>*
+  %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef)
+  %3 = zext <4 x i16> %wide.masked.load to <4 x i32>
+  %4 = getelementptr inbounds i32, i32* %a, i32 %index
+  %5 = bitcast i32* %4 to <4 x i32>*
+  %wide.masked.load10 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %5, i32 4, <4 x i1> %1, <4 x i32> undef)
+  %6 = add <4 x i32> %wide.masked.load10, %3
+  %7 = bitcast i32* %4 to <4 x i32>*
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %6, <4 x i32>* %7, i32 4, <4 x i1> %1)
+  %index.next = add i32 %index, 4
+  %8 = icmp eq i32 %index.next, %n.vec
+  br i1 %8, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32 immarg, <8 x i1>, <8 x i8>)
+declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>)
+declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>)
+declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)
+declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
+declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
+
diff --git a/llvm/test/CodeGen/X86/atomic-fp.ll b/llvm/test/CodeGen/X86/atomic-fp.ll
--- a/llvm/test/CodeGen/X86/atomic-fp.ll
+++ b/llvm/test/CodeGen/X86/atomic-fp.ll
@@ -80,33 +80,25 @@
 ; X86-NOSSE:       # %bb.0:
 ; X86-NOSSE-NEXT:    pushl %ebp
 ; X86-NOSSE-NEXT:    movl %esp, %ebp
-; X86-NOSSE-NEXT:    pushl %ebx
-; X86-NOSSE-NEXT:    pushl %esi
 ; X86-NOSSE-NEXT:    andl $-8, %esp
-; X86-NOSSE-NEXT:    subl $24, %esp
-; X86-NOSSE-NEXT:    movl 8(%ebp), %esi
-; X86-NOSSE-NEXT:    fildll (%esi)
+; X86-NOSSE-NEXT:    subl $32, %esp
+; X86-NOSSE-NEXT:    movl 8(%ebp), %eax
+; X86-NOSSE-NEXT:    fildll (%eax)
 ; X86-NOSSE-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOSSE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X86-NOSSE-NEXT:    faddl 12(%ebp)
-; X86-NOSSE-NEXT:    fstpl (%esp)
-; X86-NOSSE-NEXT:    movl (%esp), %ebx
+; X86-NOSSE-NEXT:    fstpl {{[0-9]+}}(%esp)
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movl (%esi), %eax
-; X86-NOSSE-NEXT:    movl 4(%esi), %edx
-; X86-NOSSE-NEXT:    .p2align 4, 0x90
-; X86-NOSSE-NEXT:  .LBB1_1: # %atomicrmw.start
-; X86-NOSSE-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NOSSE-NEXT:    lock cmpxchg8b (%esi)
-; X86-NOSSE-NEXT:    jne .LBB1_1
-; X86-NOSSE-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NOSSE-NEXT:    leal -8(%ebp), %esp
-; X86-NOSSE-NEXT:    popl %esi
-; X86-NOSSE-NEXT:    popl %ebx
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOSSE-NEXT:    movl %ecx, (%esp)
+; X86-NOSSE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fildll (%esp)
+; X86-NOSSE-NEXT:    fistpll (%eax)
+; X86-NOSSE-NEXT:    movl %ebp, %esp
 ; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    retl
 ;
@@ -114,33 +106,21 @@
 ; X86-SSE1:       # %bb.0:
 ; X86-SSE1-NEXT:    pushl %ebp
 ; X86-SSE1-NEXT:    movl %esp, %ebp
-; X86-SSE1-NEXT:    pushl %ebx
-; X86-SSE1-NEXT:    pushl %esi
 ; X86-SSE1-NEXT:    andl $-8, %esp
-; X86-SSE1-NEXT:    subl $24, %esp
-; X86-SSE1-NEXT:    movl 8(%ebp), %esi
-; X86-SSE1-NEXT:    fildll (%esi)
-; X86-SSE1-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    subl $16, %esp
+; X86-SSE1-NEXT:    movl 8(%ebp), %eax
+; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
+; X86-SSE1-NEXT:    movss %xmm1, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    fldl (%esp)
 ; X86-SSE1-NEXT:    faddl 12(%ebp)
-; X86-SSE1-NEXT:    fstpl (%esp)
-; X86-SSE1-NEXT:    movl (%esp), %ebx
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE1-NEXT:    movl (%esi), %eax
-; X86-SSE1-NEXT:    movl 4(%esi), %edx
-; X86-SSE1-NEXT:    .p2align 4, 0x90
-; X86-SSE1-NEXT:  .LBB1_1: # %atomicrmw.start
-; X86-SSE1-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-SSE1-NEXT:    lock cmpxchg8b (%esi)
-; X86-SSE1-NEXT:    jne .LBB1_1
-; X86-SSE1-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-SSE1-NEXT:    leal -8(%ebp), %esp
-; X86-SSE1-NEXT:    popl %esi
-; X86-SSE1-NEXT:    popl %ebx
+; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movlps %xmm0, (%eax)
+; X86-SSE1-NEXT:    movl %ebp, %esp
 ; X86-SSE1-NEXT:    popl %ebp
 ; X86-SSE1-NEXT:    retl
 ;
@@ -267,7 +247,6 @@
 ; X86-NOSSE:       # %bb.0:
 ; X86-NOSSE-NEXT:    pushl %ebp
 ; X86-NOSSE-NEXT:    movl %esp, %ebp
-; X86-NOSSE-NEXT:    pushl %ebx
 ; X86-NOSSE-NEXT:    andl $-8, %esp
 ; X86-NOSSE-NEXT:    subl $32, %esp
 ; X86-NOSSE-NEXT:    fildll glob64
@@ -278,19 +257,14 @@
 ; X86-NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NOSSE-NEXT:    fld1
 ; X86-NOSSE-NEXT:    faddl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fstpl (%esp)
-; X86-NOSSE-NEXT:    movl (%esp), %ebx
+; X86-NOSSE-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movl glob64+4, %edx
-; X86-NOSSE-NEXT:    movl glob64, %eax
-; X86-NOSSE-NEXT:    .p2align 4, 0x90
-; X86-NOSSE-NEXT:  .LBB3_1: # %atomicrmw.start
-; X86-NOSSE-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NOSSE-NEXT:    lock cmpxchg8b glob64
-; X86-NOSSE-NEXT:    jne .LBB3_1
-; X86-NOSSE-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NOSSE-NEXT:    leal -4(%ebp), %esp
-; X86-NOSSE-NEXT:    popl %ebx
+; X86-NOSSE-NEXT:    movl %eax, (%esp)
+; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fildll (%esp)
+; X86-NOSSE-NEXT:    fistpll glob64
+; X86-NOSSE-NEXT:    movl %ebp, %esp
 ; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    retl
 ;
@@ -298,30 +272,20 @@
 ; X86-SSE1:       # %bb.0:
 ; X86-SSE1-NEXT:    pushl %ebp
 ; X86-SSE1-NEXT:    movl %esp, %ebp
-; X86-SSE1-NEXT:    pushl %ebx
 ; X86-SSE1-NEXT:    andl $-8, %esp
-; X86-SSE1-NEXT:    subl $32, %esp
-; X86-SSE1-NEXT:    fildll glob64
-; X86-SSE1-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    subl $16, %esp
+; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
+; X86-SSE1-NEXT:    movss %xmm1, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
 ; X86-SSE1-NEXT:    fld1
-; X86-SSE1-NEXT:    faddl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    fstpl (%esp)
-; X86-SSE1-NEXT:    movl (%esp), %ebx
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE1-NEXT:    movl glob64+4, %edx
-; X86-SSE1-NEXT:    movl glob64, %eax
-; X86-SSE1-NEXT:    .p2align 4, 0x90
-; X86-SSE1-NEXT:  .LBB3_1: # %atomicrmw.start
-; X86-SSE1-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-SSE1-NEXT:    lock cmpxchg8b glob64
-; X86-SSE1-NEXT:    jne .LBB3_1
-; X86-SSE1-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-SSE1-NEXT:    leal -4(%ebp), %esp
-; X86-SSE1-NEXT:    popl %ebx
+; X86-SSE1-NEXT:    faddl (%esp)
+; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movlps %xmm0, glob64
+; X86-SSE1-NEXT:    movl %ebp, %esp
 ; X86-SSE1-NEXT:    popl %ebp
 ; X86-SSE1-NEXT:    retl
 ;
@@ -446,7 +410,6 @@
 ; X86-NOSSE:       # %bb.0:
 ; X86-NOSSE-NEXT:    pushl %ebp
 ; X86-NOSSE-NEXT:    movl %esp, %ebp
-; X86-NOSSE-NEXT:    pushl %ebx
 ; X86-NOSSE-NEXT:    andl $-8, %esp
 ; X86-NOSSE-NEXT:    subl $32, %esp
 ; X86-NOSSE-NEXT:    fildll -559038737
@@ -457,19 +420,14 @@
 ; X86-NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NOSSE-NEXT:    fld1
 ; X86-NOSSE-NEXT:    faddl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fstpl (%esp)
-; X86-NOSSE-NEXT:    movl (%esp), %ebx
+; X86-NOSSE-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movl -559038737, %eax
-; X86-NOSSE-NEXT:    movl -559038733, %edx
-; X86-NOSSE-NEXT:    .p2align 4, 0x90
-; X86-NOSSE-NEXT:  .LBB5_1: # %atomicrmw.start
-; X86-NOSSE-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NOSSE-NEXT:    lock cmpxchg8b -559038737
-; X86-NOSSE-NEXT:    jne .LBB5_1
-; X86-NOSSE-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NOSSE-NEXT:    leal -4(%ebp), %esp
-; X86-NOSSE-NEXT:    popl %ebx
+; X86-NOSSE-NEXT:    movl %eax, (%esp)
+; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fildll (%esp)
+; X86-NOSSE-NEXT:    fistpll -559038737
+; X86-NOSSE-NEXT:    movl %ebp, %esp
 ; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    retl
 ;
@@ -477,30 +435,20 @@
 ; X86-SSE1:       # %bb.0:
 ; X86-SSE1-NEXT:    pushl %ebp
 ; X86-SSE1-NEXT:    movl %esp, %ebp
-; X86-SSE1-NEXT:    pushl %ebx
 ; X86-SSE1-NEXT:    andl $-8, %esp
-; X86-SSE1-NEXT:    subl $32, %esp
-; X86-SSE1-NEXT:    fildll -559038737
-; X86-SSE1-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    subl $16, %esp
+; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
+; X86-SSE1-NEXT:    movss %xmm1, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
 ; X86-SSE1-NEXT:    fld1
-; X86-SSE1-NEXT:    faddl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    fstpl (%esp)
-; X86-SSE1-NEXT:    movl (%esp), %ebx
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE1-NEXT:    movl -559038737, %eax
-; X86-SSE1-NEXT:    movl -559038733, %edx
-; X86-SSE1-NEXT:    .p2align 4, 0x90
-; X86-SSE1-NEXT:  .LBB5_1: # %atomicrmw.start
-; X86-SSE1-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-SSE1-NEXT:    lock cmpxchg8b -559038737
-; X86-SSE1-NEXT:    jne .LBB5_1
-; X86-SSE1-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-SSE1-NEXT:    leal -4(%ebp), %esp
-; X86-SSE1-NEXT:    popl %ebx
+; X86-SSE1-NEXT:    faddl (%esp)
+; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movlps %xmm0, -559038737
+; X86-SSE1-NEXT:    movl %ebp, %esp
 ; X86-SSE1-NEXT:    popl %ebp
 ; X86-SSE1-NEXT:    retl
 ;
@@ -631,10 +579,9 @@
 ; X86-NOSSE:       # %bb.0:
 ; X86-NOSSE-NEXT:    pushl %ebp
 ; X86-NOSSE-NEXT:    movl %esp, %ebp
-; X86-NOSSE-NEXT:    pushl %ebx
 ; X86-NOSSE-NEXT:    andl $-8, %esp
 ; X86-NOSSE-NEXT:    subl $40, %esp
-; X86-NOSSE-NEXT:    fildll (%esp)
+; X86-NOSSE-NEXT:    fildll {{[0-9]+}}(%esp)
 ; X86-NOSSE-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -643,18 +590,13 @@
 ; X86-NOSSE-NEXT:    fld1
 ; X86-NOSSE-NEXT:    faddl {{[0-9]+}}(%esp)
 ; X86-NOSSE-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movl (%esp), %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NOSSE-NEXT:    .p2align 4, 0x90
-; X86-NOSSE-NEXT:  .LBB7_1: # %atomicrmw.start
-; X86-NOSSE-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NOSSE-NEXT:    lock cmpxchg8b (%esp)
-; X86-NOSSE-NEXT:    jne .LBB7_1
-; X86-NOSSE-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NOSSE-NEXT:    leal -4(%ebp), %esp
-; X86-NOSSE-NEXT:    popl %ebx
+; X86-NOSSE-NEXT:    movl %eax, (%esp)
+; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fildll (%esp)
+; X86-NOSSE-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl %ebp, %esp
 ; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    retl
 ;
@@ -662,30 +604,20 @@
 ; X86-SSE1:       # %bb.0:
 ; X86-SSE1-NEXT:    pushl %ebp
 ; X86-SSE1-NEXT:    movl %esp, %ebp
-; X86-SSE1-NEXT:    pushl %ebx
 ; X86-SSE1-NEXT:    andl $-8, %esp
-; X86-SSE1-NEXT:    subl $40, %esp
-; X86-SSE1-NEXT:    fildll (%esp)
-; X86-SSE1-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    subl $24, %esp
+; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
+; X86-SSE1-NEXT:    movss %xmm1, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
 ; X86-SSE1-NEXT:    fld1
-; X86-SSE1-NEXT:    faddl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    faddl (%esp)
 ; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE1-NEXT:    movl (%esp), %eax
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE1-NEXT:    .p2align 4, 0x90
-; X86-SSE1-NEXT:  .LBB7_1: # %atomicrmw.start
-; X86-SSE1-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-SSE1-NEXT:    lock cmpxchg8b (%esp)
-; X86-SSE1-NEXT:    jne .LBB7_1
-; X86-SSE1-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-SSE1-NEXT:    leal -4(%ebp), %esp
-; X86-SSE1-NEXT:    popl %ebx
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movlps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movl %ebp, %esp
 ; X86-SSE1-NEXT:    popl %ebp
 ; X86-SSE1-NEXT:    retl
 ;
@@ -747,36 +679,28 @@
 ; X86-NOSSE:       # %bb.0: # %bb
 ; X86-NOSSE-NEXT:    pushl %ebp
 ; X86-NOSSE-NEXT:    movl %esp, %ebp
-; X86-NOSSE-NEXT:    pushl %ebx
-; X86-NOSSE-NEXT:    pushl %edi
 ; X86-NOSSE-NEXT:    pushl %esi
 ; X86-NOSSE-NEXT:    andl $-8, %esp
-; X86-NOSSE-NEXT:    subl $32, %esp
-; X86-NOSSE-NEXT:    movl 20(%ebp), %esi
-; X86-NOSSE-NEXT:    movl 8(%ebp), %edi
-; X86-NOSSE-NEXT:    fildll (%edi,%esi,8)
+; X86-NOSSE-NEXT:    subl $40, %esp
+; X86-NOSSE-NEXT:    movl 20(%ebp), %eax
+; X86-NOSSE-NEXT:    movl 8(%ebp), %ecx
+; X86-NOSSE-NEXT:    fildll (%ecx,%eax,8)
 ; X86-NOSSE-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOSSE-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X86-NOSSE-NEXT:    faddl 12(%ebp)
-; X86-NOSSE-NEXT:    fstpl (%esp)
-; X86-NOSSE-NEXT:    movl (%esp), %ebx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movl (%edi,%esi,8), %eax
-; X86-NOSSE-NEXT:    movl 4(%edi,%esi,8), %edx
-; X86-NOSSE-NEXT:    .p2align 4, 0x90
-; X86-NOSSE-NEXT:  .LBB8_1: # %atomicrmw.start
-; X86-NOSSE-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NOSSE-NEXT:    lock cmpxchg8b (%edi,%esi,8)
-; X86-NOSSE-NEXT:    jne .LBB8_1
-; X86-NOSSE-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NOSSE-NEXT:    leal -12(%ebp), %esp
+; X86-NOSSE-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOSSE-NEXT:    movl %edx, (%esp)
+; X86-NOSSE-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fildll (%esp)
+; X86-NOSSE-NEXT:    fistpll (%ecx,%eax,8)
+; X86-NOSSE-NEXT:    leal -4(%ebp), %esp
 ; X86-NOSSE-NEXT:    popl %esi
-; X86-NOSSE-NEXT:    popl %edi
-; X86-NOSSE-NEXT:    popl %ebx
 ; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    retl
 ;
@@ -784,36 +708,22 @@
 ; X86-SSE1:       # %bb.0: # %bb
 ; X86-SSE1-NEXT:    pushl %ebp
 ; X86-SSE1-NEXT:    movl %esp, %ebp
-; X86-SSE1-NEXT:    pushl %ebx
-; X86-SSE1-NEXT:    pushl %edi
-; X86-SSE1-NEXT:    pushl %esi
 ; X86-SSE1-NEXT:    andl $-8, %esp
-; X86-SSE1-NEXT:    subl $32, %esp
-; X86-SSE1-NEXT:    movl 20(%ebp), %esi
-; X86-SSE1-NEXT:    movl 8(%ebp), %edi
-; X86-SSE1-NEXT:    fildll (%edi,%esi,8)
-; X86-SSE1-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    subl $16, %esp
+; X86-SSE1-NEXT:    movl 20(%ebp), %eax
+; X86-SSE1-NEXT:    movl 8(%ebp), %ecx
+; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
+; X86-SSE1-NEXT:    movss %xmm1, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    fldl (%esp)
 ; X86-SSE1-NEXT:    faddl 12(%ebp)
-; X86-SSE1-NEXT:    fstpl (%esp)
-; X86-SSE1-NEXT:    movl (%esp), %ebx
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE1-NEXT:    movl (%edi,%esi,8), %eax
-; X86-SSE1-NEXT:    movl 4(%edi,%esi,8), %edx
-; X86-SSE1-NEXT:    .p2align 4, 0x90
-; X86-SSE1-NEXT:  .LBB8_1: # %atomicrmw.start
-; X86-SSE1-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-SSE1-NEXT:    lock cmpxchg8b (%edi,%esi,8)
-; X86-SSE1-NEXT:    jne .LBB8_1
-; X86-SSE1-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-SSE1-NEXT:    leal -12(%ebp), %esp
-; X86-SSE1-NEXT:    popl %esi
-; X86-SSE1-NEXT:    popl %edi
-; X86-SSE1-NEXT:    popl %ebx
+; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movlps %xmm0, (%ecx,%eax,8)
+; X86-SSE1-NEXT:    movl %ebp, %esp
 ; X86-SSE1-NEXT:    popl %ebp
 ; X86-SSE1-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/atomic-load-store-wide.ll b/llvm/test/CodeGen/X86/atomic-load-store-wide.ll
--- a/llvm/test/CodeGen/X86/atomic-load-store-wide.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store-wide.ll
@@ -16,27 +16,24 @@
 ;
 ; NOSSE-LABEL: test1:
 ; NOSSE:       # %bb.0:
-; NOSSE-NEXT:    pushl %ebx
-; NOSSE-NEXT:    .cfi_def_cfa_offset 8
-; NOSSE-NEXT:    pushl %esi
-; NOSSE-NEXT:    .cfi_def_cfa_offset 12
-; NOSSE-NEXT:    .cfi_offset %esi, -12
-; NOSSE-NEXT:    .cfi_offset %ebx, -8
-; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; NOSSE-NEXT:    movl (%esi), %eax
-; NOSSE-NEXT:    movl 4(%esi), %edx
-; NOSSE-NEXT:    .p2align 4, 0x90
-; NOSSE-NEXT:  .LBB0_1: # %atomicrmw.start
-; NOSSE-NEXT:    # =>This Inner Loop Header: Depth=1
-; NOSSE-NEXT:    lock cmpxchg8b (%esi)
-; NOSSE-NEXT:    jne .LBB0_1
-; NOSSE-NEXT:  # %bb.2: # %atomicrmw.end
-; NOSSE-NEXT:    popl %esi
+; NOSSE-NEXT:    pushl %ebp
 ; NOSSE-NEXT:    .cfi_def_cfa_offset 8
-; NOSSE-NEXT:    popl %ebx
-; NOSSE-NEXT:    .cfi_def_cfa_offset 4
+; NOSSE-NEXT:    .cfi_offset %ebp, -8
+; NOSSE-NEXT:    movl %esp, %ebp
+; NOSSE-NEXT:    .cfi_def_cfa_register %ebp
+; NOSSE-NEXT:    andl $-8, %esp
+; NOSSE-NEXT:    subl $8, %esp
+; NOSSE-NEXT:    movl 8(%ebp), %eax
+; NOSSE-NEXT:    movl 12(%ebp), %ecx
+; NOSSE-NEXT:    movl 16(%ebp), %edx
+; NOSSE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl %ecx, (%esp)
+; NOSSE-NEXT:    fildll (%esp)
+; NOSSE-NEXT:    fistpll (%eax)
+; NOSSE-NEXT:    lock orl $0, (%esp)
+; NOSSE-NEXT:    movl %ebp, %esp
+; NOSSE-NEXT:    popl %ebp
+; NOSSE-NEXT:    .cfi_def_cfa %esp, 4
 ; NOSSE-NEXT:    retl
   store atomic i64 %val1, i64* %ptr seq_cst, align 8
   ret void
diff --git a/llvm/test/CodeGen/X86/atomic-mi.ll b/llvm/test/CodeGen/X86/atomic-mi.ll
--- a/llvm/test/CodeGen/X86/atomic-mi.ll
+++ b/llvm/test/CodeGen/X86/atomic-mi.ll
@@ -84,27 +84,21 @@
 ;
 ; X32-LABEL: store_atomic_imm_64:
 ; X32:       # %bb.0:
-; X32-NEXT:    pushl %ebx
-; X32-NEXT:    .cfi_def_cfa_offset 8
-; X32-NEXT:    pushl %esi
-; X32-NEXT:    .cfi_def_cfa_offset 12
-; X32-NEXT:    .cfi_offset %esi, -12
-; X32-NEXT:    .cfi_offset %ebx, -8
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl (%esi), %eax
-; X32-NEXT:    movl 4(%esi), %edx
-; X32-NEXT:    xorl %ecx, %ecx
-; X32-NEXT:    movl $42, %ebx
-; X32-NEXT:    .p2align 4, 0x90
-; X32-NEXT:  .LBB3_1: # %atomicrmw.start
-; X32-NEXT:    # =>This Inner Loop Header: Depth=1
-; X32-NEXT:    lock cmpxchg8b (%esi)
-; X32-NEXT:    jne .LBB3_1
-; X32-NEXT:  # %bb.2: # %atomicrmw.end
-; X32-NEXT:    popl %esi
+; X32-NEXT:    pushl %ebp
 ; X32-NEXT:    .cfi_def_cfa_offset 8
-; X32-NEXT:    popl %ebx
-; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    .cfi_offset %ebp, -8
+; X32-NEXT:    movl %esp, %ebp
+; X32-NEXT:    .cfi_def_cfa_register %ebp
+; X32-NEXT:    andl $-8, %esp
+; X32-NEXT:    subl $8, %esp
+; X32-NEXT:    movl 8(%ebp), %eax
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $42, (%esp)
+; X32-NEXT:    fildll (%esp)
+; X32-NEXT:    fistpll (%eax)
+; X32-NEXT:    movl %ebp, %esp
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    .cfi_def_cfa %esp, 4
 ; X32-NEXT:    retl
 ;   These are implemented with a CAS loop on 32 bit architectures, and thus
 ;   cannot be optimized in the same way as the others.
@@ -123,27 +117,21 @@
 ;
 ; X32-LABEL: store_atomic_imm_64_big:
 ; X32:       # %bb.0:
-; X32-NEXT:    pushl %ebx
-; X32-NEXT:    .cfi_def_cfa_offset 8
-; X32-NEXT:    pushl %esi
-; X32-NEXT:    .cfi_def_cfa_offset 12
-; X32-NEXT:    .cfi_offset %esi, -12
-; X32-NEXT:    .cfi_offset %ebx, -8
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl (%esi), %eax
-; X32-NEXT:    movl 4(%esi), %edx
-; X32-NEXT:    movl $23, %ecx
-; X32-NEXT:    movl $1215752192, %ebx # imm = 0x4876E800
-; X32-NEXT:    .p2align 4, 0x90
-; X32-NEXT:  .LBB4_1: # %atomicrmw.start
-; X32-NEXT:    # =>This Inner Loop Header: Depth=1
-; X32-NEXT:    lock cmpxchg8b (%esi)
-; X32-NEXT:    jne .LBB4_1
-; X32-NEXT:  # %bb.2: # %atomicrmw.end
-; X32-NEXT:    popl %esi
+; X32-NEXT:    pushl %ebp
 ; X32-NEXT:    .cfi_def_cfa_offset 8
-; X32-NEXT:    popl %ebx
-; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    .cfi_offset %ebp, -8
+; X32-NEXT:    movl %esp, %ebp
+; X32-NEXT:    .cfi_def_cfa_register %ebp
+; X32-NEXT:    andl $-8, %esp
+; X32-NEXT:    subl $8, %esp
+; X32-NEXT:    movl 8(%ebp), %eax
+; X32-NEXT:    movl $23, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $1215752192, (%esp) # imm = 0x4876E800
+; X32-NEXT:    fildll (%esp)
+; X32-NEXT:    fistpll (%eax)
+; X32-NEXT:    movl %ebp, %esp
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    .cfi_def_cfa %esp, 4
 ; X32-NEXT:    retl
   store atomic i64 100000000000, i64* %p monotonic, align 8
   ret void
@@ -336,30 +324,20 @@
 ; X32-NEXT:    .cfi_offset %ebp, -8
 ; X32-NEXT:    movl %esp, %ebp
 ; X32-NEXT:    .cfi_def_cfa_register %ebp
-; X32-NEXT:    pushl %ebx
-; X32-NEXT:    pushl %esi
 ; X32-NEXT:    andl $-8, %esp
-; X32-NEXT:    subl $8, %esp
-; X32-NEXT:    .cfi_offset %esi, -16
-; X32-NEXT:    .cfi_offset %ebx, -12
-; X32-NEXT:    movl 8(%ebp), %esi
-; X32-NEXT:    fildll (%esi)
-; X32-NEXT:    fistpll (%esp)
-; X32-NEXT:    movl (%esp), %ebx
+; X32-NEXT:    subl $16, %esp
+; X32-NEXT:    movl 8(%ebp), %eax
+; X32-NEXT:    fildll (%eax)
+; X32-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    addl $2, %ebx
-; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    movl (%esi), %eax
-; X32-NEXT:    movl 4(%esi), %edx
-; X32-NEXT:    .p2align 4, 0x90
-; X32-NEXT:  .LBB14_1: # %atomicrmw.start
-; X32-NEXT:    # =>This Inner Loop Header: Depth=1
-; X32-NEXT:    lock cmpxchg8b (%esi)
-; X32-NEXT:    jne .LBB14_1
-; X32-NEXT:  # %bb.2: # %atomicrmw.end
-; X32-NEXT:    leal -8(%ebp), %esp
-; X32-NEXT:    popl %esi
-; X32-NEXT:    popl %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    addl $2, %ecx
+; X32-NEXT:    adcl $0, %edx
+; X32-NEXT:    movl %ecx, (%esp)
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT:    fildll (%esp)
+; X32-NEXT:    fistpll (%eax)
+; X32-NEXT:    movl %ebp, %esp
 ; X32-NEXT:    popl %ebp
 ; X32-NEXT:    .cfi_def_cfa %esp, 4
 ; X32-NEXT:    retl
@@ -383,30 +361,20 @@
 ; X32-NEXT:    .cfi_offset %ebp, -8
 ; X32-NEXT:    movl %esp, %ebp
 ; X32-NEXT:    .cfi_def_cfa_register %ebp
-; X32-NEXT:    pushl %ebx
-; X32-NEXT:    pushl %esi
 ; X32-NEXT:    andl $-8, %esp
-; X32-NEXT:    subl $8, %esp
-; X32-NEXT:    .cfi_offset %esi, -16
-; X32-NEXT:    .cfi_offset %ebx, -12
-; X32-NEXT:    movl 8(%ebp), %esi
-; X32-NEXT:    fildll (%esi)
-; X32-NEXT:    fistpll (%esp)
-; X32-NEXT:    movl (%esp), %ebx
+; X32-NEXT:    subl $16, %esp
+; X32-NEXT:    movl 8(%ebp), %eax
+; X32-NEXT:    fildll (%eax)
+; X32-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    addl 12(%ebp), %ebx
-; X32-NEXT:    adcl 16(%ebp), %ecx
-; X32-NEXT:    movl (%esi), %eax
-; X32-NEXT:    movl 4(%esi), %edx
-; X32-NEXT:    .p2align 4, 0x90
-; X32-NEXT:  .LBB15_1: # %atomicrmw.start
-; X32-NEXT:    # =>This Inner Loop Header: Depth=1
-; X32-NEXT:    lock cmpxchg8b (%esi)
-; X32-NEXT:    jne .LBB15_1
-; X32-NEXT:  # %bb.2: # %atomicrmw.end
-; X32-NEXT:    leal -8(%ebp), %esp
-; X32-NEXT:    popl %esi
-; X32-NEXT:    popl %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    addl 12(%ebp), %ecx
+; X32-NEXT:    adcl 16(%ebp), %edx
+; X32-NEXT:    movl %ecx, (%esp)
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT:    fildll (%esp)
+; X32-NEXT:    fistpll (%eax)
+; X32-NEXT:    movl %ebp, %esp
 ; X32-NEXT:    popl %ebp
 ; X32-NEXT:    .cfi_def_cfa %esp, 4
 ; X32-NEXT:    retl
@@ -576,30 +544,20 @@
 ; X32-NEXT:    .cfi_offset %ebp, -8
 ; X32-NEXT:    movl %esp, %ebp
 ; X32-NEXT:    .cfi_def_cfa_register %ebp
-; X32-NEXT:    pushl %ebx
-; X32-NEXT:    pushl %esi
 ; X32-NEXT:    andl $-8, %esp
-; X32-NEXT:    subl $8, %esp
-; X32-NEXT:    .cfi_offset %esi, -16
-; X32-NEXT:    .cfi_offset %ebx, -12
-; X32-NEXT:    movl 8(%ebp), %esi
-; X32-NEXT:    fildll (%esi)
-; X32-NEXT:    fistpll (%esp)
-; X32-NEXT:    movl (%esp), %ebx
+; X32-NEXT:    subl $16, %esp
+; X32-NEXT:    movl 8(%ebp), %eax
+; X32-NEXT:    fildll (%eax)
+; X32-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    subl 12(%ebp), %ebx
-; X32-NEXT:    sbbl 16(%ebp), %ecx
-; X32-NEXT:    movl (%esi), %eax
-; X32-NEXT:    movl 4(%esi), %edx
-; X32-NEXT:    .p2align 4, 0x90
-; X32-NEXT:  .LBB23_1: # %atomicrmw.start
-; X32-NEXT:    # =>This Inner Loop Header: Depth=1
-; X32-NEXT:    lock cmpxchg8b (%esi)
-; X32-NEXT:    jne .LBB23_1
-; X32-NEXT:  # %bb.2: # %atomicrmw.end
-; X32-NEXT:    leal -8(%ebp), %esp
-; X32-NEXT:    popl %esi
-; X32-NEXT:    popl %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    subl 12(%ebp), %ecx
+; X32-NEXT:    sbbl 16(%ebp), %edx
+; X32-NEXT:    movl %ecx, (%esp)
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT:    fildll (%esp)
+; X32-NEXT:    fistpll (%eax)
+; X32-NEXT:    movl %ebp, %esp
 ; X32-NEXT:    popl %ebp
 ; X32-NEXT:    .cfi_def_cfa %esp, 4
 ; X32-NEXT:    retl
@@ -751,29 +709,18 @@
 ; X32-NEXT:    .cfi_offset %ebp, -8
 ; X32-NEXT:    movl %esp, %ebp
 ; X32-NEXT:    .cfi_def_cfa_register %ebp
-; X32-NEXT:    pushl %ebx
-; X32-NEXT:    pushl %esi
 ; X32-NEXT:    andl $-8, %esp
-; X32-NEXT:    subl $8, %esp
-; X32-NEXT:    .cfi_offset %esi, -16
-; X32-NEXT:    .cfi_offset %ebx, -12
-; X32-NEXT:    movl 8(%ebp), %esi
-; X32-NEXT:    fildll (%esi)
-; X32-NEXT:    fistpll (%esp)
-; X32-NEXT:    movl (%esp), %ebx
-; X32-NEXT:    andl $2, %ebx
-; X32-NEXT:    movl (%esi), %eax
-; X32-NEXT:    movl 4(%esi), %edx
-; X32-NEXT:    xorl %ecx, %ecx
-; X32-NEXT:    .p2align 4, 0x90
-; X32-NEXT:  .LBB31_1: # %atomicrmw.start
-; X32-NEXT:    # =>This Inner Loop Header: Depth=1
-; X32-NEXT:    lock cmpxchg8b (%esi)
-; X32-NEXT:    jne .LBB31_1
-; X32-NEXT:  # %bb.2: # %atomicrmw.end
-; X32-NEXT:    leal -8(%ebp), %esp
-; X32-NEXT:    popl %esi
-; X32-NEXT:    popl %ebx
+; X32-NEXT:    subl $16, %esp
+; X32-NEXT:    movl 8(%ebp), %eax
+; X32-NEXT:    fildll (%eax)
+; X32-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    andl $2, %ecx
+; X32-NEXT:    movl %ecx, (%esp)
+; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    fildll (%esp)
+; X32-NEXT:    fistpll (%eax)
+; X32-NEXT:    movl %ebp, %esp
 ; X32-NEXT:    popl %ebp
 ; X32-NEXT:    .cfi_def_cfa %esp, 4
 ; X32-NEXT:    retl
@@ -797,30 +744,20 @@
 ; X32-NEXT:    .cfi_offset %ebp, -8
 ; X32-NEXT:    movl %esp, %ebp
 ; X32-NEXT:    .cfi_def_cfa_register %ebp
-; X32-NEXT:    pushl %ebx
-; X32-NEXT:    pushl %esi
 ; X32-NEXT:    andl $-8, %esp
-; X32-NEXT:    subl $8, %esp
-; X32-NEXT:    .cfi_offset %esi, -16
-; X32-NEXT:    .cfi_offset %ebx, -12
-; X32-NEXT:    movl 8(%ebp), %esi
-; X32-NEXT:    fildll (%esi)
-; X32-NEXT:    fistpll (%esp)
-; X32-NEXT:    movl (%esp), %ebx
+; X32-NEXT:    subl $16, %esp
+; X32-NEXT:    movl 8(%ebp), %eax
+; X32-NEXT:    fildll (%eax)
+; X32-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    andl 16(%ebp), %ecx
-; X32-NEXT:    andl 12(%ebp), %ebx
-; X32-NEXT:    movl (%esi), %eax
-; X32-NEXT:    movl 4(%esi), %edx
-; X32-NEXT:    .p2align 4, 0x90
-; X32-NEXT:  .LBB32_1: # %atomicrmw.start
-; X32-NEXT:    # =>This Inner Loop Header: Depth=1
-; X32-NEXT:    lock cmpxchg8b (%esi)
-; X32-NEXT:    jne .LBB32_1
-; X32-NEXT:  # %bb.2: # %atomicrmw.end
-; X32-NEXT:    leal -8(%ebp), %esp
-; X32-NEXT:    popl %esi
-; X32-NEXT:    popl %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    andl 16(%ebp), %edx
+; X32-NEXT:    andl 12(%ebp), %ecx
+; X32-NEXT:    movl %ecx, (%esp)
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT:    fildll (%esp)
+; X32-NEXT:    fistpll (%eax)
+; X32-NEXT:    movl %ebp, %esp
 ; X32-NEXT:    popl %ebp
 ; X32-NEXT:    .cfi_def_cfa %esp, 4
 ; X32-NEXT:    retl
@@ -993,29 +930,19 @@
 ; X32-NEXT:    .cfi_offset %ebp, -8
 ; X32-NEXT:    movl %esp, %ebp
 ; X32-NEXT:    .cfi_def_cfa_register %ebp
-; X32-NEXT:    pushl %ebx
-; X32-NEXT:    pushl %esi
 ; X32-NEXT:    andl $-8, %esp
-; X32-NEXT:    subl $8, %esp
-; X32-NEXT:    .cfi_offset %esi, -16
-; X32-NEXT:    .cfi_offset %ebx, -12
-; X32-NEXT:    movl 8(%ebp), %esi
-; X32-NEXT:    fildll (%esi)
-; X32-NEXT:    fistpll (%esp)
-; X32-NEXT:    movl (%esp), %ebx
+; X32-NEXT:    subl $16, %esp
+; X32-NEXT:    movl 8(%ebp), %eax
+; X32-NEXT:    fildll (%eax)
+; X32-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    orl $2, %ebx
-; X32-NEXT:    movl (%esi), %eax
-; X32-NEXT:    movl 4(%esi), %edx
-; X32-NEXT:    .p2align 4, 0x90
-; X32-NEXT:  .LBB41_1: # %atomicrmw.start
-; X32-NEXT:    # =>This Inner Loop Header: Depth=1
-; X32-NEXT:    lock cmpxchg8b (%esi)
-; X32-NEXT:    jne .LBB41_1
-; X32-NEXT:  # %bb.2: # %atomicrmw.end
-; X32-NEXT:    leal -8(%ebp), %esp
-; X32-NEXT:    popl %esi
-; X32-NEXT:    popl %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    orl $2, %ecx
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %ecx, (%esp)
+; X32-NEXT:    fildll (%esp)
+; X32-NEXT:    fistpll (%eax)
+; X32-NEXT:    movl %ebp, %esp
 ; X32-NEXT:    popl %ebp
 ; X32-NEXT:    .cfi_def_cfa %esp, 4
 ; X32-NEXT:    retl
@@ -1039,30 +966,20 @@
 ; X32-NEXT:    .cfi_offset %ebp, -8
 ; X32-NEXT:    movl %esp, %ebp
 ; X32-NEXT:    .cfi_def_cfa_register %ebp
-; X32-NEXT:    pushl %ebx
-; X32-NEXT:    pushl %esi
 ; X32-NEXT:    andl $-8, %esp
-; X32-NEXT:    subl $8, %esp
-; X32-NEXT:    .cfi_offset %esi, -16
-; X32-NEXT:    .cfi_offset %ebx, -12
-; X32-NEXT:    movl 8(%ebp), %esi
-; X32-NEXT:    fildll (%esi)
-; X32-NEXT:    fistpll (%esp)
-; X32-NEXT:    movl (%esp), %ebx
+; X32-NEXT:    subl $16, %esp
+; X32-NEXT:    movl 8(%ebp), %eax
+; X32-NEXT:    fildll (%eax)
+; X32-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    orl 16(%ebp), %ecx
-; X32-NEXT:    orl 12(%ebp), %ebx
-; X32-NEXT:    movl (%esi), %eax
-; X32-NEXT:    movl 4(%esi), %edx
-; X32-NEXT:    .p2align 4, 0x90
-; X32-NEXT:  .LBB42_1: # %atomicrmw.start
-; X32-NEXT:    # =>This Inner Loop Header: Depth=1
-; X32-NEXT:    lock cmpxchg8b (%esi)
-; X32-NEXT:    jne .LBB42_1
-; X32-NEXT:  # %bb.2: # %atomicrmw.end
-; X32-NEXT:    leal -8(%ebp), %esp
-; X32-NEXT:    popl %esi
-; X32-NEXT:    popl %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    orl 16(%ebp), %edx
+; X32-NEXT:    orl 12(%ebp), %ecx
+; X32-NEXT:    movl %ecx, (%esp)
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT:    fildll (%esp)
+; X32-NEXT:    fistpll (%eax)
+; X32-NEXT:    movl %ebp, %esp
 ; X32-NEXT:    popl %ebp
 ; X32-NEXT:    .cfi_def_cfa %esp, 4
 ; X32-NEXT:    retl
@@ -1235,29 +1152,19 @@
 ; X32-NEXT:    .cfi_offset %ebp, -8
 ; X32-NEXT:    movl %esp, %ebp
 ; X32-NEXT:    .cfi_def_cfa_register %ebp
-; X32-NEXT:    pushl %ebx
-; X32-NEXT:    pushl %esi
 ; X32-NEXT:    andl $-8, %esp
-; X32-NEXT:    subl $8, %esp
-; X32-NEXT:    .cfi_offset %esi, -16
-; X32-NEXT:    .cfi_offset %ebx, -12
-; X32-NEXT:    movl 8(%ebp), %esi
-; X32-NEXT:    fildll (%esi)
-; X32-NEXT:    fistpll (%esp)
-; X32-NEXT:    movl (%esp), %ebx
+; X32-NEXT:    subl $16, %esp
+; X32-NEXT:    movl 8(%ebp), %eax
+; X32-NEXT:    fildll (%eax)
+; X32-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    xorl $2, %ebx
-; X32-NEXT:    movl (%esi), %eax
-; X32-NEXT:    movl 4(%esi), %edx
-; X32-NEXT:    .p2align 4, 0x90
-; X32-NEXT:  .LBB51_1: # %atomicrmw.start
-; X32-NEXT:    # =>This Inner Loop Header: Depth=1
-; X32-NEXT:    lock cmpxchg8b (%esi)
-; X32-NEXT:    jne .LBB51_1
-; X32-NEXT:  # %bb.2: # %atomicrmw.end
-; X32-NEXT:    leal -8(%ebp), %esp
-; X32-NEXT:    popl %esi
-; X32-NEXT:    popl %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    xorl $2, %ecx
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %ecx, (%esp)
+; X32-NEXT:    fildll (%esp)
+; X32-NEXT:    fistpll (%eax)
+; X32-NEXT:    movl %ebp, %esp
 ; X32-NEXT:    popl %ebp
 ; X32-NEXT:    .cfi_def_cfa %esp, 4
 ; X32-NEXT:    retl
@@ -1281,30 +1188,20 @@
 ; X32-NEXT:    .cfi_offset %ebp, -8
 ; X32-NEXT:    movl %esp, %ebp
 ; X32-NEXT:    .cfi_def_cfa_register %ebp
-; X32-NEXT:    pushl %ebx
-; X32-NEXT:    pushl %esi
 ; X32-NEXT:    andl $-8, %esp
-; X32-NEXT:    subl $8, %esp
-; X32-NEXT:    .cfi_offset %esi, -16
-; X32-NEXT:    .cfi_offset %ebx, -12
-; X32-NEXT:    movl 8(%ebp), %esi
-; X32-NEXT:    fildll (%esi)
-; X32-NEXT:    fistpll (%esp)
-; X32-NEXT:    movl (%esp), %ebx
+; X32-NEXT:    subl $16, %esp
+; X32-NEXT:    movl 8(%ebp), %eax
+; X32-NEXT:    fildll (%eax)
+; X32-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    xorl 16(%ebp), %ecx
-; X32-NEXT:    xorl 12(%ebp), %ebx
-; X32-NEXT:    movl (%esi), %eax
-; X32-NEXT:    movl 4(%esi), %edx
-; X32-NEXT:    .p2align 4, 0x90
-; X32-NEXT:  .LBB52_1: # %atomicrmw.start
-; X32-NEXT:    # =>This Inner Loop Header: Depth=1
-; X32-NEXT:    lock cmpxchg8b (%esi)
-; X32-NEXT:    jne .LBB52_1
-; X32-NEXT:  # %bb.2: # %atomicrmw.end
-; X32-NEXT:    leal -8(%ebp), %esp
-; X32-NEXT:    popl %esi
-; X32-NEXT:    popl %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    xorl 16(%ebp), %edx
+; X32-NEXT:    xorl 12(%ebp), %ecx
+; X32-NEXT:    movl %ecx, (%esp)
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT:    fildll (%esp)
+; X32-NEXT:    fistpll (%eax)
+; X32-NEXT:    movl %ebp, %esp
 ; X32-NEXT:    popl %ebp
 ; X32-NEXT:    .cfi_def_cfa %esp, 4
 ; X32-NEXT:    retl
@@ -1438,30 +1335,20 @@
 ; X32-NEXT:    .cfi_offset %ebp, -8
 ; X32-NEXT:    movl %esp, %ebp
 ; X32-NEXT:    .cfi_def_cfa_register %ebp
-; X32-NEXT:    pushl %ebx
-; X32-NEXT:    pushl %esi
 ; X32-NEXT:    andl $-8, %esp
-; X32-NEXT:    subl $8, %esp
-; X32-NEXT:    .cfi_offset %esi, -16
-; X32-NEXT:    .cfi_offset %ebx, -12
-; X32-NEXT:    movl 8(%ebp), %esi
-; X32-NEXT:    fildll (%esi)
-; X32-NEXT:    fistpll (%esp)
-; X32-NEXT:    movl (%esp), %ebx
+; X32-NEXT:    subl $16, %esp
+; X32-NEXT:    movl 8(%ebp), %eax
+; X32-NEXT:    fildll (%eax)
+; X32-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    addl $1, %ebx
-; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    movl (%esi), %eax
-; X32-NEXT:    movl 4(%esi), %edx
-; X32-NEXT:    .p2align 4, 0x90
-; X32-NEXT:  .LBB58_1: # %atomicrmw.start
-; X32-NEXT:    # =>This Inner Loop Header: Depth=1
-; X32-NEXT:    lock cmpxchg8b (%esi)
-; X32-NEXT:    jne .LBB58_1
-; X32-NEXT:  # %bb.2: # %atomicrmw.end
-; X32-NEXT:    leal -8(%ebp), %esp
-; X32-NEXT:    popl %esi
-; X32-NEXT:    popl %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    addl $1, %ecx
+; X32-NEXT:    adcl $0, %edx
+; X32-NEXT:    movl %ecx, (%esp)
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT:    fildll (%esp)
+; X32-NEXT:    fistpll (%eax)
+; X32-NEXT:    movl %ebp, %esp
 ; X32-NEXT:    popl %ebp
 ; X32-NEXT:    .cfi_def_cfa %esp, 4
 ; X32-NEXT:    retl
@@ -1586,30 +1473,20 @@
 ; X32-NEXT:    .cfi_offset %ebp, -8
 ; X32-NEXT:    movl %esp, %ebp
 ; X32-NEXT:    .cfi_def_cfa_register %ebp
-; X32-NEXT:    pushl %ebx
-; X32-NEXT:    pushl %esi
 ; X32-NEXT:    andl $-8, %esp
-; X32-NEXT:    subl $8, %esp
-; X32-NEXT:    .cfi_offset %esi, -16
-; X32-NEXT:    .cfi_offset %ebx, -12
-; X32-NEXT:    movl 8(%ebp), %esi
-; X32-NEXT:    fildll (%esi)
-; X32-NEXT:    fistpll (%esp)
-; X32-NEXT:    movl (%esp), %ebx
+; X32-NEXT:    subl $16, %esp
+; X32-NEXT:    movl 8(%ebp), %eax
+; X32-NEXT:    fildll (%eax)
+; X32-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    addl $-1, %ebx
-; X32-NEXT:    adcl $-1, %ecx
-; X32-NEXT:    movl (%esi), %eax
-; X32-NEXT:    movl 4(%esi), %edx
-; X32-NEXT:    .p2align 4, 0x90
-; X32-NEXT:  .LBB63_1: # %atomicrmw.start
-; X32-NEXT:    # =>This Inner Loop Header: Depth=1
-; X32-NEXT:    lock cmpxchg8b (%esi)
-; X32-NEXT:    jne .LBB63_1
-; X32-NEXT:  # %bb.2: # %atomicrmw.end
-; X32-NEXT:    leal -8(%ebp), %esp
-; X32-NEXT:    popl %esi
-; X32-NEXT:    popl %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    addl $-1, %ecx
+; X32-NEXT:    adcl $-1, %edx
+; X32-NEXT:    movl %ecx, (%esp)
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT:    fildll (%esp)
+; X32-NEXT:    fistpll (%eax)
+; X32-NEXT:    movl %ebp, %esp
 ; X32-NEXT:    popl %ebp
 ; X32-NEXT:    .cfi_def_cfa %esp, 4
 ; X32-NEXT:    retl
@@ -1719,30 +1596,20 @@
 ; X32-NEXT:    .cfi_offset %ebp, -8
 ; X32-NEXT:    movl %esp, %ebp
 ; X32-NEXT:    .cfi_def_cfa_register %ebp
-; X32-NEXT:    pushl %ebx
-; X32-NEXT:    pushl %esi
 ; X32-NEXT:    andl $-8, %esp
-; X32-NEXT:    subl $8, %esp
-; X32-NEXT:    .cfi_offset %esi, -16
-; X32-NEXT:    .cfi_offset %ebx, -12
-; X32-NEXT:    movl 8(%ebp), %esi
-; X32-NEXT:    fildll (%esi)
-; X32-NEXT:    fistpll (%esp)
-; X32-NEXT:    movl (%esp), %ebx
+; X32-NEXT:    subl $16, %esp
+; X32-NEXT:    movl 8(%ebp), %eax
+; X32-NEXT:    fildll (%eax)
+; X32-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    notl %edx
 ; X32-NEXT:    notl %ecx
-; X32-NEXT:    notl %ebx
-; X32-NEXT:    movl (%esi), %eax
-; X32-NEXT:    movl 4(%esi), %edx
-; X32-NEXT:    .p2align 4, 0x90
-; X32-NEXT:  .LBB68_1: # %atomicrmw.start
-; X32-NEXT:    # =>This Inner Loop Header: Depth=1
-; X32-NEXT:    lock cmpxchg8b (%esi)
-; X32-NEXT:    jne .LBB68_1
-; X32-NEXT:  # %bb.2: # %atomicrmw.end
-; X32-NEXT:    leal -8(%ebp), %esp
-; X32-NEXT:    popl %esi
-; X32-NEXT:    popl %ebx
+; X32-NEXT:    movl %ecx, (%esp)
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT:    fildll (%esp)
+; X32-NEXT:    fistpll (%eax)
+; X32-NEXT:    movl %ebp, %esp
 ; X32-NEXT:    popl %ebp
 ; X32-NEXT:    .cfi_def_cfa %esp, 4
 ; X32-NEXT:    retl
@@ -1844,30 +1711,20 @@
 ; X32-NEXT:    .cfi_offset %ebp, -8
 ; X32-NEXT:    movl %esp, %ebp
 ; X32-NEXT:    .cfi_def_cfa_register %ebp
-; X32-NEXT:    pushl %ebx
-; X32-NEXT:    pushl %esi
 ; X32-NEXT:    andl $-8, %esp
-; X32-NEXT:    subl $8, %esp
-; X32-NEXT:    .cfi_offset %esi, -16
-; X32-NEXT:    .cfi_offset %ebx, -12
-; X32-NEXT:    movl 8(%ebp), %esi
-; X32-NEXT:    fildll (%esi)
-; X32-NEXT:    fistpll (%esp)
+; X32-NEXT:    subl $16, %esp
+; X32-NEXT:    movl 8(%ebp), %eax
+; X32-NEXT:    fildll (%eax)
+; X32-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X32-NEXT:    xorl %ecx, %ecx
-; X32-NEXT:    xorl %ebx, %ebx
-; X32-NEXT:    subl (%esp), %ebx
+; X32-NEXT:    xorl %edx, %edx
+; X32-NEXT:    subl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl (%esi), %eax
-; X32-NEXT:    movl 4(%esi), %edx
-; X32-NEXT:    .p2align 4, 0x90
-; X32-NEXT:  .LBB73_1: # %atomicrmw.start
-; X32-NEXT:    # =>This Inner Loop Header: Depth=1
-; X32-NEXT:    lock cmpxchg8b (%esi)
-; X32-NEXT:    jne .LBB73_1
-; X32-NEXT:  # %bb.2: # %atomicrmw.end
-; X32-NEXT:    leal -8(%ebp), %esp
-; X32-NEXT:    popl %esi
-; X32-NEXT:    popl %ebx
+; X32-NEXT:    movl %edx, (%esp)
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NEXT:    fildll (%esp)
+; X32-NEXT:    fistpll (%eax)
+; X32-NEXT:    movl %ebp, %esp
 ; X32-NEXT:    popl %ebp
 ; X32-NEXT:    .cfi_def_cfa %esp, 4
 ; X32-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/atomic-non-integer.ll b/llvm/test/CodeGen/X86/atomic-non-integer.ll
--- a/llvm/test/CodeGen/X86/atomic-non-integer.ll
+++ b/llvm/test/CodeGen/X86/atomic-non-integer.ll
@@ -55,27 +55,10 @@
 define void @store_double(double* %fptr, double %v) {
 ; X86-SSE1-LABEL: store_double:
 ; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl %ebx
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE1-NEXT:    pushl %esi
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 12
-; X86-SSE1-NEXT:    .cfi_offset %esi, -12
-; X86-SSE1-NEXT:    .cfi_offset %ebx, -8
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE1-NEXT:    movl (%esi), %eax
-; X86-SSE1-NEXT:    movl 4(%esi), %edx
-; X86-SSE1-NEXT:    .p2align 4, 0x90
-; X86-SSE1-NEXT:  .LBB2_1: # %atomicrmw.start
-; X86-SSE1-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-SSE1-NEXT:    lock cmpxchg8b (%esi)
-; X86-SSE1-NEXT:    jne .LBB2_1
-; X86-SSE1-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-SSE1-NEXT:    popl %esi
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE1-NEXT:    popl %ebx
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 4
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movlps %xmm0, (%eax)
 ; X86-SSE1-NEXT:    retl
 ;
 ; X86-SSE2-LABEL: store_double:
@@ -94,26 +77,16 @@
 ;
 ; X86-NOSSE-LABEL: store_double:
 ; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl %ebx
-; X86-NOSSE-NEXT:    .cfi_def_cfa_offset 8
-; X86-NOSSE-NEXT:    pushl %esi
-; X86-NOSSE-NEXT:    .cfi_def_cfa_offset 12
-; X86-NOSSE-NEXT:    .cfi_offset %esi, -12
-; X86-NOSSE-NEXT:    .cfi_offset %ebx, -8
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NOSSE-NEXT:    subl $12, %esp
+; X86-NOSSE-NEXT:    .cfi_def_cfa_offset 16
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movl (%esi), %eax
-; X86-NOSSE-NEXT:    movl 4(%esi), %edx
-; X86-NOSSE-NEXT:    .p2align 4, 0x90
-; X86-NOSSE-NEXT:  .LBB2_1: # %atomicrmw.start
-; X86-NOSSE-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NOSSE-NEXT:    lock cmpxchg8b (%esi)
-; X86-NOSSE-NEXT:    jne .LBB2_1
-; X86-NOSSE-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NOSSE-NEXT:    popl %esi
-; X86-NOSSE-NEXT:    .cfi_def_cfa_offset 8
-; X86-NOSSE-NEXT:    popl %ebx
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOSSE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl %ecx, (%esp)
+; X86-NOSSE-NEXT:    fildll (%esp)
+; X86-NOSSE-NEXT:    fistpll (%eax)
+; X86-NOSSE-NEXT:    addl $12, %esp
 ; X86-NOSSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NOSSE-NEXT:    retl
 ;
@@ -299,17 +272,16 @@
 define double @load_double(double* %fptr) {
 ; X86-SSE1-LABEL: load_double:
 ; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    subl $20, %esp
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 24
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE1-NEXT:    fildll (%eax)
-; X86-SSE1-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    subl $12, %esp
+; X86-SSE1-NEXT:    .cfi_def_cfa_offset 16
 ; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movl %eax, (%esp)
+; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movss %xmm0, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X86-SSE1-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE1-NEXT:    fldl (%esp)
-; X86-SSE1-NEXT:    addl $20, %esp
+; X86-SSE1-NEXT:    addl $12, %esp
 ; X86-SSE1-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE1-NEXT:    retl
 ;
@@ -568,27 +540,11 @@
 define void @store_double_seq_cst(double* %fptr, double %v) {
 ; X86-SSE1-LABEL: store_double_seq_cst:
 ; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl %ebx
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE1-NEXT:    pushl %esi
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 12
-; X86-SSE1-NEXT:    .cfi_offset %esi, -12
-; X86-SSE1-NEXT:    .cfi_offset %ebx, -8
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE1-NEXT:    movl (%esi), %eax
-; X86-SSE1-NEXT:    movl 4(%esi), %edx
-; X86-SSE1-NEXT:    .p2align 4, 0x90
-; X86-SSE1-NEXT:  .LBB9_1: # %atomicrmw.start
-; X86-SSE1-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-SSE1-NEXT:    lock cmpxchg8b (%esi)
-; X86-SSE1-NEXT:    jne .LBB9_1
-; X86-SSE1-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-SSE1-NEXT:    popl %esi
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE1-NEXT:    popl %ebx
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 4
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movlps %xmm0, (%eax)
+; X86-SSE1-NEXT:    lock orl $0, (%esp)
 ; X86-SSE1-NEXT:    retl
 ;
 ; X86-SSE2-LABEL: store_double_seq_cst:
@@ -609,26 +565,17 @@
 ;
 ; X86-NOSSE-LABEL: store_double_seq_cst:
 ; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl %ebx
-; X86-NOSSE-NEXT:    .cfi_def_cfa_offset 8
-; X86-NOSSE-NEXT:    pushl %esi
-; X86-NOSSE-NEXT:    .cfi_def_cfa_offset 12
-; X86-NOSSE-NEXT:    .cfi_offset %esi, -12
-; X86-NOSSE-NEXT:    .cfi_offset %ebx, -8
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NOSSE-NEXT:    subl $12, %esp
+; X86-NOSSE-NEXT:    .cfi_def_cfa_offset 16
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movl (%esi), %eax
-; X86-NOSSE-NEXT:    movl 4(%esi), %edx
-; X86-NOSSE-NEXT:    .p2align 4, 0x90
-; X86-NOSSE-NEXT:  .LBB9_1: # %atomicrmw.start
-; X86-NOSSE-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NOSSE-NEXT:    lock cmpxchg8b (%esi)
-; X86-NOSSE-NEXT:    jne .LBB9_1
-; X86-NOSSE-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NOSSE-NEXT:    popl %esi
-; X86-NOSSE-NEXT:    .cfi_def_cfa_offset 8
-; X86-NOSSE-NEXT:    popl %ebx
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOSSE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl %ecx, (%esp)
+; X86-NOSSE-NEXT:    fildll (%esp)
+; X86-NOSSE-NEXT:    fistpll (%eax)
+; X86-NOSSE-NEXT:    lock orl $0, (%esp)
+; X86-NOSSE-NEXT:    addl $12, %esp
 ; X86-NOSSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NOSSE-NEXT:    retl
 ;
@@ -712,17 +659,16 @@
 define double @load_double_seq_cst(double* %fptr) {
 ; X86-SSE1-LABEL: load_double_seq_cst:
 ; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    subl $20, %esp
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 24
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE1-NEXT:    fildll (%eax)
-; X86-SSE1-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    subl $12, %esp
+; X86-SSE1-NEXT:    .cfi_def_cfa_offset 16
 ; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movl %eax, (%esp)
+; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movss %xmm0, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X86-SSE1-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE1-NEXT:    fldl (%esp)
-; X86-SSE1-NEXT:    addl $20, %esp
+; X86-SSE1-NEXT:    addl $12, %esp
 ; X86-SSE1-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE1-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll
--- a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll
+++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll
@@ -159,9 +159,8 @@
 ; X32-SSE1-LABEL: merge_4f32_f32_34uu:
 ; X32-SSE1:       # %bb.0:
 ; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-SSE1-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-SSE1-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X32-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
 ; X32-SSE1-NEXT:    retl
 ;
 ; X32-SSE41-LABEL: merge_4f32_f32_34uu:
@@ -242,11 +241,8 @@
 ; X32-SSE1-LABEL: merge_4f32_f32_45zz:
 ; X32-SSE1:       # %bb.0:
 ; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-SSE1-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-SSE1-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X32-SSE1-NEXT:    xorps %xmm1, %xmm1
-; X32-SSE1-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X32-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
 ; X32-SSE1-NEXT:    retl
 ;
 ; X32-SSE41-LABEL: merge_4f32_f32_45zz:
@@ -286,10 +282,9 @@
 ; X32-SSE1-LABEL: merge_4f32_f32_012u:
 ; X32-SSE1:       # %bb.0:
 ; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X32-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
 ; X32-SSE1-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-SSE1-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-SSE1-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; X32-SSE1-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; X32-SSE1-NEXT:    retl
 ;
@@ -335,10 +330,9 @@
 ; X32-SSE1-LABEL: merge_4f32_f32_019u:
 ; X32-SSE1:       # %bb.0:
 ; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X32-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
 ; X32-SSE1-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-SSE1-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-SSE1-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; X32-SSE1-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; X32-SSE1-NEXT:    retl
 ;
@@ -1197,11 +1191,8 @@
 ; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X32-SSE1-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-SSE1-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X32-SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-SSE1-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-SSE1-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X32-SSE1-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-SSE1-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE1-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; X32-SSE1-NEXT:    retl
 ;
 ; X32-SSE41-LABEL: merge_4f32_f32_2345_volatile:
diff --git a/llvm/test/CodeGen/X86/recip-fastmath.ll b/llvm/test/CodeGen/X86/recip-fastmath.ll
--- a/llvm/test/CodeGen/X86/recip-fastmath.ll
+++ b/llvm/test/CodeGen/X86/recip-fastmath.ll
@@ -67,8 +67,8 @@
 ; BDVER2-LABEL: f32_one_step:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; BDVER2-NEXT:    vfmsubss {{.*}}(%rip), %xmm1, %xmm0, %xmm0
-; BDVER2-NEXT:    vfnmaddss %xmm1, %xmm0, %xmm1, %xmm0
+; BDVER2-NEXT:    vfmsubss {{.*#+}} xmm0 = (xmm0 * xmm1) - mem
+; BDVER2-NEXT:    vfnmaddss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1
 ; BDVER2-NEXT:    retq
 ;
 ; BTVER2-LABEL: f32_one_step:
@@ -153,8 +153,8 @@
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    vrcpss %xmm1, %xmm1, %xmm2
 ; BDVER2-NEXT:    vmulss %xmm2, %xmm0, %xmm3
-; BDVER2-NEXT:    vfmsubss %xmm0, %xmm3, %xmm1, %xmm0
-; BDVER2-NEXT:    vfnmaddss %xmm3, %xmm0, %xmm2, %xmm0
+; BDVER2-NEXT:    vfmsubss {{.*#+}} xmm0 = (xmm1 * xmm3) - xmm0
+; BDVER2-NEXT:    vfnmaddss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
 ; BDVER2-NEXT:    retq
 ;
 ; BTVER2-LABEL: f32_one_step_variables:
@@ -255,10 +255,10 @@
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
 ; BDVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; BDVER2-NEXT:    vfmsubss %xmm2, %xmm1, %xmm0, %xmm3
-; BDVER2-NEXT:    vfnmaddss %xmm1, %xmm3, %xmm1, %xmm1
-; BDVER2-NEXT:    vfmsubss %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER2-NEXT:    vfnmaddss %xmm1, %xmm0, %xmm1, %xmm0
+; BDVER2-NEXT:    vfmsubss {{.*#+}} xmm3 = (xmm0 * xmm1) - xmm2
+; BDVER2-NEXT:    vfnmaddss {{.*#+}} xmm1 = -(xmm1 * xmm3) + xmm1
+; BDVER2-NEXT:    vfmsubss {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2
+; BDVER2-NEXT:    vfnmaddss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1
 ; BDVER2-NEXT:    retq
 ;
 ; BTVER2-LABEL: f32_two_step:
@@ -419,8 +419,8 @@
 ; BDVER2-LABEL: v4f32_one_step:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    vrcpps %xmm0, %xmm1
-; BDVER2-NEXT:    vfmsubps {{.*}}(%rip), %xmm1, %xmm0, %xmm0
-; BDVER2-NEXT:    vfnmaddps %xmm1, %xmm0, %xmm1, %xmm0
+; BDVER2-NEXT:    vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - mem
+; BDVER2-NEXT:    vfnmaddps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1
 ; BDVER2-NEXT:    retq
 ;
 ; BTVER2-LABEL: v4f32_one_step:
@@ -514,8 +514,8 @@
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    vrcpps %xmm1, %xmm2
 ; BDVER2-NEXT:    vmulps %xmm2, %xmm0, %xmm3
-; BDVER2-NEXT:    vfmsubps %xmm0, %xmm3, %xmm1, %xmm0
-; BDVER2-NEXT:    vfnmaddps %xmm3, %xmm0, %xmm2, %xmm0
+; BDVER2-NEXT:    vfmsubps {{.*#+}} xmm0 = (xmm1 * xmm3) - xmm0
+; BDVER2-NEXT:    vfnmaddps {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
 ; BDVER2-NEXT:    retq
 ;
 ; BTVER2-LABEL: v4f32_one_step_variables:
@@ -616,10 +616,10 @@
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    vrcpps %xmm0, %xmm1
 ; BDVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; BDVER2-NEXT:    vfmsubps %xmm2, %xmm1, %xmm0, %xmm3
-; BDVER2-NEXT:    vfnmaddps %xmm1, %xmm3, %xmm1, %xmm1
-; BDVER2-NEXT:    vfmsubps %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER2-NEXT:    vfnmaddps %xmm1, %xmm0, %xmm1, %xmm0
+; BDVER2-NEXT:    vfmsubps {{.*#+}} xmm3 = (xmm0 * xmm1) - xmm2
+; BDVER2-NEXT:    vfnmaddps {{.*#+}} xmm1 = -(xmm1 * xmm3) + xmm1
+; BDVER2-NEXT:    vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2
+; BDVER2-NEXT:    vfnmaddps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1
 ; BDVER2-NEXT:    retq
 ;
 ; BTVER2-LABEL: v4f32_two_step:
@@ -790,8 +790,8 @@
 ; BDVER2-LABEL: v8f32_one_step:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    vrcpps %ymm0, %ymm1
-; BDVER2-NEXT:    vfmsubps {{.*}}(%rip), %ymm1, %ymm0, %ymm0
-; BDVER2-NEXT:    vfnmaddps %ymm1, %ymm0, %ymm1, %ymm0
+; BDVER2-NEXT:    vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm1) - mem
+; BDVER2-NEXT:    vfnmaddps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm1
 ; BDVER2-NEXT:    retq
 ;
 ; BTVER2-LABEL: v8f32_one_step:
@@ -910,10 +910,10 @@
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    vrcpps %ymm0, %ymm1
 ; BDVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; BDVER2-NEXT:    vfmsubps %ymm2, %ymm1, %ymm0, %ymm3
-; BDVER2-NEXT:    vfnmaddps %ymm1, %ymm3, %ymm1, %ymm1
-; BDVER2-NEXT:    vfmsubps %ymm2, %ymm1, %ymm0, %ymm0
-; BDVER2-NEXT:    vfnmaddps %ymm1, %ymm0, %ymm1, %ymm0
+; BDVER2-NEXT:    vfmsubps {{.*#+}} ymm3 = (ymm0 * ymm1) - ymm2
+; BDVER2-NEXT:    vfnmaddps {{.*#+}} ymm1 = -(ymm1 * ymm3) + ymm1
+; BDVER2-NEXT:    vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm1) - ymm2
+; BDVER2-NEXT:    vfnmaddps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm1
 ; BDVER2-NEXT:    retq
 ;
 ; BTVER2-LABEL: v8f32_two_step:
@@ -1122,10 +1122,10 @@
 ; BDVER2-NEXT:    vrcpps %ymm0, %ymm2
 ; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; BDVER2-NEXT:    vrcpps %ymm1, %ymm4
-; BDVER2-NEXT:    vfmsubps %ymm3, %ymm2, %ymm0, %ymm0
-; BDVER2-NEXT:    vfmsubps %ymm3, %ymm4, %ymm1, %ymm1
-; BDVER2-NEXT:    vfnmaddps %ymm2, %ymm0, %ymm2, %ymm0
-; BDVER2-NEXT:    vfnmaddps %ymm4, %ymm1, %ymm4, %ymm1
+; BDVER2-NEXT:    vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm3
+; BDVER2-NEXT:    vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm4) - ymm3
+; BDVER2-NEXT:    vfnmaddps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm2
+; BDVER2-NEXT:    vfnmaddps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm4
 ; BDVER2-NEXT:    retq
 ;
 ; BTVER2-LABEL: v16f32_one_step:
@@ -1295,15 +1295,15 @@
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    vrcpps %ymm0, %ymm2
 ; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; BDVER2-NEXT:    vfmsubps %ymm3, %ymm2, %ymm0, %ymm4
-; BDVER2-NEXT:    vfnmaddps %ymm2, %ymm4, %ymm2, %ymm2
-; BDVER2-NEXT:    vfmsubps %ymm3, %ymm2, %ymm0, %ymm0
-; BDVER2-NEXT:    vfnmaddps %ymm2, %ymm0, %ymm2, %ymm0
+; BDVER2-NEXT:    vfmsubps {{.*#+}} ymm4 = (ymm0 * ymm2) - ymm3
+; BDVER2-NEXT:    vfnmaddps {{.*#+}} ymm2 = -(ymm2 * ymm4) + ymm2
+; BDVER2-NEXT:    vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm3
+; BDVER2-NEXT:    vfnmaddps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm2
 ; BDVER2-NEXT:    vrcpps %ymm1, %ymm2
-; BDVER2-NEXT:    vfmsubps %ymm3, %ymm2, %ymm1, %ymm4
-; BDVER2-NEXT:    vfnmaddps %ymm2, %ymm4, %ymm2, %ymm2
-; BDVER2-NEXT:    vfmsubps %ymm3, %ymm2, %ymm1, %ymm1
-; BDVER2-NEXT:    vfnmaddps %ymm2, %ymm1, %ymm2, %ymm1
+; BDVER2-NEXT:    vfmsubps {{.*#+}} ymm4 = (ymm1 * ymm2) - ymm3
+; BDVER2-NEXT:    vfnmaddps {{.*#+}} ymm2 = -(ymm2 * ymm4) + ymm2
+; BDVER2-NEXT:    vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm2) - ymm3
+; BDVER2-NEXT:    vfnmaddps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm2
 ; BDVER2-NEXT:    retq
 ;
 ; BTVER2-LABEL: v16f32_two_step:
diff --git a/llvm/test/CodeGen/X86/recip-fastmath2.ll b/llvm/test/CodeGen/X86/recip-fastmath2.ll
--- a/llvm/test/CodeGen/X86/recip-fastmath2.ll
+++ b/llvm/test/CodeGen/X86/recip-fastmath2.ll
@@ -67,8 +67,8 @@
 ; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
 ; BDVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; BDVER2-NEXT:    vmulss %xmm2, %xmm1, %xmm3
-; BDVER2-NEXT:    vfmsubss %xmm2, %xmm3, %xmm0, %xmm0
-; BDVER2-NEXT:    vfnmaddss %xmm3, %xmm0, %xmm1, %xmm0
+; BDVER2-NEXT:    vfmsubss {{.*#+}} xmm0 = (xmm0 * xmm3) - xmm2
+; BDVER2-NEXT:    vfnmaddss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3
 ; BDVER2-NEXT:    retq
 ;
 ; BTVER2-LABEL: f32_one_step_2:
@@ -163,8 +163,8 @@
 ; BDVER2-LABEL: f32_one_step_2_divs:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; BDVER2-NEXT:    vfmsubss {{.*}}(%rip), %xmm1, %xmm0, %xmm0
-; BDVER2-NEXT:    vfnmaddss %xmm1, %xmm0, %xmm1, %xmm0
+; BDVER2-NEXT:    vfmsubss {{.*#+}} xmm0 = (xmm0 * xmm1) - mem
+; BDVER2-NEXT:    vfnmaddss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1
 ; BDVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1
 ; BDVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; BDVER2-NEXT:    retq
@@ -278,12 +278,12 @@
 ; BDVER2-LABEL: f32_two_step_2:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; BDVER2-NEXT:    vfmsubss {{.*}}(%rip), %xmm1, %xmm0, %xmm2
+; BDVER2-NEXT:    vfmsubss {{.*#+}} xmm2 = (xmm0 * xmm1) - mem
 ; BDVER2-NEXT:    vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; BDVER2-NEXT:    vfnmaddss %xmm1, %xmm2, %xmm1, %xmm1
+; BDVER2-NEXT:    vfnmaddss {{.*#+}} xmm1 = -(xmm1 * xmm2) + xmm1
 ; BDVER2-NEXT:    vmulss %xmm4, %xmm1, %xmm3
-; BDVER2-NEXT:    vfmsubss %xmm4, %xmm3, %xmm0, %xmm0
-; BDVER2-NEXT:    vfnmaddss %xmm3, %xmm0, %xmm1, %xmm0
+; BDVER2-NEXT:    vfmsubss {{.*#+}} xmm0 = (xmm0 * xmm3) - xmm4
+; BDVER2-NEXT:    vfnmaddss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3
 ; BDVER2-NEXT:    retq
 ;
 ; BTVER2-LABEL: f32_two_step_2:
@@ -400,8 +400,8 @@
 ; BDVER2-NEXT:    vrcpps %xmm0, %xmm1
 ; BDVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
 ; BDVER2-NEXT:    vmulps %xmm2, %xmm1, %xmm3
-; BDVER2-NEXT:    vfmsubps %xmm2, %xmm3, %xmm0, %xmm0
-; BDVER2-NEXT:    vfnmaddps %xmm3, %xmm0, %xmm1, %xmm0
+; BDVER2-NEXT:    vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm3) - xmm2
+; BDVER2-NEXT:    vfnmaddps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3
 ; BDVER2-NEXT:    retq
 ;
 ; BTVER2-LABEL: v4f32_one_step2:
@@ -496,8 +496,8 @@
 ; BDVER2-LABEL: v4f32_one_step_2_divs:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    vrcpps %xmm0, %xmm1
-; BDVER2-NEXT:    vfmsubps {{.*}}(%rip), %xmm1, %xmm0, %xmm0
-; BDVER2-NEXT:    vfnmaddps %xmm1, %xmm0, %xmm1, %xmm0
+; BDVER2-NEXT:    vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - mem
+; BDVER2-NEXT:    vfnmaddps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1
 ; BDVER2-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1
 ; BDVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0
 ; BDVER2-NEXT:    retq
@@ -622,12 +622,12 @@
 ; BDVER2-LABEL: v4f32_two_step2:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    vrcpps %xmm0, %xmm1
-; BDVER2-NEXT:    vfmsubps {{.*}}(%rip), %xmm1, %xmm0, %xmm2
+; BDVER2-NEXT:    vfmsubps {{.*#+}} xmm2 = (xmm0 * xmm1) - mem
 ; BDVER2-NEXT:    vmovaps {{.*#+}} xmm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
-; BDVER2-NEXT:    vfnmaddps %xmm1, %xmm2, %xmm1, %xmm1
+; BDVER2-NEXT:    vfnmaddps {{.*#+}} xmm1 = -(xmm1 * xmm2) + xmm1
 ; BDVER2-NEXT:    vmulps %xmm4, %xmm1, %xmm3
-; BDVER2-NEXT:    vfmsubps %xmm4, %xmm3, %xmm0, %xmm0
-; BDVER2-NEXT:    vfnmaddps %xmm3, %xmm0, %xmm1, %xmm0
+; BDVER2-NEXT:    vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm3) - xmm4
+; BDVER2-NEXT:    vfnmaddps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3
 ; BDVER2-NEXT:    retq
 ;
 ; BTVER2-LABEL: v4f32_two_step2:
@@ -753,8 +753,8 @@
 ; BDVER2-NEXT:    vrcpps %ymm0, %ymm1
 ; BDVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
 ; BDVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm3
-; BDVER2-NEXT:    vfmsubps %ymm2, %ymm3, %ymm0, %ymm0
-; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm0, %ymm1, %ymm0
+; BDVER2-NEXT:    vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm3) - ymm2
+; BDVER2-NEXT:    vfnmaddps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm3
 ; BDVER2-NEXT:    retq
 ;
 ; BTVER2-LABEL: v8f32_one_step2:
@@ -858,8 +858,8 @@
 ; BDVER2-LABEL: v8f32_one_step_2_divs:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    vrcpps %ymm0, %ymm1
-; BDVER2-NEXT:    vfmsubps {{.*}}(%rip), %ymm1, %ymm0, %ymm0
-; BDVER2-NEXT:    vfnmaddps %ymm1, %ymm0, %ymm1, %ymm0
+; BDVER2-NEXT:    vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm1) - mem
+; BDVER2-NEXT:    vfnmaddps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm1
 ; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1
 ; BDVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0
 ; BDVER2-NEXT:    retq
@@ -999,12 +999,12 @@
 ; BDVER2-LABEL: v8f32_two_step2:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    vrcpps %ymm0, %ymm1
-; BDVER2-NEXT:    vfmsubps {{.*}}(%rip), %ymm1, %ymm0, %ymm2
+; BDVER2-NEXT:    vfmsubps {{.*#+}} ymm2 = (ymm0 * ymm1) - mem
 ; BDVER2-NEXT:    vmovaps {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
-; BDVER2-NEXT:    vfnmaddps %ymm1, %ymm2, %ymm1, %ymm1
+; BDVER2-NEXT:    vfnmaddps {{.*#+}} ymm1 = -(ymm1 * ymm2) + ymm1
 ; BDVER2-NEXT:    vmulps %ymm4, %ymm1, %ymm3
-; BDVER2-NEXT:    vfmsubps %ymm4, %ymm3, %ymm0, %ymm0
-; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm0, %ymm1, %ymm0
+; BDVER2-NEXT:    vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm3) - ymm4
+; BDVER2-NEXT:    vfnmaddps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm3
 ; BDVER2-NEXT:    retq
 ;
 ; BTVER2-LABEL: v8f32_two_step2:
@@ -1194,12 +1194,12 @@
 ; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
 ; BDVER2-NEXT:    vrcpps %ymm1, %ymm5
 ; BDVER2-NEXT:    vmulps %ymm3, %ymm2, %ymm4
-; BDVER2-NEXT:    vfmsubps %ymm3, %ymm4, %ymm0, %ymm0
+; BDVER2-NEXT:    vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm4) - ymm3
 ; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
-; BDVER2-NEXT:    vfnmaddps %ymm4, %ymm0, %ymm2, %ymm0
+; BDVER2-NEXT:    vfnmaddps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm4
 ; BDVER2-NEXT:    vmulps %ymm3, %ymm5, %ymm4
-; BDVER2-NEXT:    vfmsubps %ymm3, %ymm4, %ymm1, %ymm1
-; BDVER2-NEXT:    vfnmaddps %ymm4, %ymm1, %ymm5, %ymm1
+; BDVER2-NEXT:    vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm4) - ymm3
+; BDVER2-NEXT:    vfnmaddps {{.*#+}} ymm1 = -(ymm5 * ymm1) + ymm4
 ; BDVER2-NEXT:    retq
 ;
 ; BTVER2-LABEL: v16f32_one_step2:
@@ -1361,12 +1361,12 @@
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    vrcpps %ymm0, %ymm2
 ; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; BDVER2-NEXT:    vfmsubps %ymm3, %ymm2, %ymm0, %ymm0
-; BDVER2-NEXT:    vfnmaddps %ymm2, %ymm0, %ymm2, %ymm0
+; BDVER2-NEXT:    vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm3
+; BDVER2-NEXT:    vfnmaddps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm2
 ; BDVER2-NEXT:    vrcpps %ymm1, %ymm2
-; BDVER2-NEXT:    vfmsubps %ymm3, %ymm2, %ymm1, %ymm1
+; BDVER2-NEXT:    vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm2) - ymm3
 ; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm3
-; BDVER2-NEXT:    vfnmaddps %ymm2, %ymm1, %ymm2, %ymm1
+; BDVER2-NEXT:    vfnmaddps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm2
 ; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm2
 ; BDVER2-NEXT:    vmulps %ymm0, %ymm3, %ymm0
 ; BDVER2-NEXT:    vmulps %ymm1, %ymm2, %ymm1
@@ -1573,19 +1573,19 @@
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    vrcpps %ymm0, %ymm2
 ; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; BDVER2-NEXT:    vfmsubps %ymm3, %ymm2, %ymm0, %ymm4
-; BDVER2-NEXT:    vfnmaddps %ymm2, %ymm4, %ymm2, %ymm2
+; BDVER2-NEXT:    vfmsubps {{.*#+}} ymm4 = (ymm0 * ymm2) - ymm3
+; BDVER2-NEXT:    vfnmaddps {{.*#+}} ymm2 = -(ymm2 * ymm4) + ymm2
 ; BDVER2-NEXT:    vmovaps {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
 ; BDVER2-NEXT:    vmulps %ymm4, %ymm2, %ymm5
-; BDVER2-NEXT:    vfmsubps %ymm4, %ymm5, %ymm0, %ymm0
-; BDVER2-NEXT:    vfnmaddps %ymm5, %ymm0, %ymm2, %ymm0
+; BDVER2-NEXT:    vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm5) - ymm4
+; BDVER2-NEXT:    vfnmaddps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm5
 ; BDVER2-NEXT:    vrcpps %ymm1, %ymm2
 ; BDVER2-NEXT:    vmovaps {{.*#+}} ymm5 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
-; BDVER2-NEXT:    vfmsubps %ymm3, %ymm2, %ymm1, %ymm3
-; BDVER2-NEXT:    vfnmaddps %ymm2, %ymm3, %ymm2, %ymm2
+; BDVER2-NEXT:    vfmsubps {{.*#+}} ymm3 = (ymm1 * ymm2) - ymm3
+; BDVER2-NEXT:    vfnmaddps {{.*#+}} ymm2 = -(ymm2 * ymm3) + ymm2
 ; BDVER2-NEXT:    vmulps %ymm5, %ymm2, %ymm4
-; BDVER2-NEXT:    vfmsubps %ymm5, %ymm4, %ymm1, %ymm1
-; BDVER2-NEXT:    vfnmaddps %ymm4, %ymm1, %ymm2, %ymm1
+; BDVER2-NEXT:    vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm4) - ymm5
+; BDVER2-NEXT:    vfnmaddps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm4
 ; BDVER2-NEXT:    retq
 ;
 ; BTVER2-LABEL: v16f32_two_step2:
diff --git a/llvm/test/CodeGen/X86/sat-add.ll b/llvm/test/CodeGen/X86/sat-add.ll
--- a/llvm/test/CodeGen/X86/sat-add.ll
+++ b/llvm/test/CodeGen/X86/sat-add.ll
@@ -211,14 +211,10 @@
 define i8 @unsigned_sat_variable_i8_using_cmp_notval(i8 %x, i8 %y) {
 ; ANY-LABEL: unsigned_sat_variable_i8_using_cmp_notval:
 ; ANY:       # %bb.0:
-; ANY-NEXT:    # kill: def $esi killed $esi def $rsi
-; ANY-NEXT:    # kill: def $edi killed $edi def $rdi
-; ANY-NEXT:    leal (%rdi,%rsi), %eax
-; ANY-NEXT:    notb %sil
-; ANY-NEXT:    cmpb %sil, %dil
-; ANY-NEXT:    movzbl %al, %ecx
+; ANY-NEXT:    addb %dil, %sil
+; ANY-NEXT:    movzbl %sil, %ecx
 ; ANY-NEXT:    movl $255, %eax
-; ANY-NEXT:    cmovbel %ecx, %eax
+; ANY-NEXT:    cmovael %ecx, %eax
 ; ANY-NEXT:    # kill: def $al killed $al killed $eax
 ; ANY-NEXT:    retq
   %noty = xor i8 %y, -1
@@ -263,13 +259,9 @@
 define i16 @unsigned_sat_variable_i16_using_cmp_notval(i16 %x, i16 %y) {
 ; ANY-LABEL: unsigned_sat_variable_i16_using_cmp_notval:
 ; ANY:       # %bb.0:
-; ANY-NEXT:    # kill: def $esi killed $esi def $rsi
-; ANY-NEXT:    # kill: def $edi killed $edi def $rdi
-; ANY-NEXT:    leal (%rdi,%rsi), %ecx
-; ANY-NEXT:    notl %esi
-; ANY-NEXT:    cmpw %si, %di
+; ANY-NEXT:    addw %di, %si
 ; ANY-NEXT:    movl $65535, %eax # imm = 0xFFFF
-; ANY-NEXT:    cmovbel %ecx, %eax
+; ANY-NEXT:    cmovael %esi, %eax
 ; ANY-NEXT:    # kill: def $ax killed $ax killed $eax
 ; ANY-NEXT:    retq
   %noty = xor i16 %y, -1
@@ -312,13 +304,9 @@
 define i32 @unsigned_sat_variable_i32_using_cmp_notval(i32 %x, i32 %y) {
 ; ANY-LABEL: unsigned_sat_variable_i32_using_cmp_notval:
 ; ANY:       # %bb.0:
-; ANY-NEXT:    # kill: def $esi killed $esi def $rsi
-; ANY-NEXT:    # kill: def $edi killed $edi def $rdi
-; ANY-NEXT:    leal (%rdi,%rsi), %ecx
-; ANY-NEXT:    notl %esi
-; ANY-NEXT:    cmpl %esi, %edi
+; ANY-NEXT:    addl %esi, %edi
 ; ANY-NEXT:    movl $-1, %eax
-; ANY-NEXT:    cmovbel %ecx, %eax
+; ANY-NEXT:    cmovael %edi, %eax
 ; ANY-NEXT:    retq
   %noty = xor i32 %y, -1
   %a = add i32 %x, %y
@@ -359,11 +347,9 @@
 define i64 @unsigned_sat_variable_i64_using_cmp_notval(i64 %x, i64 %y) {
 ; ANY-LABEL: unsigned_sat_variable_i64_using_cmp_notval:
 ; ANY:       # %bb.0:
-; ANY-NEXT:    leaq (%rdi,%rsi), %rcx
-; ANY-NEXT:    notq %rsi
-; ANY-NEXT:    cmpq %rsi, %rdi
+; ANY-NEXT:    addq %rsi, %rdi
 ; ANY-NEXT:    movq $-1, %rax
-; ANY-NEXT:    cmovbeq %rcx, %rax
+; ANY-NEXT:    cmovaeq %rdi, %rax
 ; ANY-NEXT:    retq
   %noty = xor i64 %y, -1
   %a = add i64 %x, %y
diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
@@ -0,0 +1,1411 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686 -mattr=cmov | FileCheck %s --check-prefix=X86
+
+declare  i4  @llvm.sdiv.fix.sat.i4   (i4,  i4,  i32)
+declare  i15 @llvm.sdiv.fix.sat.i15  (i15, i15, i32)
+declare  i16 @llvm.sdiv.fix.sat.i16  (i16, i16, i32)
+declare  i18 @llvm.sdiv.fix.sat.i18  (i18, i18, i32)
+declare  i64 @llvm.sdiv.fix.sat.i64  (i64, i64, i32)
+declare  <4 x i32> @llvm.sdiv.fix.sat.v4i32(<4 x i32>, <4 x i32>, i32)
+
+define i16 @func(i16 %x, i16 %y) nounwind {
+;
+; X64-LABEL: func:
+; X64:       # %bb.0:
+; X64-NEXT:    movswl %si, %esi
+; X64-NEXT:    movswl %di, %ecx
+; X64-NEXT:    shll $8, %ecx
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    cltd
+; X64-NEXT:    idivl %esi
+; X64-NEXT:    # kill: def $eax killed $eax def $rax
+; X64-NEXT:    leal -1(%rax), %edi
+; X64-NEXT:    testl %esi, %esi
+; X64-NEXT:    sets %sil
+; X64-NEXT:    testl %ecx, %ecx
+; X64-NEXT:    sets %cl
+; X64-NEXT:    xorb %sil, %cl
+; X64-NEXT:    testl %edx, %edx
+; X64-NEXT:    setne %dl
+; X64-NEXT:    testb %cl, %dl
+; X64-NEXT:    cmovel %eax, %edi
+; X64-NEXT:    cmpl $65535, %edi # imm = 0xFFFF
+; X64-NEXT:    movl $65535, %ecx # imm = 0xFFFF
+; X64-NEXT:    cmovll %edi, %ecx
+; X64-NEXT:    cmpl $-65536, %ecx # imm = 0xFFFF0000
+; X64-NEXT:    movl $-65536, %eax # imm = 0xFFFF0000
+; X64-NEXT:    cmovgl %ecx, %eax
+; X64-NEXT:    shrl %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shll $8, %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    cltd
+; X86-NEXT:    idivl %esi
+; X86-NEXT:    leal -1(%eax), %edi
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    sets %bl
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    sets %cl
+; X86-NEXT:    xorb %bl, %cl
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    setne %dl
+; X86-NEXT:    testb %cl, %dl
+; X86-NEXT:    cmovel %eax, %edi
+; X86-NEXT:    cmpl $65535, %edi # imm = 0xFFFF
+; X86-NEXT:    movl $65535, %ecx # imm = 0xFFFF
+; X86-NEXT:    cmovll %edi, %ecx
+; X86-NEXT:    cmpl $-65536, %ecx # imm = 0xFFFF0000
+; X86-NEXT:    movl $-65536, %eax # imm = 0xFFFF0000
+; X86-NEXT:    cmovgl %ecx, %eax
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl
+  %tmp = call i16 @llvm.sdiv.fix.sat.i16(i16 %x, i16 %y, i32 7)
+  ret i16 %tmp
+}
+
+define i16 @func2(i8 %x, i8 %y) nounwind {
+;
+; X64-LABEL: func2:
+; X64:       # %bb.0:
+; X64-NEXT:    movsbl %dil, %eax
+; X64-NEXT:    movsbl %sil, %ecx
+; X64-NEXT:    movswl %cx, %esi
+; X64-NEXT:    movswl %ax, %ecx
+; X64-NEXT:    shll $14, %ecx
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    cltd
+; X64-NEXT:    idivl %esi
+; X64-NEXT:    # kill: def $eax killed $eax def $rax
+; X64-NEXT:    leal -1(%rax), %edi
+; X64-NEXT:    testl %esi, %esi
+; X64-NEXT:    sets %sil
+; X64-NEXT:    testl %ecx, %ecx
+; X64-NEXT:    sets %cl
+; X64-NEXT:    xorb %sil, %cl
+; X64-NEXT:    testl %edx, %edx
+; X64-NEXT:    setne %dl
+; X64-NEXT:    testb %cl, %dl
+; X64-NEXT:    cmovel %eax, %edi
+; X64-NEXT:    cmpl $16383, %edi # imm = 0x3FFF
+; X64-NEXT:    movl $16383, %ecx # imm = 0x3FFF
+; X64-NEXT:    cmovll %edi, %ecx
+; X64-NEXT:    cmpl $-16384, %ecx # imm = 0xC000
+; X64-NEXT:    movl $-16384, %eax # imm = 0xC000
+; X64-NEXT:    cmovgl %ecx, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func2:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shll $14, %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    cltd
+; X86-NEXT:    idivl %esi
+; X86-NEXT:    leal -1(%eax), %edi
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    sets %bl
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    sets %cl
+; X86-NEXT:    xorb %bl, %cl
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    setne %dl
+; X86-NEXT:    testb %cl, %dl
+; X86-NEXT:    cmovel %eax, %edi
+; X86-NEXT:    cmpl $16383, %edi # imm = 0x3FFF
+; X86-NEXT:    movl $16383, %ecx # imm = 0x3FFF
+; X86-NEXT:    cmovll %edi, %ecx
+; X86-NEXT:    cmpl $-16384, %ecx # imm = 0xC000
+; X86-NEXT:    movl $-16384, %eax # imm = 0xC000
+; X86-NEXT:    cmovgl %ecx, %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl
+  %x2 = sext i8 %x to i15
+  %y2 = sext i8 %y to i15
+  %tmp = call i15 @llvm.sdiv.fix.sat.i15(i15 %x2, i15 %y2, i32 14)
+  %tmp2 = sext i15 %tmp to i16
+  ret i16 %tmp2
+}
+
+define i16 @func3(i15 %x, i8 %y) nounwind {
+;
+; X64-LABEL: func3:
+; X64:       # %bb.0:
+; X64-NEXT:    shll $8, %esi
+; X64-NEXT:    movswl %si, %ecx
+; X64-NEXT:    addl %edi, %edi
+; X64-NEXT:    shrl $4, %ecx
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    cwtd
+; X64-NEXT:    idivw %cx
+; X64-NEXT:    # kill: def $ax killed $ax def $rax
+; X64-NEXT:    leal -1(%rax), %esi
+; X64-NEXT:    testw %di, %di
+; X64-NEXT:    sets %dil
+; X64-NEXT:    testw %cx, %cx
+; X64-NEXT:    sets %cl
+; X64-NEXT:    xorb %dil, %cl
+; X64-NEXT:    testw %dx, %dx
+; X64-NEXT:    setne %dl
+; X64-NEXT:    testb %cl, %dl
+; X64-NEXT:    cmovel %eax, %esi
+; X64-NEXT:    movswl %si, %eax
+; X64-NEXT:    cmpl $16383, %eax # imm = 0x3FFF
+; X64-NEXT:    movl $16383, %ecx # imm = 0x3FFF
+; X64-NEXT:    cmovll %esi, %ecx
+; X64-NEXT:    movswl %cx, %eax
+; X64-NEXT:    cmpl $-16384, %eax # imm = 0xC000
+; X64-NEXT:    movl $49152, %eax # imm = 0xC000
+; X64-NEXT:    cmovgl %ecx, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func3:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $8, %eax
+; X86-NEXT:    movswl %ax, %esi
+; X86-NEXT:    addl %ecx, %ecx
+; X86-NEXT:    shrl $4, %esi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    cwtd
+; X86-NEXT:    idivw %si
+; X86-NEXT:    # kill: def $ax killed $ax def $eax
+; X86-NEXT:    leal -1(%eax), %edi
+; X86-NEXT:    testw %cx, %cx
+; X86-NEXT:    sets %cl
+; X86-NEXT:    testw %si, %si
+; X86-NEXT:    sets %ch
+; X86-NEXT:    xorb %cl, %ch
+; X86-NEXT:    testw %dx, %dx
+; X86-NEXT:    setne %cl
+; X86-NEXT:    testb %ch, %cl
+; X86-NEXT:    cmovel %eax, %edi
+; X86-NEXT:    movswl %di, %eax
+; X86-NEXT:    cmpl $16383, %eax # imm = 0x3FFF
+; X86-NEXT:    movl $16383, %ecx # imm = 0x3FFF
+; X86-NEXT:    cmovll %edi, %ecx
+; X86-NEXT:    movswl %cx, %eax
+; X86-NEXT:    cmpl $-16384, %eax # imm = 0xC000
+; X86-NEXT:    movl $49152, %eax # imm = 0xC000
+; X86-NEXT:    cmovgl %ecx, %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl
+  %y2 = sext i8 %y to i15
+  %y3 = shl i15 %y2, 7
+  %tmp = call i15 @llvm.sdiv.fix.sat.i15(i15 %x, i15 %y3, i32 4)
+  %tmp2 = sext i15 %tmp to i16
+  ret i16 %tmp2
+}
+
+define i4 @func4(i4 %x, i4 %y) nounwind {
+;
+; X64-LABEL: func4:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    shlb $4, %sil
+; X64-NEXT:    sarb $4, %sil
+; X64-NEXT:    shlb $4, %dil
+; X64-NEXT:    sarb $4, %dil
+; X64-NEXT:    shlb $2, %dil
+; X64-NEXT:    movsbl %dil, %ecx
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    idivb %sil
+; X64-NEXT:    movsbl %ah, %ebx
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    leal -1(%rax), %edi
+; X64-NEXT:    movzbl %dil, %edi
+; X64-NEXT:    testb %sil, %sil
+; X64-NEXT:    sets %dl
+; X64-NEXT:    testb %cl, %cl
+; X64-NEXT:    sets %cl
+; X64-NEXT:    xorb %dl, %cl
+; X64-NEXT:    testb %bl, %bl
+; X64-NEXT:    setne %dl
+; X64-NEXT:    testb %cl, %dl
+; X64-NEXT:    cmovel %eax, %edi
+; X64-NEXT:    cmpb $7, %dil
+; X64-NEXT:    movl $7, %ecx
+; X64-NEXT:    cmovll %edi, %ecx
+; X64-NEXT:    cmpb $-8, %cl
+; X64-NEXT:    movl $248, %eax
+; X64-NEXT:    cmovgl %ecx, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    retq
+;
+; X86-LABEL: func4:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; X86-NEXT:    shlb $4, %dl
+; X86-NEXT:    sarb $4, %dl
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %dh
+; X86-NEXT:    shlb $4, %dh
+; X86-NEXT:    sarb $4, %dh
+; X86-NEXT:    shlb $2, %dh
+; X86-NEXT:    movsbl %dh, %eax
+; X86-NEXT:    idivb %dl
+; X86-NEXT:    movsbl %ah, %ecx
+; X86-NEXT:    movzbl %al, %esi
+; X86-NEXT:    decb %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    testb %dl, %dl
+; X86-NEXT:    sets %dl
+; X86-NEXT:    testb %dh, %dh
+; X86-NEXT:    sets %dh
+; X86-NEXT:    xorb %dl, %dh
+; X86-NEXT:    testb %cl, %cl
+; X86-NEXT:    setne %cl
+; X86-NEXT:    testb %dh, %cl
+; X86-NEXT:    cmovel %esi, %eax
+; X86-NEXT:    cmpb $7, %al
+; X86-NEXT:    movl $7, %ecx
+; X86-NEXT:    cmovll %eax, %ecx
+; X86-NEXT:    cmpb $-8, %cl
+; X86-NEXT:    movl $248, %eax
+; X86-NEXT:    cmovgl %ecx, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+  %tmp = call i4 @llvm.sdiv.fix.sat.i4(i4 %x, i4 %y, i32 2)
+  ret i4 %tmp
+}
+
+define i64 @func5(i64 %x, i64 %y) nounwind {
+;
+; X64-LABEL: func5:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    pushq %r15
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    pushq %r13
+; X64-NEXT:    pushq %r12
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    subq $40, %rsp
+; X64-NEXT:    movq %rdi, %r15
+; X64-NEXT:    leaq (%rdi,%rdi), %rax
+; X64-NEXT:    shrq $33, %rax
+; X64-NEXT:    movq %rdi, %r12
+; X64-NEXT:    sarq $63, %r12
+; X64-NEXT:    shlq $31, %r12
+; X64-NEXT:    orq %rax, %r12
+; X64-NEXT:    sets {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; X64-NEXT:    shlq $32, %r15
+; X64-NEXT:    movq %rsi, %rdx
+; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rsi, %r13
+; X64-NEXT:    sarq $63, %r13
+; X64-NEXT:    movq %r15, %rdi
+; X64-NEXT:    movq %r12, %rsi
+; X64-NEXT:    movq %r13, %rcx
+; X64-NEXT:    callq __divti3
+; X64-NEXT:    movq %rax, %rbx
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rdx, %rbp
+; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    subq $1, %rbx
+; X64-NEXT:    sbbq $0, %rbp
+; X64-NEXT:    testq %r13, %r13
+; X64-NEXT:    sets %r14b
+; X64-NEXT:    xorb {{[-0-9]+}}(%r{{[sb]}}p), %r14b # 1-byte Folded Reload
+; X64-NEXT:    movq %r15, %rdi
+; X64-NEXT:    movq %r12, %rsi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT:    movq %r13, %rcx
+; X64-NEXT:    callq __modti3
+; X64-NEXT:    orq %rax, %rdx
+; X64-NEXT:    setne %al
+; X64-NEXT:    testb %r14b, %al
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; X64-NEXT:    cmpq $-1, %rbx
+; X64-NEXT:    movq $-1, %rax
+; X64-NEXT:    movq $-1, %rcx
+; X64-NEXT:    cmovbq %rbx, %rcx
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    testq %rbp, %rbp
+; X64-NEXT:    cmovnsq %rax, %rbx
+; X64-NEXT:    cmoveq %rcx, %rbx
+; X64-NEXT:    cmovnsq %rdx, %rbp
+; X64-NEXT:    testq %rbx, %rbx
+; X64-NEXT:    movl $0, %ecx
+; X64-NEXT:    cmovaq %rbx, %rcx
+; X64-NEXT:    testq %rbp, %rbp
+; X64-NEXT:    cmovnsq %rbp, %rax
+; X64-NEXT:    cmovsq %rdx, %rbx
+; X64-NEXT:    cmpq $-1, %rbp
+; X64-NEXT:    cmoveq %rcx, %rbx
+; X64-NEXT:    shrdq $1, %rax, %rbx
+; X64-NEXT:    movq %rbx, %rax
+; X64-NEXT:    addq $40, %rsp
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    popq %r12
+; X64-NEXT:    popq %r13
+; X64-NEXT:    popq %r14
+; X64-NEXT:    popq %r15
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    retq
+;
+; X86-LABEL: func5:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $88, %esp
+; X86-NEXT:    movl 8(%ebp), %ecx
+; X86-NEXT:    movl 12(%ebp), %eax
+; X86-NEXT:    movl 20(%ebp), %ebx
+; X86-NEXT:    sarl $31, %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $31, %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $31, %ecx, %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shll $31, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl 20(%ebp)
+; X86-NEXT:    pushl 16(%ebp)
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll __divti3
+; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    subl $1, %esi
+; X86-NEXT:    sbbl $0, %edi
+; X86-NEXT:    sbbl $0, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    sets %al
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    sets %ah
+; X86-NEXT:    xorb %al, %ah
+; X86-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl 20(%ebp)
+; X86-NEXT:    pushl 16(%ebp)
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll __modti3
+; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    setne %al
+; X86-NEXT:    testb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    cmovsl %ebx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $-1, %ecx
+; X86-NEXT:    cmovsl %esi, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X86-NEXT:    cmovsl %edi, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    andl %eax, %edx
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    cmovel %ebx, %edx
+; X86-NEXT:    cmpl $-1, %esi
+; X86-NEXT:    movl $-1, %eax
+; X86-NEXT:    cmovbl %esi, %eax
+; X86-NEXT:    cmpl $2147483647, %edi # imm = 0x7FFFFFFF
+; X86-NEXT:    movl $-1, %ecx
+; X86-NEXT:    cmovael %ecx, %esi
+; X86-NEXT:    cmovel %eax, %esi
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    cmovael %eax, %edi
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    cmoval %esi, %eax
+; X86-NEXT:    cmpl $-2147483648, %edi # imm = 0x80000000
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    cmoval %esi, %ecx
+; X86-NEXT:    cmovel %eax, %ecx
+; X86-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
+; X86-NEXT:    cmoval %edi, %eax
+; X86-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl $-2147483648, %ebx # imm = 0x80000000
+; X86-NEXT:    cmovsl %ebx, %edi
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    cmovsl %ebx, %esi
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmpl $-1, %edx
+; X86-NEXT:    cmovel %ecx, %esi
+; X86-NEXT:    cmovel %eax, %edi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+  %tmp = call i64 @llvm.sdiv.fix.sat.i64(i64 %x, i64 %y, i32 31)
+  ret i64 %tmp
+}
+
+define i18 @func6(i16 %x, i16 %y) nounwind {
+;
+; X64-LABEL: func6:
+; X64:       # %bb.0:
+; X64-NEXT:    movswl %di, %ecx
+; X64-NEXT:    movswl %si, %esi
+; X64-NEXT:    shll $7, %ecx
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    cltd
+; X64-NEXT:    idivl %esi
+; X64-NEXT:    # kill: def $eax killed $eax def $rax
+; X64-NEXT:    leal -1(%rax), %edi
+; X64-NEXT:    testl %esi, %esi
+; X64-NEXT:    sets %sil
+; X64-NEXT:    testl %ecx, %ecx
+; X64-NEXT:    sets %cl
+; X64-NEXT:    xorb %sil, %cl
+; X64-NEXT:    testl %edx, %edx
+; X64-NEXT:    setne %dl
+; X64-NEXT:    testb %cl, %dl
+; X64-NEXT:    cmovel %eax, %edi
+; X64-NEXT:    cmpl $131071, %edi # imm = 0x1FFFF
+; X64-NEXT:    movl $131071, %ecx # imm = 0x1FFFF
+; X64-NEXT:    cmovll %edi, %ecx
+; X64-NEXT:    cmpl $-131072, %ecx # imm = 0xFFFE0000
+; X64-NEXT:    movl $-131072, %eax # imm = 0xFFFE0000
+; X64-NEXT:    cmovgl %ecx, %eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func6:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shll $7, %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    cltd
+; X86-NEXT:    idivl %esi
+; X86-NEXT:    leal -1(%eax), %edi
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    sets %bl
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    sets %cl
+; X86-NEXT:    xorb %bl, %cl
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    setne %dl
+; X86-NEXT:    testb %cl, %dl
+; X86-NEXT:    cmovel %eax, %edi
+; X86-NEXT:    cmpl $131071, %edi # imm = 0x1FFFF
+; X86-NEXT:    movl $131071, %ecx # imm = 0x1FFFF
+; X86-NEXT:    cmovll %edi, %ecx
+; X86-NEXT:    cmpl $-131072, %ecx # imm = 0xFFFE0000
+; X86-NEXT:    movl $-131072, %eax # imm = 0xFFFE0000
+; X86-NEXT:    cmovgl %ecx, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl
+  %x2 = sext i16 %x to i18
+  %y2 = sext i16 %y to i18
+  %tmp = call i18 @llvm.sdiv.fix.sat.i18(i18 %x2, i18 %y2, i32 7)
+  ret i18 %tmp
+}
+
+define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
+;
+; X64-LABEL: vec:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    pushq %r15
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    pushq %r13
+; X64-NEXT:    pushq %r12
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    subq $104, %rsp
+; X64-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-NEXT:    pxor %xmm2, %xmm2
+; X64-NEXT:    pcmpgtd %xmm0, %xmm2
+; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-NEXT:    paddq %xmm0, %xmm0
+; X64-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-NEXT:    movq %xmm0, %rbp
+; X64-NEXT:    movq %rbp, %r12
+; X64-NEXT:    shrq $33, %r12
+; X64-NEXT:    movq %rbp, %r14
+; X64-NEXT:    sarq $63, %r14
+; X64-NEXT:    shlq $31, %r14
+; X64-NEXT:    orq %r14, %r12
+; X64-NEXT:    pxor %xmm0, %xmm0
+; X64-NEXT:    pcmpgtd %xmm1, %xmm0
+; X64-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X64-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-NEXT:    movq %xmm1, %rdx
+; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rdx, %rbx
+; X64-NEXT:    sarq $63, %rbx
+; X64-NEXT:    shlq $31, %rbp
+; X64-NEXT:    movq %rbp, %rdi
+; X64-NEXT:    movq %r12, %rsi
+; X64-NEXT:    movq %rbx, %rcx
+; X64-NEXT:    callq __divti3
+; X64-NEXT:    movq %rax, %r13
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rdx, %r15
+; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    subq $1, %r13
+; X64-NEXT:    sbbq $0, %r15
+; X64-NEXT:    shrq $63, %r14
+; X64-NEXT:    xorl %ebx, %r14d
+; X64-NEXT:    movq %rbp, %rdi
+; X64-NEXT:    movq %r12, %rsi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT:    movq %rbx, %rcx
+; X64-NEXT:    callq __modti3
+; X64-NEXT:    orq %rax, %rdx
+; X64-NEXT:    setne %al
+; X64-NEXT:    testb %r14b, %al
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; X64-NEXT:    movl $4294967295, %edx # imm = 0xFFFFFFFF
+; X64-NEXT:    cmpq %rdx, %r13
+; X64-NEXT:    movl $4294967295, %eax # imm = 0xFFFFFFFF
+; X64-NEXT:    cmovbq %r13, %rax
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    testq %r15, %r15
+; X64-NEXT:    cmovnsq %rdx, %r13
+; X64-NEXT:    cmoveq %rax, %r13
+; X64-NEXT:    cmovnsq %rcx, %r15
+; X64-NEXT:    movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
+; X64-NEXT:    cmpq %rcx, %r13
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    cmovaq %r13, %rax
+; X64-NEXT:    testq %r15, %r15
+; X64-NEXT:    cmovsq %rcx, %r13
+; X64-NEXT:    cmpq $-1, %r15
+; X64-NEXT:    cmoveq %rax, %r13
+; X64-NEXT:    movq %r13, %xmm0
+; X64-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-NEXT:    pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; X64-NEXT:    # xmm0 = mem[2,3,0,1]
+; X64-NEXT:    movq %xmm0, %r13
+; X64-NEXT:    movq %r13, %rbx
+; X64-NEXT:    shrq $33, %rbx
+; X64-NEXT:    movq %r13, %r14
+; X64-NEXT:    sarq $63, %r14
+; X64-NEXT:    shlq $31, %r14
+; X64-NEXT:    orq %r14, %rbx
+; X64-NEXT:    pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; X64-NEXT:    # xmm0 = mem[2,3,0,1]
+; X64-NEXT:    movq %xmm0, %rdx
+; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rdx, %rbp
+; X64-NEXT:    sarq $63, %rbp
+; X64-NEXT:    shlq $31, %r13
+; X64-NEXT:    movq %r13, %rdi
+; X64-NEXT:    movq %rbx, %rsi
+; X64-NEXT:    movq %rbp, %rcx
+; X64-NEXT:    callq __divti3
+; X64-NEXT:    movq %rax, %r12
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rdx, %r15
+; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    subq $1, %r12
+; X64-NEXT:    sbbq $0, %r15
+; X64-NEXT:    shrq $63, %r14
+; X64-NEXT:    xorl %ebp, %r14d
+; X64-NEXT:    movq %r13, %rdi
+; X64-NEXT:    movq %rbx, %rsi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT:    movq %rbp, %rcx
+; X64-NEXT:    callq __modti3
+; X64-NEXT:    orq %rax, %rdx
+; X64-NEXT:    setne %al
+; X64-NEXT:    testb %r14b, %al
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; X64-NEXT:    movl $4294967295, %ecx # imm = 0xFFFFFFFF
+; X64-NEXT:    cmpq %rcx, %r12
+; X64-NEXT:    movl $4294967295, %eax # imm = 0xFFFFFFFF
+; X64-NEXT:    cmovbq %r12, %rax
+; X64-NEXT:    testq %r15, %r15
+; X64-NEXT:    cmovnsq %rcx, %r12
+; X64-NEXT:    cmoveq %rax, %r12
+; X64-NEXT:    movl $0, %eax
+; X64-NEXT:    cmovnsq %rax, %r15
+; X64-NEXT:    movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
+; X64-NEXT:    cmpq %rcx, %r12
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    cmovaq %r12, %rax
+; X64-NEXT:    testq %r15, %r15
+; X64-NEXT:    cmovsq %rcx, %r12
+; X64-NEXT:    cmpq $-1, %r15
+; X64-NEXT:    cmoveq %rax, %r12
+; X64-NEXT:    movq %r12, %xmm0
+; X64-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; X64-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; X64-NEXT:    psrlq $1, %xmm1
+; X64-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-NEXT:    pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; X64-NEXT:    # xmm1 = mem[2,3,0,1]
+; X64-NEXT:    pxor %xmm0, %xmm0
+; X64-NEXT:    pcmpgtd %xmm1, %xmm0
+; X64-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X64-NEXT:    paddq %xmm1, %xmm1
+; X64-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-NEXT:    movq %xmm1, %r12
+; X64-NEXT:    movq %r12, %rbx
+; X64-NEXT:    shrq $33, %rbx
+; X64-NEXT:    movq %r12, %r14
+; X64-NEXT:    sarq $63, %r14
+; X64-NEXT:    shlq $31, %r14
+; X64-NEXT:    orq %r14, %rbx
+; X64-NEXT:    pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; X64-NEXT:    # xmm1 = mem[2,3,0,1]
+; X64-NEXT:    pxor %xmm0, %xmm0
+; X64-NEXT:    pcmpgtd %xmm1, %xmm0
+; X64-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X64-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-NEXT:    movq %xmm1, %rdx
+; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rdx, %rbp
+; X64-NEXT:    sarq $63, %rbp
+; X64-NEXT:    shlq $31, %r12
+; X64-NEXT:    movq %r12, %rdi
+; X64-NEXT:    movq %rbx, %rsi
+; X64-NEXT:    movq %rbp, %rcx
+; X64-NEXT:    callq __divti3
+; X64-NEXT:    movq %rax, %r13
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rdx, %r15
+; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    subq $1, %r13
+; X64-NEXT:    sbbq $0, %r15
+; X64-NEXT:    shrq $63, %r14
+; X64-NEXT:    xorl %ebp, %r14d
+; X64-NEXT:    movq %r12, %rdi
+; X64-NEXT:    movq %rbx, %rsi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT:    movq %rbp, %rcx
+; X64-NEXT:    callq __modti3
+; X64-NEXT:    orq %rax, %rdx
+; X64-NEXT:    setne %al
+; X64-NEXT:    testb %r14b, %al
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; X64-NEXT:    movl $4294967295, %ecx # imm = 0xFFFFFFFF
+; X64-NEXT:    cmpq %rcx, %r13
+; X64-NEXT:    movl $4294967295, %eax # imm = 0xFFFFFFFF
+; X64-NEXT:    cmovbq %r13, %rax
+; X64-NEXT:    testq %r15, %r15
+; X64-NEXT:    cmovnsq %rcx, %r13
+; X64-NEXT:    cmoveq %rax, %r13
+; X64-NEXT:    movl $0, %eax
+; X64-NEXT:    cmovnsq %rax, %r15
+; X64-NEXT:    movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
+; X64-NEXT:    cmpq %rcx, %r13
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    cmovaq %r13, %rax
+; X64-NEXT:    testq %r15, %r15
+; X64-NEXT:    cmovsq %rcx, %r13
+; X64-NEXT:    cmpq $-1, %r15
+; X64-NEXT:    cmoveq %rax, %r13
+; X64-NEXT:    movq %r13, %xmm0
+; X64-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-NEXT:    pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; X64-NEXT:    # xmm0 = mem[2,3,0,1]
+; X64-NEXT:    movq %xmm0, %r13
+; X64-NEXT:    movq %r13, %rbx
+; X64-NEXT:    shrq $33, %rbx
+; X64-NEXT:    movq %r13, %r14
+; X64-NEXT:    sarq $63, %r14
+; X64-NEXT:    shlq $31, %r14
+; X64-NEXT:    orq %r14, %rbx
+; X64-NEXT:    pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; X64-NEXT:    # xmm0 = mem[2,3,0,1]
+; X64-NEXT:    movq %xmm0, %rdx
+; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rdx, %rbp
+; X64-NEXT:    sarq $63, %rbp
+; X64-NEXT:    shlq $31, %r13
+; X64-NEXT:    movq %r13, %rdi
+; X64-NEXT:    movq %rbx, %rsi
+; X64-NEXT:    movq %rbp, %rcx
+; X64-NEXT:    callq __divti3
+; X64-NEXT:    movq %rax, %r12
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rdx, %r15
+; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    subq $1, %r12
+; X64-NEXT:    sbbq $0, %r15
+; X64-NEXT:    shrq $63, %r14
+; X64-NEXT:    xorl %ebp, %r14d
+; X64-NEXT:    movq %r13, %rdi
+; X64-NEXT:    movq %rbx, %rsi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT:    movq %rbp, %rcx
+; X64-NEXT:    callq __modti3
+; X64-NEXT:    orq %rax, %rdx
+; X64-NEXT:    setne %al
+; X64-NEXT:    testb %r14b, %al
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; X64-NEXT:    movl $4294967295, %ecx # imm = 0xFFFFFFFF
+; X64-NEXT:    cmpq %rcx, %r12
+; X64-NEXT:    movl $4294967295, %eax # imm = 0xFFFFFFFF
+; X64-NEXT:    cmovbq %r12, %rax
+; X64-NEXT:    testq %r15, %r15
+; X64-NEXT:    cmovnsq %rcx, %r12
+; X64-NEXT:    cmoveq %rax, %r12
+; X64-NEXT:    movl $0, %eax
+; X64-NEXT:    cmovnsq %rax, %r15
+; X64-NEXT:    movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
+; X64-NEXT:    cmpq %rcx, %r12
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    cmovaq %r12, %rax
+; X64-NEXT:    testq %r15, %r15
+; X64-NEXT:    cmovsq %rcx, %r12
+; X64-NEXT:    cmpq $-1, %r15
+; X64-NEXT:    cmoveq %rax, %r12
+; X64-NEXT:    movq %r12, %xmm0
+; X64-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; X64-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; X64-NEXT:    psrlq $1, %xmm1
+; X64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X64-NEXT:    addq $104, %rsp
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    popq %r12
+; X64-NEXT:    popq %r13
+; X64-NEXT:    popq %r14
+; X64-NEXT:    popq %r15
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    retq
+;
+; X86-LABEL: vec:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $256, %esp # imm = 0x100
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 40(%ebp), %ebx
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    addl %ecx, %ecx
+; X86-NEXT:    adcl %eax, %eax
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    shll $31, %eax
+; X86-NEXT:    shrl %ecx
+; X86-NEXT:    subl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $31, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    negl %edi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll __modti3
+; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    movl 36(%ebp), %edi
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    movl 20(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    addl %ecx, %ecx
+; X86-NEXT:    adcl %eax, %eax
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    shll $31, %eax
+; X86-NEXT:    shrl %ecx
+; X86-NEXT:    subl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $31, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    negl %ebx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll __modti3
+; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    sarl $31, %ebx
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    addl %ecx, %ecx
+; X86-NEXT:    adcl %eax, %eax
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    shll $31, %eax
+; X86-NEXT:    shrl %ecx
+; X86-NEXT:    subl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $31, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    negl %esi
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll __divti3
+; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    movl 16(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    addl %ecx, %ecx
+; X86-NEXT:    adcl %eax, %eax
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    shll $31, %eax
+; X86-NEXT:    shrl %ecx
+; X86-NEXT:    subl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $31, %ebx
+; X86-NEXT:    negl %esi
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll __modti3
+; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl 32(%ebp)
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll __divti3
+; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl 40(%ebp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll __divti3
+; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl 36(%ebp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll __divti3
+; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    subl $1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    sbbl $0, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    sets %bl
+; X86-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    sets %bh
+; X86-NEXT:    xorb %bl, %bh
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    setne %al
+; X86-NEXT:    testb %bh, %al
+; X86-NEXT:    cmovel %esi, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    subl $1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    sbbl $0, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl $0, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    sets %bl
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    sets %bh
+; X86-NEXT:    xorb %bl, %bh
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl %edi, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    testb %bh, %al
+; X86-NEXT:    cmovel %edx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    cmovel %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    subl $1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl $0, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, %eax
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    sets %al
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    sets %bl
+; X86-NEXT:    xorb %al, %bl
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl 28(%ebp)
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll __modti3
+; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    setne %al
+; X86-NEXT:    testb %bl, %al
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    cmovel %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    subl $1, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    sbbl $0, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, %ecx
+; X86-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    sets %bl
+; X86-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    sets %bh
+; X86-NEXT:    xorb %bl, %bh
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    setne %al
+; X86-NEXT:    testb %bh, %al
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    cmovel %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    cmovsl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $-1, %eax
+; X86-NEXT:    cmovsl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    cmovel %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    cmovsl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $-1, %eax
+; X86-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    sarl $31, %ebx
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    cmovel %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    cmovsl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $-1, %eax
+; X86-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    cmovel %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    cmovsl %eax, %edi
+; X86-NEXT:    movl $-1, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    cmovsl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    cmovel %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    cmpl $-1, %edx
+; X86-NEXT:    movl $-1, %eax
+; X86-NEXT:    cmovael %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    cmpl $1, %edx
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    notl %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    cmovbl %edx, %ecx
+; X86-NEXT:    andl %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovel %ecx, %esi
+; X86-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    cmoval %eax, %ecx
+; X86-NEXT:    cmpl $-1, %esi
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    cmovnel %edx, %ecx
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    movl $-1, %edx
+; X86-NEXT:    cmovsl %edx, %esi
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    cmovsl %edx, %eax
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    cmpl $-1, %edi
+; X86-NEXT:    cmovel %ecx, %eax
+; X86-NEXT:    cmovnel %esi, %edi
+; X86-NEXT:    shldl $31, %eax, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    cmpl $-1, %eax
+; X86-NEXT:    movl $-1, %ecx
+; X86-NEXT:    cmovael %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    cmpl $1, %esi
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    notl %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    cmovbl %esi, %ecx
+; X86-NEXT:    andl %esi, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovel %ecx, %ebx
+; X86-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    cmoval %eax, %ecx
+; X86-NEXT:    cmpl $-1, %ebx
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    cmovnel %edi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    movl $-1, %edx
+; X86-NEXT:    cmovsl %edx, %ebx
+; X86-NEXT:    cmovsl %edi, %eax
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    cmpl $-1, %esi
+; X86-NEXT:    cmovel %ecx, %eax
+; X86-NEXT:    cmovnel %ebx, %esi
+; X86-NEXT:    shldl $31, %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    cmpl $-1, %eax
+; X86-NEXT:    cmovael %edx, %eax
+; X86-NEXT:    movl $-1, %ebx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    cmpl $1, %edx
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    notl %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    cmovbl %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    andl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovel %ecx, %edi
+; X86-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    cmoval %eax, %ecx
+; X86-NEXT:    cmpl $-1, %edi
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    cmovnel %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    cmovsl %ebx, %edi
+; X86-NEXT:    cmovsl %edx, %eax
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    cmpl $-1, %esi
+; X86-NEXT:    cmovel %ecx, %eax
+; X86-NEXT:    cmovnel %edi, %esi
+; X86-NEXT:    shldl $31, %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    cmpl $-1, %eax
+; X86-NEXT:    cmovael %ebx, %eax
+; X86-NEXT:    movl $-1, %esi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    cmpl $1, %edx
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    notl %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    cmovbl %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    andl %edx, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovel %ecx, %ebx
+; X86-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    cmoval %eax, %ecx
+; X86-NEXT:    cmpl $-1, %ebx
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    cmovnel %edi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    cmovsl %esi, %ebx
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    cmovsl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    andl %edx, %ebx
+; X86-NEXT:    cmpl $-1, %ebx
+; X86-NEXT:    cmovel %ecx, %eax
+; X86-NEXT:    cmovnel %esi, %ebx
+; X86-NEXT:    shldl $31, %eax, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+  %tmp = call <4 x i32> @llvm.sdiv.fix.sat.v4i32(<4 x i32> %x, <4 x i32> %y, i32 31)
+  ret <4 x i32> %tmp
+}
diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
--- a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
@@ -1,8 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -show-mc-encoding -fast-isel-sink-local-values < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse | FileCheck %s --check-prefixes=CHECK,X86,SSE,X86-SSE
+; RUN: llc -show-mc-encoding -fast-isel-sink-local-values < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse | FileCheck %s --check-prefixes=CHECK,X86,SSE,X86-SSE,X86-SSE1
+; RUN: llc -show-mc-encoding -fast-isel-sink-local-values < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,X86,SSE,X86-SSE,X86-SSE2
 ; RUN: llc -show-mc-encoding -fast-isel-sink-local-values < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX1,X86-AVX1
 ; RUN: llc -show-mc-encoding -fast-isel-sink-local-values < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX512,X86-AVX512
-; RUN: llc -show-mc-encoding -fast-isel-sink-local-values < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse,-sse2 | FileCheck %s --check-prefixes=CHECK,X64,SSE,X64-SSE
+; RUN: llc -show-mc-encoding -fast-isel-sink-local-values < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse,-sse2 | FileCheck %s --check-prefixes=CHECK,X64,SSE,X64-SSE,X64-SSE1
+; RUN: llc -show-mc-encoding -fast-isel-sink-local-values < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse,+sse2 | FileCheck %s --check-prefixes=CHECK,X64,SSE,X64-SSE,X64-SSE2
 ; RUN: llc -show-mc-encoding -fast-isel-sink-local-values < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX1,X64-AVX1
 ; RUN: llc -show-mc-encoding -fast-isel-sink-local-values < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX512,X64-AVX512
 
@@ -72,10 +74,17 @@
 }
 
 define <4 x float> @test_mm_andnot_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
-; SSE-LABEL: test_mm_andnot_ps:
-; SSE:       # %bb.0:
-; SSE-NEXT:    andnps %xmm1, %xmm0 # encoding: [0x0f,0x55,0xc1]
-; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-SSE1-LABEL: test_mm_andnot_ps:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    andnps %xmm1, %xmm0 # encoding: [0x0f,0x55,0xc1]
+; X86-SSE1-NEXT:    retl # encoding: [0xc3]
+;
+; X86-SSE2-LABEL: test_mm_andnot_ps:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pcmpeqd %xmm2, %xmm2 # encoding: [0x66,0x0f,0x76,0xd2]
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm0 # encoding: [0x66,0x0f,0xef,0xc2]
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0 # encoding: [0x66,0x0f,0xdb,0xc1]
+; X86-SSE2-NEXT:    retl # encoding: [0xc3]
 ;
 ; AVX1-LABEL: test_mm_andnot_ps:
 ; AVX1:       # %bb.0:
@@ -89,6 +98,18 @@
 ; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf3,0xfd,0x08,0x25,0xc0,0x0f]
 ; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xc1]
 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; X64-SSE1-LABEL: test_mm_andnot_ps:
+; X64-SSE1:       # %bb.0:
+; X64-SSE1-NEXT:    andnps %xmm1, %xmm0 # encoding: [0x0f,0x55,0xc1]
+; X64-SSE1-NEXT:    retq # encoding: [0xc3]
+;
+; X64-SSE2-LABEL: test_mm_andnot_ps:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    pcmpeqd %xmm2, %xmm2 # encoding: [0x66,0x0f,0x76,0xd2]
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm0 # encoding: [0x66,0x0f,0xef,0xc2]
+; X64-SSE2-NEXT:    pand %xmm1, %xmm0 # encoding: [0x66,0x0f,0xdb,0xc1]
+; X64-SSE2-NEXT:    retq # encoding: [0xc3]
   %arg0 = bitcast <4 x float> %a0 to <4 x i32>
   %arg1 = bitcast <4 x float> %a1 to <4 x i32>
   %not = xor <4 x i32> %arg0, <i32 -1, i32 -1, i32 -1, i32 -1>
@@ -2727,21 +2748,27 @@
 }
 
 define void @test_mm_storeh_pi(x86_mmx *%a0, <4 x float> %a1) nounwind {
-; X86-SSE-LABEL: test_mm_storeh_pi:
-; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    pushl %ebp # encoding: [0x55]
-; X86-SSE-NEXT:    movl %esp, %ebp # encoding: [0x89,0xe5]
-; X86-SSE-NEXT:    andl $-16, %esp # encoding: [0x83,0xe4,0xf0]
-; X86-SSE-NEXT:    subl $32, %esp # encoding: [0x83,0xec,0x20]
-; X86-SSE-NEXT:    movl 8(%ebp), %eax # encoding: [0x8b,0x45,0x08]
-; X86-SSE-NEXT:    movaps %xmm0, (%esp) # encoding: [0x0f,0x29,0x04,0x24]
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
-; X86-SSE-NEXT:    movl %edx, 4(%eax) # encoding: [0x89,0x50,0x04]
-; X86-SSE-NEXT:    movl %ecx, (%eax) # encoding: [0x89,0x08]
-; X86-SSE-NEXT:    movl %ebp, %esp # encoding: [0x89,0xec]
-; X86-SSE-NEXT:    popl %ebp # encoding: [0x5d]
-; X86-SSE-NEXT:    retl # encoding: [0xc3]
+; X86-SSE1-LABEL: test_mm_storeh_pi:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl %ebp # encoding: [0x55]
+; X86-SSE1-NEXT:    movl %esp, %ebp # encoding: [0x89,0xe5]
+; X86-SSE1-NEXT:    andl $-16, %esp # encoding: [0x83,0xe4,0xf0]
+; X86-SSE1-NEXT:    subl $32, %esp # encoding: [0x83,0xec,0x20]
+; X86-SSE1-NEXT:    movl 8(%ebp), %eax # encoding: [0x8b,0x45,0x08]
+; X86-SSE1-NEXT:    movaps %xmm0, (%esp) # encoding: [0x0f,0x29,0x04,0x24]
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
+; X86-SSE1-NEXT:    movl %edx, 4(%eax) # encoding: [0x89,0x50,0x04]
+; X86-SSE1-NEXT:    movl %ecx, (%eax) # encoding: [0x89,0x08]
+; X86-SSE1-NEXT:    movl %ebp, %esp # encoding: [0x89,0xec]
+; X86-SSE1-NEXT:    popl %ebp # encoding: [0x5d]
+; X86-SSE1-NEXT:    retl # encoding: [0xc3]
+;
+; X86-SSE2-LABEL: test_mm_storeh_pi:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-SSE2-NEXT:    movhps %xmm0, (%eax) # encoding: [0x0f,0x17,0x00]
+; X86-SSE2-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: test_mm_storeh_pi:
 ; X86-AVX1:       # %bb.0:
@@ -2755,12 +2782,20 @@
 ; X86-AVX512-NEXT:    vmovhps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x17,0x00]
 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
 ;
-; X64-SSE-LABEL: test_mm_storeh_pi:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x29,0x44,0x24,0xe8]
-; X64-SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8b,0x44,0x24,0xf0]
-; X64-SSE-NEXT:    movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
-; X64-SSE-NEXT:    retq # encoding: [0xc3]
+; X64-SSE1-LABEL: test_mm_storeh_pi:
+; X64-SSE1:       # %bb.0:
+; X64-SSE1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x29,0x44,0x24,0xe8]
+; X64-SSE1-NEXT:    movq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8b,0x44,0x24,0xf0]
+; X64-SSE1-NEXT:    movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
+; X64-SSE1-NEXT:    retq # encoding: [0xc3]
+;
+; X64-SSE2-LABEL: test_mm_storeh_pi:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    pshufd $78, %xmm0, %xmm0 # encoding: [0x66,0x0f,0x70,0xc0,0x4e]
+; X64-SSE2-NEXT:    # xmm0 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT:    movq %xmm0, %rax # encoding: [0x66,0x48,0x0f,0x7e,0xc0]
+; X64-SSE2-NEXT:    movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
+; X64-SSE2-NEXT:    retq # encoding: [0xc3]
 ;
 ; X64-AVX1-LABEL: test_mm_storeh_pi:
 ; X64-AVX1:       # %bb.0:
@@ -2820,21 +2855,27 @@
 }
 
 define void @test_mm_storel_pi(x86_mmx *%a0, <4 x float> %a1) nounwind {
-; X86-SSE-LABEL: test_mm_storel_pi:
-; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    pushl %ebp # encoding: [0x55]
-; X86-SSE-NEXT:    movl %esp, %ebp # encoding: [0x89,0xe5]
-; X86-SSE-NEXT:    andl $-16, %esp # encoding: [0x83,0xe4,0xf0]
-; X86-SSE-NEXT:    subl $32, %esp # encoding: [0x83,0xec,0x20]
-; X86-SSE-NEXT:    movl 8(%ebp), %eax # encoding: [0x8b,0x45,0x08]
-; X86-SSE-NEXT:    movaps %xmm0, (%esp) # encoding: [0x0f,0x29,0x04,0x24]
-; X86-SSE-NEXT:    movl (%esp), %ecx # encoding: [0x8b,0x0c,0x24]
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04]
-; X86-SSE-NEXT:    movl %edx, 4(%eax) # encoding: [0x89,0x50,0x04]
-; X86-SSE-NEXT:    movl %ecx, (%eax) # encoding: [0x89,0x08]
-; X86-SSE-NEXT:    movl %ebp, %esp # encoding: [0x89,0xec]
-; X86-SSE-NEXT:    popl %ebp # encoding: [0x5d]
-; X86-SSE-NEXT:    retl # encoding: [0xc3]
+; X86-SSE1-LABEL: test_mm_storel_pi:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl %ebp # encoding: [0x55]
+; X86-SSE1-NEXT:    movl %esp, %ebp # encoding: [0x89,0xe5]
+; X86-SSE1-NEXT:    andl $-16, %esp # encoding: [0x83,0xe4,0xf0]
+; X86-SSE1-NEXT:    subl $32, %esp # encoding: [0x83,0xec,0x20]
+; X86-SSE1-NEXT:    movl 8(%ebp), %eax # encoding: [0x8b,0x45,0x08]
+; X86-SSE1-NEXT:    movaps %xmm0, (%esp) # encoding: [0x0f,0x29,0x04,0x24]
+; X86-SSE1-NEXT:    movl (%esp), %ecx # encoding: [0x8b,0x0c,0x24]
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04]
+; X86-SSE1-NEXT:    movl %edx, 4(%eax) # encoding: [0x89,0x50,0x04]
+; X86-SSE1-NEXT:    movl %ecx, (%eax) # encoding: [0x89,0x08]
+; X86-SSE1-NEXT:    movl %ebp, %esp # encoding: [0x89,0xec]
+; X86-SSE1-NEXT:    popl %ebp # encoding: [0x5d]
+; X86-SSE1-NEXT:    retl # encoding: [0xc3]
+;
+; X86-SSE2-LABEL: test_mm_storel_pi:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-SSE2-NEXT:    movlps %xmm0, (%eax) # encoding: [0x0f,0x13,0x00]
+; X86-SSE2-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: test_mm_storel_pi:
 ; X86-AVX1:       # %bb.0:
@@ -2848,12 +2889,18 @@
 ; X86-AVX512-NEXT:    vmovlps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x13,0x00]
 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
 ;
-; X64-SSE-LABEL: test_mm_storel_pi:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x29,0x44,0x24,0xe8]
-; X64-SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8b,0x44,0x24,0xe8]
-; X64-SSE-NEXT:    movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
-; X64-SSE-NEXT:    retq # encoding: [0xc3]
+; X64-SSE1-LABEL: test_mm_storel_pi:
+; X64-SSE1:       # %bb.0:
+; X64-SSE1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x29,0x44,0x24,0xe8]
+; X64-SSE1-NEXT:    movq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8b,0x44,0x24,0xe8]
+; X64-SSE1-NEXT:    movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
+; X64-SSE1-NEXT:    retq # encoding: [0xc3]
+;
+; X64-SSE2-LABEL: test_mm_storel_pi:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    movq %xmm0, %rax # encoding: [0x66,0x48,0x0f,0x7e,0xc0]
+; X64-SSE2-NEXT:    movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
+; X64-SSE2-NEXT:    retq # encoding: [0xc3]
 ;
 ; X64-AVX1-LABEL: test_mm_storel_pi:
 ; X64-AVX1:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/udiv_fix_sat.ll b/llvm/test/CodeGen/X86/udiv_fix_sat.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/udiv_fix_sat.ll
@@ -0,0 +1,528 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686 -mattr=cmov | FileCheck %s --check-prefix=X86
+
+declare  i4  @llvm.udiv.fix.sat.i4   (i4,  i4,  i32)
+declare  i15 @llvm.udiv.fix.sat.i15  (i15, i15, i32)
+declare  i16 @llvm.udiv.fix.sat.i16  (i16, i16, i32)
+declare  i18 @llvm.udiv.fix.sat.i18  (i18, i18, i32)
+declare  i64 @llvm.udiv.fix.sat.i64  (i64, i64, i32)
+declare  <4 x i32> @llvm.udiv.fix.sat.v4i32(<4 x i32>, <4 x i32>, i32)
+
+define i16 @func(i16 %x, i16 %y) nounwind {
+; X64-LABEL: func:
+; X64:       # %bb.0:
+; X64-NEXT:    movzwl %si, %ecx
+; X64-NEXT:    movzwl %di, %eax
+; X64-NEXT:    shll $8, %eax
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    divl %ecx
+; X64-NEXT:    cmpl $131071, %eax # imm = 0x1FFFF
+; X64-NEXT:    movl $131071, %ecx # imm = 0x1FFFF
+; X64-NEXT:    cmovael %ecx, %eax
+; X64-NEXT:    shrl %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func:
+; X86:       # %bb.0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl %ax, %eax
+; X86-NEXT:    shll $8, %eax
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    divl %ecx
+; X86-NEXT:    cmpl $131071, %eax # imm = 0x1FFFF
+; X86-NEXT:    movl $131071, %ecx # imm = 0x1FFFF
+; X86-NEXT:    cmovael %ecx, %eax
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+  %tmp = call i16 @llvm.udiv.fix.sat.i16(i16 %x, i16 %y, i32 7)
+  ret i16 %tmp
+}
+
+define i16 @func2(i8 %x, i8 %y) nounwind {
+; X64-LABEL: func2:
+; X64:       # %bb.0:
+; X64-NEXT:    movsbl %dil, %eax
+; X64-NEXT:    andl $32767, %eax # imm = 0x7FFF
+; X64-NEXT:    movsbl %sil, %ecx
+; X64-NEXT:    andl $32767, %ecx # imm = 0x7FFF
+; X64-NEXT:    shll $14, %eax
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    divl %ecx
+; X64-NEXT:    cmpl $32767, %eax # imm = 0x7FFF
+; X64-NEXT:    movl $32767, %ecx # imm = 0x7FFF
+; X64-NEXT:    cmovbl %eax, %ecx
+; X64-NEXT:    addl %ecx, %ecx
+; X64-NEXT:    movswl %cx, %eax
+; X64-NEXT:    shrl %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func2:
+; X86:       # %bb.0:
+; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $32767, %ecx # imm = 0x7FFF
+; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $32767, %eax # imm = 0x7FFF
+; X86-NEXT:    shll $14, %eax
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    divl %ecx
+; X86-NEXT:    cmpl $32767, %eax # imm = 0x7FFF
+; X86-NEXT:    movl $32767, %ecx # imm = 0x7FFF
+; X86-NEXT:    cmovbl %eax, %ecx
+; X86-NEXT:    addl %ecx, %ecx
+; X86-NEXT:    movswl %cx, %eax
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+  %x2 = sext i8 %x to i15
+  %y2 = sext i8 %y to i15
+  %tmp = call i15 @llvm.udiv.fix.sat.i15(i15 %x2, i15 %y2, i32 14)
+  %tmp2 = sext i15 %tmp to i16
+  ret i16 %tmp2
+}
+
+define i16 @func3(i15 %x, i8 %y) nounwind {
+; X64-LABEL: func3:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal (%rdi,%rdi), %eax
+; X64-NEXT:    movzbl %sil, %ecx
+; X64-NEXT:    shll $4, %ecx
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    divw %cx
+; X64-NEXT:    # kill: def $ax killed $ax def $eax
+; X64-NEXT:    movzwl %ax, %ecx
+; X64-NEXT:    cmpl $32767, %ecx # imm = 0x7FFF
+; X64-NEXT:    movl $32767, %ecx # imm = 0x7FFF
+; X64-NEXT:    cmovbl %eax, %ecx
+; X64-NEXT:    addl %ecx, %ecx
+; X64-NEXT:    movswl %cx, %eax
+; X64-NEXT:    shrl %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func3:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addl %eax, %eax
+; X86-NEXT:    movzbl %cl, %ecx
+; X86-NEXT:    shll $4, %ecx
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    divw %cx
+; X86-NEXT:    # kill: def $ax killed $ax def $eax
+; X86-NEXT:    movzwl %ax, %ecx
+; X86-NEXT:    cmpl $32767, %ecx # imm = 0x7FFF
+; X86-NEXT:    movl $32767, %ecx # imm = 0x7FFF
+; X86-NEXT:    cmovbl %eax, %ecx
+; X86-NEXT:    addl %ecx, %ecx
+; X86-NEXT:    movswl %cx, %eax
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+  %y2 = sext i8 %y to i15
+  %y3 = shl i15 %y2, 7
+  %tmp = call i15 @llvm.udiv.fix.sat.i15(i15 %x, i15 %y3, i32 4)
+  %tmp2 = sext i15 %tmp to i16
+  ret i16 %tmp2
+}
+
+define i4 @func4(i4 %x, i4 %y) nounwind {
+; X64-LABEL: func4:
+; X64:       # %bb.0:
+; X64-NEXT:    andb $15, %sil
+; X64-NEXT:    andb $15, %dil
+; X64-NEXT:    shlb $2, %dil
+; X64-NEXT:    movzbl %dil, %eax
+; X64-NEXT:    divb %sil
+; X64-NEXT:    movzbl %al, %ecx
+; X64-NEXT:    cmpb $15, %cl
+; X64-NEXT:    movl $15, %eax
+; X64-NEXT:    cmovbl %ecx, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func4:
+; X86:       # %bb.0:
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    andb $15, %cl
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    andb $15, %al
+; X86-NEXT:    shlb $2, %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    divb %cl
+; X86-NEXT:    movzbl %al, %ecx
+; X86-NEXT:    cmpb $15, %al
+; X86-NEXT:    movl $15, %eax
+; X86-NEXT:    cmovbl %ecx, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %tmp = call i4 @llvm.udiv.fix.sat.i4(i4 %x, i4 %y, i32 2)
+  ret i4 %tmp
+}
+
+define i64 @func5(i64 %x, i64 %y) nounwind {
+; X64-LABEL: func5:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    movq %rsi, %rdx
+; X64-NEXT:    leaq (%rdi,%rdi), %rsi
+; X64-NEXT:    shrq $33, %rsi
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    shrq $32, %rax
+; X64-NEXT:    andl $-2147483648, %eax # imm = 0x80000000
+; X64-NEXT:    orq %rax, %rsi
+; X64-NEXT:    shlq $32, %rdi
+; X64-NEXT:    xorl %ebx, %ebx
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    callq __udivti3
+; X64-NEXT:    cmpq $-1, %rax
+; X64-NEXT:    movq $-1, %rcx
+; X64-NEXT:    cmovbq %rax, %rcx
+; X64-NEXT:    cmpq $1, %rdx
+; X64-NEXT:    movl $1, %esi
+; X64-NEXT:    cmovbq %rdx, %rsi
+; X64-NEXT:    sbbq %rbx, %rbx
+; X64-NEXT:    notq %rbx
+; X64-NEXT:    orq %rax, %rbx
+; X64-NEXT:    cmpq $1, %rdx
+; X64-NEXT:    cmoveq %rcx, %rbx
+; X64-NEXT:    shrdq $1, %rsi, %rbx
+; X64-NEXT:    movq %rbx, %rax
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    retq
+;
+; X86-LABEL: func5:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    shrl %edx
+; X86-NEXT:    shldl $31, %eax, %ecx
+; X86-NEXT:    shll $31, %eax
+; X86-NEXT:    movl %esp, %esi
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl 20(%ebp)
+; X86-NEXT:    pushl 16(%ebp)
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    calll __udivti3
+; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl $-1, %eax
+; X86-NEXT:    movl $-1, %ecx
+; X86-NEXT:    movl $-1, %esi
+; X86-NEXT:    cmovbl %eax, %esi
+; X86-NEXT:    cmpl $-1, %edx
+; X86-NEXT:    cmovel %edx, %eax
+; X86-NEXT:    cmovel %esi, %eax
+; X86-NEXT:    cmovael %ecx, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmovnel %ecx, %edx
+; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    leal -4(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+  %tmp = call i64 @llvm.udiv.fix.sat.i64(i64 %x, i64 %y, i32 31)
+  ret i64 %tmp
+}
+
+define i18 @func6(i16 %x, i16 %y) nounwind {
+; X64-LABEL: func6:
+; X64:       # %bb.0:
+; X64-NEXT:    movswl %di, %eax
+; X64-NEXT:    andl $262143, %eax # imm = 0x3FFFF
+; X64-NEXT:    movswl %si, %ecx
+; X64-NEXT:    andl $262143, %ecx # imm = 0x3FFFF
+; X64-NEXT:    shll $7, %eax
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    divl %ecx
+; X64-NEXT:    cmpl $262143, %eax # imm = 0x3FFFF
+; X64-NEXT:    movl $262143, %ecx # imm = 0x3FFFF
+; X64-NEXT:    cmovael %ecx, %eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func6:
+; X86:       # %bb.0:
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $262143, %ecx # imm = 0x3FFFF
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $262143, %eax # imm = 0x3FFFF
+; X86-NEXT:    shll $7, %eax
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    divl %ecx
+; X86-NEXT:    cmpl $262143, %eax # imm = 0x3FFFF
+; X86-NEXT:    movl $262143, %ecx # imm = 0x3FFFF
+; X86-NEXT:    cmovael %ecx, %eax
+; X86-NEXT:    retl
+  %x2 = sext i16 %x to i18
+  %y2 = sext i16 %y to i18
+  %tmp = call i18 @llvm.udiv.fix.sat.i18(i18 %x2, i18 %y2, i32 7)
+  ret i18 %tmp
+}
+
+define i16 @func7(i16 %x, i16 %y) nounwind {
+; X64-LABEL: func7:
+; X64:       # %bb.0:
+; X64-NEXT:    movzwl %si, %ecx
+; X64-NEXT:    movzwl %di, %eax
+; X64-NEXT:    addl %eax, %eax
+; X64-NEXT:    shlq $16, %rax
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    divq %rcx
+; X64-NEXT:    cmpq $131071, %rax # imm = 0x1FFFF
+; X64-NEXT:    movl $131071, %ecx # imm = 0x1FFFF
+; X64-NEXT:    cmovaeq %rcx, %rax
+; X64-NEXT:    shrl %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $rax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func7:
+; X86:       # %bb.0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl %cx, %ecx
+; X86-NEXT:    addl %ecx, %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    shrl $16, %edx
+; X86-NEXT:    shll $16, %ecx
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    calll __udivdi3
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    cmpl $131071, %eax # imm = 0x1FFFF
+; X86-NEXT:    movl $131071, %ecx # imm = 0x1FFFF
+; X86-NEXT:    cmovael %ecx, %eax
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+  %tmp = call i16 @llvm.udiv.fix.sat.i16(i16 %x, i16 %y, i32 16)
+  ret i16 %tmp
+}
+
+define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
+; X64-LABEL: vec:
+; X64:       # %bb.0:
+; X64-NEXT:    pxor %xmm8, %xmm8
+; X64-NEXT:    movdqa %xmm1, %xmm2
+; X64-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm8[2],xmm2[3],xmm8[3]
+; X64-NEXT:    movq %xmm2, %rcx
+; X64-NEXT:    movdqa %xmm0, %xmm4
+; X64-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm8[2],xmm4[3],xmm8[3]
+; X64-NEXT:    paddq %xmm4, %xmm4
+; X64-NEXT:    psllq $31, %xmm4
+; X64-NEXT:    movq %xmm4, %rax
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    divq %rcx
+; X64-NEXT:    movq %rax, %xmm7
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; X64-NEXT:    movq %xmm2, %rcx
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[2,3,0,1]
+; X64-NEXT:    movq %xmm2, %rax
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    divq %rcx
+; X64-NEXT:    movq %rax, %xmm2
+; X64-NEXT:    punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm2[0]
+; X64-NEXT:    movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
+; X64-NEXT:    movdqa %xmm7, %xmm2
+; X64-NEXT:    pxor %xmm4, %xmm2
+; X64-NEXT:    movdqa {{.*#+}} xmm9 = [9223372043297226751,9223372043297226751]
+; X64-NEXT:    movdqa %xmm9, %xmm6
+; X64-NEXT:    pcmpgtd %xmm2, %xmm6
+; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2]
+; X64-NEXT:    pcmpeqd %xmm9, %xmm2
+; X64-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; X64-NEXT:    pand %xmm3, %xmm5
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
+; X64-NEXT:    por %xmm5, %xmm2
+; X64-NEXT:    movdqa {{.*#+}} xmm6 = [8589934591,8589934591]
+; X64-NEXT:    pand %xmm2, %xmm7
+; X64-NEXT:    pandn %xmm6, %xmm2
+; X64-NEXT:    por %xmm7, %xmm2
+; X64-NEXT:    psrlq $1, %xmm2
+; X64-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1]
+; X64-NEXT:    movq %xmm1, %rcx
+; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
+; X64-NEXT:    paddq %xmm0, %xmm0
+; X64-NEXT:    psllq $31, %xmm0
+; X64-NEXT:    movq %xmm0, %rax
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    divq %rcx
+; X64-NEXT:    movq %rax, %xmm3
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X64-NEXT:    movq %xmm1, %rcx
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X64-NEXT:    movq %xmm0, %rax
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    divq %rcx
+; X64-NEXT:    movq %rax, %xmm0
+; X64-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; X64-NEXT:    pxor %xmm3, %xmm4
+; X64-NEXT:    movdqa %xmm9, %xmm0
+; X64-NEXT:    pcmpgtd %xmm4, %xmm0
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; X64-NEXT:    pcmpeqd %xmm9, %xmm4
+; X64-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; X64-NEXT:    pand %xmm1, %xmm4
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-NEXT:    por %xmm4, %xmm0
+; X64-NEXT:    pand %xmm0, %xmm3
+; X64-NEXT:    pandn %xmm6, %xmm0
+; X64-NEXT:    por %xmm3, %xmm0
+; X64-NEXT:    psrlq $1, %xmm0
+; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; X64-NEXT:    retq
+;
+; X86-LABEL: vec:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    addl %ecx, %ecx
+; X86-NEXT:    setb %al
+; X86-NEXT:    shldl $31, %ecx, %eax
+; X86-NEXT:    shll $31, %ecx
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    calll __udivdi3
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    cmpl $-1, %eax
+; X86-NEXT:    movl $-1, %ecx
+; X86-NEXT:    cmovbl %eax, %ecx
+; X86-NEXT:    cmpl $1, %edx
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %edi, %edi
+; X86-NEXT:    notl %edi
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    addl %esi, %esi
+; X86-NEXT:    setb %al
+; X86-NEXT:    cmpl $1, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    cmovel %ecx, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $1, %ecx
+; X86-NEXT:    cmovael %ecx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $31, %esi, %eax
+; X86-NEXT:    shll $31, %esi
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    calll __udivdi3
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    cmpl $-1, %eax
+; X86-NEXT:    movl $-1, %ecx
+; X86-NEXT:    cmovbl %eax, %ecx
+; X86-NEXT:    cmpl $1, %edx
+; X86-NEXT:    movl $1, %esi
+; X86-NEXT:    cmovbl %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %esi
+; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    notl %esi
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    addl %edi, %edi
+; X86-NEXT:    setb %al
+; X86-NEXT:    cmpl $1, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    cmovel %ecx, %esi
+; X86-NEXT:    shldl $31, %edi, %eax
+; X86-NEXT:    shll $31, %edi
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    calll __udivdi3
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    cmpl $-1, %eax
+; X86-NEXT:    movl $-1, %ebx
+; X86-NEXT:    cmovbl %eax, %ebx
+; X86-NEXT:    cmpl $1, %edx
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %edi, %edi
+; X86-NEXT:    notl %edi
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    addl %ebp, %ebp
+; X86-NEXT:    setb %cl
+; X86-NEXT:    cmpl $1, %edx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    cmovael %edx, %eax
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    cmovel %ebx, %edi
+; X86-NEXT:    shldl $31, %ebp, %ecx
+; X86-NEXT:    shll $31, %ebp
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    calll __udivdi3
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    cmpl $-1, %eax
+; X86-NEXT:    movl $-1, %ecx
+; X86-NEXT:    cmovbl %eax, %ecx
+; X86-NEXT:    cmpl $1, %edx
+; X86-NEXT:    movl $1, %ebx
+; X86-NEXT:    cmovbl %edx, %ebx
+; X86-NEXT:    movl $0, %ebp
+; X86-NEXT:    sbbl %ebp, %ebp
+; X86-NEXT:    notl %ebp
+; X86-NEXT:    orl %eax, %ebp
+; X86-NEXT:    cmpl $1, %edx
+; X86-NEXT:    cmovel %ecx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shrdl $1, %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shrdl $1, %eax, %esi
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    shrdl $1, %eax, %edi
+; X86-NEXT:    shrdl $1, %ebx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ebp, 12(%eax)
+; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+  %tmp = call <4 x i32> @llvm.udiv.fix.sat.v4i32(<4 x i32> %x, <4 x i32> %y, i32 31)
+  ret <4 x i32> %tmp
+}
diff --git a/llvm/test/CodeGen/X86/vec_fneg.ll b/llvm/test/CodeGen/X86/vec_fneg.ll
--- a/llvm/test/CodeGen/X86/vec_fneg.ll
+++ b/llvm/test/CodeGen/X86/vec_fneg.ll
@@ -76,12 +76,10 @@
 define <4 x float> @fsub_neg0_undef_elts_undef(<4 x float> %x) {
 ; X32-SSE-LABEL: fsub_neg0_undef_elts_undef:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = <NaN,u,u,NaN>
 ; X32-SSE-NEXT:    retl
 ;
 ; X64-SSE-LABEL: fsub_neg0_undef_elts_undef:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    movaps {{.*#+}} xmm0 = <NaN,u,u,NaN>
 ; X64-SSE-NEXT:    retq
   %r = fsub <4 x float> <float -0.0, float undef, float undef, float -0.0>, undef
   ret <4 x float> %r
diff --git a/llvm/test/ExecutionEngine/JITLink/X86/Inputs/MachO_GOTAndStubsOptimizationHelper.s b/llvm/test/ExecutionEngine/JITLink/X86/Inputs/MachO_GOTAndStubsOptimizationHelper.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/X86/Inputs/MachO_GOTAndStubsOptimizationHelper.s
@@ -0,0 +1,8 @@
+	.section	__TEXT,__text,regular,pure_instructions
+	.macosx_version_min 10, 14
+	.globl	bypass_got
+	.p2align	4, 0x90
+bypass_got:
+	movq	_x@GOTPCREL(%rip), %rax
+
+.subsections_via_symbols
diff --git a/llvm/test/ExecutionEngine/JITLink/X86/MachO_GOTAndStubsOptimization.s b/llvm/test/ExecutionEngine/JITLink/X86/MachO_GOTAndStubsOptimization.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/X86/MachO_GOTAndStubsOptimization.s
@@ -0,0 +1,31 @@
+# RUN: rm -rf %t && mkdir -p %t
+# RUN: llvm-mc -triple=x86_64-apple-macos10.9 -filetype=obj \
+# RUN:   -o %t/helper.o %S/Inputs/MachO_GOTAndStubsOptimizationHelper.s
+# RUN: llvm-mc -triple=x86_64-apple-macos10.9 -filetype=obj \
+# RUN:   -o %t/testcase.o %s
+# RUN: llvm-jitlink -noexec -slab-allocate 64Kb -entry=bypass_stub -check %s \
+# RUN:   %t/testcase.o %t/helper.o
+#
+# Test that references to in-range GOT and stub targets can be bypassed.
+# The helper file contains a function that uses the GOT for _x, and this file
+# contains an external call to that function. By slab allocating the JIT memory
+# we can ensure that the references and targets will be in-range of one another,
+# which should cause both the GOT load and stub to be bypassed.
+
+        .section	__TEXT,__text,regular,pure_instructions
+	.macosx_version_min 10, 14
+	.globl bypass_stub
+	.p2align	4, 0x90
+
+# jitlink-check: decode_operand(bypass_got, 4) = _x - next_pc(bypass_got)
+# jitlink-check: decode_operand(bypass_stub, 0) = bypass_got - next_pc(bypass_stub)
+bypass_stub:
+	callq	bypass_got
+
+	.section	__DATA,__data
+	.globl	_x
+	.p2align	2
+_x:
+	.long	42
+
+.subsections_via_symbols
diff --git a/llvm/test/ExecutionEngine/JITLink/X86/MachO_x86-64_relocations.s b/llvm/test/ExecutionEngine/JITLink/X86/MachO_x86-64_relocations.s
--- a/llvm/test/ExecutionEngine/JITLink/X86/MachO_x86-64_relocations.s
+++ b/llvm/test/ExecutionEngine/JITLink/X86/MachO_x86-64_relocations.s
@@ -1,6 +1,8 @@
 # RUN: rm -rf %t && mkdir -p %t
 # RUN: llvm-mc -triple=x86_64-apple-macosx10.9 -filetype=obj -o %t/macho_reloc.o %s
-# RUN: llvm-jitlink -noexec -define-abs external_data=0xdeadbeef -define-abs external_func=0xcafef00d -check=%s %t/macho_reloc.o
+# RUN: llvm-jitlink -noexec -define-abs external_data=0xffffffffdeadbeef \
+# RUN:    -define-abs external_func=0xffffffffcafef00d \
+# RUN:    -define-abs lowaddr_symbol=0x1000 -check=%s %t/macho_reloc.o
 
         .section        __TEXT,__text,regular,pure_instructions
 
@@ -170,11 +172,11 @@
 # Check X86_64_RELOC_UNSIGNED / long / extern handling by putting the address of
 # an external function (defined to reside in the low 4Gb) into a long symbol.
 #
-# jitlink-check: *{4}named_func_addr_long = external_func
-        .globl  named_func_addr_long
+# jitlink-check: *{4}named_lowaddr_symbol_long = lowaddr_symbol
+        .globl  named_lowaddr_symbol_long
         .p2align  2
-named_func_addr_long:
-        .long   external_func
+named_lowaddr_symbol_long:
+        .long   lowaddr_symbol
 
 # Check X86_64_RELOC_UNSIGNED / quad / non-extern handling by putting the
 # address of a local anonymous function into a quad symbol.
diff --git a/llvm/test/ExecutionEngine/OrcLazy/static-initializers-in-objectfiles.ll b/llvm/test/ExecutionEngine/OrcLazy/static-initializers-in-objectfiles.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/ExecutionEngine/OrcLazy/static-initializers-in-objectfiles.ll
@@ -0,0 +1,28 @@
+; RUN: rm -rf %t
+; RUN: mkdir -p %t
+; RUN: lli -jit-kind=orc-lazy -enable-cache-manager -object-cache-dir=%t %s
+; RUN: lli -jit-kind=orc-lazy -enable-cache-manager -object-cache-dir=%t %s
+;
+; Verify that LLJIT Platforms respect static initializers in cached objects.
+; This IR file contains a static initializer that must execute for main to exit
+; with value zero. The first execution will populate an object cache for the
+; second. The initializer in the cached objects must also be successfully run
+; for the test to pass.
+
+@HasError = global i8 1, align 1
+@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @resetHasError, i8* null }]
+
+define void @resetHasError() {
+entry:
+  store i8 0, i8* @HasError, align 1
+  ret void
+}
+
+define i32 @main(i32 %argc, i8** %argv) #2 {
+entry:
+  %0 = load i8, i8* @HasError, align 1
+  %tobool = trunc i8 %0 to i1
+  %conv = zext i1 %tobool to i32
+  ret i32 %conv
+}
+
diff --git a/llvm/test/MC/ELF/exclude-debug-dwo.s b/llvm/test/MC/ELF/exclude-debug-dwo.s
--- a/llvm/test/MC/ELF/exclude-debug-dwo.s
+++ b/llvm/test/MC/ELF/exclude-debug-dwo.s
@@ -10,23 +10,23 @@
 # CHECK: .debug_loc.dwo         {{.*}} E
 # CHECK: .debug_str_offsets.dwo {{.*}} E
 
-.section .debug_info.dwo
+.section .debug_info.dwo,"e"
 nop
 
-.section .debug_types.dwo
+.section .debug_types.dwo,"e"
 nop
 
-.section .debug_abbrev.dwo
+.section .debug_abbrev.dwo,"e"
 nop
 
-.section .debug_str.dwo
+.section .debug_str.dwo,"MSe",@progbits,1
 nop
 
-.section .debug_line.dwo
+.section .debug_line.dwo,"e"
 nop
 
-.section .debug_loc.dwo
+.section .debug_loc.dwo,"e"
 nop
 
-.section .debug_str_offsets.dwo
+.section .debug_str_offsets.dwo,"e"
 nop
diff --git a/llvm/test/MC/ELF/section-entsize-changed.s b/llvm/test/MC/ELF/section-entsize-changed.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/MC/ELF/section-entsize-changed.s
@@ -0,0 +1,12 @@
+# RUN: not llvm-mc -triple=x86_64 %s -o /dev/null 2>&1 | FileCheck %s --implicit-check-not=error:
+
+foo:
+.section .foo,"aM",@progbits,1
+
+# CHECK: {{.*}}.s:[[# @LINE+1]]:1: error: changed section entsize for .foo, expected: 1
+.section .foo,"aM",@progbits,4
+
+# CHECK: {{.*}}.s:[[# @LINE+1]]:1: error: changed section entsize for .foo, expected: 1
+.pushsection .foo,"aM",@progbits,4
+
+.pushsection .foo,"aM",@progbits,1
diff --git a/llvm/test/MC/ELF/section-flags-changed.s b/llvm/test/MC/ELF/section-flags-changed.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/MC/ELF/section-flags-changed.s
@@ -0,0 +1,12 @@
+# RUN: not llvm-mc -triple=x86_64 %s -o /dev/null 2>&1 | FileCheck %s --implicit-check-not=error:
+
+foo:
+.section .foo,"ax",@progbits
+
+# CHECK: {{.*}}.s:[[# @LINE+1]]:1: error: changed section flags for .foo, expected: 0x6
+.section .foo,"awx",@progbits
+
+# CHECK: {{.*}}.s:[[# @LINE+1]]:1: error: changed section flags for .foo, expected: 0x6
+.pushsection .foo,"a",@progbits
+
+.pushsection .foo,"ax",@progbits
diff --git a/llvm/test/MC/ELF/section-type-changed.s b/llvm/test/MC/ELF/section-type-changed.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/MC/ELF/section-type-changed.s
@@ -0,0 +1,11 @@
+# RUN: not llvm-mc -triple=x86_64 %s -o /dev/null 2>&1 | FileCheck %s --implicit-check-not=error:
+
+.section .foo,"a",@progbits
+
+# CHECK: {{.*}}.s:[[# @LINE+1]]:1: error: changed section type for .foo, expected: 0x1
+.section .foo,"a",@init_array
+
+# CHECK: {{.*}}.s:[[# @LINE+1]]:1: error: changed section type for .foo, expected: 0x1
+.pushsection .foo,"a",@nobits
+
+.pushsection .foo,"a",@progbits
diff --git a/llvm/test/Other/cfg_deopt_unreach.ll b/llvm/test/Other/cfg_deopt_unreach.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Other/cfg_deopt_unreach.ll
@@ -0,0 +1,33 @@
+; RUN: opt < %s -analyze -dot-cfg -cfg-hide-unreachable-paths -cfg-dot-filename-prefix=unreach 2>/dev/null
+; RUN: FileCheck %s -input-file=unreach.callee.dot -check-prefix=UNREACH
+; RUN: opt < %s -analyze -dot-cfg -cfg-hide-deoptimize-paths -cfg-dot-filename-prefix=deopt 2>/dev/null
+; RUN: FileCheck %s -input-file=deopt.callee.dot -check-prefix=DEOPT
+; RUN: opt < %s -analyze -dot-cfg -cfg-dot-filename-prefix=no-flags 2>/dev/null
+; RUN: FileCheck %s -input-file=no-flags.callee.dot -check-prefix=NO-FLAGS
+; RUN: opt < %s -analyze -dot-cfg -cfg-hide-unreachable-paths -cfg-hide-deoptimize-paths -cfg-dot-filename-prefix=both-flags 2>/dev/null
+; RUN: FileCheck %s -input-file=both-flags.callee.dot -check-prefix=BOTH-FLAGS
+
+declare i8 @llvm.experimental.deoptimize.i8(...)
+
+define i8 @callee(i1* %c) alwaysinline {
+; NO-FLAGS: [shape=record,label="{%0:\l  %c0 = load volatile i1, i1* %c\l  br i1 %c0, label %lleft, label %lright\l|{<s0>T|<s1>F}}"];
+; DEOPT: [shape=record,label="{%0:\l  %c0 = load volatile i1, i1* %c\l  br i1 %c0, label %lleft, label %lright\l|{<s0>T|<s1>F}}"];
+; UNREACH: [shape=record,label="{%0:\l  %c0 = load volatile i1, i1* %c\l  br i1 %c0, label %lleft, label %lright\l|{<s0>T|<s1>F}}"];
+; BOTH-FLAGS-NOT: [shape=record,label="{%0:\l  %c0 = load volatile i1, i1* %c\l  br i1 %c0, label %lleft, label %lright\l|{<s0>T|<s1>F}}"];
+  %c0 = load volatile i1, i1* %c
+  br i1 %c0, label %lleft, label %lright
+; NO-FLAGS: [shape=record,label="{lleft:                                            \l  %v0 = call i8 (...) @llvm.experimental.deoptimize.i8(i32 1) [ \"deopt\"(i32 1)\l... ]\l  ret i8 %v0\l}"];
+; DEOPT-NOT: [shape=record,label="{lleft:                                            \l  %v0 = call i8 (...) @llvm.experimental.deoptimize.i8(i32 1) [ \"deopt\"(i32 1)\l... ]\l  ret i8 %v0\l}"];
+; UNREACH: [shape=record,label="{lleft:                                            \l  %v0 = call i8 (...) @llvm.experimental.deoptimize.i8(i32 1) [ \"deopt\"(i32 1)\l... ]\l  ret i8 %v0\l}"];
+; BOTH-FLAGS-NOT: [shape=record,label="{lleft:                                            \l  %v0 = call i8 (...) @llvm.experimental.deoptimize.i8(i32 1) [ \"deopt\"(i32 1)\l... ]\l  ret i8 %v0\l}"];
+lleft:
+  %v0 = call i8(...) @llvm.experimental.deoptimize.i8(i32 1) [ "deopt"(i32 1) ]
+  ret i8 %v0
+
+; NO-FLAGS: [shape=record,label="{lright:                                           \l  unreachable\l}"];
+; DEOPT: [shape=record,label="{lright:                                           \l  unreachable\l}"];
+; UNREACH-NOT: [shape=record,label="{lright:                                           \l  unreachable\l}"];
+; BOTH-FLAGS-NOT: [shape=record,label="{lright:                                           \l  unreachable\l}"];
+lright:
+  unreachable
+}
diff --git a/llvm/test/Other/module-pass-printer.ll b/llvm/test/Other/module-pass-printer.ll
--- a/llvm/test/Other/module-pass-printer.ll
+++ b/llvm/test/Other/module-pass-printer.ll
@@ -1,13 +1,43 @@
 ; Check pass name is only printed once.
-; RUN: opt < %s 2>&1 -forceattrs -disable-output -print-after-all | FileCheck %s
-; RUN: opt < %s 2>&1 -forceattrs -disable-output -print-after-all -filter-print-funcs=foo,bar | FileCheck %s
+; Check only one function is printed
+; RUN: opt < %s 2>&1 -forceattrs -disable-output -print-after-all -filter-print-funcs=foo | FileCheck %s  -check-prefix=FOO
+; RUN: opt < %s 2>&1 -passes=forceattrs -disable-output -print-after-all -filter-print-funcs=foo | FileCheck %s  -check-prefix=FOO
+
+; Check pass name is only printed once.
+; Check both functions are printed
+; RUN: opt < %s 2>&1 -forceattrs -disable-output -print-after-all -filter-print-funcs=foo,bar | FileCheck %s -check-prefix=BOTH
+; RUN: opt < %s 2>&1 -passes=forceattrs -disable-output -print-after-all -filter-print-funcs=foo,bar | FileCheck %s -check-prefix=BOTH
 
 ; Check pass name is not printed if a module doesn't include any function specified in -filter-print-funcs.
 ; RUN: opt < %s 2>&1 -forceattrs -disable-output -print-after-all -filter-print-funcs=baz | FileCheck %s -allow-empty -check-prefix=EMPTY
+; RUN: opt < %s 2>&1 -passes=forceattrs -disable-output -print-after-all -filter-print-funcs=baz | FileCheck %s -allow-empty -check-prefix=EMPTY
+
+; Check whole module is printed with user-specified wildcast switch -filter-print-funcs=* or -print-module-scope
+; RUN: opt < %s 2>&1 -forceattrs -disable-output -print-after-all | FileCheck %s -check-prefix=ALL
+; RUN: opt < %s 2>&1 -forceattrs -disable-output  -print-after-all -filter-print-funcs=* | FileCheck %s -check-prefix=ALL
+; RUN: opt < %s 2>&1 -forceattrs -disable-output -print-after-all -filter-print-funcs=foo -print-module-scope | FileCheck %s -check-prefix=ALL
+; RUN: opt < %s 2>&1 -passes=forceattrs -disable-output -print-after-all | FileCheck %s -check-prefix=ALL
+; RUN: opt < %s 2>&1 -passes=forceattrs -disable-output -print-after-all -filter-print-funcs=* | FileCheck %s -check-prefix=ALL
+; RUN: opt < %s 2>&1 -passes=forceattrs -disable-output -print-after-all -filter-print-funcs=foo -print-module-scope | FileCheck %s -check-prefix=ALL
+
+; FOO:      IR Dump After {{Force set function attributes|ForceFunctionAttrsPass}}
+; FOO:      define void @foo
+; FOO-NOT:  define void @bar
+; FOO-NOT:  IR Dump After {{Force set function attributes|ForceFunctionAttrsPass}}
+
+; BOTH:     IR Dump After {{Force set function attributes|ForceFunctionAttrsPass}}
+; BOTH:     define void @foo
+; BOTH:     define void @bar
+; BOTH-NOT: IR Dump After {{Force set function attributes|ForceFunctionAttrsPass}}
+; BOTH-NOT: ModuleID =
+
+; EMPTY-NOT: IR Dump After {{Force set function attributes|ForceFunctionAttrsPass}}
 
-; CHECK: *** IR Dump After Force set function attributes ***
-; CHECK-NOT: *** IR Dump After Force set function attributes ***
-; EMPTY-NOT: *** IR Dump After Force set function attributes ***
+; ALL:  IR Dump After {{Force set function attributes|ForceFunctionAttrsPass}}
+; ALL:  ModuleID =
+; ALL:  define void @foo
+; ALL:  define void @bar
+; ALL-NOT: IR Dump After {{Force set function attributes|ForceFunctionAttrsPass}}
 
 define void @foo() {
   ret void
diff --git a/llvm/test/Transforms/Attributor/liveness.ll b/llvm/test/Transforms/Attributor/liveness.ll
--- a/llvm/test/Transforms/Attributor/liveness.ll
+++ b/llvm/test/Transforms/Attributor/liveness.ll
@@ -3,6 +3,7 @@
 ; RUN: opt -attributor-cgscc --attributor-disable=false -attributor-annotate-decl-cs -attributor-max-iterations=7 -S < %s | FileCheck %s --check-prefixes=CHECK,CGSCC,CGSCC_OLD
 ; RUN: opt -passes=attributor --attributor-disable=false -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=7 -S < %s | FileCheck %s --check-prefixes=CHECK,MODULE,MODULE_NEW
 ; RUN: opt -passes='attributor-cgscc' --attributor-disable=false -attributor-annotate-decl-cs -attributor-max-iterations=7 -S < %s | FileCheck %s --check-prefixes=CHECK,CGSCC,CGSCC_NEW
+; XFAIL: *
 ; UTC_ARGS: --disable
 
 ; MODULE_OLD: @dead_with_blockaddress_users.l = constant [2 x i8*] [i8* inttoptr (i32 1 to i8*), i8* inttoptr (i32 1 to i8*)]
diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/overflow-intrinsics.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/overflow-intrinsics.ll
--- a/llvm/test/Transforms/CodeGenPrepare/AArch64/overflow-intrinsics.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/overflow-intrinsics.ll
@@ -102,9 +102,9 @@
 ; pattern as well.
 define i64 @uaddo6_xor(i64 %a, i64 %b) {
 ; CHECK-LABEL: @uaddo6_xor(
-; CHECK-NEXT:    [[X:%.*]] = xor i64 [[A:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[X]], [[B:%.*]]
-; CHECK-NEXT:    [[Q:%.*]] = select i1 [[CMP]], i64 [[B]], i64 42
+; CHECK-NEXT:    [[TMP1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A:%.*]], i64 [[B:%.*]])
+; CHECK-NEXT:    [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1
+; CHECK-NEXT:    [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42
 ; CHECK-NEXT:    ret i64 [[Q]]
 ;
   %x = xor i64 %a, -1
@@ -115,13 +115,13 @@
 
 define i64 @uaddo6_xor_commuted(i64 %a, i64 %b) {
 ; CHECK-LABEL: @uaddo6_xor_commuted(
-; CHECK-NEXT:    [[X:%.*]] = xor i64 -1, [[A:%.*]]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[X]], [[B:%.*]]
-; CHECK-NEXT:    [[Q:%.*]] = select i1 [[CMP]], i64 [[B]], i64 42
+; CHECK-NEXT:    [[TMP1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A:%.*]], i64 [[B:%.*]])
+; CHECK-NEXT:    [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1
+; CHECK-NEXT:    [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42
 ; CHECK-NEXT:    ret i64 [[Q]]
 ;
-  %x = xor i64 -1, %a
-  %cmp = icmp ult i64 %x, %b
+  %x = xor i64 %a, -1
+  %cmp = icmp ugt i64 %b, %x
   %Q = select i1 %cmp, i64 %b, i64 42
   ret i64 %Q
 }
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/overflow-intrinsics.ll b/llvm/test/Transforms/CodeGenPrepare/X86/overflow-intrinsics.ll
--- a/llvm/test/Transforms/CodeGenPrepare/X86/overflow-intrinsics.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/overflow-intrinsics.ll
@@ -153,9 +153,9 @@
 ; pattern as well.
 define i64 @uaddo6_xor(i64 %a, i64 %b) {
 ; CHECK-LABEL: @uaddo6_xor(
-; CHECK-NEXT:    [[X:%.*]] = xor i64 [[A:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[X]], [[B:%.*]]
-; CHECK-NEXT:    [[Q:%.*]] = select i1 [[CMP]], i64 [[B]], i64 42
+; CHECK-NEXT:    [[TMP1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A:%.*]], i64 [[B:%.*]])
+; CHECK-NEXT:    [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1
+; CHECK-NEXT:    [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42
 ; CHECK-NEXT:    ret i64 [[Q]]
 ;
   %x = xor i64 %a, -1
@@ -166,12 +166,12 @@
 
 define i64 @uaddo6_xor_commuted(i64 %a, i64 %b) {
 ; CHECK-LABEL: @uaddo6_xor_commuted(
-; CHECK-NEXT:    [[X:%.*]] = xor i64 -1, [[A:%.*]]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[X]], [[B:%.*]]
-; CHECK-NEXT:    [[Q:%.*]] = select i1 [[CMP]], i64 [[B]], i64 42
+; CHECK-NEXT:    [[TMP1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A:%.*]], i64 [[B:%.*]])
+; CHECK-NEXT:    [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1
+; CHECK-NEXT:    [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42
 ; CHECK-NEXT:    ret i64 [[Q]]
 ;
-  %x = xor i64 -1, %a
+  %x = xor i64 %a, -1
   %cmp = icmp ult i64 %x, %b
   %Q = select i1 %cmp, i64 %b, i64 42
   ret i64 %Q
@@ -194,6 +194,23 @@
   ret i64 %Q
 }
 
+; Make sure we do not use the XOR binary operator as insert point, as it may
+; come before the second operand of the overflow intrinsic.
+define i1 @uaddo6_xor_op_after_XOR(i32 %a, i32* %b.ptr) {
+; CHECK-LABEL: @uaddo6_xor_op_after_XOR(
+; CHECK-NEXT:    [[B:%.*]] = load i32, i32* [[B_PTR:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A:%.*]], i32 [[B]])
+; CHECK-NEXT:    [[OV1:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1
+; CHECK-NEXT:    [[OV:%.*]] = xor i1 [[OV1]], true
+; CHECK-NEXT:    ret i1 [[OV]]
+;
+  %x = xor i32 %a, -1
+  %b = load i32, i32* %b.ptr, align 8
+  %cmp14 = icmp ugt i32 %b, %x
+  %ov = xor i1 %cmp14, true
+  ret i1 %ov
+}
+
 ; When adding 1, the general pattern for add-overflow may be different due to icmp canonicalization.
 ; PR31754: https://bugs.llvm.org/show_bug.cgi?id=31754
 
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/OverwriteStoreBegin.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/OverwriteStoreBegin.ll
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/OverwriteStoreBegin.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/OverwriteStoreBegin.ll
@@ -1,5 +1,4 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; XFAIL: *
 ; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -S | FileCheck %s
 
 define void @write4to7(i32* nocapture %p) {
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/OverwriteStoreEnd.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/OverwriteStoreEnd.ll
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/OverwriteStoreEnd.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/OverwriteStoreEnd.ll
@@ -1,5 +1,4 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; XFAIL: *
 ; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -S | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/memset-missing-debugloc.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/memset-missing-debugloc.ll
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/memset-missing-debugloc.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/memset-missing-debugloc.ll
@@ -1,7 +1,6 @@
 ; Test that the getelementptr generated when the dse pass determines that
 ; a memset can be shortened has the debugloc carried over from the memset.
 
-; XFAIL: *
 ; RUN: opt -S -march=native -dse -enable-dse-memoryssa < %s| FileCheck %s
 ; CHECK: bitcast [5 x i64]* %{{[a-zA-Z_][a-zA-Z0-9_]*}} to i8*, !dbg
 ; CHECK-NEXT: %{{[0-9]+}} = getelementptr inbounds i8, i8* %0, i64 32, !dbg ![[DBG:[0-9]+]]
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-loops.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-loops.ll
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-loops.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-loops.ll
@@ -248,7 +248,6 @@
 ; We cannot eliminate the store in for.header, as the location is not overwritten
 ; in for.body and read afterwards.
 define void @loop_multiple_def_uses_mayalias_write(i32* %p, i32* %q) {
-
 ; CHECK-LABEL: @loop_multiple_def_uses_mayalias_write(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[FOR_HEADER:%.*]]
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memintrinsics.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memintrinsics.ll
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memintrinsics.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memintrinsics.ll
@@ -43,7 +43,8 @@
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
 ; CHECK-NEXT:    [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8*
-; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 28, i1 false)
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 4
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 24, i1 false)
 ; CHECK-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    br label [[BB3:%.*]]
diff --git a/llvm/test/Transforms/InstCombine/fneg.ll b/llvm/test/Transforms/InstCombine/fneg.ll
--- a/llvm/test/Transforms/InstCombine/fneg.ll
+++ b/llvm/test/Transforms/InstCombine/fneg.ll
@@ -89,20 +89,20 @@
 
 define <4 x double> @fmul_fsub_vec(<4 x double> %x) {
 ; CHECK-LABEL: @fmul_fsub_vec(
-; CHECK-NEXT:    [[R:%.*]] = fmul <4 x double> [[X:%.*]], <double -4.200000e+01, double 0x7F80000000000000, double 0xFFF0000000000000, double undef>
+; CHECK-NEXT:    [[R:%.*]] = fmul <4 x double> [[X:%.*]], <double -4.200000e+01, double 0xFFF8000000000000, double 0xFFF0000000000000, double undef>
 ; CHECK-NEXT:    ret <4 x double> [[R]]
 ;
-  %m = fmul <4 x double> %x, <double 42.0, double 0x7FF80000000000000, double 0x7FF0000000000000, double undef>
+  %m = fmul <4 x double> %x, <double 42.0, double 0x7FF8000000000000, double 0x7FF0000000000000, double undef>
   %r = fsub <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %m
   ret <4 x double> %r
 }
 
 define <4 x double> @fmul_fneg_vec(<4 x double> %x) {
 ; CHECK-LABEL: @fmul_fneg_vec(
-; CHECK-NEXT:    [[R:%.*]] = fmul <4 x double> [[X:%.*]], <double -4.200000e+01, double 0x7F80000000000000, double 0xFFF0000000000000, double undef>
+; CHECK-NEXT:    [[R:%.*]] = fmul <4 x double> [[X:%.*]], <double -4.200000e+01, double 0xFFF8000000000000, double 0xFFF0000000000000, double undef>
 ; CHECK-NEXT:    ret <4 x double> [[R]]
 ;
-  %m = fmul <4 x double> %x, <double 42.0, double 0x7FF80000000000000, double 0x7FF0000000000000, double undef>
+  %m = fmul <4 x double> %x, <double 42.0, double 0x7FF8000000000000, double 0x7FF0000000000000, double undef>
   %r = fneg <4 x double> %m
   ret <4 x double> %r
 }
@@ -277,20 +277,20 @@
 
 define <4 x double> @fdiv_op0_constant_fsub_vec(<4 x double> %x) {
 ; CHECK-LABEL: @fdiv_op0_constant_fsub_vec(
-; CHECK-NEXT:    [[R:%.*]] = fdiv <4 x double> <double 4.200000e+01, double 0x7F80000000000000, double 0x7FF0000000000000, double undef>, [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = fdiv <4 x double> <double 4.200000e+01, double 0xFFF8000000000000, double 0x7FF0000000000000, double undef>, [[X:%.*]]
 ; CHECK-NEXT:    ret <4 x double> [[R]]
 ;
-  %d = fdiv <4 x double> <double -42.0, double 0x7FF80000000000000, double 0xFFF0000000000000, double undef>, %x
+  %d = fdiv <4 x double> <double -42.0, double 0x7FF8000000000000, double 0xFFF0000000000000, double undef>, %x
   %r = fsub <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %d
   ret <4 x double> %r
 }
 
 define <4 x double> @fdiv_op0_constant_fneg_vec(<4 x double> %x) {
 ; CHECK-LABEL: @fdiv_op0_constant_fneg_vec(
-; CHECK-NEXT:    [[R:%.*]] = fdiv <4 x double> <double 4.200000e+01, double 0x7F80000000000000, double 0x7FF0000000000000, double undef>, [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = fdiv <4 x double> <double 4.200000e+01, double 0xFFF8000000000000, double 0x7FF0000000000000, double undef>, [[X:%.*]]
 ; CHECK-NEXT:    ret <4 x double> [[R]]
 ;
-  %d = fdiv <4 x double> <double -42.0, double 0x7FF80000000000000, double 0xFFF0000000000000, double undef>, %x
+  %d = fdiv <4 x double> <double -42.0, double 0x7FF8000000000000, double 0xFFF0000000000000, double undef>, %x
   %r = fneg <4 x double> %d
   ret <4 x double> %r
 }
diff --git a/llvm/test/tools/llvm-objcopy/ELF/only-keep-debug.test b/llvm/test/tools/llvm-objcopy/ELF/only-keep-debug.test
--- a/llvm/test/tools/llvm-objcopy/ELF/only-keep-debug.test
+++ b/llvm/test/tools/llvm-objcopy/ELF/only-keep-debug.test
@@ -193,9 +193,9 @@
 
 # CHECK3:      [Nr] Name          Type     Address          Off    Size   ES Flg Lk Inf Al
 # CHECK3:      [ 1] .dynsym       NOBITS   0000000000000000 000040 000018 18   A  2   1 1024
-# CHECK3-NEXT: [ 2] .dynstr       NOBITS   0000000000000000 000040 000001 00   A  0   0  0
-# CHECK3-NEXT: [ 3] .symtab       NOBITS   0000000000000000 000040 000018 18   A  4   1  0
-# CHECK3-NEXT: [ 4] .strtab       NOBITS   0000000000000000 000040 000001 00   A  0   0  0
+# CHECK3-NEXT: [ 2] .dynstr       NOBITS   0000000000000018 000040 000001 00   A  0   0  0
+# CHECK3-NEXT: [ 3] .symtab       NOBITS   0000000000000019 000040 000018 18   A  4   1  0
+# CHECK3-NEXT: [ 4] .strtab       NOBITS   0000000000000031 000040 000001 00   A  0   0  0
 # CHECK3-NEXT: [ 5] .shstrtab     STRTAB   0000000000000000 000040 00002b 00      0   0  1
 
 --- !ELF
diff --git a/llvm/test/tools/llvm-objdump/AMDGPU/source-lines.ll b/llvm/test/tools/llvm-objdump/AMDGPU/source-lines.ll
--- a/llvm/test/tools/llvm-objdump/AMDGPU/source-lines.ll
+++ b/llvm/test/tools/llvm-objdump/AMDGPU/source-lines.ll
@@ -5,6 +5,7 @@
 
 ; Prologue.
 ; LINE:      source_lines_test:
+; LINE-NEXT: ; source_lines_test():
 ; LINE-NEXT: ; {{.*}}source-lines.cl:1
 ; Kernel.
 ; LINE: v_mov_b32_e32 v{{[0-9]+}}, 0x777
diff --git a/llvm/test/tools/llvm-objdump/Hexagon/source-interleave-hexagon.ll b/llvm/test/tools/llvm-objdump/Hexagon/source-interleave-hexagon.ll
--- a/llvm/test/tools/llvm-objdump/Hexagon/source-interleave-hexagon.ll
+++ b/llvm/test/tools/llvm-objdump/Hexagon/source-interleave-hexagon.ll
@@ -66,6 +66,7 @@
 !22 = !DILocation(line: 8, column: 13, scope: !14)
 !23 = !DILocation(line: 8, column: 3, scope: !14)
 ; LINES: main:
+; LINES-NEXT: main():
 ; LINES-NEXT: SRC_COMPDIR/source-interleave-hexagon.c:6
 
 ; SOURCE: main:
diff --git a/llvm/test/tools/llvm-objdump/X86/source-interleave-function-from-debug.test b/llvm/test/tools/llvm-objdump/X86/source-interleave-function-from-debug.test
new file mode 100644
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/X86/source-interleave-function-from-debug.test
@@ -0,0 +1,108 @@
+;; Verify that llvm-objdump -l also prints the function name in disassembly
+;; output, getting it from the debug info.
+
+; RUN: llc < %s -o %t.o -filetype=obj -mtriple=x86_64-unknown-linux-gnu
+; RUN: llvm-objdump -dl %t.o | FileCheck %s --check-prefixes=CHECK,CHECK-NO-DEMANGLE
+
+; RUN: llc < %s -o %t.o -filetype=obj -mtriple=x86_64-unknown-linux-gnu
+; RUN: llvm-objdump -dlC %t.o | FileCheck %s --check-prefixes=CHECK,CHECK-DEMANGLE
+
+; CHECK:      0000000000000000 foo:
+; CHECK-NEXT: ; foo():
+; CHECK-NEXT: ; /tmp{{/|\\}}src.cc:1
+; CHECK-NEXT:        0: b8 05 00 00 00                movl    $5, %eax
+; CHECK-NEXT:        5: c3                            retq
+
+; CHECK-NO-DEMANGLE:      0000000000000010 _ZN3xyz3barEv:
+; CHECK-NO-DEMANGLE-NEXT: ; _ZN3xyz3barEv():
+; CHECK-DEMANGLE:         0000000000000010 xyz::bar():
+; CHECK-DEMANGLE-NEXT:    ; xyz::bar():
+
+; CHECK-NEXT: ; /tmp{{/|\\}}src.cc:3
+; CHECK-NEXT:       10: b8 0a 00 00 00                movl    $10, %eax
+; CHECK-NEXT:       15: c3                            retq
+
+; CHECK-NO-DEMANGLE:      0000000000000020 _ZN3xyz3bazEv:
+; CHECK-NO-DEMANGLE-NEXT: ; _ZN3xyz3bazEv():
+; CHECK-DEMANGLE:         0000000000000020 xyz::baz():
+; CHECK-DEMANGLE-NEXT:    ; xyz::baz():
+
+; CHECK-NEXT: ; /tmp{{/|\\}}src.cc:3
+; CHECK-NEXT:       20: b8 14 00 00 00                movl    $20, %eax
+; CHECK-NEXT:       25: c3                            retq
+
+;; When symbol information is missing, we can get function names from debug
+;; info. The IR is intentionally doctored to have different names in debug info
+;; for the test case here.
+; RUN: llvm-strip %t.o -N foo -N _ZN3xyz3barEv -N _ZN3xyz3bazEv -o %t-stripped.o
+; RUN: llvm-objdump -dlC %t-stripped.o | FileCheck %s --check-prefix=STRIPPED
+
+; STRIPPED:      0000000000000000 .text:
+; STRIPPED-NEXT: ; Function1():
+; STRIPPED-NEXT: ; /tmp{{/|\\}}src.cc:1
+; STRIPPED-NEXT:        0: b8 05 00 00 00                movl    $5, %eax
+; STRIPPED-NEXT:        5: c3                            retq
+
+; STRIPPED:      ; xyz::bar():
+; STRIPPED-NEXT: ; /tmp{{/|\\}}src.cc:3
+; STRIPPED-NEXT:       10: b8 0a 00 00 00                movl    $10, %eax
+; STRIPPED-NEXT:       15: c3                            retq
+
+; STRIPPED:      ; xyz::baz():
+; STRIPPED-NEXT: ; /tmp{{/|\\}}src.cc:3
+; STRIPPED-NEXT:       20: b8 14 00 00 00                movl    $20, %eax
+; STRIPPED-NEXT:       25: c3                            retq
+
+;; IR adapted from:
+;; $ cat /tmp/src.cc
+;; extern "C" int foo() { return 5; };
+;; namespace xyz {
+;; int bar() { return 10; } int baz() { return 20; }
+;; } // namespace xyz
+;; $ clang++ -O -g -c /tmp/src.cc -S -emit-llvm
+;; Note: bar() and baz() intentionally written on the same line.
+
+; ModuleID = '/tmp/src.cc'
+source_filename = "/tmp/src.cc"
+target triple = "x86_64-unknown-linux-gnu"
+
+define dso_local i32 @foo() #0 !dbg !7 {
+entry:
+  ret i32 5, !dbg !12
+}
+
+define dso_local i32 @_ZN3xyz3barEv() #0 !dbg !13 {
+entry:
+  ret i32 10, !dbg !15
+}
+
+define dso_local i32 @_ZN3xyz3bazEv() #0 !dbg !16 {
+entry:
+  ret i32 20, !dbg !17
+}
+
+attributes #0 = { "frame-pointer"="none" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang trunk", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
+; Note: <invalid> triggers a bad DILineInfo. We still print "Function1()".
+!1 = !DIFile(filename: "<invalid>", directory: "")
+!2 = !{}
+!3 = !{i32 7, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang trunk)"}
+!7 = distinct !DISubprogram(name: "Function1", scope: !8, file: !8, line: 1, type: !9, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
+!8 = !DIFile(filename: "/tmp/src.cc", directory: "")
+!9 = !DISubroutineType(types: !10)
+!10 = !{!11}
+!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!12 = !DILocation(line: 1, column: 24, scope: !7)
+!13 = distinct !DISubprogram(name: "bar", linkageName: "_ZN3xyz3barEv", scope: !14, file: !8, line: 3, type: !9, scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
+!14 = !DINamespace(name: "xyz", scope: null)
+!15 = !DILocation(line: 3, column: 13, scope: !13)
+!16 = distinct !DISubprogram(name: "baz", linkageName: "_ZN3xyz3bazEv", scope: !14, file: !8, line: 3, type: !9, scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
+!17 = !DILocation(line: 3, column: 38, scope: !16)
diff --git a/llvm/test/tools/llvm-objdump/X86/source-interleave-x86_64.test b/llvm/test/tools/llvm-objdump/X86/source-interleave-x86_64.test
--- a/llvm/test/tools/llvm-objdump/X86/source-interleave-x86_64.test
+++ b/llvm/test/tools/llvm-objdump/X86/source-interleave-x86_64.test
@@ -10,6 +10,7 @@
 # RUN: FileCheck --check-prefix=SOURCE --strict-whitespace %s < %t2
 
 # LINES: main:
+# LINES-NEXT: ; main():
 # LINES-NEXT: ; {{[ -\(\)_A-Za-z0-9.\\/:]+}}source-interleave-x86_64.c:6
 
 # SOURCE: main:
diff --git a/llvm/test/tools/llvm-objdump/embedded-source.test b/llvm/test/tools/llvm-objdump/embedded-source.test
--- a/llvm/test/tools/llvm-objdump/embedded-source.test
+++ b/llvm/test/tools/llvm-objdump/embedded-source.test
@@ -13,6 +13,7 @@
 ; }
 
 ; LINE: main:
+; LINE-NEXT: ; main():
 ; LINE-NEXT: ; {{.*}}embedded-source.c:1
 ; LINE-NEXT: pushq %rbp
 ; LINE: ; {{.*}}embedded-source.c:2
diff --git a/llvm/test/tools/llvm-readobj/ELF/verneed-invalid.test b/llvm/test/tools/llvm-readobj/ELF/verneed-invalid.test
--- a/llvm/test/tools/llvm-readobj/ELF/verneed-invalid.test
+++ b/llvm/test/tools/llvm-readobj/ELF/verneed-invalid.test
@@ -13,7 +13,7 @@
 # GNU-VERNEED-NAME-NEXT:   000:   0 (*local*)       2 ()
 
 # GNU-VERNEED-NAME:      Version needs section '.gnu.version_r' contains 1 entries:
-# GNU-VERNEED-NAME-NEXT:  Addr: 0000000000000000  Offset: 0x000044  Link: 4 (.dynstr)
+# GNU-VERNEED-NAME-NEXT:  Addr: 0000000000200214  Offset: 0x000044  Link: 4 (.dynstr)
 # GNU-VERNEED-NAME-NEXT:   0x0000: Version: 1  File: somefile  Cnt: 1
 # GNU-VERNEED-NAME-NEXT:   0x0010:   Name:   Flags: none  Version: 2
 
@@ -89,7 +89,7 @@
 # GNU-NOLINK-NEXT: warning: '[[FILE]]': invalid string table linked to SHT_GNU_verneed section with index 2: invalid sh_type for string table section [index 0]: expected SHT_STRTAB, but got SHT_NULL
 # GNU-NOLINK-NEXT:   000:   0 (*local*) 2 (<corrupt>)
 # GNU-NOLINK:      Version needs section '.gnu.version_r' contains 1 entries:
-# GNU-NOLINK-NEXT:  Addr: 0000000000000000  Offset: 0x000044  Link: 0 ()
+# GNU-NOLINK-NEXT:  Addr: 0000000000000004  Offset: 0x000044  Link: 0 ()
 # GNU-NOLINK-NEXT:   0x0000: Version: 1  File: <corrupt vn_file: 9>  Cnt: 1
 # GNU-NOLINK-NEXT:   0x0010:   Name: <corrupt>  Flags: none Version: 2
 
@@ -231,7 +231,7 @@
 # LLVM-OFFSET-EQ-NEXT: ]
 
 # GNU-OFFSET-EQ:      Version needs section '.gnu.version_r' contains 1 entries:
-# GNU-OFFSET-EQ-NEXT:  Addr: 0000000000000000  Offset: 0x000044  Link: 1 (.mystrtab)
+# GNU-OFFSET-EQ-NEXT:  Addr: 0000000000000004  Offset: 0x000044  Link: 1 (.mystrtab)
 # GNU-OFFSET-EQ-NEXT:   0x0000: Version: 1  File: <corrupt vn_file: 1>  Cnt: 1
 # GNU-OFFSET-EQ-NEXT:   0x0010:   Name: <corrupt>  Flags: none  Version: 0
 
@@ -562,7 +562,7 @@
 # GNU-CUSTOM-DYNSTR-NEXT:  Addr: 0000000000000000  Offset: 0x000040  Link: 4 (.dynsym)
 # GNU-CUSTOM-DYNSTR-NEXT:   000:   0 (*local*)       2 (bcdefghij)
 # GNU-CUSTOM-DYNSTR:      Version needs section '.gnu.version_r' contains 1 entries:
-# GNU-CUSTOM-DYNSTR-NEXT:  Addr: 0000000000000000  Offset: 0x000044  Link: 3 (.custom.dynstr)
+# GNU-CUSTOM-DYNSTR-NEXT:  Addr: 0000000000000004  Offset: 0x000044  Link: 3 (.custom.dynstr)
 # GNU-CUSTOM-DYNSTR-NEXT:   0x0000: Version: 1  File: j  Cnt: 1
 # GNU-CUSTOM-DYNSTR-NEXT:   0x0010:   Name: bcdefghij  Flags: none  Version: 2
 
diff --git a/llvm/test/tools/llvm-readobj/ELF/versioninfo.test b/llvm/test/tools/llvm-readobj/ELF/versioninfo.test
--- a/llvm/test/tools/llvm-readobj/ELF/versioninfo.test
+++ b/llvm/test/tools/llvm-readobj/ELF/versioninfo.test
@@ -275,7 +275,7 @@
 # GNU-NEXT:   004:   5 (v2)      6 (v3)
 # GNU-EMPTY:
 # GNU-NEXT:  Version definition section '.gnu.version_d' contains 6 entries:
-# GNU-NEXT:   Addr: 0000000000000000  Offset: 0x00004c  Link: 5 (.dynstr)
+# GNU-NEXT:   Addr: 000000000000000c  Offset: 0x00004c  Link: 5 (.dynstr)
 # GNU-NEXT:    0x0000: Rev: 1  Flags: none  Index: 0  Cnt: 1  Name: VERSION1
 # GNU-NEXT:    0x001c: Rev: 1  Flags: BASE  Index: 0  Cnt: 1  Name: VERSION1
 # GNU-NEXT:    0x0038: Rev: 1  Flags: WEAK  Index: 0  Cnt: 1  Name: VERSION1
@@ -286,7 +286,7 @@
 # GNU-NEXT:    0x00b0: Parent 2: VERSION3
 # GNU-EMPTY:
 # GNU-NEXT:  Version needs section '.gnu.version_r' contains 2 entries:
-# GNU-NEXT:   Addr: 0000000000000000  Offset: 0x000104  Link: 5 (.dynstr)
+# GNU-NEXT:   Addr: 00000000000000c4  Offset: 0x000104  Link: 5 (.dynstr)
 # GNU-NEXT:    0x0000: Version: 1  File: verneed1.so.0  Cnt: 5
 # GNU-NEXT:    0x0010:   Name: v1  Flags: BASE  Version: 0
 # GNU-NEXT:    0x0020:   Name: v1  Flags: WEAK  Version: 0
diff --git a/llvm/test/tools/obj2yaml/elf-gnu-hash-section.yaml b/llvm/test/tools/obj2yaml/elf-gnu-hash-section.yaml
--- a/llvm/test/tools/obj2yaml/elf-gnu-hash-section.yaml
+++ b/llvm/test/tools/obj2yaml/elf-gnu-hash-section.yaml
@@ -45,6 +45,7 @@
 # INVALID-NEXT: - Name:        .gnu.hash.empty
 # INVALID-NEXT:   Type:        SHT_GNU_HASH
 # INVALID-NEXT:   Flags:       [ SHF_ALLOC ]
+# INVALID-NEXT:   Address:     0x000000000000000F
 # INVALID-NEXT:   Header:
 # INVALID-NEXT:     SymNdx:      0x00000000
 # INVALID-NEXT:     Shift2:      0x00000000
diff --git a/llvm/test/tools/yaml2obj/ELF/section-address-assign.yaml b/llvm/test/tools/yaml2obj/ELF/section-address-assign.yaml
new file mode 100644
--- /dev/null
+++ b/llvm/test/tools/yaml2obj/ELF/section-address-assign.yaml
@@ -0,0 +1,98 @@
+## Test that yaml2obj automatically assigns sh_addr to allocatable sections for ET_EXEC/ET_DYN files.
+
+# RUN: yaml2obj %s -o %t.so -D TYPE=ET_DYN
+# RUN: llvm-readelf --sections %t.so | FileCheck %s --check-prefix=EXE-DSO
+
+# RUN: yaml2obj %s -o %t -D TYPE=ET_EXEC
+# RUN: llvm-readelf --sections %t | FileCheck %s --check-prefix=EXE-DSO
+
+# RUN: yaml2obj %s -o %t.o -D TYPE=ET_REL
+# RUN: llvm-readelf --sections %t.o | FileCheck %s --check-prefix=REL
+
+## We assign virtual addresses to allocatable sections automatically for executables and shared libraries.
+
+# EXE-DSO:      Section Headers:
+# EXE-DSO-NEXT:   [Nr] Name                  Type     Address          Off    Size   ES Flg Lk Inf Al
+# EXE-DSO-NEXT:   [ 0]                       NULL     0000000000000000 000000 000000 00     0   0  0
+# EXE-DSO-NEXT:   [ 1] .text.any.addr        PROGBITS 0000000000001000 000040 000003 00   A 0   0  0
+# EXE-DSO-NEXT:   [ 2] .text.shsize          PROGBITS 0000000000001003 000043 001234 00   A 0   0  0
+# EXE-DSO-NEXT:   [ 3] .text.align           PROGBITS 0000000000001100 000100 000004 00   A 0   0  256
+# EXE-DSO-NEXT:   [ 4] .data.any.addr        PROGBITS 0000000000002000 000104 000001 00   A 0   0  0
+# EXE-DSO-NEXT:   [ 5] .data.after.fill      PROGBITS 0000000000002101 000205 000001 00   A 0   0  0
+# EXE-DSO-NEXT:   [ 6] .data.return.back     PROGBITS 0000000000001500 000206 000001 00   A 0   0  0
+# EXE-DSO-NEXT:   [ 7] .data.return.back.foo PROGBITS 0000000000001501 000207 000000 00   A 0   0  0
+# EXE-DSO-NEXT:   [ 8] .dynsym               DYNSYM   0000000000001508 000208 000018 18   A 9   1  8
+# EXE-DSO-NEXT:   [ 9] .dynstr               STRTAB   0000000000001520 000220 000001 00   A 0   0  1
+# EXE-DSO-NEXT:   [10] .strtab               STRTAB   0000000000000000 000221 000001 00     0   0  1
+# EXE-DSO-NEXT:   [11] .shstrtab             STRTAB   0000000000000000 000222 000093 00     0   0  1
+
+## We do not assign virtual addresses to allocatable sections in a relocatable object
+## unless YAML document has an explicit request.
+
+# REL:      Section Headers:
+# REL-NEXT:   [Nr] Name                  Type     Address          Off    Size   ES Flg Lk Inf Al
+# REL-NEXT:   [ 0]                       NULL     0000000000000000 000000 000000 00     0   0  0
+# REL-NEXT:   [ 1] .text.any.addr        PROGBITS 0000000000001000 000040 000003 00   A 0   0  0
+# REL-NEXT:   [ 2] .text.shsize          PROGBITS 0000000000000000 000043 001234 00   A 0   0  0
+# REL-NEXT:   [ 3] .text.align           PROGBITS 0000000000000000 000100 000004 00   A 0   0  256
+# REL-NEXT:   [ 4] .data.any.addr        PROGBITS 0000000000002000 000104 000001 00   A 0   0  0
+# REL-NEXT:   [ 5] .data.after.fill      PROGBITS 0000000000000000 000205 000001 00   A 0   0  0
+# REL-NEXT:   [ 6] .data.return.back     PROGBITS 0000000000001500 000206 000001 00   A 0   0  0
+# REL-NEXT:   [ 7] .data.return.back.foo PROGBITS 0000000000000000 000207 000000 00   A 0   0  0
+# REL-NEXT:   [ 8] .dynsym               DYNSYM   0000000000000000 000208 000018 18   A 9   1  8
+# REL-NEXT:   [ 9] .dynstr               STRTAB   0000000000000000 000220 000001 00   A 0   0  1
+# REL-NEXT:   [10] .strtab               STRTAB   0000000000000000 000221 000001 00     0   0  1
+# REL-NEXT:   [11] .shstrtab             STRTAB   0000000000000000 000222 000093 00     0   0  1
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    [[TYPE]]
+  Machine: EM_X86_64
+Sections:
+## Show we can place a section at any address.
+  - Name:    .text.any.addr
+    Type:    SHT_PROGBITS
+    Flags:   [ SHF_ALLOC ]
+    Address: 0x1000
+    Size:    0x3
+## Test that ShSize does not affect virtual addresses.
+  - Name:   .text.shsize
+    Type:   SHT_PROGBITS
+    Flags:  [ SHF_ALLOC ]
+    ShSize: 0x1234
+## Show we respect an address align when automatically
+## assign virtual addresses.
+  - Name:         .text.align
+    Type:         SHT_PROGBITS
+    Flags:        [ SHF_ALLOC ]
+    AddressAlign: 0x100
+    Size:         0x4
+## We can set another address for a subsequent section.
+  - Name:    .data.any.addr
+    Type:    SHT_PROGBITS
+    Flags:   [ SHF_ALLOC ]
+    Address: 0x2000
+    Size:    0x1
+## Show that Fill occupies VA space.
+  - Type:    Fill
+    Pattern: "AABB"
+    Size:    0x100
+  - Name:  .data.after.fill
+    Type:  SHT_PROGBITS
+    Flags: [ SHF_ALLOC ]
+    Size:  0x1
+## Show we can return back in the address space and
+## continue placing sections. The order of sections in the
+## section header table will match the order in the YAML description.
+  - Name:    .data.return.back
+    Address: 0x1500
+    Type:    SHT_PROGBITS
+    Flags:   [ SHF_ALLOC ]
+    Size:    0x1
+  - Name:    .data.return.back.foo
+    Type:    SHT_PROGBITS
+    Flags:   [ SHF_ALLOC ]
+## Used to trigger creation of .dynsym and .dynstr.
+DynamicSymbols: []
diff --git a/llvm/tools/lli/lli.cpp b/llvm/tools/lli/lli.cpp
--- a/llvm/tools/lli/lli.cpp
+++ b/llvm/tools/lli/lli.cpp
@@ -274,6 +274,7 @@
       SmallString<128> dir(sys::path::parent_path(CacheName));
       sys::fs::create_directories(Twine(dir));
     }
+
     std::error_code EC;
     raw_fd_ostream outfile(CacheName, EC, sys::fs::OF_None);
     outfile.write(Obj.getBufferStart(), Obj.getBufferSize());
@@ -306,14 +307,16 @@
     size_t PrefixLength = Prefix.length();
     if (ModID.substr(0, PrefixLength) != Prefix)
       return false;
-        std::string CacheSubdir = ModID.substr(PrefixLength);
+
+    std::string CacheSubdir = ModID.substr(PrefixLength);
 #if defined(_WIN32)
-        // Transform "X:\foo" => "/X\foo" for convenience.
-        if (isalpha(CacheSubdir[0]) && CacheSubdir[1] == ':') {
-          CacheSubdir[1] = CacheSubdir[0];
-          CacheSubdir[0] = '/';
-        }
+    // Transform "X:\foo" => "/X\foo" for convenience.
+    if (isalpha(CacheSubdir[0]) && CacheSubdir[1] == ':') {
+      CacheSubdir[1] = CacheSubdir[0];
+      CacheSubdir[0] = '/';
+    }
 #endif
+
     CacheName = CacheDir + CacheSubdir;
     size_t pos = CacheName.rfind('.');
     CacheName.replace(pos, CacheName.length() - pos, ".o");
@@ -777,30 +780,56 @@
 
 static void exitOnLazyCallThroughFailure() { exit(1); }
 
+Expected<orc::ThreadSafeModule>
+loadModule(StringRef Path, orc::ThreadSafeContext TSCtx) {
+  SMDiagnostic Err;
+  auto M = parseIRFile(Path, Err, *TSCtx.getContext());
+  if (!M) {
+    std::string ErrMsg;
+    {
+      raw_string_ostream ErrMsgStream(ErrMsg);
+      Err.print("lli", ErrMsgStream);
+    }
+    return make_error<StringError>(std::move(ErrMsg), inconvertibleErrorCode());
+  }
+
+  if (EnableCacheManager)
+    M->setModuleIdentifier("file:" + M->getModuleIdentifier());
+
+  return orc::ThreadSafeModule(std::move(M), std::move(TSCtx));
+}
+
 int runOrcLazyJIT(const char *ProgName) {
   // Start setting up the JIT environment.
 
   // Parse the main module.
   orc::ThreadSafeContext TSCtx(std::make_unique<LLVMContext>());
-  SMDiagnostic Err;
-  auto MainModule = parseIRFile(InputFile, Err, *TSCtx.getContext());
-  if (!MainModule)
-    reportError(Err, ProgName);
+  auto MainModule = ExitOnErr(loadModule(InputFile, TSCtx));
+
+  // Get TargetTriple and DataLayout from the main module if they're explicitly
+  // set.
+  Optional<Triple> TT;
+  Optional<DataLayout> DL;
+  MainModule.withModuleDo([&](Module &M) {
+      if (!M.getTargetTriple().empty())
+        TT = Triple(M.getTargetTriple());
+      if (!M.getDataLayout().isDefault())
+        DL = M.getDataLayout();
+    });
 
-  Triple TT(MainModule->getTargetTriple());
   orc::LLLazyJITBuilder Builder;
 
   Builder.setJITTargetMachineBuilder(
-      MainModule->getTargetTriple().empty()
-          ? ExitOnErr(orc::JITTargetMachineBuilder::detectHost())
-          : orc::JITTargetMachineBuilder(TT));
+      TT ? orc::JITTargetMachineBuilder(*TT)
+         : ExitOnErr(orc::JITTargetMachineBuilder::detectHost()));
+
+  TT = Builder.getJITTargetMachineBuilder()->getTargetTriple();
+  if (DL)
+    Builder.setDataLayout(DL);
 
   if (!MArch.empty())
     Builder.getJITTargetMachineBuilder()->getTargetTriple().setArchName(MArch);
 
-  if (!MainModule->getDataLayout().isDefault())
-    Builder.setDataLayout(MainModule->getDataLayout());
-
   Builder.getJITTargetMachineBuilder()
       ->setCPU(getCPUStr())
       .addFeatures(getFeatureList())
@@ -815,11 +844,34 @@
       pointerToJITTargetAddress(exitOnLazyCallThroughFailure));
   Builder.setNumCompileThreads(LazyJITCompileThreads);
 
+  // If the object cache is enabled then set a custom compile function
+  // creator to use the cache.
+  std::unique_ptr<LLIObjectCache> CacheManager;
+  if (EnableCacheManager) {
+
+    CacheManager = std::make_unique<LLIObjectCache>(ObjectCacheDir);
+
+    Builder.setCompileFunctionCreator(
+      [&](orc::JITTargetMachineBuilder JTMB)
+            -> Expected<std::unique_ptr<orc::IRCompileLayer::IRCompiler>> {
+        if (LazyJITCompileThreads > 0)
+          return std::make_unique<orc::ConcurrentIRCompiler>(std::move(JTMB),
+                                                        CacheManager.get());
+
+        auto TM = JTMB.createTargetMachine();
+        if (!TM)
+          return TM.takeError();
+
+        return std::make_unique<orc::TMOwningSimpleCompiler>(std::move(*TM),
+                                                        CacheManager.get());
+      });
+  }
+
   // Set up LLJIT platform.
   {
     LLJITPlatform P = Platform;
     if (P == LLJITPlatform::DetectHost) {
-      if (TT.isOSBinFormatMachO())
+      if (TT->isOSBinFormatMachO())
         P = LLJITPlatform::MachO;
       else
         P = LLJITPlatform::GenericIR;
@@ -871,8 +923,7 @@
             })));
 
   // Add the main module.
-  ExitOnErr(
-      J->addLazyIRModule(orc::ThreadSafeModule(std::move(MainModule), TSCtx)));
+  ExitOnErr(J->addLazyIRModule(std::move(MainModule)));
 
   // Create JITDylibs and add any extra modules.
   {
@@ -894,16 +945,13 @@
 
     for (auto EMItr = ExtraModules.begin(), EMEnd = ExtraModules.end();
          EMItr != EMEnd; ++EMItr) {
-      auto M = parseIRFile(*EMItr, Err, *TSCtx.getContext());
-      if (!M)
-        reportError(Err, ProgName);
+      auto M = ExitOnErr(loadModule(*EMItr, TSCtx));
 
       auto EMIdx = ExtraModules.getPosition(EMItr - ExtraModules.begin());
       assert(EMIdx != 0 && "ExtraModule should have index > 0");
       auto JDItr = std::prev(IdxToDylib.lower_bound(EMIdx));
       auto &JD = *JDItr->second;
-      ExitOnErr(
-          J->addLazyIRModule(JD, orc::ThreadSafeModule(std::move(M), TSCtx)));
+      ExitOnErr(J->addLazyIRModule(JD, std::move(M)));
     }
 
     for (auto EAItr = ExtraArchives.begin(), EAEnd = ExtraArchives.end();
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp
--- a/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -553,13 +553,20 @@
 private:
   bool cacheSource(const DILineInfo& LineInfoFile);
 
+  void printLines(raw_ostream &OS, const DILineInfo &LineInfo,
+                  StringRef Delimiter);
+
+  void printSources(raw_ostream &OS, const DILineInfo &LineInfo,
+                    StringRef ObjectFilename, StringRef Delimiter);
+
 public:
   SourcePrinter() = default;
   SourcePrinter(const ObjectFile *Obj, StringRef DefaultArch)
       : Obj(Obj), WarnedNoDebugInfo(false) {
     symbolize::LLVMSymbolizer::Options SymbolizerOpts;
-    SymbolizerOpts.PrintFunctions = DILineInfoSpecifier::FunctionNameKind::None;
-    SymbolizerOpts.Demangle = false;
+    SymbolizerOpts.PrintFunctions =
+        DILineInfoSpecifier::FunctionNameKind::LinkageName;
+    SymbolizerOpts.Demangle = Demangle;
     SymbolizerOpts.DefaultArch = std::string(DefaultArch);
     Symbolizer.reset(new symbolize::LLVMSymbolizer(SymbolizerOpts));
   }
@@ -624,34 +631,57 @@
       reportWarning(Warning, ObjectFilename);
       WarnedNoDebugInfo = true;
     }
-    return;
   }
 
-  if (LineInfo.Line == 0 || ((OldLineInfo.Line == LineInfo.Line) &&
-                             (OldLineInfo.FileName == LineInfo.FileName)))
-    return;
-
   if (PrintLines)
+    printLines(OS, LineInfo, Delimiter);
+  if (PrintSource)
+    printSources(OS, LineInfo, ObjectFilename, Delimiter);
+  OldLineInfo = LineInfo;
+}
+
+void SourcePrinter::printLines(raw_ostream &OS, const DILineInfo &LineInfo,
+                               StringRef Delimiter) {
+  bool PrintFunctionName = LineInfo.FunctionName != DILineInfo::BadString &&
+                           LineInfo.FunctionName != OldLineInfo.FunctionName;
+  if (PrintFunctionName) {
+    OS << Delimiter << LineInfo.FunctionName;
+    // If demangling is successful, FunctionName will end with "()". Print it
+    // only if demangling did not run or was unsuccessful.
+    if (!StringRef(LineInfo.FunctionName).endswith("()"))
+      OS << "()";
+    OS << ":\n";
+  }
+  if (LineInfo.FileName != DILineInfo::BadString && LineInfo.Line != 0 &&
+      (OldLineInfo.Line != LineInfo.Line ||
+       OldLineInfo.FileName != LineInfo.FileName || PrintFunctionName))
     OS << Delimiter << LineInfo.FileName << ":" << LineInfo.Line << "\n";
-  if (PrintSource) {
-    if (SourceCache.find(LineInfo.FileName) == SourceCache.end())
-      if (!cacheSource(LineInfo))
-        return;
-    auto LineBuffer = LineCache.find(LineInfo.FileName);
-    if (LineBuffer != LineCache.end()) {
-      if (LineInfo.Line > LineBuffer->second.size()) {
-        reportWarning(
-            formatv(
-                "debug info line number {0} exceeds the number of lines in {1}",
-                LineInfo.Line, LineInfo.FileName),
-            ObjectFilename);
-        return;
-      }
-      // Vector begins at 0, line numbers are non-zero
-      OS << Delimiter << LineBuffer->second[LineInfo.Line - 1] << '\n';
+}
+
+void SourcePrinter::printSources(raw_ostream &OS, const DILineInfo &LineInfo,
+                                 StringRef ObjectFilename,
+                                 StringRef Delimiter) {
+  if (LineInfo.FileName == DILineInfo::BadString || LineInfo.Line == 0 ||
+      (OldLineInfo.Line == LineInfo.Line &&
+       OldLineInfo.FileName == LineInfo.FileName))
+    return;
+
+  if (SourceCache.find(LineInfo.FileName) == SourceCache.end())
+    if (!cacheSource(LineInfo))
+      return;
+  auto LineBuffer = LineCache.find(LineInfo.FileName);
+  if (LineBuffer != LineCache.end()) {
+    if (LineInfo.Line > LineBuffer->second.size()) {
+      reportWarning(
+          formatv(
+              "debug info line number {0} exceeds the number of lines in {1}",
+              LineInfo.Line, LineInfo.FileName),
+          ObjectFilename);
+      return;
     }
+    // Vector begins at 0, line numbers are non-zero
+    OS << Delimiter << LineBuffer->second[LineInfo.Line - 1] << '\n';
   }
-  OldLineInfo = LineInfo;
 }
 
 static bool isAArch64Elf(const ObjectFile *Obj) {
diff --git a/llvm/tools/obj2yaml/elf2yaml.cpp b/llvm/tools/obj2yaml/elf2yaml.cpp
--- a/llvm/tools/obj2yaml/elf2yaml.cpp
+++ b/llvm/tools/obj2yaml/elf2yaml.cpp
@@ -502,7 +502,8 @@
   S.Type = Shdr->sh_type;
   if (Shdr->sh_flags)
     S.Flags = static_cast<ELFYAML::ELF_SHF>(Shdr->sh_flags);
-  S.Address = Shdr->sh_addr;
+  if (Shdr->sh_addr)
+    S.Address = static_cast<uint64_t>(Shdr->sh_addr);
   S.AddressAlign = Shdr->sh_addralign;
   if (Shdr->sh_entsize)
     S.EntSize = static_cast<llvm::yaml::Hex64>(Shdr->sh_entsize);
diff --git a/llvm/unittests/DebugInfo/DWARF/DWARFDebugLineTest.cpp b/llvm/unittests/DebugInfo/DWARF/DWARFDebugLineTest.cpp
--- a/llvm/unittests/DebugInfo/DWARF/DWARFDebugLineTest.cpp
+++ b/llvm/unittests/DebugInfo/DWARF/DWARFDebugLineTest.cpp
@@ -96,42 +96,12 @@
     Unrecoverable = joinErrors(std::move(Unrecoverable), std::move(Err));
   }
 
-  void checkError(ArrayRef<StringRef> ExpectedMsgs, Error Err) {
-    ASSERT_TRUE(Err.operator bool());
-    size_t WhichMsg = 0;
-    Error Remaining =
-        handleErrors(std::move(Err), [&](const ErrorInfoBase &Actual) {
-          ASSERT_LT(WhichMsg, ExpectedMsgs.size());
-          // Use .str(), because googletest doesn't visualise a StringRef
-          // properly.
-          EXPECT_EQ(Actual.message(), ExpectedMsgs[WhichMsg++].str());
-        });
-    EXPECT_EQ(WhichMsg, ExpectedMsgs.size());
-    EXPECT_FALSE(Remaining);
-  }
-
-  void checkError(StringRef ExpectedMsg, Error Err) {
-    checkError(ArrayRef<StringRef>{ExpectedMsg}, std::move(Err));
-  }
-
-  void checkGetOrParseLineTableEmitsFatalError(StringRef ExpectedMsg,
-                                               uint64_t Offset = 0) {
-    auto ExpectedLineTable = Line.getOrParseLineTable(
-        LineData, Offset, *Context, nullptr, RecordRecoverable);
-    EXPECT_FALSE(ExpectedLineTable);
-    EXPECT_FALSE(Recoverable);
-
-    checkError(ExpectedMsg, ExpectedLineTable.takeError());
-  }
-
-  void checkGetOrParseLineTableEmitsFatalError(ArrayRef<StringRef> ExpectedMsgs,
-                                               uint64_t Offset = 0) {
+  Expected<const DWARFDebugLine::LineTable *>
+  getOrParseLineTableFatalErrors(uint64_t Offset = 0) {
     auto ExpectedLineTable = Line.getOrParseLineTable(
         LineData, Offset, *Context, nullptr, RecordRecoverable);
-    EXPECT_FALSE(ExpectedLineTable);
-    EXPECT_FALSE(Recoverable);
-
-    checkError(ExpectedMsgs, ExpectedLineTable.takeError());
+    EXPECT_THAT_ERROR(std::move(Recoverable), Succeeded());
+    return ExpectedLineTable;
   }
 
   uint8_t AddressSize;
@@ -213,14 +183,21 @@
     return;
   generate();
 
-  checkGetOrParseLineTableEmitsFatalError(
-      "offset 0x00000000 is not a valid debug line section offset", 0);
+  EXPECT_THAT_EXPECTED(
+      getOrParseLineTableFatalErrors(0),
+      FailedWithMessage(
+          "offset 0x00000000 is not a valid debug line section offset"));
   // Repeat to show that an error is reported each time.
-  checkGetOrParseLineTableEmitsFatalError(
-      "offset 0x00000000 is not a valid debug line section offset", 0);
+  EXPECT_THAT_EXPECTED(
+      getOrParseLineTableFatalErrors(0),
+      FailedWithMessage(
+          "offset 0x00000000 is not a valid debug line section offset"));
+
   // Show that an error is reported for later offsets too.
-  checkGetOrParseLineTableEmitsFatalError(
-      "offset 0x00000001 is not a valid debug line section offset", 1);
+  EXPECT_THAT_EXPECTED(
+      getOrParseLineTableFatalErrors(1),
+      FailedWithMessage(
+          "offset 0x00000001 is not a valid debug line section offset"));
 }
 
 TEST_F(DebugLineBasicFixture, GetOrParseLineTableAtInvalidOffsetAfterData) {
@@ -232,8 +209,10 @@
 
   generate();
 
-  checkGetOrParseLineTableEmitsFatalError(
-      "offset 0x00000001 is not a valid debug line section offset", 1);
+  EXPECT_THAT_EXPECTED(
+      getOrParseLineTableFatalErrors(1),
+      FailedWithMessage(
+          "offset 0x00000001 is not a valid debug line section offset"));
 }
 
 TEST_P(DebugLineParameterisedFixture, PrologueGetLength) {
@@ -334,9 +313,11 @@
 
   generate();
 
-  checkGetOrParseLineTableEmitsFatalError(
-      "parsing line table prologue at offset 0x00000000 unsupported reserved "
-      "unit length found of value 0xfffffff0");
+  EXPECT_THAT_EXPECTED(
+      getOrParseLineTableFatalErrors(),
+      FailedWithMessage(
+          "parsing line table prologue at offset 0x00000000 unsupported "
+          "reserved unit length found of value 0xfffffff0"));
 }
 
 struct DebugLineUnsupportedVersionFixture : public TestWithParam<uint16_t>,
@@ -356,10 +337,11 @@
 
   generate();
 
-  checkGetOrParseLineTableEmitsFatalError(
-      "parsing line table prologue at offset 0x00000000 found unsupported "
-      "version " +
-      std::to_string(Version));
+  EXPECT_THAT_EXPECTED(
+      getOrParseLineTableFatalErrors(),
+      FailedWithMessage("parsing line table prologue at offset 0x00000000 "
+                        "found unsupported version " +
+                        std::to_string(Version)));
 }
 
 INSTANTIATE_TEST_CASE_P(UnsupportedVersionTestParams,
@@ -399,11 +381,13 @@
                                                     nullptr, RecordRecoverable);
   EXPECT_THAT_EXPECTED(ExpectedLineTable, Succeeded());
 
-  checkError(
-      {"parsing line table prologue at 0x00000000 found an invalid directory "
-       "or file table description at 0x00000014",
-       "failed to parse entry content descriptions because no path was found"},
-      std::move(Recoverable));
+  EXPECT_THAT_ERROR(
+      std::move(Recoverable),
+      FailedWithMessage(
+          "parsing line table prologue at 0x00000000 found an invalid "
+          "directory or file table description at 0x00000014",
+          "failed to parse entry content descriptions because no path was "
+          "found"));
 }
 
 TEST_P(DebugLineParameterisedFixture, ErrorForTooLargePrologueLength) {
@@ -431,13 +415,14 @@
 
   uint64_t ExpectedEnd =
       Prologue.TotalLength + 1 + Prologue.sizeofTotalLength();
-  checkError(
-      (Twine("parsing line table prologue at 0x00000000 should have ended at "
-             "0x000000") +
-       Twine::utohexstr(ExpectedEnd) + " but it ended at 0x000000" +
-       Twine::utohexstr(ExpectedEnd - 1))
-          .str(),
-      std::move(Recoverable));
+  EXPECT_THAT_ERROR(
+      std::move(Recoverable),
+      FailedWithMessage(("parsing line table prologue at 0x00000000 should "
+                         "have ended at 0x000000" +
+                         Twine::utohexstr(ExpectedEnd) +
+                         " but it ended at 0x000000" +
+                         Twine::utohexstr(ExpectedEnd - 1))
+                            .str()));
 }
 
 TEST_P(DebugLineParameterisedFixture, ErrorForTooShortPrologueLength) {
@@ -486,8 +471,8 @@
        Twine::utohexstr(ExpectedEnd) + " but it ended at 0x000000" +
        Twine::utohexstr(ActualEnd))
           .str());
-  std::vector<StringRef> ErrRefs(Errs.begin(), Errs.end());
-  checkError(ErrRefs, std::move(Recoverable));
+  EXPECT_THAT_ERROR(std::move(Recoverable),
+                    FailedWithMessageArray(testing::ElementsAreArray(Errs)));
 }
 
 INSTANTIATE_TEST_CASE_P(
@@ -517,9 +502,9 @@
 
   auto ExpectedLineTable = Line.getOrParseLineTable(LineData, 0, *Context,
                                                     nullptr, RecordRecoverable);
-  checkError(
-      "unexpected line op length at offset 0x00000031 expected 0x01 found 0x02",
-      std::move(Recoverable));
+  EXPECT_THAT_ERROR(std::move(Recoverable),
+                    FailedWithMessage("unexpected line op length at offset "
+                                      "0x00000031 expected 0x01 found 0x02"));
   ASSERT_THAT_EXPECTED(ExpectedLineTable, Succeeded());
   ASSERT_EQ((*ExpectedLineTable)->Rows.size(), 3u);
   EXPECT_EQ((*ExpectedLineTable)->Sequences.size(), 1u);
@@ -546,9 +531,9 @@
 
   auto ExpectedLineTable = Line.getOrParseLineTable(LineData, 0, *Context,
                                                     nullptr, RecordRecoverable);
-  checkError(
-      "unexpected line op length at offset 0x00000032 expected 0x02 found 0x01",
-      std::move(Recoverable));
+  EXPECT_THAT_ERROR(std::move(Recoverable),
+                    FailedWithMessage("unexpected line op length at offset "
+                                      "0x00000032 expected 0x02 found 0x01"));
   ASSERT_THAT_EXPECTED(ExpectedLineTable, Succeeded());
   ASSERT_EQ((*ExpectedLineTable)->Rows.size(), 4u);
   EXPECT_EQ((*ExpectedLineTable)->Sequences.size(), 2u);
@@ -576,9 +561,10 @@
 
   auto ExpectedLineTable = Line.getOrParseLineTable(LineData, 1, *Context,
                                                     nullptr, RecordRecoverable);
-  checkError("line table program with offset 0x00000001 has length 0x00000034 "
-             "but only 0x00000033 bytes are available",
-             std::move(Recoverable));
+  EXPECT_THAT_ERROR(
+      std::move(Recoverable),
+      FailedWithMessage("line table program with offset 0x00000001 has length "
+                        "0x00000034 but only 0x00000033 bytes are available"));
   ASSERT_THAT_EXPECTED(ExpectedLineTable, Succeeded());
   EXPECT_EQ((*ExpectedLineTable)->Rows.size(), 2u);
   EXPECT_EQ((*ExpectedLineTable)->Sequences.size(), 1u);
@@ -603,9 +589,9 @@
 
   auto ExpectedLineTable = Line.getOrParseLineTable(LineData, 0, *Context,
                                                     nullptr, RecordRecoverable);
-  checkError(
-      "mismatching address size at offset 0x00000030 expected 0x08 found 0x04",
-      std::move(Recoverable));
+  EXPECT_THAT_ERROR(std::move(Recoverable),
+                    FailedWithMessage("mismatching address size at offset "
+                                      "0x00000030 expected 0x08 found 0x04"));
   ASSERT_THAT_EXPECTED(ExpectedLineTable, Succeeded());
   ASSERT_EQ((*ExpectedLineTable)->Rows.size(), 2u);
   EXPECT_EQ((*ExpectedLineTable)->Sequences.size(), 1u);
@@ -630,9 +616,9 @@
 
   auto ExpectedLineTable = Line.getOrParseLineTable(LineData, 0, *Context,
                                                     nullptr, RecordRecoverable);
-  checkError(
-      "mismatching address size at offset 0x00000038 expected 0x04 found 0x08",
-      std::move(Recoverable));
+  EXPECT_THAT_ERROR(std::move(Recoverable),
+                    FailedWithMessage("mismatching address size at offset "
+                                      "0x00000038 expected 0x04 found 0x08"));
   ASSERT_THAT_EXPECTED(ExpectedLineTable, Succeeded());
   ASSERT_EQ((*ExpectedLineTable)->Rows.size(), 2u);
   EXPECT_EQ((*ExpectedLineTable)->Sequences.size(), 1u);
@@ -662,10 +648,10 @@
 
   auto ExpectedLineTable = Line.getOrParseLineTable(LineData, 0, *Context,
                                                     nullptr, RecordRecoverable);
-  checkError(
-      "address size 0x03 of DW_LNE_set_address opcode at offset 0x00000030 is "
-      "unsupported",
-      std::move(Recoverable));
+  EXPECT_THAT_ERROR(
+      std::move(Recoverable),
+      FailedWithMessage("address size 0x03 of DW_LNE_set_address opcode at "
+                        "offset 0x00000030 is unsupported"));
   ASSERT_THAT_EXPECTED(ExpectedLineTable, Succeeded());
   ASSERT_EQ((*ExpectedLineTable)->Rows.size(), 3u);
   EXPECT_EQ((*ExpectedLineTable)->Sequences.size(), 1u);
@@ -690,10 +676,10 @@
 
   auto ExpectedLineTable = Line.getOrParseLineTable(LineData, 0, *Context,
                                                     nullptr, RecordRecoverable);
-  checkError(
-      "address size 0x108 of DW_LNE_set_address opcode at offset 0x00000031 is "
-      "unsupported",
-      std::move(Recoverable));
+  EXPECT_THAT_ERROR(
+      std::move(Recoverable),
+      FailedWithMessage("address size 0x108 of DW_LNE_set_address opcode at "
+                        "offset 0x00000031 is unsupported"));
   ASSERT_THAT_EXPECTED(ExpectedLineTable, Succeeded());
 }
 
@@ -723,10 +709,10 @@
 
   auto ExpectedLineTable = Line.getOrParseLineTable(LineData, 0, *Context,
                                                     nullptr, RecordRecoverable);
-  checkError(
-      "address size 0x09 of DW_LNE_set_address opcode at offset 0x00000038 is "
-      "unsupported",
-      std::move(Recoverable));
+  EXPECT_THAT_ERROR(
+      std::move(Recoverable),
+      FailedWithMessage("address size 0x09 of DW_LNE_set_address opcode at "
+                        "offset 0x00000038 is unsupported"));
   ASSERT_THAT_EXPECTED(ExpectedLineTable, Succeeded());
   ASSERT_EQ((*ExpectedLineTable)->Rows.size(), 3u);
   EXPECT_EQ((*ExpectedLineTable)->Sequences.size(), 1u);
@@ -754,10 +740,10 @@
 
   auto ExpectedLineTable = Line.getOrParseLineTable(LineData, 0, *Context,
                                                     nullptr, RecordRecoverable);
-  checkError("last sequence in debug line table at offset 0x00000000 is not "
-             "terminated",
-             std::move(Recoverable));
-  ASSERT_TRUE(ExpectedLineTable.operator bool());
+  EXPECT_THAT_ERROR(std::move(Recoverable),
+                    FailedWithMessage("last sequence in debug line table at "
+                                      "offset 0x00000000 is not terminated"));
+  ASSERT_THAT_EXPECTED(ExpectedLineTable, Succeeded());
   EXPECT_EQ((*ExpectedLineTable)->Rows.size(), 6u);
   // The unterminated sequence is not added to the sequence list.
   EXPECT_EQ((*ExpectedLineTable)->Sequences.size(), 1u);
@@ -837,9 +823,11 @@
   EXPECT_TRUE(Parser.done());
   EXPECT_FALSE(Recoverable);
 
-  checkError("parsing line table prologue at offset 0x00000000 unsupported "
-             "reserved unit length found of value 0xfffffff0",
-             std::move(Unrecoverable));
+  EXPECT_THAT_ERROR(
+      std::move(Unrecoverable),
+      FailedWithMessage(
+          "parsing line table prologue at offset 0x00000000 unsupported "
+          "reserved unit length found of value 0xfffffff0"));
 }
 
 TEST_F(DebugLineBasicFixture, ParserMovesToEndForBadLengthWhenSkipping) {
@@ -858,9 +846,11 @@
   EXPECT_TRUE(Parser.done());
   EXPECT_FALSE(Recoverable);
 
-  checkError("parsing line table prologue at offset 0x00000000 unsupported "
-             "reserved unit length found of value 0xfffffff0",
-             std::move(Unrecoverable));
+  EXPECT_THAT_ERROR(
+      std::move(Unrecoverable),
+      FailedWithMessage(
+          "parsing line table prologue at offset 0x00000000 unsupported "
+          "reserved unit length found of value 0xfffffff0"));
 }
 
 TEST_F(DebugLineBasicFixture, ParserReportsFirstErrorInEachTableWhenParsing) {
@@ -879,13 +869,14 @@
   Parser.parseNext(RecordRecoverable, RecordUnrecoverable);
 
   EXPECT_TRUE(Parser.done());
-  EXPECT_FALSE(Recoverable);
-
-  checkError({"parsing line table prologue at offset 0x00000000 found "
-              "unsupported version 0",
-              "parsing line table prologue at offset 0x00000006 found "
-              "unsupported version 1"},
-             std::move(Unrecoverable));
+  EXPECT_THAT_ERROR(std::move(Recoverable), Succeeded());
+
+  EXPECT_THAT_ERROR(
+      std::move(Unrecoverable),
+      FailedWithMessage("parsing line table prologue at offset 0x00000000 "
+                        "found unsupported version 0",
+                        "parsing line table prologue at offset 0x00000006 "
+                        "found unsupported version 1"));
 }
 
 TEST_F(DebugLineBasicFixture, ParserReportsNonPrologueProblemsWhenParsing) {
@@ -905,18 +896,18 @@
   Parser.parseNext(RecordRecoverable, RecordUnrecoverable);
   EXPECT_FALSE(Unrecoverable);
   ASSERT_FALSE(Parser.done());
-  checkError(
-      "unexpected line op length at offset 0x00000030 expected 0x42 found 0x01",
-      std::move(Recoverable));
+  EXPECT_THAT_ERROR(std::move(Recoverable),
+                    FailedWithMessage("unexpected line op length at offset "
+                                      "0x00000030 expected 0x42 found 0x01"));
 
   // Reset the error state so that it does not confuse the next set of checks.
   Unrecoverable = Error::success();
   Parser.parseNext(RecordRecoverable, RecordUnrecoverable);
 
   EXPECT_TRUE(Parser.done());
-  checkError("last sequence in debug line table at offset 0x00000031 is not "
-             "terminated",
-             std::move(Recoverable));
+  EXPECT_THAT_ERROR(std::move(Recoverable),
+                    FailedWithMessage("last sequence in debug line table at "
+                                      "offset 0x00000031 is not terminated"));
   EXPECT_FALSE(Unrecoverable);
 }
 
@@ -939,11 +930,12 @@
   EXPECT_TRUE(Parser.done());
   EXPECT_FALSE(Recoverable);
 
-  checkError({"parsing line table prologue at offset 0x00000000 found "
-              "unsupported version 0",
-              "parsing line table prologue at offset 0x00000006 found "
-              "unsupported version 1"},
-             std::move(Unrecoverable));
+  EXPECT_THAT_ERROR(
+      std::move(Unrecoverable),
+      FailedWithMessage("parsing line table prologue at offset 0x00000000 "
+                        "found unsupported version 0",
+                        "parsing line table prologue at offset 0x00000006 "
+                        "found unsupported version 1"));
 }
 
 TEST_F(DebugLineBasicFixture, ParserIgnoresNonPrologueErrorsWhenSkipping) {
diff --git a/llvm/unittests/Transforms/Utils/KnowledgeRetentionTest.cpp b/llvm/unittests/Transforms/Utils/KnowledgeRetentionTest.cpp
--- a/llvm/unittests/Transforms/Utils/KnowledgeRetentionTest.cpp
+++ b/llvm/unittests/Transforms/Utils/KnowledgeRetentionTest.cpp
@@ -51,8 +51,9 @@
 #include "llvm/IR/Attributes.inc"
        }) {
     bool ShouldHaveAttr = Reg.match(Attr, &Matches) && Matches[0] == Attr;
-    if (ShouldHaveAttr != hasAttributeInAssume(*Assume, WasOn, Attr))
+    if (ShouldHaveAttr != hasAttributeInAssume(*Assume, WasOn, Attr)) {
       ASSERT_TRUE(false);
+    }
   }
 }
 
@@ -71,10 +72,12 @@
                                         AssumeQuery::Lowest);
   bool ResultHigh = hasAttributeInAssume(*Assume, WasOn, Kind, &ArgValHigh,
                                          AssumeQuery::Highest);
-  if (ResultLow != ResultHigh)
+  if (ResultLow != ResultHigh || ResultHigh == false) {
     ASSERT_TRUE(false);
-  if (ArgValLow != Value || ArgValLow != ArgValHigh)
-    ASSERT_EQ(ArgValLow, Value);
+  }
+  if (ArgValLow != Value || ArgValLow != ArgValHigh) {
+    ASSERT_TRUE(false);
+  }
 }
 
 TEST(AssumeQueryAPI, Basic) {
diff --git a/llvm/utils/TableGen/SubtargetEmitter.cpp b/llvm/utils/TableGen/SubtargetEmitter.cpp
--- a/llvm/utils/TableGen/SubtargetEmitter.cpp
+++ b/llvm/utils/TableGen/SubtargetEmitter.cpp
@@ -396,8 +396,8 @@
        << "namespace " << Name << "FU {\n";
 
     for (unsigned j = 0, FUN = FUs.size(); j < FUN; ++j)
-      OS << "  const unsigned " << FUs[j]->getName()
-         << " = 1 << " << j << ";\n";
+      OS << "  const InstrStage::FuncUnits " << FUs[j]->getName()
+         << " = 1ULL << " << j << ";\n";
 
     OS << "} // end namespace " << Name << "FU\n";
 
diff --git a/mlir/docs/OpDefinitions.md b/mlir/docs/OpDefinitions.md
--- a/mlir/docs/OpDefinitions.md
+++ b/mlir/docs/OpDefinitions.md
@@ -279,6 +279,24 @@
 And similarly, `SameVariadicResultSize` for multiple variadic results in the
 same operation.
 
+### Operation successors
+
+For terminator operations, the successors are specified inside of the
+`dag`-typed `successors`, led by `successor`:
+
+```tablegen
+let successors = (successor
+  <successor-constraint>:$<successor-name>,
+  ...
+);
+```
+
+#### Variadic successors
+
+Similar to the `Variadic` class used for variadic operands and results,
+`VariadicSuccessor<...>` can be used for successors. Variadic successors can
+currently only be specified as the last successor in the successor list.
+
 ### Operation traits and constraints
 
 Traits are operation properties that affect syntax or semantics. MLIR C++
@@ -583,25 +601,39 @@
 A directive is a type of builtin function, with an optional set of arguments.
 The available directives are as follows:
 
-* `attr-dict`
-  -  Represents the attribute dictionary of the operation.
+*   `attr-dict`
+
+    -   Represents the attribute dictionary of the operation.
+
+*   `attr-dict-with-keyword`
+
+    -   Represents the attribute dictionary of the operation, but prefixes the
+        dictionary with an `attributes` keyword.
 
-* `functional-type` ( inputs , results )
-  -  Formats the `inputs` and `results` arguments as a
-     [function type](LangRef.md#function-type).
-  -  The constraints on `inputs` and `results` are the same as the `input` of
-     the `type` directive.
+*   `functional-type` ( inputs , results )
 
-* `operands`
-  -  Represents all of the operands of an operation.
+    -   Formats the `inputs` and `results` arguments as a
+        [function type](LangRef.md#function-type).
+    -   The constraints on `inputs` and `results` are the same as the `input` of
+        the `type` directive.
 
-* `results`
-  -  Represents all of the results of an operation.
+*   `operands`
 
-* `type` ( input )
-  - Represents the type of the given input.
-  - `input` must be either an operand or result [variable](#variables), the
-    `operands` directive, or the `results` directive.
+    -   Represents all of the operands of an operation.
+
+*   `results`
+
+    -   Represents all of the results of an operation.
+
+*   `successors`
+
+    -   Represents all of the successors of an operation.
+
+*   `type` ( input )
+
+    -   Represents the type of the given input.
+    -   `input` must be either an operand or result [variable](#variables), the
+        `operands` directive, or the `results` directive.
 
 #### Literals
 
@@ -613,12 +645,49 @@
 #### Variables
 
 A variable is an entity that has been registered on the operation itself, i.e.
-an argument(attribute or operand), result, etc. In the `CallOp` example above,
-the variables would be `$callee`  and `$args`.
+an argument(attribute or operand), result, successor, etc. In the `CallOp`
+example above, the variables would be `$callee` and `$args`.
 
 Attribute variables are printed with their respective value type, unless that
 value type is buildable. In those cases, the type of the attribute is elided.
 
+#### Optional Groups
+
+In certain situations operations may have "optional" information, e.g.
+attributes or an empty set of variadic operands. In these situtations a section
+of the assembly format can be marked as `optional` based on the presence of this
+information. An optional group is defined by wrapping a set of elements within
+`()` followed by a `?` and has the following requirements:
+
+*   The first element of the group must either be a literal or an operand.
+    -   This is because the first element must be optionally parsable.
+*   Exactly one argument variable within the group must be marked as the anchor
+    of the group.
+    -   The anchor is the element whose presence controls whether the group
+        should be printed/parsed.
+    -   An element is marked as the anchor by adding a trailing `^`.
+    -   The first element is *not* required to be the anchor of the group.
+*   Literals, variables, and type directives are the only valid elements within
+    the group.
+    -   Any attribute variable may be used, but only optional attributes can be
+        marked as the anchor.
+    -   Only variadic, i.e. optional, operand arguments can be used.
+    -   The operands to a type directive must be defined within the optional
+        group.
+
+An example of an operation with an optional group is `std.return`, which has a
+variadic number of operands.
+
+```
+def ReturnOp : ... {
+  let arguments = (ins Variadic<AnyType>:$operands);
+
+  // We only print the operands and types if there are a non-zero number
+  // of operands.
+  let assemblyFormat = "attr-dict ($operands^ `:` type($operands))?";
+}
+```
+
 #### Requirements
 
 The format specification has a certain set of requirements that must be adhered
diff --git a/mlir/docs/Quantization.md b/mlir/docs/Quantization.md
--- a/mlir/docs/Quantization.md
+++ b/mlir/docs/Quantization.md
@@ -18,7 +18,7 @@
 
 The primary quantization mechanism supported by MLIR is a scheme which can
 express fixed point and affine transformations via uniformly spaced point on the
-Real number line.
+[Real](https://en.wikipedia.org/wiki/Real_number) number line.
 
 Further, the scheme can be applied:
 
@@ -30,11 +30,11 @@
 
 [Fixed point](https://en.wikipedia.org/wiki/Fixed-point_arithmetic) values are a
 [Real](https://en.wikipedia.org/wiki/Real_number) number divided by a *scale*.
-We will call the result of the divided Real the *scaled value*.
+We will call the result of the divided real the *scaled value*.
 
 $$ real\_value = scaled\_value * scale $$
 
-The scale can be interpreted as the distance, in Real units, between neighboring
+The scale can be interpreted as the distance, in real units, between neighboring
 scaled values. For example, if the scale is $$ \pi $$, then fixed point values
 with this scale can only represent multiples of $$ \pi $$, and nothing in
 between. The maximum rounding error to convert an arbitrary Real to a fixed
@@ -43,10 +43,10 @@
 \frac{\pi}{2} $$.
 
 Multiplication can be performed on scaled values with different scales, using
-the same algorithm as multiplication of Real values (note that product scaled
+the same algorithm as multiplication of real values (note that product scaled
 value has $$ scale_{product} = scale_{left \mbox{ } operand} * scale_{right
-\mbox{ } operand} $$). Addition can be performed on scaled values, as long as
-they have the same scale, using the same algorithm as addition of Real values.
+\mbox{ } operand} $$). Addition can be performed on scaled values, so long as
+they have the same scale, using the same algorithm for addition of real values.
 This makes it convenient to represent scaled values on a computer as signed
 integers, and perform arithmetic on those signed integers, because the results
 will be correct scaled values.
@@ -55,31 +55,31 @@
 
 Mathematically speaking, affine values are the result of
 [adding a Real-valued *zero point*, to a scaled value](https://en.wikipedia.org/wiki/Affine_transformation#Representation).
-Or equivalently, subtracting a zero point from an affine value results in a
+Alternatively (and equivalently), subtracting a zero point from an affine value results in a
 scaled value:
 
 $$ real\_value = scaled\_value * scale = (affine\_value - zero\_point) * scale $$
 
-Essentially, affine values are a shifting of the scaled values by some constant
+Essentially, affine values are a shift of the scaled values by some constant
 amount. Arithmetic (i.e., addition, subtraction, multiplication, division)
-cannot, in general, be directly performed on affine values; you must first
-[convert](#affine-to-fixed-point) them to the equivalent scaled values.
+cannot, in general, be directly performed on affine values; they must first be
+[converted](#affine-to-fixed-point) to the equivalent scaled values.
 
 As alluded to above, the motivation for using affine values is to more
-efficiently represent the Real values that will actually be encountered during
-computation. Frequently, the Real values that will be encountered are not
-symmetric around the Real zero. We also make the assumption that the Real zero
+efficiently represent real values that will actually be encountered during
+computation. Frequently, real values that will be encountered are not
+symmetric around the real zero. We also make the assumption that the real zero
 is encountered during computation, and should thus be represented.
 
-In this case, it's inefficient to store scaled values represented by signed
-integers, as some of the signed integers will never be used. The bit patterns
+In this case, it is inefficient to store scaled values represented by signed
+integers, as some of the signed integers will never be used. In effect, the bit patterns
 corresponding to those signed integers are going to waste.
 
-In order to exactly represent the Real zero with an integral-valued affine
+In order to exactly represent the real zero with an integral-valued affine
 value, the zero point must be an integer between the minimum and maximum affine
 value (inclusive). For example, given an affine value represented by an 8 bit
 unsigned integer, we have: $$ 0 \leq zero\_point \leq 255$$. This is important,
-because in deep neural networks' convolution-like operations, we frequently
+because in convolution-like operations of deep neural networks, we frequently
 need to zero-pad inputs and outputs, so zero must be exactly representable, or
 the result will be biased.
 
@@ -99,14 +99,14 @@
 rounding should be according to the IEEE754 default of RNE (where hardware
 permits).
 
-### Converting between Real and fixed point or affine
+### Converting between real and fixed point or affine
 
-To convert a Real value to a fixed point value, you must know the scale. To
-convert a Real value to an affine value, you must know the scale and zero point.
+To convert a real value to a fixed point value, we must know the scale. To
+convert a real value to an affine value, we must know the scale and the zero point.
 
 #### Real to affine
 
-To convert an input tensor of Real-valued elements (usually represented by a
+To convert an input tensor of real-valued elements (usually represented by a
 floating point format, frequently
 [Single precision](https://en.wikipedia.org/wiki/Single-precision_floating-point_format))
 to a tensor of affine elements represented by an integral type (e.g. 8-bit
@@ -121,16 +121,16 @@
 $$
 
 In the above, we assume that $$real\_value$$ is a Single, $$scale$$ is a Single,
-$$roundToNearestInteger$$ returns a signed 32 bit integer, and $$zero\_point$$
-is an unsigned 8 or 16 bit integer. Note that bit depth and number of fixed
+$$roundToNearestInteger$$ returns a signed 32-bit integer, and $$zero\_point$$
+is an unsigned 8-bit or 16-bit integer. Note that bit depth and number of fixed
 point values are indicative of common types on typical hardware but is not
 constrained to particular bit depths or a requirement that the entire range of
 an N-bit integer is used.
 
-#### Affine to Real
+#### Affine to real
 
 To convert an output tensor of affine elements represented by uint8
-or uint16 to a tensor of Real-valued elements (usually represented with a
+or uint16 to a tensor of real-valued elements (usually represented with a
 floating point format, frequently Single precision), the following conversion
 can be performed:
 
@@ -186,10 +186,10 @@
 
     *   The TFLite op-set natively supports uniform-quantized variants.
     *   Passes and tools exist to convert directly from the *TensorFlow* dialect
-        to the TFLite quantized op-set.
+        to the TFLite quantized operation set.
 
 *   [*FxpMath* dialect](#fxpmath-dialect) containing (experimental) generalized
-    representations of fixed-point math ops and conversions:
+    representations of fixed-point math operations and conversions:
 
     *   [Real math ops](#real-math-ops) representing common combinations of
         arithmetic operations that closely match corresponding fixed-point math
@@ -198,16 +198,16 @@
     *   [Fixed-point math ops](#fixed-point-math-ops) that for carrying out
         computations on integers, as are typically needed by uniform
         quantization schemes.
-    *   Passes to lower from real math ops to fixed-point math ops.
+    *   Passes to lower from real math operations to fixed-point math operations.
 
 *   [Solver tools](#solver-tools) which can (experimentally and generically
     operate on computations expressed in the *FxpMath* dialect in order to
     convert from floating point types to appropriate *QuantizedTypes*, allowing
-    the computation to be further lowered to integral math ops.
+    the computation to be further lowered to integral math operations.
 
-Not every application of quantization will use all facilities. Specifically, the
+Not every application of quantization will use all of these facilities. Specifically, the
 TensorFlow to TensorFlow Lite conversion uses the QuantizedTypes but has its own
-ops for type conversion and expression of the backing math.
+operations for type conversion and expression of the supporting math.
 
 ## Quantization Dialect
 
@@ -218,20 +218,20 @@
 *   QuantizedType base class
 *   UniformQuantizedType
 
-### Quantized type conversion ops
+### Quantized type conversion operations
 
 *   qcast : Convert from an expressed type to QuantizedType
 *   dcast : Convert from a QuantizedType to its expressed type
 *   scast : Convert between a QuantizedType and its storage type
 
-### Instrumentation and constraint ops
+### Instrumentation and constraint operations
 
 *   const_fake_quant : Emulates the logic of the historic TensorFlow
-    fake_quant_with_min_max_args op.
+    fake_quant_with_min_max_args operation.
 *   stats_ref : Declares that statistics should be gathered at this point with a
     unique key and made available to future passes of the solver.
 *   stats : Declares inline statistics (per layer and per axis) for the point in
-    the computation. stats_ref ops are generally converted to stats ops once
+    the computation. stats_ref ops are generally converted to statistical operations once
     trial runs have been performed.
 *   coupled_ref : Declares points in the computation to be coupled from a type
     inference perspective based on a unique key.
@@ -246,23 +246,23 @@
 operations at inference time. When quantized inference was enabled, if every
 eligible tensor passed through an appropriate fake_quant node (the rules of
 which tensors can have fake_quant applied are somewhat involved), then
-TensorFlow Lite would use the attributes of the fake_quant ops to make a
-judgment about how to convert to use kernels from its quantized ops subset.
+TensorFlow Lite would use the attributes of the fake_quant operations to make a
+judgment about how to convert to use kernels from its quantized operations subset.
 
-In MLIR-based quantization, fake_quant_\* ops are handled by converting them to
+In MLIR-based quantization, fake_quant_\* operationss are handled by converting them to
 a sequence of *qcast* (quantize) followed by *dcast* (dequantize) with an
 appropriate *UniformQuantizedType* as the target of the qcast operation.
 
 This allows subsequent compiler passes to preserve the knowledge that
-quantization was simulated in a certain way while giving the compiler
+quantization was simulated in a certain way, while giving the compiler
 flexibility to move the casts as it simplifies the computation and converts it
 to a form based on integral arithmetic.
 
 This scheme also naturally allows computations that are *partially quantized*
-where the parts which could not be reduced to integral ops are still carried out
+where the parts which could not be reduced to integral operationss are still carried out
 in floating point with appropriate conversions at the boundaries.
 
-## TFLite Native Quantization
+## TFLite native quantization
 
 TODO : Flesh this out
 
@@ -280,16 +280,16 @@
     -> tfl.Q) and replaces with (op). Also replace (constant_float -> tfl.Q)
     with (constant_quant).
 
-## FxpMath Dialect
+## FxpMath dialect
 
-### Real math ops
+### Real math operations
 
 Note that these all support explicit clamps, which allows for simple fusions and
 representation of some common sequences quantization-compatible math. Of
 addition, some support explicit biases, which are often represented as separate
 adds in source dialects.
 
-TODO: This op set is still evolving and needs to be completed.
+TODO: This operation set is still evolving and needs to be completed.
 
 *   RealBinaryOp
     *   RealAddEwOp
@@ -312,9 +312,9 @@
     *   CMPLZ
     *   CMPGZ
 
-### Fixed-point math ops
+### Fixed-point math operationss
 
-TODO: This op set only has enough ops to lower a simple power-of-two
+TODO: This operation set only has enough operations to lower a simple power-of-two
 RealAddEwOp.
 
 *   RoundingDivideByPotFxpOp
@@ -331,7 +331,7 @@
 precision types (i.e. bfloat16 or fp16).
 
 Solver tools are expected to operate in several modes, depending on the
-computation and the manner in which it was trained:
+computation and the training characteristics of the model:
 
 *   *Transform* : With all available information in the MLIR computation, infer
     boundaries where the computation can be carried out with integral math and
@@ -339,18 +339,18 @@
 
     *   For passthrough ops which do not perform active math, change them to
         operate directly on the storage type, converting in and out at the edges
-        via scast ops.
-    *   For ops that have the *Quantizable* trait, the type can be set directly.
-        This includes ops from the [real math ops set]{#real-math-ops}.
-    *   For others, encase them in appropriate dcast/qcast ops, presuming that
+        via scast operations.
+    *   For operations that have the *Quantizable* trait, the type can be set directly.
+        This includes operations from the [real math ops set]{#real-math-ops}.
+    *   For others, encase them in appropriate dcast/qcast operations, presuming that
         some follow-on pass will know what to do with them.
 
 *   *Instrument* : Most of the time, there are not sufficient implied
     constraints within a computation to perform many transformations. For this
-    reason, the solver can insert instrumentation ops at points where additional
+    reason, the solver can insert instrumentation operations at points where additional
     runtime statistics may yield solutions. It is expected that such
     computations will be lowered as-is for execution, run over an appropriate
-    eval set, and statistics at each instrumentation point made available for a
+    evaluation set, and statistics at each instrumentation point made available for a
     future invocation of the solver.
 
 *   *Simplify* : A variety of passes and simplifications are applied once
diff --git a/mlir/docs/ShapeInference.md b/mlir/docs/ShapeInference.md
--- a/mlir/docs/ShapeInference.md
+++ b/mlir/docs/ShapeInference.md
@@ -10,7 +10,7 @@
 valuable constraints that could be captured even without full knowledge of the
 shape.
 
-Type inference is currently modelled executionally for op creation using the
+Type inference is currently modelled executionally for operation creation using the
 [`InferTypeOpInterface`][InferTypeOpInterface], while
 `InferShapedTypeOpInterface` is used to implement the shape and element type
 inference. The return type can often be deduced from the deduced return shape
@@ -27,7 +27,7 @@
 *   Constraints on the operands of an operation directly. For example
     constraining the input type to be tensor/vector elements or that the
     elemental type be of a specific type (e.g., output of computing the size
-    of a value is of elemental type `i1`) or class (e.g., float like).
+    of a value is of elemental type `i1`) or class (e.g., float-like).
 *   Constraints across operands and results of an operation.
 
     - For example, specifying equality constraints on type/constituents of a
@@ -41,7 +41,7 @@
 ## Testing
 
 Shape inference is currently tested alongside type inference by
-`TestReturnTypeDriver` in the test dialect. The driver performs two checks:
+`TestReturnTypeDriver` in the test dialect. This driver performs two checks:
 
 1.  Verification that the return types specified matches the infered types. This
     explicit check will be removed and made part of Op verification instead.
@@ -63,7 +63,7 @@
 of the output shape). As shown in the shaped container type, shape will be one
 of 3 components, the others being elemental type and attribute (which is
 currently left open with the intention of supporting extensions such as layouts
-or bounded shapes). This allows for decoupling of these:
+or bounded shapes at a later point). This allows for decoupling of these:
 
 *   Not all the information is needed for all analysis;
 *   Not all shape functions need to provide all the information (e.g., one could
@@ -73,16 +73,16 @@
     representation of an operation;
     
 An argument could be made that these are metadata function instead of shape
-functions, with some considering shape and elemental type different and some as
+functions, with some considering shape and elemental types different and some considering them both as
 part of shape. But `shape function` is IMHO descriptive and metadata can span
 too large a range of potential uses/values.
 
 ### Requirements
 
-The requirements for the shape inference functions are shaped by the
+The requirements for the shape inference functions are determined by the
 requirements of shape inference, but we believe the requirements below still
-allow freedom to consider different shape inference approaches and so we don't
-constrain to a particular shape inference approach here.
+allow freedom to consider different shape inference approaches and so we do not
+impose a particular shape inference approach here.
 
 #### Shape inference functions
 
@@ -104,8 +104,8 @@
         guaranteed to pass.
         *   Ideally all would eventually (see section
             [Inlining shape checking](#inline)) be elided.
-    *   Only report error guaranteed to occur at runtime, if an error is only
-        possible rather use runtime assertion to fail and produce an error
+    *   Only reporting errors which are guaranteed to occur at runtime. If an error is only
+        possible (rather than guaranteed) then we use a runtime assertion to fail and produce an error
         message with the invariant violated.
 
 *   Shape functions usable by compiler and runtime.
@@ -130,7 +130,7 @@
 
 *   Shape inference functions are expressible at runtime
 
-    *   User can define a shape function for a new op dynamically at runtime,
+    *   User can define a shape function for a new operation dynamically at runtime,
         this allows for vendors to describe an operation and shape function
         dynamically.
 
@@ -140,10 +140,10 @@
     information)
 
     *   Shape functions should be cheap to invoke on each kernel launch.
-    *   Shape function dictated by arguments (operands, attributes and regions)
+    *   Shape function can be dictated by arguments (operands, attributes and regions)
         only (e.g., same operands as the corresponding operation could be
         constructed & invoked with).
-    *   Shape information that need higher-level/graph information should use
+    *   Shape information that needs higher-level/graph information should use
         richer types (e.g., `TensorList<F32>`);
     *   The function should be invocable before/while constructing an op (e.g.,
         can't rely on the op being constructed).
@@ -157,19 +157,19 @@
         determining the shape & then post to be able to actually consume the
         output of the file).
 
-*   The shape function op dialect should interop with non shape dialect ops.
+*   The shape function operation dialect should be interoperable with non-shape function dialect operations.
 
-    *   There may be a common set of ops that satisfy most uses (e.g., merge,
+    *   There may be a common set of operations that satisfy most uses (e.g., merge,
         equal_type, arithmetic expressions, slice, concat, pattern matching on
         attributes such as padding etc.) that will be discovered and could cover
-        a large percentage of the use cases. And among these there will be some
+        a large percentage of the use cases. Among these there will be some
         which carry extra semantic info that could be used for symbolic
         constraints (e.g., checking equality of two dimensions resulting in
         setting an equality constraint) and higher-order interpretation for
         constraint solving.
 
-        It is therefore beneficial to reuse operations but not required.
-        Especially as for statically known shapes, arbitrary arithmetic
+        It is therefore beneficial (but not required) to reuse operations, 
+        especially as for statically known shapes, arbitrary arithmetic
         computations could still be performed. This means that the computations
         performed statically may or may not be supported by an arbitrary solver,
         but would still be allowed.
@@ -239,7 +239,7 @@
 
 ### Possibly Asked Questions
 
-#### What about ODS specifications of ops?
+#### What about ODS specifications of operations?
 
 In ODS we have been recording the constraints for the operands & attributes of
 an operation. Where these are sufficient to constrain the output shape (e.g.,
@@ -251,7 +251,7 @@
 #### Why not extract the shape function from reference implementation?
 
 This could be done in future! The extracted shape function would use the shape
-inference dialect, so we are starting there. Especially for ops described in a
+inference dialect, so we are starting there. Especially for operations described in a
 structured way, one could autogenerate the shape function.
 
 #### How/in what language will the shape functions be authored?
diff --git a/mlir/docs/Tutorials/Toy/Ch-2.md b/mlir/docs/Tutorials/Toy/Ch-2.md
--- a/mlir/docs/Tutorials/Toy/Ch-2.md
+++ b/mlir/docs/Tutorials/Toy/Ch-2.md
@@ -517,12 +517,7 @@
 }
 ```
 
-Above we introduce several of the concepts for defining operations in the ODS
-framework, but there are many more that we haven't had a chance to: regions,
-variadic operands, etc. Check out the
-[full specification](../../OpDefinitions.md) for more details.
-
-## Complete Toy Example
+#### Specifying a Custom Assembly Format
 
 At this point we can generate our "Toy IR". A simplified version of the previous
 example:
@@ -565,6 +560,185 @@
 } loc("test/codegen.toy":0:0)
 ```
 
+One thing to notice here is that all of our Toy operations are printed using the
+generic assembly format. This format is the one shown when breaking down
+`toy.transpose` at the beginning of this chapter. MLIR allows for operations to
+define their own custom assembly format, either
+[declaratively](../../OpDefinitions.md#declarative-assembly-format) or
+imperatively via C++. Defining a custom assembly format allows for tailoring the
+generated IR into something a bit more readable by removing a lot of the fluff
+that is required by the generic format. Let's walk through an example of an
+operation format that we would like to simplify.
+
+##### `toy.print`
+
+The current form of `toy.print` is a little verbose. There are a lot of
+additional characters that we would like to strip away. Let's begin by thinking
+of what a good format of `toy.print` would be, and see how we can implement it.
+Looking at the basics of `toy.print` we get:
+
+```mlir
+toy.print %5 : tensor<*xf64> loc(...)
+```
+
+Here we have stripped much of the format down to the bare essentials, and it has
+become much more readable. To provide a custom assembly format, an operation can
+either override the `parser` and `printer` fields for a C++ format, or the
+`assemblyFormat` field for the declarative format. Let's look at the C++ variant
+first, as this is what the declarative format maps to internally.
+
+```tablegen
+/// Consider a stripped definition of `toy.print` here.
+def PrintOp : Toy_Op<"print"> {
+  let arguments = (ins F64Tensor:$input);
+
+  // Divert the printer and parser to static functions in our .cpp
+  // file that correspond to 'print' and 'printPrintOp'. 'printer' and 'parser'
+  // here correspond to an instance of a 'OpAsmParser' and 'OpAsmPrinter'. More
+  // details on these classes is shown below.
+  let printer = [{ return ::print(printer, *this); }];
+  let parser = [{ return ::parse$cppClass(parser, result); }];
+}
+```
+
+A C++ implementation for the printer and parser is shown below:
+
+```c++
+/// The 'OpAsmPrinter' class is a stream that will allows for formatting
+/// strings, attributes, operands, types, etc.
+static void print(mlir::OpAsmPrinter &printer, PrintOp op) {
+  printer << "toy.print " << op.input();
+  printer.printOptionalAttrDict(op.getAttrs());
+  printer << " : " << op.input().getType();
+}
+
+/// The 'OpAsmPrinter' class provides a collection of methods for parsing
+/// various punctuation, as well as attributes, operands, types, etc. Each of
+/// these methods returns a `ParseResult`. This class is a wrapper around
+/// `LogicalResult` that can be converted to a boolean `true` value on failure,
+/// or `false` on success. This allows for easily chaining together a set of
+/// parser rules. These rules are used to populate an `mlir::OperationState`
+/// similarly to the `build` methods described above.
+static mlir::ParseResult parsePrintOp(mlir::OpAsmParser &parser,
+                                      mlir::OperationState &result) {
+  // Parse the input operand, the attribute dictionary, and the type of the
+  // input.
+  mlir::OpAsmParser::OperandType inputOperand;
+  mlir::Type inputType;
+  if (parser.parseOperand(inputOperand) ||
+      parser.parseOptionalAttrDict(result.attributes) || parser.parseColon() ||
+      parser.parseType(inputType))
+    return mlir::failure();
+
+  // Resolve the input operand to the type we parsed in.
+  if (parser.resolveOperand(inputOperand, inputType, result.operands))
+    return mlir::failure();
+
+  return mlir::success();
+}
+```
+
+With the C++ implementation defined, let's see how this can be mapped to the
+[declarative format](../../OpDefinitions.md#declarative-assembly-format). The
+declarative format is largely composed of three different components:
+
+*   Directives
+    -   A type of builtin function, with an optional set of arguments.
+*   Literals
+    -   A keyword or punctuation surrounded by \`\`.
+*   Variables
+    -   An entity that has been registered on the operation itself, i.e. an
+        argument(attribute or operand), result, successor, etc. In the `PrintOp`
+        example above, a variable would be `$input`.
+
+A direct mapping of our C++ format looks something like:
+
+```tablegen
+/// Consider a stripped definition of `toy.print` here.
+def PrintOp : Toy_Op<"print"> {
+  let arguments = (ins F64Tensor:$input);
+
+  // In the following format we have two directives, `attr-dict` and `type`.
+  // These correspond to the attribute dictionary and the type of a given
+  // variable represectively.
+  let assemblyFormat = "$input attr-dict `:` type($input)";
+}
+```
+
+The [declarative format](../../OpDefinitions.md#declarative-assembly-format) has
+many more interesting features, so be sure to check it out before implementing a
+custom format in C++. After beautifying the format of a few of our operations we
+now get a much more readable:
+
+```mlir
+module {
+  func @multiply_transpose(%arg0: tensor<*xf64>, %arg1: tensor<*xf64>) -> tensor<*xf64> {
+    %0 = toy.transpose(%arg0 : tensor<*xf64>) to tensor<*xf64> loc("test/codegen.toy":5:10)
+    %1 = toy.transpose(%arg1 : tensor<*xf64>) to tensor<*xf64> loc("test/codegen.toy":5:25)
+    %2 = toy.mul %0, %1 : tensor<*xf64> loc("test/codegen.toy":5:25)
+    toy.return %2 : tensor<*xf64> loc("test/codegen.toy":5:3)
+  } loc("test/codegen.toy":4:1)
+  func @main() {
+    %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64> loc("test/codegen.toy":9:17)
+    %1 = toy.reshape(%0 : tensor<2x3xf64>) to tensor<2x3xf64> loc("test/codegen.toy":9:3)
+    %2 = toy.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64> loc("test/codegen.toy":10:17)
+    %3 = toy.reshape(%2 : tensor<6xf64>) to tensor<2x3xf64> loc("test/codegen.toy":10:3)
+    %4 = toy.generic_call @multiply_transpose(%1, %3) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64> loc("test/codegen.toy":11:11)
+    %5 = toy.generic_call @multiply_transpose(%3, %1) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64> loc("test/codegen.toy":12:11)
+    toy.print %5 : tensor<*xf64> loc("test/codegen.toy":13:3)
+    toy.return loc("test/codegen.toy":8:1)
+  } loc("test/codegen.toy":8:1)
+} loc("test/codegen.toy":0:0)
+```
+
+Above we introduce several of the concepts for defining operations in the ODS
+framework, but there are many more that we haven't had a chance to: regions,
+variadic operands, etc. Check out the
+[full specification](../../OpDefinitions.md) for more details.
+
+## Complete Toy Example
+
+At this point we can generate our "Toy IR". A simplified version of the previous
+example:
+
+```toy
+# User defined generic function that operates on unknown shaped arguments.
+def multiply_transpose(a, b) {
+  return transpose(a) * transpose(b);
+}
+
+def main() {
+  var a<2, 3> = [[1, 2, 3], [4, 5, 6]];
+  var b<2, 3> = [1, 2, 3, 4, 5, 6];
+  var c = multiply_transpose(a, b);
+  var d = multiply_transpose(b, a);
+  print(d);
+}
+```
+
+Results in the following IR:
+
+```mlir
+module {
+  func @multiply_transpose(%arg0: tensor<*xf64>, %arg1: tensor<*xf64>) -> tensor<*xf64> {
+    %0 = toy.transpose(%arg0 : tensor<*xf64>) to tensor<*xf64> loc("test/codegen.toy":5:10)
+    %1 = toy.transpose(%arg1 : tensor<*xf64>) to tensor<*xf64> loc("test/codegen.toy":5:25)
+    %2 = toy.mul %0, %1 : tensor<*xf64> loc("test/codegen.toy":5:25)
+    toy.return %2 : tensor<*xf64> loc("test/codegen.toy":5:3)
+  } loc("test/codegen.toy":4:1)
+  func @main() {
+    %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64> loc("test/codegen.toy":9:17)
+    %1 = toy.reshape(%0 : tensor<2x3xf64>) to tensor<2x3xf64> loc("test/codegen.toy":9:3)
+    %2 = toy.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64> loc("test/codegen.toy":10:17)
+    %3 = toy.reshape(%2 : tensor<6xf64>) to tensor<2x3xf64> loc("test/codegen.toy":10:3)
+    %4 = toy.generic_call @multiply_transpose(%1, %3) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64> loc("test/codegen.toy":11:11)
+    %5 = toy.generic_call @multiply_transpose(%3, %1) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64> loc("test/codegen.toy":12:11)
+    toy.print %5 : tensor<*xf64> loc("test/codegen.toy":13:3)
+    toy.return loc("test/codegen.toy":8:1)
+  } loc("test/codegen.toy":8:1)
+} loc("test/codegen.toy":0:0)
+```
+
 You can build `toyc-ch2` and try yourself: `toyc-ch2
 test/Examples/Toy/Ch2/codegen.toy -emit=mlir -mlir-print-debuginfo`. We can also
 check our RoundTrip: `toyc-ch2 test/Examples/Toy/Ch2/codegen.toy -emit=mlir
diff --git a/mlir/docs/Tutorials/Toy/Ch-3.md b/mlir/docs/Tutorials/Toy/Ch-3.md
--- a/mlir/docs/Tutorials/Toy/Ch-3.md
+++ b/mlir/docs/Tutorials/Toy/Ch-3.md
@@ -38,9 +38,9 @@
 
 ```mlir
 func @transpose_transpose(%arg0: tensor<*xf64>) -> tensor<*xf64> {
-  %0 = "toy.transpose"(%arg0) : (tensor<*xf64>) -> tensor<*xf64>
-  %1 = "toy.transpose"(%0) : (tensor<*xf64>) -> tensor<*xf64>
-  "toy.return"(%1) : (tensor<*xf64>) -> ()
+  %0 = toy.transpose(%arg0 : tensor<*xf64>) to tensor<*xf64>
+  %1 = toy.transpose(%0 : tensor<*xf64>) to tensor<*xf64>
+  toy.return %1 : tensor<*xf64>
 }
 ```
 
@@ -133,8 +133,8 @@
 
 ```mlir
 func @transpose_transpose(%arg0: tensor<*xf64>) -> tensor<*xf64> {
-  %0 = "toy.transpose"(%arg0) : (tensor<*xf64>) -> tensor<*xf64>
-  "toy.return"(%arg0) : (tensor<*xf64>) -> ()
+  %0 = toy.transpose(%arg0 : tensor<*xf64>) to tensor<*xf64>
+  toy.return %arg0 : tensor<*xf64>
 }
 ```
 
@@ -154,7 +154,7 @@
 
 ```mlir
 func @transpose_transpose(%arg0: tensor<*xf64>) -> tensor<*xf64> {
-  "toy.return"(%arg0) : (tensor<*xf64>) -> ()
+  toy.return %arg0 : tensor<*xf64>
 }
 ```
 
@@ -229,13 +229,12 @@
 ```mlir
 module {
   func @main() {
-    %0 = "toy.constant"() {value = dense<[1.000000e+00, 2.000000e+00]> : tensor<2xf64>}
-                           : () -> tensor<2xf64>
-    %1 = "toy.reshape"(%0) : (tensor<2xf64>) -> tensor<2x1xf64>
-    %2 = "toy.reshape"(%1) : (tensor<2x1xf64>) -> tensor<2x1xf64>
-    %3 = "toy.reshape"(%2) : (tensor<2x1xf64>) -> tensor<2x1xf64>
-    "toy.print"(%3) : (tensor<2x1xf64>) -> ()
-    "toy.return"() : () -> ()
+    %0 = toy.constant dense<[1.000000e+00, 2.000000e+00]> : tensor<2xf64>
+    %1 = toy.reshape(%0 : tensor<2xf64>) to tensor<2x1xf64>
+    %2 = toy.reshape(%1 : tensor<2x1xf64>) to tensor<2x1xf64>
+    %3 = toy.reshape(%2 : tensor<2x1xf64>) to tensor<2x1xf64>
+    toy.print %3 : tensor<2x1xf64>
+    toy.return
   }
 }
 ```
@@ -246,10 +245,9 @@
 ```mlir
 module {
   func @main() {
-    %0 = "toy.constant"() {value = dense<[[1.000000e+00], [2.000000e+00]]> \
-                           : tensor<2x1xf64>} : () -> tensor<2x1xf64>
-    "toy.print"(%0) : (tensor<2x1xf64>) -> ()
-    "toy.return"() : () -> ()
+    %0 = toy.constant dense<[[1.000000e+00], [2.000000e+00]]> : tensor<2x1xf64>
+    toy.print %0 : tensor<2x1xf64>
+    toy.return
   }
 }
 ```
diff --git a/mlir/docs/Tutorials/Toy/Ch-4.md b/mlir/docs/Tutorials/Toy/Ch-4.md
--- a/mlir/docs/Tutorials/Toy/Ch-4.md
+++ b/mlir/docs/Tutorials/Toy/Ch-4.md
@@ -150,20 +150,20 @@
 
 ```mlir
 func @multiply_transpose(%arg0: tensor<*xf64>, %arg1: tensor<*xf64>) -> tensor<*xf64> {
-  %0 = "toy.transpose"(%arg0) : (tensor<*xf64>) -> tensor<*xf64>
-  %1 = "toy.transpose"(%arg1) : (tensor<*xf64>) -> tensor<*xf64>
-  %2 = "toy.mul"(%0, %1) : (tensor<*xf64>, tensor<*xf64>) -> tensor<*xf64>
-  "toy.return"(%2) : (tensor<*xf64>) -> ()
+  %0 = toy.transpose(%arg0 : tensor<*xf64>) to tensor<*xf64>
+  %1 = toy.transpose(%arg1 : tensor<*xf64>) to tensor<*xf64>
+  %2 = toy.mul %0, %1 : tensor<*xf64>
+  toy.return %2 : tensor<*xf64>
 }
 func @main() {
-  %0 = "toy.constant"() {value = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
-  %1 = "toy.reshape"(%0) : (tensor<2x3xf64>) -> tensor<2x3xf64>
-  %2 = "toy.constant"() {value = dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64>} : () -> tensor<6xf64>
-  %3 = "toy.reshape"(%2) : (tensor<6xf64>) -> tensor<2x3xf64>
-  %4 = "toy.generic_call"(%1, %3) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
-  %5 = "toy.generic_call"(%3, %1) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
-  "toy.print"(%5) : (tensor<*xf64>) -> ()
-  "toy.return"() : () -> ()
+  %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+  %1 = toy.reshape(%0 : tensor<2x3xf64>) to tensor<2x3xf64>
+  %2 = toy.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64>
+  %3 = toy.reshape(%2 : tensor<6xf64>) to tensor<2x3xf64>
+  %4 = toy.generic_call @multiply_transpose(%1, %3) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+  %5 = toy.generic_call @multiply_transpose(%3, %1) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+  toy.print %5 : tensor<*xf64>
+  toy.return
 }
 ```
 
@@ -226,8 +226,8 @@
   %4 = "toy.transpose"(%2) : (tensor<*xf64>) -> tensor<*xf64>
   %5 = "toy.transpose"(%3) : (tensor<*xf64>) -> tensor<*xf64>
   %6 = "toy.mul"(%4, %5) : (tensor<*xf64>, tensor<*xf64>) -> tensor<*xf64>
-  "toy.print"(%6) : (tensor<*xf64>) -> ()
-  "toy.return"() : () -> ()
+  toy.print %6 : tensor<*xf64>
+  toy.return
 }
 ```
 
@@ -374,8 +374,8 @@
   %0 = "toy.constant"() {value = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
   %1 = "toy.transpose"(%0) : (tensor<2x3xf64>) -> tensor<3x2xf64>
   %2 = "toy.mul"(%1, %1) : (tensor<3x2xf64>, tensor<3x2xf64>) -> tensor<3x2xf64>
-  "toy.print"(%2) : (tensor<3x2xf64>) -> ()
-  "toy.return"() : () -> ()
+  toy.print %2 : tensor<3x2xf64>
+  toy.return
 }
 ```
 
diff --git a/mlir/docs/Tutorials/Toy/Ch-5.md b/mlir/docs/Tutorials/Toy/Ch-5.md
--- a/mlir/docs/Tutorials/Toy/Ch-5.md
+++ b/mlir/docs/Tutorials/Toy/Ch-5.md
@@ -239,11 +239,11 @@
 
 ```mlir
 func @main() {
-  %0 = "toy.constant"() {value = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
-  %2 = "toy.transpose"(%0) : (tensor<2x3xf64>) -> tensor<3x2xf64>
-  %3 = "toy.mul"(%2, %2) : (tensor<3x2xf64>, tensor<3x2xf64>) -> tensor<3x2xf64>
-  "toy.print"(%3) : (tensor<3x2xf64>) -> ()
-  "toy.return"() : () -> ()
+  %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+  %2 = toy.transpose(%0 : tensor<2x3xf64>) to tensor<3x2xf64>
+  %3 = toy.mul %2, %2 : tensor<3x2xf64>
+  toy.print %3 : tensor<3x2xf64>
+  toy.return
 }
 ```
 
@@ -291,7 +291,7 @@
   }
 
   // Print the value held by the buffer.
-  "toy.print"(%0) : (memref<3x2xf64>) -> ()
+  toy.print %0 : memref<3x2xf64>
   dealloc %2 : memref<2x3xf64>
   dealloc %1 : memref<3x2xf64>
   dealloc %0 : memref<3x2xf64>
@@ -340,7 +340,7 @@
   }
 
   // Print the value held by the buffer.
-  "toy.print"(%0) : (memref<3x2xf64>) -> ()
+  toy.print %0 : memref<3x2xf64>
   dealloc %1 : memref<2x3xf64>
   dealloc %0 : memref<3x2xf64>
   return
diff --git a/mlir/docs/Tutorials/Toy/Ch-6.md b/mlir/docs/Tutorials/Toy/Ch-6.md
--- a/mlir/docs/Tutorials/Toy/Ch-6.md
+++ b/mlir/docs/Tutorials/Toy/Ch-6.md
@@ -115,11 +115,11 @@
 
 ```mlir
 func @main() {
-  %0 = "toy.constant"() {value = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
-  %2 = "toy.transpose"(%0) : (tensor<2x3xf64>) -> tensor<3x2xf64>
-  %3 = "toy.mul"(%2, %2) : (tensor<3x2xf64>, tensor<3x2xf64>) -> tensor<3x2xf64>
-  "toy.print"(%3) : (tensor<3x2xf64>) -> ()
-  "toy.return"() : () -> ()
+  %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+  %2 = toy.transpose(%0 : tensor<2x3xf64>) to tensor<3x2xf64>
+  %3 = toy.mul %2, %2 : tensor<3x2xf64>
+  toy.print %3 : tensor<3x2xf64>
+  toy.return
 }
 ```
 
diff --git a/mlir/docs/Tutorials/Toy/Ch-7.md b/mlir/docs/Tutorials/Toy/Ch-7.md
--- a/mlir/docs/Tutorials/Toy/Ch-7.md
+++ b/mlir/docs/Tutorials/Toy/Ch-7.md
@@ -342,7 +342,7 @@
 ```mlir
 module {
   func @multiply_transpose(%arg0: !toy.struct<tensor<*xf64>, tensor<*xf64>>) {
-    "toy.return"() : () -> ()
+    toy.return
   }
 }
 ```
@@ -391,9 +391,9 @@
 that contains a set of constant values for each of the `struct` elements.
 
 ```mlir
-  %0 = "toy.struct_constant"() {
-    value = [dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]> : tensor<2x3xf64>]
-  } : () -> !toy.struct<tensor<*xf64>>
+  %0 = toy.struct_constant [
+    dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]> : tensor<2x3xf64>
+  ] : !toy.struct<tensor<*xf64>>
 ```
 
 ##### `toy.struct_access`
@@ -401,10 +401,10 @@
 This new operation materializes the Nth element of a `struct` value.
 
 ```mlir
-  %0 = "toy.struct_constant"() {
-    value = [dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]> : tensor<2x3xf64>]
-  } : () -> !toy.struct<tensor<*xf64>>
-  %1 = "toy.struct_access"(%0) {index = 0 : i64} : (!toy.struct<tensor<*xf64>>) -> tensor<*xf64>
+  %0 = toy.struct_constant [
+    dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]> : tensor<2x3xf64>
+  ] : !toy.struct<tensor<*xf64>>
+  %1 = toy.struct_access %0[0] : !toy.struct<tensor<*xf64>> -> tensor<*xf64>
 ```
 
 With these operations, we can revisit our original example:
@@ -436,18 +436,21 @@
 ```mlir
 module {
   func @multiply_transpose(%arg0: !toy.struct<tensor<*xf64>, tensor<*xf64>>) -> tensor<*xf64> {
-    %0 = "toy.struct_access"(%arg0) {index = 0 : i64} : (!toy.struct<tensor<*xf64>, tensor<*xf64>>) -> tensor<*xf64>
-    %1 = "toy.transpose"(%0) : (tensor<*xf64>) -> tensor<*xf64>
-    %2 = "toy.struct_access"(%arg0) {index = 1 : i64} : (!toy.struct<tensor<*xf64>, tensor<*xf64>>) -> tensor<*xf64>
-    %3 = "toy.transpose"(%2) : (tensor<*xf64>) -> tensor<*xf64>
-    %4 = "toy.mul"(%1, %3) : (tensor<*xf64>, tensor<*xf64>) -> tensor<*xf64>
-    "toy.return"(%4) : (tensor<*xf64>) -> ()
+    %0 = toy.struct_access %arg0[0] : !toy.struct<tensor<*xf64>, tensor<*xf64>> -> tensor<*xf64>
+    %1 = toy.transpose(%0 : tensor<*xf64>) to tensor<*xf64>
+    %2 = toy.struct_access %arg0[1] : !toy.struct<tensor<*xf64>, tensor<*xf64>> -> tensor<*xf64>
+    %3 = toy.transpose(%2 : tensor<*xf64>) to tensor<*xf64>
+    %4 = toy.mul %1, %3 : tensor<*xf64>
+    toy.return %4 : tensor<*xf64>
   }
   func @main() {
-    %0 = "toy.struct_constant"() {value = [dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>, dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>]} : () -> !toy.struct<tensor<*xf64>, tensor<*xf64>>
-    %1 = "toy.generic_call"(%0) {callee = @multiply_transpose} : (!toy.struct<tensor<*xf64>, tensor<*xf64>>) -> tensor<*xf64>
-    "toy.print"(%1) : (tensor<*xf64>) -> ()
-    "toy.return"() : () -> ()
+    %0 = toy.struct_constant [
+      dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>,
+      dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+    ] : !toy.struct<tensor<*xf64>, tensor<*xf64>>
+    %1 = toy.generic_call @multiply_transpose(%0) : (!toy.struct<tensor<*xf64>, tensor<*xf64>>) -> tensor<*xf64>
+    toy.print %1 : tensor<*xf64>
+    toy.return
   }
 }
 ```
@@ -462,14 +465,17 @@
 ```mlir
 module {
   func @main() {
-    %0 = "toy.struct_constant"() {value = [dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>, dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>]} : () -> !toy.struct<tensor<*xf64>, tensor<*xf64>>
-    %1 = "toy.struct_access"(%0) {index = 0 : i64} : (!toy.struct<tensor<*xf64>, tensor<*xf64>>) -> tensor<*xf64>
-    %2 = "toy.transpose"(%1) : (tensor<*xf64>) -> tensor<*xf64>
-    %3 = "toy.struct_access"(%0) {index = 1 : i64} : (!toy.struct<tensor<*xf64>, tensor<*xf64>>) -> tensor<*xf64>
-    %4 = "toy.transpose"(%3) : (tensor<*xf64>) -> tensor<*xf64>
-    %5 = "toy.mul"(%2, %4) : (tensor<*xf64>, tensor<*xf64>) -> tensor<*xf64>
-    "toy.print"(%5) : (tensor<*xf64>) -> ()
-    "toy.return"() : () -> ()
+    %0 = toy.struct_constant [
+      dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>,
+      dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+    ] : !toy.struct<tensor<*xf64>, tensor<*xf64>>
+    %1 = toy.struct_access %0[0] : !toy.struct<tensor<*xf64>, tensor<*xf64>> -> tensor<*xf64>
+    %2 = toy.transpose(%1 : tensor<*xf64>) to tensor<*xf64>
+    %3 = toy.struct_access %0[1] : !toy.struct<tensor<*xf64>, tensor<*xf64>> -> tensor<*xf64>
+    %4 = toy.transpose(%3 : tensor<*xf64>) to tensor<*xf64>
+    %5 = toy.mul %2, %4 : tensor<*xf64>
+    toy.print %5 : tensor<*xf64>
+    toy.return
   }
 }
 ```
@@ -524,11 +530,11 @@
 ```mlir
 module {
   func @main() {
-    %0 = "toy.constant"() {value = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
-    %1 = "toy.transpose"(%0) : (tensor<2x3xf64>) -> tensor<3x2xf64>
-    %2 = "toy.mul"(%1, %1) : (tensor<3x2xf64>, tensor<3x2xf64>) -> tensor<3x2xf64>
-    "toy.print"(%2) : (tensor<3x2xf64>) -> ()
-    "toy.return"() : () -> ()
+    %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+    %1 = toy.transpose(%0 : tensor<2x3xf64>) to tensor<3x2xf64>
+    %2 = toy.mul %1, %1 : tensor<3x2xf64>
+    toy.print %2 : tensor<3x2xf64>
+    toy.return
   }
 }
 ```
diff --git a/mlir/examples/toy/Ch2/include/toy/Ops.td b/mlir/examples/toy/Ch2/include/toy/Ops.td
--- a/mlir/examples/toy/Ch2/include/toy/Ops.td
+++ b/mlir/examples/toy/Ch2/include/toy/Ops.td
@@ -47,9 +47,8 @@
     to the operation as an attribute. For example:
 
     ```mlir
-      %0 = "toy.constant"()
-         { value = dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]> : tensor<2x3xf64> }
-        : () -> tensor<2x3xf64>
+      %0 = toy.constant dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]>
+                        : tensor<2x3xf64>
     ```
   }];
 
@@ -59,6 +58,10 @@
   // The constant operation returns a single value of TensorType.
   let results = (outs F64Tensor);
 
+  // Specify a parser and printer method.
+  let parser = [{ return ::parseConstantOp(parser, result); }];
+  let printer = [{ return ::print(p, *this); }];
+
   // Add custom build methods for the constant operation. These method populates
   // the `state` that MLIR uses to create operations, i.e. these are used when
   // using `builder.create<ConstantOp>(...)`.
@@ -87,6 +90,10 @@
   let arguments = (ins F64Tensor:$lhs, F64Tensor:$rhs);
   let results = (outs F64Tensor);
 
+  // Specify a parser and printer method.
+  let parser = [{ return ::parseBinaryOp(parser, result); }];
+  let printer = [{ return ::printBinaryOp(p, *this); }];
+
   // Allow building an AddOp with from the two input operands.
   let builders = [
     OpBuilder<"Builder *b, OperationState &state, Value lhs, Value rhs">
@@ -102,7 +109,7 @@
     arguments expected by the callee. For example:
 
     ```mlir
-     %4 = "toy.generic_call"(%1, %3) {callee = @my_func}
+     %4 = toy.generic_call @my_func(%1, %3)
            : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
     ```
 
@@ -117,6 +124,11 @@
   // The generic call operation returns a single value of TensorType.
   let results = (outs F64Tensor);
 
+  // The return operation only emits the input in the format if it is present.
+  let assemblyFormat = [{
+    $callee `(` $inputs `)` attr-dict `:` functional-type($inputs, results)
+  }];
+
   // Add custom build methods for the generic call operation.
   let builders = [
     OpBuilder<"Builder *builder, OperationState &state, "
@@ -134,6 +146,10 @@
   let arguments = (ins F64Tensor:$lhs, F64Tensor:$rhs);
   let results = (outs F64Tensor);
 
+  // Specify a parser and printer method.
+  let parser = [{ return ::parseBinaryOp(parser, result); }];
+  let printer = [{ return ::printBinaryOp(p, *this); }];
+
   // Allow building a MulOp with from the two input operands.
   let builders = [
     OpBuilder<"Builder *b, OperationState &state, Value lhs, Value rhs">
@@ -149,6 +165,8 @@
 
   // The print operation takes an input tensor to print.
   let arguments = (ins F64Tensor:$input);
+
+  let assemblyFormat = "$input attr-dict `:` type($input)";
 }
 
 def ReshapeOp : Toy_Op<"reshape"> {
@@ -158,7 +176,7 @@
     the same number of elements but different shapes. For example:
 
     ```mlir
-       %0 = "toy.reshape"(%arg1) : (tensor<10xf64>) -> tensor<5x2xf64>
+       %0 = toy.reshape (%arg1 : tensor<10xf64>) to tensor<5x2xf64>
     ```
   }];
 
@@ -166,6 +184,10 @@
 
   // We expect that the reshape operation returns a statically shaped tensor.
   let results = (outs StaticShapeTensorOf<[F64]>);
+
+  let assemblyFormat = [{
+    `(` $input `:` type($input) `)` attr-dict `to` type(results)
+  }];
 }
 
 def ReturnOp : Toy_Op<"return", [Terminator, HasParent<"FuncOp">]> {
@@ -188,6 +210,9 @@
   // value must match the return type of the enclosing function.
   let arguments = (ins Variadic<F64Tensor>:$input);
 
+  // The return operation only emits the input in the format if it is present.
+  let assemblyFormat = "($input^ `:` type($input))? attr-dict ";
+
   // Allow building a ReturnOp with no return operand.
   let builders = [OpBuilder<
     "Builder *b, OperationState &state", [{ build(b, state, llvm::None); }]
@@ -208,6 +233,10 @@
   let arguments = (ins F64Tensor:$input);
   let results = (outs F64Tensor);
 
+  let assemblyFormat = [{
+    `(` $input `:` type($input) `)` attr-dict `to` type(results)
+  }];
+
   // Allow building a TransposeOp with from the input operand.
   let builders = [
     OpBuilder<"Builder *b, OperationState &state, Value input">
diff --git a/mlir/examples/toy/Ch2/mlir/Dialect.cpp b/mlir/examples/toy/Ch2/mlir/Dialect.cpp
--- a/mlir/examples/toy/Ch2/mlir/Dialect.cpp
+++ b/mlir/examples/toy/Ch2/mlir/Dialect.cpp
@@ -14,6 +14,7 @@
 #include "toy/Dialect.h"
 
 #include "mlir/IR/Builders.h"
+#include "mlir/IR/OpImplementation.h"
 #include "mlir/IR/StandardTypes.h"
 
 using namespace mlir;
@@ -36,6 +37,54 @@
 // Toy Operations
 //===----------------------------------------------------------------------===//
 
+/// A generalized parser for binary operations. This parses the different forms
+/// of 'printBinaryOp' below.
+static mlir::ParseResult parseBinaryOp(mlir::OpAsmParser &parser,
+                                       mlir::OperationState &result) {
+  SmallVector<mlir::OpAsmParser::OperandType, 2> operands;
+  llvm::SMLoc operandsLoc = parser.getCurrentLocation();
+  Type type;
+  if (parser.parseOperandList(operands, /*requiredOperandCount=*/2) ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonType(type))
+    return mlir::failure();
+
+  // If the type is a function type, it contains the input and result types of
+  // this operation.
+  if (FunctionType funcType = type.dyn_cast<FunctionType>()) {
+    if (parser.resolveOperands(operands, funcType.getInputs(), operandsLoc,
+                               result.operands))
+      return mlir::failure();
+    result.addTypes(funcType.getResults());
+    return mlir::success();
+  }
+
+  // Otherwise, the parsed type is the type of both operands and results.
+  if (parser.resolveOperands(operands, type, result.operands))
+    return mlir::failure();
+  result.addTypes(type);
+  return mlir::success();
+}
+
+/// A generalized printer for binary operations. It prints in two different
+/// forms depending on if all of the types match.
+static void printBinaryOp(mlir::OpAsmPrinter &printer, mlir::Operation *op) {
+  printer << op->getName() << " " << op->getOperands();
+  printer.printOptionalAttrDict(op->getAttrs());
+  printer << " : ";
+
+  // If all of the types are the same, print the type directly.
+  Type resultType = *op->result_type_begin();
+  if (llvm::all_of(op->getOperandTypes(),
+                   [=](Type type) { return type == resultType; })) {
+    printer << resultType;
+    return;
+  }
+
+  // Otherwise, print a functional type.
+  printer.printFunctionalType(op->getOperandTypes(), op->getResultTypes());
+}
+
 //===----------------------------------------------------------------------===//
 // ConstantOp
 
@@ -49,6 +98,32 @@
   ConstantOp::build(builder, state, dataType, dataAttribute);
 }
 
+/// The 'OpAsmPrinter' class provides a collection of methods for parsing
+/// various punctuation, as well as attributes, operands, types, etc. Each of
+/// these methods returns a `ParseResult`. This class is a wrapper around
+/// `LogicalResult` that can be converted to a boolean `true` value on failure,
+/// or `false` on success. This allows for easily chaining together a set of
+/// parser rules. These rules are used to populate an `mlir::OperationState`
+/// similarly to the `build` methods described above.
+static mlir::ParseResult parseConstantOp(mlir::OpAsmParser &parser,
+                                         mlir::OperationState &result) {
+  mlir::DenseElementsAttr value;
+  if (parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseAttribute(value, "value", result.attributes))
+    return failure();
+
+  result.addTypes(value.getType());
+  return success();
+}
+
+/// The 'OpAsmPrinter' class is a stream that will allows for formatting
+/// strings, attributes, operands, types, etc.
+static void print(mlir::OpAsmPrinter &printer, ConstantOp op) {
+  printer << "toy.constant ";
+  printer.printOptionalAttrDict(op.getAttrs(), /*elidedAttrs=*/{"value"});
+  printer << op.value();
+}
+
 /// Verifier for the constant operation. This corresponds to the `::verify(...)`
 /// in the op definition.
 static mlir::LogicalResult verify(ConstantOp op) {
diff --git a/mlir/examples/toy/Ch3/include/toy/Ops.td b/mlir/examples/toy/Ch3/include/toy/Ops.td
--- a/mlir/examples/toy/Ch3/include/toy/Ops.td
+++ b/mlir/examples/toy/Ch3/include/toy/Ops.td
@@ -47,9 +47,8 @@
     to the operation as an attribute. For example:
 
     ```mlir
-      %0 = "toy.constant"()
-         { value = dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]> : tensor<2x3xf64> }
-        : () -> tensor<2x3xf64>
+      %0 = toy.constant dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]>
+                        : tensor<2x3xf64>
     ```
   }];
 
@@ -59,6 +58,10 @@
   // The constant operation returns a single value of TensorType.
   let results = (outs F64Tensor);
 
+  // Specify a parser and printer method.
+  let parser = [{ return ::parseConstantOp(parser, result); }];
+  let printer = [{ return ::print(p, *this); }];
+
   // Add custom build methods for the constant operation. These method populates
   // the `state` that MLIR uses to create operations, i.e. these are used when
   // using `builder.create<ConstantOp>(...)`.
@@ -87,6 +90,10 @@
   let arguments = (ins F64Tensor:$lhs, F64Tensor:$rhs);
   let results = (outs F64Tensor);
 
+  // Specify a parser and printer method.
+  let parser = [{ return ::parseBinaryOp(parser, result); }];
+  let printer = [{ return ::printBinaryOp(p, *this); }];
+
   // Allow building an AddOp with from the two input operands.
   let builders = [
     OpBuilder<"Builder *b, OperationState &state, Value lhs, Value rhs">
@@ -102,7 +109,7 @@
     arguments expected by the callee. For example:
 
     ```mlir
-     %4 = "toy.generic_call"(%1, %3) {callee = @my_func}
+     %4 = toy.generic_call @my_func(%1, %3)
            : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
     ```
 
@@ -117,6 +124,11 @@
   // The generic call operation returns a single value of TensorType.
   let results = (outs F64Tensor);
 
+  // The return operation only emits the input in the format if it is present.
+  let assemblyFormat = [{
+    $callee `(` $inputs `)` attr-dict `:` functional-type($inputs, results)
+  }];
+
   // Add custom build methods for the generic call operation.
   let builders = [
     OpBuilder<"Builder *builder, OperationState &state, "
@@ -134,6 +146,10 @@
   let arguments = (ins F64Tensor:$lhs, F64Tensor:$rhs);
   let results = (outs F64Tensor);
 
+  // Specify a parser and printer method.
+  let parser = [{ return ::parseBinaryOp(parser, result); }];
+  let printer = [{ return ::printBinaryOp(p, *this); }];
+
   // Allow building a MulOp with from the two input operands.
   let builders = [
     OpBuilder<"Builder *b, OperationState &state, Value lhs, Value rhs">
@@ -149,6 +165,8 @@
 
   // The print operation takes an input tensor to print.
   let arguments = (ins F64Tensor:$input);
+
+  let assemblyFormat = "$input attr-dict `:` type($input)";
 }
 
 def ReshapeOp : Toy_Op<"reshape", [NoSideEffect]> {
@@ -158,17 +176,21 @@
     the same number of elements but different shapes. For example:
 
     ```mlir
-       %0 = "toy.reshape"(%arg1) : (tensor<10xf64>) -> tensor<5x2xf64>
+       %0 = toy.reshape (%arg1 : tensor<10xf64>) to tensor<5x2xf64>
     ```
   }];
 
   let arguments = (ins F64Tensor:$input);
 
-  // Enabled registering canonicalization patterns with this operation.
-  let hasCanonicalizer = 1;
-
   // We expect that the reshape operation returns a statically shaped tensor.
   let results = (outs StaticShapeTensorOf<[F64]>);
+
+  let assemblyFormat = [{
+    `(` $input `:` type($input) `)` attr-dict `to` type(results)
+  }];
+
+  // Enable registering canonicalization patterns with this operation.
+  let hasCanonicalizer = 1;
 }
 
 def ReturnOp : Toy_Op<"return", [Terminator, HasParent<"FuncOp">]> {
@@ -191,6 +213,9 @@
   // value must match the return type of the enclosing function.
   let arguments = (ins Variadic<F64Tensor>:$input);
 
+  // The return operation only emits the input in the format if it is present.
+  let assemblyFormat = "($input^ `:` type($input))? attr-dict ";
+
   // Allow building a ReturnOp with no return operand.
   let builders = [OpBuilder<
     "Builder *b, OperationState &state", [{ build(b, state, llvm::None); }]
@@ -211,7 +236,11 @@
   let arguments = (ins F64Tensor:$input);
   let results = (outs F64Tensor);
 
-  // Enabled registering canonicalization patterns with this operation.
+  let assemblyFormat = [{
+    `(` $input `:` type($input) `)` attr-dict `to` type(results)
+  }];
+
+  // Enable registering canonicalization patterns with this operation.
   let hasCanonicalizer = 1;
 
   // Allow building a TransposeOp with from the input operand.
diff --git a/mlir/examples/toy/Ch3/mlir/Dialect.cpp b/mlir/examples/toy/Ch3/mlir/Dialect.cpp
--- a/mlir/examples/toy/Ch3/mlir/Dialect.cpp
+++ b/mlir/examples/toy/Ch3/mlir/Dialect.cpp
@@ -14,6 +14,7 @@
 #include "toy/Dialect.h"
 
 #include "mlir/IR/Builders.h"
+#include "mlir/IR/OpImplementation.h"
 #include "mlir/IR/StandardTypes.h"
 
 using namespace mlir;
@@ -36,6 +37,54 @@
 // Toy Operations
 //===----------------------------------------------------------------------===//
 
+/// A generalized parser for binary operations. This parses the different forms
+/// of 'printBinaryOp' below.
+static mlir::ParseResult parseBinaryOp(mlir::OpAsmParser &parser,
+                                       mlir::OperationState &result) {
+  SmallVector<mlir::OpAsmParser::OperandType, 2> operands;
+  llvm::SMLoc operandsLoc = parser.getCurrentLocation();
+  Type type;
+  if (parser.parseOperandList(operands, /*requiredOperandCount=*/2) ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonType(type))
+    return mlir::failure();
+
+  // If the type is a function type, it contains the input and result types of
+  // this operation.
+  if (FunctionType funcType = type.dyn_cast<FunctionType>()) {
+    if (parser.resolveOperands(operands, funcType.getInputs(), operandsLoc,
+                               result.operands))
+      return mlir::failure();
+    result.addTypes(funcType.getResults());
+    return mlir::success();
+  }
+
+  // Otherwise, the parsed type is the type of both operands and results.
+  if (parser.resolveOperands(operands, type, result.operands))
+    return mlir::failure();
+  result.addTypes(type);
+  return mlir::success();
+}
+
+/// A generalized printer for binary operations. It prints in two different
+/// forms depending on if all of the types match.
+static void printBinaryOp(mlir::OpAsmPrinter &printer, mlir::Operation *op) {
+  printer << op->getName() << " " << op->getOperands();
+  printer.printOptionalAttrDict(op->getAttrs());
+  printer << " : ";
+
+  // If all of the types are the same, print the type directly.
+  Type resultType = *op->result_type_begin();
+  if (llvm::all_of(op->getOperandTypes(),
+                   [=](Type type) { return type == resultType; })) {
+    printer << resultType;
+    return;
+  }
+
+  // Otherwise, print a functional type.
+  printer.printFunctionalType(op->getOperandTypes(), op->getResultTypes());
+}
+
 //===----------------------------------------------------------------------===//
 // ConstantOp
 
@@ -49,6 +98,32 @@
   ConstantOp::build(builder, state, dataType, dataAttribute);
 }
 
+/// The 'OpAsmPrinter' class provides a collection of methods for parsing
+/// various punctuation, as well as attributes, operands, types, etc. Each of
+/// these methods returns a `ParseResult`. This class is a wrapper around
+/// `LogicalResult` that can be converted to a boolean `true` value on failure,
+/// or `false` on success. This allows for easily chaining together a set of
+/// parser rules. These rules are used to populate an `mlir::OperationState`
+/// similarly to the `build` methods described above.
+static mlir::ParseResult parseConstantOp(mlir::OpAsmParser &parser,
+                                         mlir::OperationState &result) {
+  mlir::DenseElementsAttr value;
+  if (parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseAttribute(value, "value", result.attributes))
+    return failure();
+
+  result.addTypes(value.getType());
+  return success();
+}
+
+/// The 'OpAsmPrinter' class is a stream that will allows for formatting
+/// strings, attributes, operands, types, etc.
+static void print(mlir::OpAsmPrinter &printer, ConstantOp op) {
+  printer << "toy.constant ";
+  printer.printOptionalAttrDict(op.getAttrs(), /*elidedAttrs=*/{"value"});
+  printer << op.value();
+}
+
 /// Verifier for the constant operation. This corresponds to the `::verify(...)`
 /// in the op definition.
 static mlir::LogicalResult verify(ConstantOp op) {
diff --git a/mlir/examples/toy/Ch4/include/toy/Ops.td b/mlir/examples/toy/Ch4/include/toy/Ops.td
--- a/mlir/examples/toy/Ch4/include/toy/Ops.td
+++ b/mlir/examples/toy/Ch4/include/toy/Ops.td
@@ -48,9 +48,8 @@
     to the operation as an attribute. For example:
 
     ```mlir
-      %0 = "toy.constant"()
-         { value = dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]> : tensor<2x3xf64> }
-        : () -> tensor<2x3xf64>
+      %0 = toy.constant dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]>
+                        : tensor<2x3xf64>
     ```
   }];
 
@@ -60,6 +59,10 @@
   // The constant operation returns a single value of TensorType.
   let results = (outs F64Tensor);
 
+  // Specify a parser and printer method.
+  let parser = [{ return ::parseConstantOp(parser, result); }];
+  let printer = [{ return ::print(p, *this); }];
+
   // Add custom build methods for the constant operation. These method populates
   // the `state` that MLIR uses to create operations, i.e. these are used when
   // using `builder.create<ConstantOp>(...)`.
@@ -89,6 +92,10 @@
   let arguments = (ins F64Tensor:$lhs, F64Tensor:$rhs);
   let results = (outs F64Tensor);
 
+  // Specify a parser and printer method.
+  let parser = [{ return ::parseBinaryOp(parser, result); }];
+  let printer = [{ return ::printBinaryOp(p, *this); }];
+
   // Allow building an AddOp with from the two input operands.
   let builders = [
     OpBuilder<"Builder *b, OperationState &state, Value lhs, Value rhs">
@@ -110,6 +117,8 @@
   let arguments = (ins F64Tensor:$input);
   let results = (outs F64Tensor:$output);
 
+  let assemblyFormat = "$input attr-dict `:` type($input) `to` type($output)";
+
   // Set the folder bit so that we can fold redundant cast operations.
   let hasFolder = 1;
 }
@@ -124,7 +133,7 @@
     arguments expected by the callee. For example:
 
     ```mlir
-     %4 = "toy.generic_call"(%1, %3) {callee = @my_func}
+     %4 = toy.generic_call @my_func(%1, %3)
            : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
     ```
 
@@ -139,6 +148,11 @@
   // The generic call operation returns a single value of TensorType.
   let results = (outs F64Tensor);
 
+  // The return operation only emits the input in the format if it is present.
+  let assemblyFormat = [{
+    $callee `(` $inputs `)` attr-dict `:` functional-type($inputs, results)
+  }];
+
   // Add custom build methods for the generic call operation.
   let builders = [
     OpBuilder<"Builder *builder, OperationState &state, "
@@ -157,6 +171,10 @@
   let arguments = (ins F64Tensor:$lhs, F64Tensor:$rhs);
   let results = (outs F64Tensor);
 
+  // Specify a parser and printer method.
+  let parser = [{ return ::parseBinaryOp(parser, result); }];
+  let printer = [{ return ::printBinaryOp(p, *this); }];
+
   // Allow building a MulOp with from the two input operands.
   let builders = [
     OpBuilder<"Builder *b, OperationState &state, Value lhs, Value rhs">
@@ -172,6 +190,8 @@
 
   // The print operation takes an input tensor to print.
   let arguments = (ins F64Tensor:$input);
+
+  let assemblyFormat = "$input attr-dict `:` type($input)";
 }
 
 def ReshapeOp : Toy_Op<"reshape", [NoSideEffect]> {
@@ -181,15 +201,21 @@
     the same number of elements but different shapes. For example:
 
     ```mlir
-       %0 = "toy.reshape"(%arg1) : (tensor<10xf64>) -> tensor<5x2xf64>
+       %0 = toy.reshape (%arg1 : tensor<10xf64>) to tensor<5x2xf64>
     ```
   }];
 
   let arguments = (ins F64Tensor:$input);
-  let hasCanonicalizer = 1;
 
   // We expect that the reshape operation returns a statically shaped tensor.
   let results = (outs StaticShapeTensorOf<[F64]>);
+
+  let assemblyFormat = [{
+    `(` $input `:` type($input) `)` attr-dict `to` type(results)
+  }];
+
+  // Enable registering canonicalization patterns with this operation.
+  let hasCanonicalizer = 1;
 }
 
 def ReturnOp : Toy_Op<"return", [Terminator, HasParent<"FuncOp">]> {
@@ -212,6 +238,9 @@
   // value must match the return type of the enclosing function.
   let arguments = (ins Variadic<F64Tensor>:$input);
 
+  // The return operation only emits the input in the format if it is present.
+  let assemblyFormat = "($input^ `:` type($input))? attr-dict ";
+
   // Allow building a ReturnOp with no return operand.
   let builders = [OpBuilder<
     "Builder *b, OperationState &state", [{ build(b, state, llvm::None); }]
@@ -232,6 +261,12 @@
 
   let arguments = (ins F64Tensor:$input);
   let results = (outs F64Tensor);
+
+  let assemblyFormat = [{
+    `(` $input `:` type($input) `)` attr-dict `to` type(results)
+  }];
+
+  // Enable registering canonicalization patterns with this operation.
   let hasCanonicalizer = 1;
 
   // Allow building a TransposeOp with from the input operand.
diff --git a/mlir/examples/toy/Ch4/mlir/Dialect.cpp b/mlir/examples/toy/Ch4/mlir/Dialect.cpp
--- a/mlir/examples/toy/Ch4/mlir/Dialect.cpp
+++ b/mlir/examples/toy/Ch4/mlir/Dialect.cpp
@@ -14,6 +14,7 @@
 #include "toy/Dialect.h"
 
 #include "mlir/IR/Builders.h"
+#include "mlir/IR/OpImplementation.h"
 #include "mlir/IR/StandardTypes.h"
 #include "mlir/Transforms/InliningUtils.h"
 
@@ -86,6 +87,54 @@
 // Toy Operations
 //===----------------------------------------------------------------------===//
 
+/// A generalized parser for binary operations. This parses the different forms
+/// of 'printBinaryOp' below.
+static mlir::ParseResult parseBinaryOp(mlir::OpAsmParser &parser,
+                                       mlir::OperationState &result) {
+  SmallVector<mlir::OpAsmParser::OperandType, 2> operands;
+  llvm::SMLoc operandsLoc = parser.getCurrentLocation();
+  Type type;
+  if (parser.parseOperandList(operands, /*requiredOperandCount=*/2) ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonType(type))
+    return mlir::failure();
+
+  // If the type is a function type, it contains the input and result types of
+  // this operation.
+  if (FunctionType funcType = type.dyn_cast<FunctionType>()) {
+    if (parser.resolveOperands(operands, funcType.getInputs(), operandsLoc,
+                               result.operands))
+      return mlir::failure();
+    result.addTypes(funcType.getResults());
+    return mlir::success();
+  }
+
+  // Otherwise, the parsed type is the type of both operands and results.
+  if (parser.resolveOperands(operands, type, result.operands))
+    return mlir::failure();
+  result.addTypes(type);
+  return mlir::success();
+}
+
+/// A generalized printer for binary operations. It prints in two different
+/// forms depending on if all of the types match.
+static void printBinaryOp(mlir::OpAsmPrinter &printer, mlir::Operation *op) {
+  printer << op->getName() << " " << op->getOperands();
+  printer.printOptionalAttrDict(op->getAttrs());
+  printer << " : ";
+
+  // If all of the types are the same, print the type directly.
+  Type resultType = *op->result_type_begin();
+  if (llvm::all_of(op->getOperandTypes(),
+                   [=](Type type) { return type == resultType; })) {
+    printer << resultType;
+    return;
+  }
+
+  // Otherwise, print a functional type.
+  printer.printFunctionalType(op->getOperandTypes(), op->getResultTypes());
+}
+
 //===----------------------------------------------------------------------===//
 // ConstantOp
 
@@ -99,6 +148,32 @@
   ConstantOp::build(builder, state, dataType, dataAttribute);
 }
 
+/// The 'OpAsmPrinter' class provides a collection of methods for parsing
+/// various punctuation, as well as attributes, operands, types, etc. Each of
+/// these methods returns a `ParseResult`. This class is a wrapper around
+/// `LogicalResult` that can be converted to a boolean `true` value on failure,
+/// or `false` on success. This allows for easily chaining together a set of
+/// parser rules. These rules are used to populate an `mlir::OperationState`
+/// similarly to the `build` methods described above.
+static mlir::ParseResult parseConstantOp(mlir::OpAsmParser &parser,
+                                         mlir::OperationState &result) {
+  mlir::DenseElementsAttr value;
+  if (parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseAttribute(value, "value", result.attributes))
+    return failure();
+
+  result.addTypes(value.getType());
+  return success();
+}
+
+/// The 'OpAsmPrinter' class is a stream that will allows for formatting
+/// strings, attributes, operands, types, etc.
+static void print(mlir::OpAsmPrinter &printer, ConstantOp op) {
+  printer << "toy.constant ";
+  printer.printOptionalAttrDict(op.getAttrs(), /*elidedAttrs=*/{"value"});
+  printer << op.value();
+}
+
 /// Verifier for the constant operation. This corresponds to the `::verify(...)`
 /// in the op definition.
 static mlir::LogicalResult verify(ConstantOp op) {
diff --git a/mlir/examples/toy/Ch5/include/toy/Ops.td b/mlir/examples/toy/Ch5/include/toy/Ops.td
--- a/mlir/examples/toy/Ch5/include/toy/Ops.td
+++ b/mlir/examples/toy/Ch5/include/toy/Ops.td
@@ -48,9 +48,8 @@
     to the operation as an attribute. For example:
 
     ```mlir
-      %0 = "toy.constant"()
-         { value = dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]> : tensor<2x3xf64> }
-        : () -> tensor<2x3xf64>
+      %0 = toy.constant dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]>
+                        : tensor<2x3xf64>
     ```
   }];
 
@@ -60,6 +59,10 @@
   // The constant operation returns a single value of TensorType.
   let results = (outs F64Tensor);
 
+  // Specify a parser and printer method.
+  let parser = [{ return ::parseConstantOp(parser, result); }];
+  let printer = [{ return ::print(p, *this); }];
+
   // Add custom build methods for the constant operation. These method populates
   // the `state` that MLIR uses to create operations, i.e. these are used when
   // using `builder.create<ConstantOp>(...)`.
@@ -89,6 +92,10 @@
   let arguments = (ins F64Tensor:$lhs, F64Tensor:$rhs);
   let results = (outs F64Tensor);
 
+  // Specify a parser and printer method.
+  let parser = [{ return ::parseBinaryOp(parser, result); }];
+  let printer = [{ return ::printBinaryOp(p, *this); }];
+
   // Allow building an AddOp with from the two input operands.
   let builders = [
     OpBuilder<"Builder *b, OperationState &state, Value lhs, Value rhs">
@@ -110,6 +117,8 @@
   let arguments = (ins F64Tensor:$input);
   let results = (outs F64Tensor:$output);
 
+  let assemblyFormat = "$input attr-dict `:` type($input) `to` type($output)";
+
   // Set the folder bit so that we can fold redundant cast operations.
   let hasFolder = 1;
 }
@@ -124,7 +133,7 @@
     arguments expected by the callee. For example:
 
     ```mlir
-     %4 = "toy.generic_call"(%1, %3) {callee = @my_func}
+     %4 = toy.generic_call @my_func(%1, %3)
            : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
     ```
 
@@ -139,6 +148,11 @@
   // The generic call operation returns a single value of TensorType.
   let results = (outs F64Tensor);
 
+  // The return operation only emits the input in the format if it is present.
+  let assemblyFormat = [{
+    $callee `(` $inputs `)` attr-dict `:` functional-type($inputs, results)
+  }];
+
   // Add custom build methods for the generic call operation.
   let builders = [
     OpBuilder<"Builder *builder, OperationState &state, "
@@ -157,6 +171,10 @@
   let arguments = (ins F64Tensor:$lhs, F64Tensor:$rhs);
   let results = (outs F64Tensor);
 
+  // Specify a parser and printer method.
+  let parser = [{ return ::parseBinaryOp(parser, result); }];
+  let printer = [{ return ::printBinaryOp(p, *this); }];
+
   // Allow building a MulOp with from the two input operands.
   let builders = [
     OpBuilder<"Builder *b, OperationState &state, Value lhs, Value rhs">
@@ -173,6 +191,8 @@
   // The print operation takes an input tensor to print.
   // We also allow a F64MemRef to enable interop during partial lowering.
   let arguments = (ins AnyTypeOf<[F64Tensor, F64MemRef]>:$input);
+
+  let assemblyFormat = "$input attr-dict `:` type($input)";
 }
 
 def ReshapeOp : Toy_Op<"reshape", [NoSideEffect]> {
@@ -182,15 +202,21 @@
     the same number of elements but different shapes. For example:
 
     ```mlir
-       %0 = "toy.reshape"(%arg1) : (tensor<10xf64>) -> tensor<5x2xf64>
+       %0 = toy.reshape (%arg1 : tensor<10xf64>) to tensor<5x2xf64>
     ```
   }];
 
   let arguments = (ins F64Tensor:$input);
-  let hasCanonicalizer = 1;
 
   // We expect that the reshape operation returns a statically shaped tensor.
   let results = (outs StaticShapeTensorOf<[F64]>);
+
+  let assemblyFormat = [{
+    `(` $input `:` type($input) `)` attr-dict `to` type(results)
+  }];
+
+  // Enable registering canonicalization patterns with this operation.
+  let hasCanonicalizer = 1;
 }
 
 def ReturnOp : Toy_Op<"return", [Terminator, HasParent<"FuncOp">]> {
@@ -213,6 +239,9 @@
   // value must match the return type of the enclosing function.
   let arguments = (ins Variadic<F64Tensor>:$input);
 
+  // The return operation only emits the input in the format if it is present.
+  let assemblyFormat = "($input^ `:` type($input))? attr-dict ";
+
   // Allow building a ReturnOp with no return operand.
   let builders = [OpBuilder<
     "Builder *b, OperationState &state", [{ build(b, state, llvm::None); }]
@@ -233,6 +262,12 @@
 
   let arguments = (ins F64Tensor:$input);
   let results = (outs F64Tensor);
+
+  let assemblyFormat = [{
+    `(` $input `:` type($input) `)` attr-dict `to` type(results)
+  }];
+
+  // Enable registering canonicalization patterns with this operation.
   let hasCanonicalizer = 1;
 
   // Allow building a TransposeOp with from the input operand.
diff --git a/mlir/examples/toy/Ch5/mlir/Dialect.cpp b/mlir/examples/toy/Ch5/mlir/Dialect.cpp
--- a/mlir/examples/toy/Ch5/mlir/Dialect.cpp
+++ b/mlir/examples/toy/Ch5/mlir/Dialect.cpp
@@ -14,6 +14,7 @@
 #include "toy/Dialect.h"
 
 #include "mlir/IR/Builders.h"
+#include "mlir/IR/OpImplementation.h"
 #include "mlir/IR/StandardTypes.h"
 #include "mlir/Transforms/InliningUtils.h"
 
@@ -86,6 +87,54 @@
 // Toy Operations
 //===----------------------------------------------------------------------===//
 
+/// A generalized parser for binary operations. This parses the different forms
+/// of 'printBinaryOp' below.
+static mlir::ParseResult parseBinaryOp(mlir::OpAsmParser &parser,
+                                       mlir::OperationState &result) {
+  SmallVector<mlir::OpAsmParser::OperandType, 2> operands;
+  llvm::SMLoc operandsLoc = parser.getCurrentLocation();
+  Type type;
+  if (parser.parseOperandList(operands, /*requiredOperandCount=*/2) ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonType(type))
+    return mlir::failure();
+
+  // If the type is a function type, it contains the input and result types of
+  // this operation.
+  if (FunctionType funcType = type.dyn_cast<FunctionType>()) {
+    if (parser.resolveOperands(operands, funcType.getInputs(), operandsLoc,
+                               result.operands))
+      return mlir::failure();
+    result.addTypes(funcType.getResults());
+    return mlir::success();
+  }
+
+  // Otherwise, the parsed type is the type of both operands and results.
+  if (parser.resolveOperands(operands, type, result.operands))
+    return mlir::failure();
+  result.addTypes(type);
+  return mlir::success();
+}
+
+/// A generalized printer for binary operations. It prints in two different
+/// forms depending on if all of the types match.
+static void printBinaryOp(mlir::OpAsmPrinter &printer, mlir::Operation *op) {
+  printer << op->getName() << " " << op->getOperands();
+  printer.printOptionalAttrDict(op->getAttrs());
+  printer << " : ";
+
+  // If all of the types are the same, print the type directly.
+  Type resultType = *op->result_type_begin();
+  if (llvm::all_of(op->getOperandTypes(),
+                   [=](Type type) { return type == resultType; })) {
+    printer << resultType;
+    return;
+  }
+
+  // Otherwise, print a functional type.
+  printer.printFunctionalType(op->getOperandTypes(), op->getResultTypes());
+}
+
 //===----------------------------------------------------------------------===//
 // ConstantOp
 
@@ -99,6 +148,32 @@
   ConstantOp::build(builder, state, dataType, dataAttribute);
 }
 
+/// The 'OpAsmPrinter' class provides a collection of methods for parsing
+/// various punctuation, as well as attributes, operands, types, etc. Each of
+/// these methods returns a `ParseResult`. This class is a wrapper around
+/// `LogicalResult` that can be converted to a boolean `true` value on failure,
+/// or `false` on success. This allows for easily chaining together a set of
+/// parser rules. These rules are used to populate an `mlir::OperationState`
+/// similarly to the `build` methods described above.
+static mlir::ParseResult parseConstantOp(mlir::OpAsmParser &parser,
+                                         mlir::OperationState &result) {
+  mlir::DenseElementsAttr value;
+  if (parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseAttribute(value, "value", result.attributes))
+    return failure();
+
+  result.addTypes(value.getType());
+  return success();
+}
+
+/// The 'OpAsmPrinter' class is a stream that will allows for formatting
+/// strings, attributes, operands, types, etc.
+static void print(mlir::OpAsmPrinter &printer, ConstantOp op) {
+  printer << "toy.constant ";
+  printer.printOptionalAttrDict(op.getAttrs(), /*elidedAttrs=*/{"value"});
+  printer << op.value();
+}
+
 /// Verifier for the constant operation. This corresponds to the `::verify(...)`
 /// in the op definition.
 static mlir::LogicalResult verify(ConstantOp op) {
diff --git a/mlir/examples/toy/Ch6/include/toy/Ops.td b/mlir/examples/toy/Ch6/include/toy/Ops.td
--- a/mlir/examples/toy/Ch6/include/toy/Ops.td
+++ b/mlir/examples/toy/Ch6/include/toy/Ops.td
@@ -48,9 +48,8 @@
     to the operation as an attribute. For example:
 
     ```mlir
-      %0 = "toy.constant"()
-         { value = dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]> : tensor<2x3xf64> }
-        : () -> tensor<2x3xf64>
+      %0 = toy.constant dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]>
+                        : tensor<2x3xf64>
     ```
   }];
 
@@ -60,6 +59,10 @@
   // The constant operation returns a single value of TensorType.
   let results = (outs F64Tensor);
 
+  // Specify a parser and printer method.
+  let parser = [{ return ::parseConstantOp(parser, result); }];
+  let printer = [{ return ::print(p, *this); }];
+
   // Add custom build methods for the constant operation. These method populates
   // the `state` that MLIR uses to create operations, i.e. these are used when
   // using `builder.create<ConstantOp>(...)`.
@@ -89,6 +92,10 @@
   let arguments = (ins F64Tensor:$lhs, F64Tensor:$rhs);
   let results = (outs F64Tensor);
 
+  // Specify a parser and printer method.
+  let parser = [{ return ::parseBinaryOp(parser, result); }];
+  let printer = [{ return ::printBinaryOp(p, *this); }];
+
   // Allow building an AddOp with from the two input operands.
   let builders = [
     OpBuilder<"Builder *b, OperationState &state, Value lhs, Value rhs">
@@ -110,6 +117,8 @@
   let arguments = (ins F64Tensor:$input);
   let results = (outs F64Tensor:$output);
 
+  let assemblyFormat = "$input attr-dict `:` type($input) `to` type($output)";
+
   // Set the folder bit so that we can fold redundant cast operations.
   let hasFolder = 1;
 }
@@ -124,7 +133,7 @@
     arguments expected by the callee. For example:
 
     ```mlir
-     %4 = "toy.generic_call"(%1, %3) {callee = @my_func}
+     %4 = toy.generic_call @my_func(%1, %3)
            : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
     ```
 
@@ -139,6 +148,11 @@
   // The generic call operation returns a single value of TensorType.
   let results = (outs F64Tensor);
 
+  // The return operation only emits the input in the format if it is present.
+  let assemblyFormat = [{
+    $callee `(` $inputs `)` attr-dict `:` functional-type($inputs, results)
+  }];
+
   // Add custom build methods for the generic call operation.
   let builders = [
     OpBuilder<"Builder *builder, OperationState &state, "
@@ -157,6 +171,10 @@
   let arguments = (ins F64Tensor:$lhs, F64Tensor:$rhs);
   let results = (outs F64Tensor);
 
+  // Specify a parser and printer method.
+  let parser = [{ return ::parseBinaryOp(parser, result); }];
+  let printer = [{ return ::printBinaryOp(p, *this); }];
+
   // Allow building a MulOp with from the two input operands.
   let builders = [
     OpBuilder<"Builder *b, OperationState &state, Value lhs, Value rhs">
@@ -173,6 +191,8 @@
   // The print operation takes an input tensor to print.
   // We also allow a F64MemRef to enable interop during partial lowering.
   let arguments = (ins AnyTypeOf<[F64Tensor, F64MemRef]>:$input);
+
+  let assemblyFormat = "$input attr-dict `:` type($input)";
 }
 
 def ReshapeOp : Toy_Op<"reshape", [NoSideEffect]> {
@@ -182,11 +202,17 @@
     the same number of elements but different shapes. For example:
 
     ```mlir
-       %0 = "toy.reshape"(%arg1) : (tensor<10xf64>) -> tensor<5x2xf64>
+       %0 = toy.reshape (%arg1 : tensor<10xf64>) to tensor<5x2xf64>
     ```
   }];
 
   let arguments = (ins F64Tensor:$input);
+
+  let assemblyFormat = [{
+    `(` $input `:` type($input) `)` attr-dict `to` type(results)
+  }];
+
+  // Enable registering canonicalization patterns with this operation.
   let hasCanonicalizer = 1;
 
   // We expect that the reshape operation returns a statically shaped tensor.
@@ -213,6 +239,9 @@
   // value must match the return type of the enclosing function.
   let arguments = (ins Variadic<F64Tensor>:$input);
 
+  // The return operation only emits the input in the format if it is present.
+  let assemblyFormat = "($input^ `:` type($input))? attr-dict ";
+
   // Allow building a ReturnOp with no return operand.
   let builders = [OpBuilder<
     "Builder *b, OperationState &state", [{ build(b, state, llvm::None); }]
@@ -233,6 +262,12 @@
 
   let arguments = (ins F64Tensor:$input);
   let results = (outs F64Tensor);
+
+  let assemblyFormat = [{
+    `(` $input `:` type($input) `)` attr-dict `to` type(results)
+  }];
+
+  // Enable registering canonicalization patterns with this operation.
   let hasCanonicalizer = 1;
 
   // Allow building a TransposeOp with from the input operand.
diff --git a/mlir/examples/toy/Ch6/mlir/Dialect.cpp b/mlir/examples/toy/Ch6/mlir/Dialect.cpp
--- a/mlir/examples/toy/Ch6/mlir/Dialect.cpp
+++ b/mlir/examples/toy/Ch6/mlir/Dialect.cpp
@@ -14,6 +14,7 @@
 #include "toy/Dialect.h"
 
 #include "mlir/IR/Builders.h"
+#include "mlir/IR/OpImplementation.h"
 #include "mlir/IR/StandardTypes.h"
 #include "mlir/Transforms/InliningUtils.h"
 
@@ -86,6 +87,54 @@
 // Toy Operations
 //===----------------------------------------------------------------------===//
 
+/// A generalized parser for binary operations. This parses the different forms
+/// of 'printBinaryOp' below.
+static mlir::ParseResult parseBinaryOp(mlir::OpAsmParser &parser,
+                                       mlir::OperationState &result) {
+  SmallVector<mlir::OpAsmParser::OperandType, 2> operands;
+  llvm::SMLoc operandsLoc = parser.getCurrentLocation();
+  Type type;
+  if (parser.parseOperandList(operands, /*requiredOperandCount=*/2) ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonType(type))
+    return mlir::failure();
+
+  // If the type is a function type, it contains the input and result types of
+  // this operation.
+  if (FunctionType funcType = type.dyn_cast<FunctionType>()) {
+    if (parser.resolveOperands(operands, funcType.getInputs(), operandsLoc,
+                               result.operands))
+      return mlir::failure();
+    result.addTypes(funcType.getResults());
+    return mlir::success();
+  }
+
+  // Otherwise, the parsed type is the type of both operands and results.
+  if (parser.resolveOperands(operands, type, result.operands))
+    return mlir::failure();
+  result.addTypes(type);
+  return mlir::success();
+}
+
+/// A generalized printer for binary operations. It prints in two different
+/// forms depending on if all of the types match.
+static void printBinaryOp(mlir::OpAsmPrinter &printer, mlir::Operation *op) {
+  printer << op->getName() << " " << op->getOperands();
+  printer.printOptionalAttrDict(op->getAttrs());
+  printer << " : ";
+
+  // If all of the types are the same, print the type directly.
+  Type resultType = *op->result_type_begin();
+  if (llvm::all_of(op->getOperandTypes(),
+                   [=](Type type) { return type == resultType; })) {
+    printer << resultType;
+    return;
+  }
+
+  // Otherwise, print a functional type.
+  printer.printFunctionalType(op->getOperandTypes(), op->getResultTypes());
+}
+
 //===----------------------------------------------------------------------===//
 // ConstantOp
 
@@ -99,6 +148,32 @@
   ConstantOp::build(builder, state, dataType, dataAttribute);
 }
 
+/// The 'OpAsmPrinter' class provides a collection of methods for parsing
+/// various punctuation, as well as attributes, operands, types, etc. Each of
+/// these methods returns a `ParseResult`. This class is a wrapper around
+/// `LogicalResult` that can be converted to a boolean `true` value on failure,
+/// or `false` on success. This allows for easily chaining together a set of
+/// parser rules. These rules are used to populate an `mlir::OperationState`
+/// similarly to the `build` methods described above.
+static mlir::ParseResult parseConstantOp(mlir::OpAsmParser &parser,
+                                         mlir::OperationState &result) {
+  mlir::DenseElementsAttr value;
+  if (parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseAttribute(value, "value", result.attributes))
+    return failure();
+
+  result.addTypes(value.getType());
+  return success();
+}
+
+/// The 'OpAsmPrinter' class is a stream that will allows for formatting
+/// strings, attributes, operands, types, etc.
+static void print(mlir::OpAsmPrinter &printer, ConstantOp op) {
+  printer << "toy.constant ";
+  printer.printOptionalAttrDict(op.getAttrs(), /*elidedAttrs=*/{"value"});
+  printer << op.value();
+}
+
 /// Verifier for the constant operation. This corresponds to the `::verify(...)`
 /// in the op definition.
 static mlir::LogicalResult verify(ConstantOp op) {
diff --git a/mlir/examples/toy/Ch7/include/toy/Ops.td b/mlir/examples/toy/Ch7/include/toy/Ops.td
--- a/mlir/examples/toy/Ch7/include/toy/Ops.td
+++ b/mlir/examples/toy/Ch7/include/toy/Ops.td
@@ -57,9 +57,8 @@
     to the operation as an attribute. For example:
 
     ```mlir
-      %0 = "toy.constant"()
-         { value = dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]> : tensor<2x3xf64> }
-        : () -> tensor<2x3xf64>
+      %0 = toy.constant dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]>
+                        : tensor<2x3xf64>
     ```
   }];
 
@@ -69,6 +68,10 @@
   // The constant operation returns a single value of TensorType.
   let results = (outs F64Tensor);
 
+  // Specify a parser and printer method.
+  let parser = [{ return ::parseConstantOp(parser, result); }];
+  let printer = [{ return ::print(p, *this); }];
+
   // Add custom build methods for the constant operation. These method populates
   // the `state` that MLIR uses to create operations, i.e. these are used when
   // using `builder.create<ConstantOp>(...)`.
@@ -101,6 +104,10 @@
   let arguments = (ins F64Tensor:$lhs, F64Tensor:$rhs);
   let results = (outs F64Tensor);
 
+  // Specify a parser and printer method.
+  let parser = [{ return ::parseBinaryOp(parser, result); }];
+  let printer = [{ return ::printBinaryOp(p, *this); }];
+
   // Allow building an AddOp with from the two input operands.
   let builders = [
     OpBuilder<"Builder *b, OperationState &state, Value lhs, Value rhs">
@@ -122,6 +129,8 @@
   let arguments = (ins F64Tensor:$input);
   let results = (outs F64Tensor:$output);
 
+  let assemblyFormat = "$input attr-dict `:` type($input) `to` type($output)";
+
   // Set the folder bit so that we can fold redundant cast operations.
   let hasFolder = 1;
 }
@@ -136,7 +145,7 @@
     arguments expected by the callee. For example:
 
     ```mlir
-     %4 = "toy.generic_call"(%1, %3) {callee = @my_func}
+     %4 = toy.generic_call @my_func(%1, %3)
            : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
     ```
 
@@ -152,6 +161,11 @@
   // StructType.
   let results = (outs Toy_Type);
 
+  // The return operation only emits the input in the format if it is present.
+  let assemblyFormat = [{
+    $callee `(` $inputs `)` attr-dict `:` functional-type($inputs, results)
+  }];
+
   // Add custom build methods for the generic call operation.
   let builders = [
     OpBuilder<"Builder *builder, OperationState &state, "
@@ -170,6 +184,10 @@
   let arguments = (ins F64Tensor:$lhs, F64Tensor:$rhs);
   let results = (outs F64Tensor);
 
+  // Specify a parser and printer method.
+  let parser = [{ return ::parseBinaryOp(parser, result); }];
+  let printer = [{ return ::printBinaryOp(p, *this); }];
+
   // Allow building a MulOp with from the two input operands.
   let builders = [
     OpBuilder<"Builder *b, OperationState &state, Value lhs, Value rhs">
@@ -186,6 +204,8 @@
   // The print operation takes an input tensor to print.
   // We also allow a F64MemRef to enable interop during partial lowering.
   let arguments = (ins AnyTypeOf<[F64Tensor, F64MemRef]>:$input);
+
+  let assemblyFormat = "$input attr-dict `:` type($input)";
 }
 
 def ReshapeOp : Toy_Op<"reshape", [NoSideEffect]> {
@@ -195,11 +215,17 @@
     the same number of elements but different shapes. For example:
 
     ```mlir
-       %0 = "toy.reshape"(%arg1) : (tensor<10xf64>) -> tensor<5x2xf64>
+       %0 = toy.reshape (%arg1 : tensor<10xf64>) to tensor<5x2xf64>
     ```
   }];
 
   let arguments = (ins F64Tensor:$input);
+
+  let assemblyFormat = [{
+    `(` $input `:` type($input) `)` attr-dict `to` type(results)
+  }];
+
+  // Enable registering canonicalization patterns with this operation.
   let hasCanonicalizer = 1;
 
   // We expect that the reshape operation returns a statically shaped tensor.
@@ -226,6 +252,9 @@
   // value must match the return type of the enclosing function.
   let arguments = (ins Variadic<Toy_Type>:$input);
 
+  // The return operation only emits the input in the format if it is present.
+  let assemblyFormat = "($input^ `:` type($input))? attr-dict ";
+
   // Allow building a ReturnOp with no return operand.
   let builders = [OpBuilder<
     "Builder *b, OperationState &state", [{ build(b, state, llvm::None); }]
@@ -247,7 +276,11 @@
   }];
 
   let arguments = (ins Toy_StructType:$input, I64Attr:$index);
-  let results = (outs Toy_Type);
+  let results = (outs Toy_Type:$output);
+
+  let assemblyFormat = [{
+    $input `[` $index `]` attr-dict `:` type($input) `->` type($output)
+  }];
 
   // Allow building a StructAccessOp with just a struct value and an index.
   let builders = [
@@ -268,16 +301,19 @@
     as an array of other constant values. For example:
 
     ```mlir
-      %0 = "toy.struct_constant"() {
-        value = [dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]> : tensor<2x3xf64>]
-      } : () -> !toy.struct<tensor<*xf64>>
+      %0 = toy.struct_constant [
+        dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]> : tensor<2x3xf64>
+      ] : !toy.struct<tensor<*xf64>>
     ```
   }];
 
-  let hasFolder = 1;
   let arguments = (ins ArrayAttr:$value);
-  let results = (outs Toy_StructType);
+  let results = (outs Toy_StructType:$output);
+
+  let assemblyFormat = "$value attr-dict `:` type($output)";
+
   let verifier = [{ return ::verify(*this); }];
+  let hasFolder = 1;
 }
 
 def TransposeOp : Toy_Op<"transpose",
@@ -286,6 +322,12 @@
 
   let arguments = (ins F64Tensor:$input);
   let results = (outs F64Tensor);
+
+  let assemblyFormat = [{
+    `(` $input `:` type($input) `)` attr-dict `to` type(results)
+  }];
+
+  // Enable registering canonicalization patterns with this operation.
   let hasCanonicalizer = 1;
 
   // Allow building a TransposeOp with from the input operand.
diff --git a/mlir/examples/toy/Ch7/mlir/Dialect.cpp b/mlir/examples/toy/Ch7/mlir/Dialect.cpp
--- a/mlir/examples/toy/Ch7/mlir/Dialect.cpp
+++ b/mlir/examples/toy/Ch7/mlir/Dialect.cpp
@@ -15,6 +15,7 @@
 
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/DialectImplementation.h"
+#include "mlir/IR/OpImplementation.h"
 #include "mlir/IR/StandardTypes.h"
 #include "mlir/Transforms/InliningUtils.h"
 
@@ -99,6 +100,54 @@
 // Toy Operations
 //===----------------------------------------------------------------------===//
 
+/// A generalized parser for binary operations. This parses the different forms
+/// of 'printBinaryOp' below.
+static mlir::ParseResult parseBinaryOp(mlir::OpAsmParser &parser,
+                                       mlir::OperationState &result) {
+  SmallVector<mlir::OpAsmParser::OperandType, 2> operands;
+  llvm::SMLoc operandsLoc = parser.getCurrentLocation();
+  Type type;
+  if (parser.parseOperandList(operands, /*requiredOperandCount=*/2) ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonType(type))
+    return mlir::failure();
+
+  // If the type is a function type, it contains the input and result types of
+  // this operation.
+  if (FunctionType funcType = type.dyn_cast<FunctionType>()) {
+    if (parser.resolveOperands(operands, funcType.getInputs(), operandsLoc,
+                               result.operands))
+      return mlir::failure();
+    result.addTypes(funcType.getResults());
+    return mlir::success();
+  }
+
+  // Otherwise, the parsed type is the type of both operands and results.
+  if (parser.resolveOperands(operands, type, result.operands))
+    return mlir::failure();
+  result.addTypes(type);
+  return mlir::success();
+}
+
+/// A generalized printer for binary operations. It prints in two different
+/// forms depending on if all of the types match.
+static void printBinaryOp(mlir::OpAsmPrinter &printer, mlir::Operation *op) {
+  printer << op->getName() << " " << op->getOperands();
+  printer.printOptionalAttrDict(op->getAttrs());
+  printer << " : ";
+
+  // If all of the types are the same, print the type directly.
+  Type resultType = *op->result_type_begin();
+  if (llvm::all_of(op->getOperandTypes(),
+                   [=](Type type) { return type == resultType; })) {
+    printer << resultType;
+    return;
+  }
+
+  // Otherwise, print a functional type.
+  printer.printFunctionalType(op->getOperandTypes(), op->getResultTypes());
+}
+
 //===----------------------------------------------------------------------===//
 // ConstantOp
 
@@ -112,6 +161,32 @@
   ConstantOp::build(builder, state, dataType, dataAttribute);
 }
 
+/// The 'OpAsmPrinter' class provides a collection of methods for parsing
+/// various punctuation, as well as attributes, operands, types, etc. Each of
+/// these methods returns a `ParseResult`. This class is a wrapper around
+/// `LogicalResult` that can be converted to a boolean `true` value on failure,
+/// or `false` on success. This allows for easily chaining together a set of
+/// parser rules. These rules are used to populate an `mlir::OperationState`
+/// similarly to the `build` methods described above.
+static mlir::ParseResult parseConstantOp(mlir::OpAsmParser &parser,
+                                         mlir::OperationState &result) {
+  mlir::DenseElementsAttr value;
+  if (parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseAttribute(value, "value", result.attributes))
+    return failure();
+
+  result.addTypes(value.getType());
+  return success();
+}
+
+/// The 'OpAsmPrinter' class is a stream that will allows for formatting
+/// strings, attributes, operands, types, etc.
+static void print(mlir::OpAsmPrinter &printer, ConstantOp op) {
+  printer << "toy.constant ";
+  printer.printOptionalAttrDict(op.getAttrs(), /*elidedAttrs=*/{"value"});
+  printer << op.value();
+}
+
 /// Verify that the given attribute value is valid for the given type.
 static mlir::LogicalResult verifyConstantForType(mlir::Type type,
                                                  mlir::Attribute opaqueValue,
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
@@ -31,6 +31,25 @@
        CPred<"$_self.cast<::mlir::LLVM::LLVMType>().isIntegerTy()">]>,
   "LLVM dialect integer">;
 
+def LLVMIntBase : TypeConstraint<
+  And<[LLVM_Type.predicate,
+       CPred<"$_self.cast<::mlir::LLVM::LLVMType>().isIntegerTy()">]>,
+  "LLVM dialect integer">;
+
+// Integer type of a specific width.
+class LLVMI<int width>
+    : Type<And<[
+        LLVM_Type.predicate,
+        CPred<
+         "$_self.cast<::mlir::LLVM::LLVMType>().isIntegerTy(" # width # ")">]>,
+         "LLVM dialect " # width # "-bit integer">,
+      BuildableType<
+        "::mlir::LLVM::LLVMType::getIntNTy("
+        "$_builder.getContext()->getRegisteredDialect<LLVM::LLVMDialect>(),"
+         # width # ")">;
+
+def LLVMI1 : LLVMI<1>;
+
 // Base class for LLVM operations. Defines the interface to the llvm::IRBuilder
 // used to translate to LLVM IR proper.
 class LLVM_OpBase<Dialect dialect, string mnemonic, list<OpTrait> traits = []> :
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -72,8 +72,7 @@
 // Base class for LLVM terminator operations.  All terminator operations have
 // zero results and an optional list of successors.
 class LLVM_TerminatorOp<string mnemonic, list<OpTrait> traits = []> :
-    LLVM_Op<mnemonic, !listconcat(traits, [Terminator])>,
-    Arguments<(ins Variadic<LLVM_Type>:$args)>, Results<(outs)> {
+    LLVM_Op<mnemonic, !listconcat(traits, [Terminator])> {
   let builders = [
     OpBuilder<
       "Builder *, OperationState &result, "
@@ -320,15 +319,10 @@
                     Arguments<(ins OptionalAttr<FlatSymbolRefAttr>:$callee,
                                Variadic<LLVM_Type>)>,
                     Results<(outs Variadic<LLVM_Type>)> {
+  let successors = (successor AnySuccessor:$normalDest,
+                              AnySuccessor:$unwindDest);
+
   let builders = [OpBuilder<
-    "Builder *b, OperationState &result, ArrayRef<Type> tys, "
-    "FlatSymbolRefAttr callee, ValueRange ops, Block* normal, "
-    "ValueRange normalOps, Block* unwind, ValueRange unwindOps",
-    [{
-      result.addAttribute("callee", callee);
-      build(b, result, tys, ops, normal, normalOps, unwind, unwindOps);
-    }]>,
-    OpBuilder<
     "Builder *b, OperationState &result, ArrayRef<Type> tys, "
     "ValueRange ops, Block* normal, "
     "ValueRange normalOps, Block* unwind, ValueRange unwindOps",
@@ -460,19 +454,16 @@
 
 // Terminators.
 def LLVM_BrOp : LLVM_TerminatorOp<"br", []> {
-  let parser = [{ return parseBrOp(parser, result); }];
-  let printer = [{ printBrOp(p, *this); }];
+  let successors = (successor AnySuccessor:$dest);
+  let assemblyFormat = "$dest attr-dict";
 }
 def LLVM_CondBrOp : LLVM_TerminatorOp<"cond_br", []> {
-  let verifier = [{
-    if (getNumSuccessors() != 2)
-      return emitOpError("expected exactly two successors");
-    return success();
-  }];
-  let parser = [{ return parseCondBrOp(parser, result); }];
-  let printer = [{ printCondBrOp(p, *this); }];
+  let arguments = (ins LLVMI1:$condition);
+  let successors = (successor AnySuccessor:$trueDest, AnySuccessor:$falseDest);
+  let assemblyFormat = "$condition `,` successors attr-dict";
 }
-def LLVM_ReturnOp : LLVM_TerminatorOp<"return", []> {
+def LLVM_ReturnOp : LLVM_TerminatorOp<"return", []>,
+                    Arguments<(ins Variadic<LLVM_Type>:$args)> {
   string llvmBuilder = [{
     if ($_numOperands != 0)
       builder.CreateRet($args[0]);
diff --git a/mlir/include/mlir/Dialect/LoopOps/Passes.h b/mlir/include/mlir/Dialect/LoopOps/Passes.h
--- a/mlir/include/mlir/Dialect/LoopOps/Passes.h
+++ b/mlir/include/mlir/Dialect/LoopOps/Passes.h
@@ -13,6 +13,7 @@
 #ifndef MLIR_DIALECT_LOOPOPS_PASSES_H_
 #define MLIR_DIALECT_LOOPOPS_PASSES_H_
 
+#include "llvm/ADT/ArrayRef.h"
 #include <memory>
 
 namespace mlir {
@@ -22,6 +23,10 @@
 /// Creates a loop fusion pass which fuses parallel loops.
 std::unique_ptr<Pass> createParallelLoopFusionPass();
 
+/// Creates a pass which tiles innermost parallel loops.
+std::unique_ptr<Pass>
+createParallelLoopTilingPass(llvm::ArrayRef<int64_t> tileSize = {});
+
 } // namespace mlir
 
 #endif // MLIR_DIALECT_LOOPOPS_PASSES_H_
diff --git a/mlir/include/mlir/Dialect/SPIRV/SPIRVControlFlowOps.td b/mlir/include/mlir/Dialect/SPIRV/SPIRVControlFlowOps.td
--- a/mlir/include/mlir/Dialect/SPIRV/SPIRVControlFlowOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/SPIRVControlFlowOps.td
@@ -41,12 +41,14 @@
     ```
   }];
 
-  let arguments = (ins
-    Variadic<AnyType>:$block_arguments
-  );
+  let arguments = (ins);
 
   let results = (outs);
 
+  let successors = (successor AnySuccessor:$target);
+
+  let verifier = [{ return success(); }];
+
   let builders = [
     OpBuilder<
       "Builder *, OperationState &state, "
@@ -60,15 +62,15 @@
 
   let extraClassDeclaration = [{
     /// Returns the branch target block.
-    Block *getTarget() { return getOperation()->getSuccessor(0); }
+    Block *getTarget() { return target(); }
 
     /// Returns the block arguments.
-    operand_range getBlockArguments() {
-      return getOperation()->getSuccessorOperands(0);
-    }
+    operand_range getBlockArguments() { return targetOperands(); }
   }];
 
   let autogenSerialization = 0;
+
+  let assemblyFormat = "successors attr-dict";
 }
 
 // -----
@@ -115,12 +117,14 @@
 
   let arguments = (ins
     SPV_Bool:$condition,
-    Variadic<AnyType>:$branch_arguments,
     OptionalAttr<I32ArrayAttr>:$branch_weights
   );
 
   let results = (outs);
 
+  let successors = (successor AnySuccessor:$trueTarget,
+                               AnySuccessor:$falseTarget);
+
   let builders = [
     OpBuilder<
       "Builder *builder, OperationState &state, Value condition, "
diff --git a/mlir/include/mlir/Dialect/StandardOps/IR/Ops.td b/mlir/include/mlir/Dialect/StandardOps/IR/Ops.td
--- a/mlir/include/mlir/Dialect/StandardOps/IR/Ops.td
+++ b/mlir/include/mlir/Dialect/StandardOps/IR/Ops.td
@@ -232,12 +232,10 @@
       ^bb3(%3: tensor<*xf32>):
   }];
 
-  let arguments = (ins Variadic<AnyType>:$operands);
+  let successors = (successor AnySuccessor:$dest);
 
-  let builders = [OpBuilder<
-    "Builder *, OperationState &result, Block *dest,"
-    "ValueRange operands = {}", [{
-      result.addSuccessor(dest, operands);
+  let builders = [OpBuilder<"Builder *, OperationState &result, Block *dest", [{
+    result.addSuccessor(dest, llvm::None);
   }]>];
 
   // BranchOp is fully verified by traits.
@@ -252,6 +250,7 @@
   }];
 
   let hasCanonicalizer = 1;
+  let assemblyFormat = "$dest attr-dict";
 }
 
 def CallOp : Std_Op<"call", [CallOpInterface]> {
@@ -357,6 +356,8 @@
 
   let verifier = ?;
   let hasCanonicalizer = 1;
+
+  let assemblyFormat = "$callee `(` $operands `)` attr-dict `:` type($callee)";
 }
 
 def CeilFOp : FloatUnaryOp<"ceilf"> {
@@ -490,6 +491,8 @@
   let verifier = [{ return success(); }];
 
   let hasFolder = 1;
+
+  let assemblyFormat = "$predicate `,` $lhs `,` $rhs attr-dict `:` type($lhs)";
 }
 
 def CondBranchOp : Std_Op<"cond_br", [Terminator]> {
@@ -509,16 +512,8 @@
          ...
   }];
 
-  let arguments = (ins I1:$condition, Variadic<AnyType>:$branchOperands);
-
-  let builders = [OpBuilder<
-    "Builder *, OperationState &result, Value condition,"
-    "Block *trueDest, ValueRange trueOperands,"
-    "Block *falseDest, ValueRange falseOperands", [{
-      result.addOperands(condition);
-      result.addSuccessor(trueDest, trueOperands);
-      result.addSuccessor(falseDest, falseOperands);
-  }]>];
+  let arguments = (ins I1:$condition);
+  let successors = (successor AnySuccessor:$trueDest, AnySuccessor:$falseDest);
 
   // CondBranchOp is fully verified by traits.
   let verifier = ?;
@@ -608,6 +603,7 @@
   }];
 
   let hasCanonicalizer = 1;
+  let assemblyFormat = "$condition `,` successors attr-dict";
 }
 
 def ConstantOp : Std_Op<"constant",
@@ -761,6 +757,10 @@
   }];
 
   let hasFolder = 1;
+
+  let assemblyFormat = [{
+    $aggregate `[` $indices `]` attr-dict `:` type($aggregate)
+  }];
 }
 
 def IndexCastOp : CastOp<"index_cast">, Arguments<(ins AnyType:$in)> {
@@ -853,6 +853,8 @@
   }];
 
   let hasFolder = 1;
+
+  let assemblyFormat = "$memref `[` $indices `]` attr-dict `:` type($memref)";
 }
 
 def LogOp : FloatUnaryOp<"log"> {
@@ -1049,6 +1051,8 @@
   let builders = [OpBuilder<
     "Builder *b, OperationState &result", [{ build(b, result, llvm::None); }]
   >];
+
+  let assemblyFormat = "attr-dict ($operands^ `:` type($operands))?";
 }
 
 def SelectOp : Std_Op<"select", [NoSideEffect, SameOperandsAndResultShape,
@@ -1090,6 +1094,10 @@
   }];
 
   let hasFolder = 1;
+
+  let assemblyFormat = [{
+    $condition `,` $true_value `,` $false_value attr-dict `:` type($result)
+  }];
 }
 
 def SignExtendIOp : Std_Op<"sexti",
@@ -1222,6 +1230,8 @@
                   [{ build(builder, result, aggregateType, element); }]>];
 
   let hasFolder = 1;
+
+  let assemblyFormat = "$input attr-dict `:` type($aggregate)";
 }
 
 def StoreOp : Std_Op<"store",
@@ -1264,6 +1274,10 @@
   }];
 
   let hasFolder = 1;
+
+  let assemblyFormat = [{
+    $value `,` $memref `[` $indices `]` attr-dict `:` type($memref)
+  }];
 }
 
 def SubFOp : FloatArithmeticOp<"subf"> {
@@ -1517,11 +1531,12 @@
       result.addTypes(resultType);
   }]>];
 
-
   let extraClassDeclaration = [{
     /// The result of a tensor_load is always a tensor.
     TensorType getType() { return getResult().getType().cast<TensorType>(); }
   }];
+
+  let assemblyFormat = "$memref attr-dict `:` type($memref)";
 }
 
 def TensorStoreOp : Std_Op<"tensor_store",
@@ -1545,6 +1560,8 @@
   let arguments = (ins AnyTensor:$tensor, AnyMemRef:$memref);
   // TensorStoreOp is fully verified by traits.
   let verifier = ?;
+
+  let assemblyFormat = "$tensor `,` $memref attr-dict `:` type($memref)";
 }
 
 def TruncateIOp : Std_Op<"trunci", [NoSideEffect, SameOperandsAndResultShape]> {
diff --git a/mlir/include/mlir/Dialect/VectorOps/VectorOps.td b/mlir/include/mlir/Dialect/VectorOps/VectorOps.td
--- a/mlir/include/mlir/Dialect/VectorOps/VectorOps.td
+++ b/mlir/include/mlir/Dialect/VectorOps/VectorOps.td
@@ -363,6 +363,10 @@
       return vector().getType().cast<VectorType>();
     }
   }];
+
+  let assemblyFormat = [{
+    $vector `[` $position `:` type($position) `]` attr-dict `:` type($vector)
+  }];
 }
 
 def Vector_ExtractOp :
@@ -512,6 +516,11 @@
       return dest().getType().cast<VectorType>();
     }
   }];
+
+  let assemblyFormat = [{
+    $source `,` $dest `[` $position `:` type($position) `]` attr-dict `:`
+    type($result)
+  }];
 }
 
 def Vector_InsertOp :
diff --git a/mlir/include/mlir/IR/OpBase.td b/mlir/include/mlir/IR/OpBase.td
--- a/mlir/include/mlir/IR/OpBase.td
+++ b/mlir/include/mlir/IR/OpBase.td
@@ -185,6 +185,10 @@
 class RegionConstraint<Pred predicate, string description = ""> :
     Constraint<predicate, description>;
 
+// Subclass for constraints on a successor.
+class SuccessorConstraint<Pred predicate, string description = ""> :
+    Constraint<predicate, description>;
+
 // How to use these constraint categories:
 //
 // * Use TypeConstraint to specify
@@ -1341,6 +1345,21 @@
   CPred<"$_self.getBlocks().size() == " # numBlocks>,
   "region with " # numBlocks # " blocks">;
 
+//===----------------------------------------------------------------------===//
+// Successor definitions
+//===----------------------------------------------------------------------===//
+
+class Successor<Pred condition, string descr = ""> :
+    SuccessorConstraint<condition, descr>;
+
+// Any successor.
+def AnySuccessor : Successor<?, "any successor">;
+
+// A variadic successor constraint. It expands to zero or more of the base
+// successor.
+class VariadicSuccessor<Successor successor>
+  : Successor<successor.predicate, successor.description>;
+
 //===----------------------------------------------------------------------===//
 // OpTrait definitions
 //===----------------------------------------------------------------------===//
@@ -1537,6 +1556,9 @@
 // Marker used to identify the region list for an op.
 def region;
 
+// Marker used to identify the successor list for an op.
+def successor;
+
 // Class for defining a custom builder.
 //
 // TableGen generates several generic builders for each op by default (see
@@ -1587,6 +1609,9 @@
   // The list of regions of the op. Default to 0 regions.
   dag regions = (region);
 
+  // The list of successors of the op. Default to 0 successors.
+  dag successors = (successor);
+
   // Attribute getters can be added to the op by adding an Attr member
   // with the name and type of the attribute. E.g., adding int attribute
   // with name "value" and type "i32":
diff --git a/mlir/include/mlir/IR/OpImplementation.h b/mlir/include/mlir/IR/OpImplementation.h
--- a/mlir/include/mlir/IR/OpImplementation.h
+++ b/mlir/include/mlir/IR/OpImplementation.h
@@ -496,6 +496,12 @@
         return failure();
     return success();
   }
+  template <typename Operands>
+  ParseResult resolveOperands(Operands &&operands, Type type, llvm::SMLoc loc,
+                              SmallVectorImpl<Value> &result) {
+    return resolveOperands(std::forward<Operands>(operands),
+                           ArrayRef<Type>(type), loc, result);
+  }
   template <typename Operands, typename Types>
   ParseResult resolveOperands(Operands &&operands, Types &&types,
                               llvm::SMLoc loc, SmallVectorImpl<Value> &result) {
@@ -572,6 +578,11 @@
   virtual ParseResult
   parseSuccessorAndUseList(Block *&dest, SmallVectorImpl<Value> &operands) = 0;
 
+  /// Parse an optional operation successor and its operand list.
+  virtual OptionalParseResult
+  parseOptionalSuccessorAndUseList(Block *&dest,
+                                   SmallVectorImpl<Value> &operands) = 0;
+
   //===--------------------------------------------------------------------===//
   // Type Parsing
   //===--------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/IR/OperationSupport.h b/mlir/include/mlir/IR/OperationSupport.h
--- a/mlir/include/mlir/IR/OperationSupport.h
+++ b/mlir/include/mlir/IR/OperationSupport.h
@@ -294,6 +294,11 @@
   void addTypes(ArrayRef<Type> newTypes) {
     types.append(newTypes.begin(), newTypes.end());
   }
+  template <typename RangeT>
+  std::enable_if_t<!std::is_convertible<RangeT, ArrayRef<Type>>::value>
+  addTypes(RangeT &&newTypes) {
+    types.append(newTypes.begin(), newTypes.end());
+  }
 
   /// Add an attribute with the specified name.
   void addAttribute(StringRef name, Attribute attr) {
diff --git a/mlir/include/mlir/InitAllPasses.h b/mlir/include/mlir/InitAllPasses.h
--- a/mlir/include/mlir/InitAllPasses.h
+++ b/mlir/include/mlir/InitAllPasses.h
@@ -109,6 +109,7 @@
 
   // LoopOps
   createParallelLoopFusionPass();
+  createParallelLoopTilingPass();
 
   // QuantOps
   quant::createConvertSimulatedQuantPass();
diff --git a/mlir/include/mlir/Support/STLExtras.h b/mlir/include/mlir/Support/STLExtras.h
--- a/mlir/include/mlir/Support/STLExtras.h
+++ b/mlir/include/mlir/Support/STLExtras.h
@@ -339,6 +339,26 @@
       });
 }
 
+/// A range class that repeats a specific value for a set number of times.
+template <typename T>
+class RepeatRange
+    : public detail::indexed_accessor_range_base<RepeatRange<T>, T, const T> {
+public:
+  using detail::indexed_accessor_range_base<
+      RepeatRange<T>, T, const T>::indexed_accessor_range_base;
+
+  /// Given that we are repeating a specific value, we can simply return that
+  /// value when offsetting the base or dereferencing the iterator.
+  static T offset_base(const T &val, ptrdiff_t) { return val; }
+  static const T &dereference_iterator(const T &val, ptrdiff_t) { return val; }
+};
+
+/// Make a range that repeats the given value 'n' times.
+template <typename ValueTy>
+RepeatRange<ValueTy> make_repeated_range(const ValueTy &value, size_t n) {
+  return RepeatRange<ValueTy>(value, n);
+}
+
 /// Returns true of the given range only contains a single element.
 template <typename ContainerTy> bool has_single_element(ContainerTy &&c) {
   auto it = std::begin(c), e = std::end(c);
diff --git a/mlir/include/mlir/TableGen/Constraint.h b/mlir/include/mlir/TableGen/Constraint.h
--- a/mlir/include/mlir/TableGen/Constraint.h
+++ b/mlir/include/mlir/TableGen/Constraint.h
@@ -48,7 +48,7 @@
   StringRef getDescription() const;
 
   // Constraint kind
-  enum Kind { CK_Attr, CK_Region, CK_Type, CK_Uncategorized };
+  enum Kind { CK_Attr, CK_Region, CK_Successor, CK_Type, CK_Uncategorized };
 
   Kind getKind() const { return kind; }
 
diff --git a/mlir/include/mlir/TableGen/Operator.h b/mlir/include/mlir/TableGen/Operator.h
--- a/mlir/include/mlir/TableGen/Operator.h
+++ b/mlir/include/mlir/TableGen/Operator.h
@@ -19,6 +19,7 @@
 #include "mlir/TableGen/Dialect.h"
 #include "mlir/TableGen/OpTrait.h"
 #include "mlir/TableGen/Region.h"
+#include "mlir/TableGen/Successor.h"
 #include "mlir/TableGen/Type.h"
 #include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/SmallVector.h"
@@ -138,6 +139,20 @@
   // Returns the `index`-th region.
   const NamedRegion &getRegion(unsigned index) const;
 
+  // Successors.
+  using const_successor_iterator = const NamedSuccessor *;
+  const_successor_iterator successor_begin() const;
+  const_successor_iterator successor_end() const;
+  llvm::iterator_range<const_successor_iterator> getSuccessors() const;
+
+  // Returns the number of successors.
+  unsigned getNumSuccessors() const;
+  // Returns the `index`-th successor.
+  const NamedSuccessor &getSuccessor(unsigned index) const;
+
+  // Returns the number of variadic successors in this operation.
+  unsigned getNumVariadicSuccessors() const;
+
   // Trait.
   using const_trait_iterator = const OpTrait *;
   const_trait_iterator trait_begin() const;
@@ -193,6 +208,9 @@
   // The results of the op.
   SmallVector<NamedTypeConstraint, 4> results;
 
+  // The successors of this op.
+  SmallVector<NamedSuccessor, 0> successors;
+
   // The traits of the op.
   SmallVector<OpTrait, 4> traits;
 
diff --git a/mlir/include/mlir/TableGen/Successor.h b/mlir/include/mlir/TableGen/Successor.h
new file mode 100644
--- /dev/null
+++ b/mlir/include/mlir/TableGen/Successor.h
@@ -0,0 +1,44 @@
+//===- Successor.h - TableGen successor definitions -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TABLEGEN_SUCCESSOR_H_
+#define MLIR_TABLEGEN_SUCCESSOR_H_
+
+#include "mlir/Support/LLVM.h"
+#include "mlir/TableGen/Constraint.h"
+
+namespace mlir {
+namespace tblgen {
+
+// Wrapper class providing helper methods for accessing Successor defined in
+// TableGen.
+class Successor : public Constraint {
+public:
+  using Constraint::Constraint;
+
+  static bool classof(const Constraint *c) {
+    return c->getKind() == CK_Successor;
+  }
+
+  // Returns true if this successor is variadic.
+  bool isVariadic() const;
+};
+
+// A struct bundling a successor's constraint and its name.
+struct NamedSuccessor {
+  // Returns true if this successor is variadic.
+  bool isVariadic() const { return constraint.isVariadic(); }
+
+  StringRef name;
+  Successor constraint;
+};
+
+} // end namespace tblgen
+} // end namespace mlir
+
+#endif // MLIR_TABLEGEN_SUCCESSOR_H_
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -234,15 +234,14 @@
 static LogicalResult verify(InvokeOp op) {
   if (op.getNumResults() > 1)
     return op.emitOpError("must have 0 or 1 result");
-  if (op.getNumSuccessors() != 2)
-    return op.emitOpError("must have normal and unwind destinations");
 
-  if (op.getSuccessor(1)->empty())
+  Block *unwindDest = op.unwindDest();
+  if (unwindDest->empty())
     return op.emitError(
         "must have at least one operation in unwind destination");
 
   // In unwind destination, first operation must be LandingpadOp
-  if (!isa<LandingpadOp>(op.getSuccessor(1)->front()))
+  if (!isa<LandingpadOp>(unwindDest->front()))
     return op.emitError("first operation in unwind destination should be a "
                         "llvm.landingpad operation");
 
@@ -781,69 +780,6 @@
   return success();
 }
 
-//===----------------------------------------------------------------------===//
-// Printing/parsing for LLVM::BrOp.
-//===----------------------------------------------------------------------===//
-
-static void printBrOp(OpAsmPrinter &p, BrOp &op) {
-  p << op.getOperationName() << ' ';
-  p.printSuccessorAndUseList(op.getOperation(), 0);
-  p.printOptionalAttrDict(op.getAttrs());
-}
-
-// <operation> ::= `llvm.br` bb-id (`[` ssa-use-and-type-list `]`)?
-// attribute-dict?
-static ParseResult parseBrOp(OpAsmParser &parser, OperationState &result) {
-  Block *dest;
-  SmallVector<Value, 4> operands;
-  if (parser.parseSuccessorAndUseList(dest, operands) ||
-      parser.parseOptionalAttrDict(result.attributes))
-    return failure();
-
-  result.addSuccessor(dest, operands);
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// Printing/parsing for LLVM::CondBrOp.
-//===----------------------------------------------------------------------===//
-
-static void printCondBrOp(OpAsmPrinter &p, CondBrOp &op) {
-  p << op.getOperationName() << ' ' << op.getOperand(0) << ", ";
-  p.printSuccessorAndUseList(op.getOperation(), 0);
-  p << ", ";
-  p.printSuccessorAndUseList(op.getOperation(), 1);
-  p.printOptionalAttrDict(op.getAttrs());
-}
-
-// <operation> ::= `llvm.cond_br` ssa-use `,`
-//                  bb-id (`[` ssa-use-and-type-list `]`)? `,`
-//                  bb-id (`[` ssa-use-and-type-list `]`)? attribute-dict?
-static ParseResult parseCondBrOp(OpAsmParser &parser, OperationState &result) {
-  Block *trueDest;
-  Block *falseDest;
-  SmallVector<Value, 4> trueOperands;
-  SmallVector<Value, 4> falseOperands;
-  OpAsmParser::OperandType condition;
-
-  Builder &builder = parser.getBuilder();
-  auto *llvmDialect =
-      builder.getContext()->getRegisteredDialect<LLVM::LLVMDialect>();
-  auto i1Type = LLVM::LLVMType::getInt1Ty(llvmDialect);
-
-  if (parser.parseOperand(condition) || parser.parseComma() ||
-      parser.parseSuccessorAndUseList(trueDest, trueOperands) ||
-      parser.parseComma() ||
-      parser.parseSuccessorAndUseList(falseDest, falseOperands) ||
-      parser.parseOptionalAttrDict(result.attributes) ||
-      parser.resolveOperand(condition, i1Type, result.operands))
-    return failure();
-
-  result.addSuccessor(trueDest, trueOperands);
-  result.addSuccessor(falseDest, falseOperands);
-  return success();
-}
-
 //===----------------------------------------------------------------------===//
 // Printing/parsing for LLVM::ReturnOp.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/LoopOps/Transforms/CMakeLists.txt b/mlir/lib/Dialect/LoopOps/Transforms/CMakeLists.txt
--- a/mlir/lib/Dialect/LoopOps/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/LoopOps/Transforms/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_llvm_library(MLIRLoopOpsTransforms
   ParallelLoopFusion.cpp
+  ParallelLoopTiling.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/LoopOps
diff --git a/mlir/lib/Dialect/LoopOps/Transforms/ParallelLoopTiling.cpp b/mlir/lib/Dialect/LoopOps/Transforms/ParallelLoopTiling.cpp
new file mode 100644
--- /dev/null
+++ b/mlir/lib/Dialect/LoopOps/Transforms/ParallelLoopTiling.cpp
@@ -0,0 +1,133 @@
+//===- ParallelLoopTiling.cpp - Tiles loop.parallel ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements loop tiling on parallel loops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/Dialect/LoopOps/Passes.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/RegionUtils.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace mlir;
+using loop::ParallelOp;
+
+/// Tile a parallel loop of the form
+///   loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
+///                                             step (%arg4, %arg5)
+///
+/// into
+///   loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
+///                                             step (%arg4*tileSize[0],
+///                                                   %arg5*tileSize[1])
+///     loop.parallel (%j0, %j1) = (0, 0) to (min(tileSize[0], %arg2-%j0)
+///                                           min(tileSize[1], %arg3-%j1))
+///                                        step (%arg4, %arg5)
+/// The old loop is replaced with the new one.
+static void tileParallelLoop(ParallelOp op, ArrayRef<int64_t> tileSizes) {
+  OpBuilder b(op);
+  auto zero = b.create<ConstantIndexOp>(op.getLoc(), 0);
+  SmallVector<Value, 2> tileSizeConstants;
+  tileSizeConstants.reserve(op.upperBound().size());
+  for (size_t i = 0, end = op.upperBound().size(); i != end; ++i) {
+    if (i < tileSizes.size())
+      tileSizeConstants.push_back(
+          b.create<ConstantIndexOp>(op.getLoc(), tileSizes[i]));
+    else
+      // Just pick 1 for the remaining dimensions.
+      tileSizeConstants.push_back(b.create<ConstantIndexOp>(op.getLoc(), 1));
+  }
+
+  // Create the outer loop with adjusted steps.
+  SmallVector<Value, 2> newSteps;
+  newSteps.reserve(op.step().size());
+  for (auto step : llvm::zip(op.step(), tileSizeConstants)) {
+    newSteps.push_back(
+        b.create<MulIOp>(op.getLoc(), std::get<0>(step), std::get<1>(step)));
+  }
+  auto outerLoop = b.create<ParallelOp>(op.getLoc(), op.lowerBound(),
+                                        op.upperBound(), newSteps);
+  b.setInsertionPointToStart(outerLoop.getBody());
+
+  // Compute min(size, dim - offset) to avoid out-of-bounds accesses.
+  // FIXME: Instead of using min, we want to replicate the tail. This would give
+  // the inner loop constant bounds for easy vectorization.
+  auto minMap = AffineMap::get(
+      /*dimCount=*/3, /*symbolCount=*/0,
+      {getAffineDimExpr(/*position=*/0, b.getContext()),
+       getAffineDimExpr(/*position=*/1, b.getContext()) -
+           getAffineDimExpr(/*position=*/2, b.getContext())});
+
+  // Create the inner loop with adjusted bounds.
+  SmallVector<Value, 2> newBounds;
+  newBounds.reserve(op.upperBound().size());
+  for (auto bounds : llvm::zip(tileSizeConstants, outerLoop.upperBound(),
+                               outerLoop.getInductionVars())) {
+    newBounds.push_back(b.create<AffineMinOp>(
+        op.getLoc(), b.getIndexType(), minMap,
+        ValueRange{std::get<0>(bounds), std::get<1>(bounds),
+                   std::get<2>(bounds)}));
+  }
+  auto innerLoop = b.create<ParallelOp>(
+      op.getLoc(), SmallVector<Value, 2>(newBounds.size(), zero), newBounds,
+      op.step());
+
+  // Steal the body of the old parallel loop and erase it.
+  innerLoop.region().takeBody(op.region());
+  op.erase();
+}
+
+/// Get a list of most nested parallel loops. Assumes that ParallelOps are only
+/// directly nested.
+static bool getInnermostNestedLoops(Block *block,
+                                    SmallVectorImpl<ParallelOp> &loops) {
+  bool hasInnerLoop = false;
+  for (auto parallelOp : block->getOps<ParallelOp>()) {
+    hasInnerLoop = true;
+    if (!getInnermostNestedLoops(parallelOp.getBody(), loops))
+      loops.push_back(parallelOp);
+  }
+  return hasInnerLoop;
+}
+
+namespace {
+struct ParallelLoopTiling : public FunctionPass<ParallelLoopTiling> {
+  ParallelLoopTiling() = default;
+  ParallelLoopTiling(const ParallelLoopTiling &) {} // tileSize is non-copyable.
+  explicit ParallelLoopTiling(ArrayRef<int64_t> tileSizes) {
+    this->tileSizes = tileSizes;
+  }
+
+  void runOnFunction() override {
+    SmallVector<ParallelOp, 2> mostNestedParallelOps;
+    for (Block &block : getFunction()) {
+      getInnermostNestedLoops(&block, mostNestedParallelOps);
+    }
+    for (ParallelOp pLoop : mostNestedParallelOps) {
+      tileParallelLoop(pLoop, tileSizes);
+    }
+  }
+
+  ListOption<int64_t> tileSizes{
+      *this, "parallel-loop-tile-sizes",
+      llvm::cl::desc("factors to tile parallel loops by"), llvm::cl::ZeroOrMore,
+      llvm::cl::MiscFlags::CommaSeparated};
+};
+} // namespace
+
+std::unique_ptr<Pass>
+mlir::createParallelLoopTilingPass(ArrayRef<int64_t> tileSizes) {
+  return std::make_unique<ParallelLoopTiling>(tileSizes);
+}
+
+static PassRegistration<ParallelLoopTiling> pass("parallel-loop-tiling",
+                                                 "Tile parallel loops.");
diff --git a/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp b/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp
--- a/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp
+++ b/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp
@@ -1018,32 +1018,6 @@
   results.insert<ConvertChainedBitcast>(context);
 }
 
-//===----------------------------------------------------------------------===//
-// spv.BranchOp
-//===----------------------------------------------------------------------===//
-
-static ParseResult parseBranchOp(OpAsmParser &parser, OperationState &state) {
-  Block *dest;
-  SmallVector<Value, 4> destOperands;
-  if (parser.parseSuccessorAndUseList(dest, destOperands))
-    return failure();
-  state.addSuccessor(dest, destOperands);
-  return success();
-}
-
-static void print(spirv::BranchOp branchOp, OpAsmPrinter &printer) {
-  printer << spirv::BranchOp::getOperationName() << ' ';
-  printer.printSuccessorAndUseList(branchOp.getOperation(), /*index=*/0);
-}
-
-static LogicalResult verify(spirv::BranchOp branchOp) {
-  auto *op = branchOp.getOperation();
-  if (op->getNumSuccessors() != 1)
-    branchOp.emitOpError("must have exactly one successor");
-
-  return success();
-}
-
 //===----------------------------------------------------------------------===//
 // spv.BranchConditionalOp
 //===----------------------------------------------------------------------===//
@@ -1114,10 +1088,6 @@
 }
 
 static LogicalResult verify(spirv::BranchConditionalOp branchOp) {
-  auto *op = branchOp.getOperation();
-  if (op->getNumSuccessors() != 2)
-    return branchOp.emitOpError("must have exactly two successors");
-
   if (auto weights = branchOp.branch_weights()) {
     if (weights->getValue().size() != 2) {
       return branchOp.emitOpError("must have exactly two branch weights");
diff --git a/mlir/lib/Dialect/StandardOps/IR/Ops.cpp b/mlir/lib/Dialect/StandardOps/IR/Ops.cpp
--- a/mlir/lib/Dialect/StandardOps/IR/Ops.cpp
+++ b/mlir/lib/Dialect/StandardOps/IR/Ops.cpp
@@ -414,20 +414,6 @@
 };
 } // end anonymous namespace.
 
-static ParseResult parseBranchOp(OpAsmParser &parser, OperationState &result) {
-  Block *dest;
-  SmallVector<Value, 4> destOperands;
-  if (parser.parseSuccessorAndUseList(dest, destOperands))
-    return failure();
-  result.addSuccessor(dest, destOperands);
-  return success();
-}
-
-static void print(OpAsmPrinter &p, BranchOp op) {
-  p << "br ";
-  p.printSuccessorAndUseList(op.getOperation(), 0);
-}
-
 Block *BranchOp::getDest() { return getSuccessor(0); }
 
 void BranchOp::setDest(Block *block) { return setSuccessor(block, 0); }
@@ -505,29 +491,6 @@
 };
 } // end anonymous namespace.
 
-static ParseResult parseCallIndirectOp(OpAsmParser &parser,
-                                       OperationState &result) {
-  FunctionType calleeType;
-  OpAsmParser::OperandType callee;
-  llvm::SMLoc operandsLoc;
-  SmallVector<OpAsmParser::OperandType, 4> operands;
-  return failure(
-      parser.parseOperand(callee) || parser.getCurrentLocation(&operandsLoc) ||
-      parser.parseOperandList(operands, OpAsmParser::Delimiter::Paren) ||
-      parser.parseOptionalAttrDict(result.attributes) ||
-      parser.parseColonType(calleeType) ||
-      parser.resolveOperand(callee, calleeType, result.operands) ||
-      parser.resolveOperands(operands, calleeType.getInputs(), operandsLoc,
-                             result.operands) ||
-      parser.addTypesToList(calleeType.getResults(), result.types));
-}
-
-static void print(OpAsmPrinter &p, CallIndirectOp op) {
-  p << "call_indirect " << op.getCallee() << '(' << op.getArgOperands() << ')';
-  p.printOptionalAttrDict(op.getAttrs(), /*elidedAttrs=*/{"callee"});
-  p << " : " << op.getCallee().getType();
-}
-
 void CallIndirectOp::getCanonicalizationPatterns(
     OwningRewritePatternList &results, MLIRContext *context) {
   results.insert<SimplifyIndirectCallWithKnownCallee>(context);
@@ -570,55 +533,6 @@
       build->getI64IntegerAttr(static_cast<int64_t>(predicate)));
 }
 
-static ParseResult parseCmpIOp(OpAsmParser &parser, OperationState &result) {
-  SmallVector<OpAsmParser::OperandType, 2> ops;
-  SmallVector<NamedAttribute, 4> attrs;
-  Attribute predicateNameAttr;
-  Type type;
-  if (parser.parseAttribute(predicateNameAttr, CmpIOp::getPredicateAttrName(),
-                            attrs) ||
-      parser.parseComma() || parser.parseOperandList(ops, 2) ||
-      parser.parseOptionalAttrDict(attrs) || parser.parseColonType(type) ||
-      parser.resolveOperands(ops, type, result.operands))
-    return failure();
-
-  if (!predicateNameAttr.isa<StringAttr>())
-    return parser.emitError(parser.getNameLoc(),
-                            "expected string comparison predicate attribute");
-
-  // Rewrite string attribute to an enum value.
-  StringRef predicateName = predicateNameAttr.cast<StringAttr>().getValue();
-  Optional<CmpIPredicate> predicate = symbolizeCmpIPredicate(predicateName);
-  if (!predicate.hasValue())
-    return parser.emitError(parser.getNameLoc())
-           << "unknown comparison predicate \"" << predicateName << "\"";
-
-  auto builder = parser.getBuilder();
-  Type i1Type = getCheckedI1SameShape(type);
-  if (!i1Type)
-    return parser.emitError(parser.getNameLoc(),
-                            "expected type with valid i1 shape");
-
-  attrs[0].second = builder.getI64IntegerAttr(static_cast<int64_t>(*predicate));
-  result.attributes = attrs;
-
-  result.addTypes({i1Type});
-  return success();
-}
-
-static void print(OpAsmPrinter &p, CmpIOp op) {
-  p << "cmpi ";
-
-  Builder b(op.getContext());
-  auto predicateValue =
-      op.getAttrOfType<IntegerAttr>(CmpIOp::getPredicateAttrName()).getInt();
-  p << '"' << stringifyCmpIPredicate(static_cast<CmpIPredicate>(predicateValue))
-    << '"' << ", " << op.lhs() << ", " << op.rhs();
-  p.printOptionalAttrDict(op.getAttrs(),
-                          /*elidedAttrs=*/{CmpIOp::getPredicateAttrName()});
-  p << " : " << op.lhs().getType();
-}
-
 // Compute `lhs` `pred` `rhs`, where `pred` is one of the known integer
 // comparison predicates.
 static bool applyCmpPredicate(CmpIPredicate predicate, const APInt &lhs,
@@ -882,42 +796,6 @@
 };
 } // end anonymous namespace.
 
-static ParseResult parseCondBranchOp(OpAsmParser &parser,
-                                     OperationState &result) {
-  SmallVector<Value, 4> destOperands;
-  Block *dest;
-  OpAsmParser::OperandType condInfo;
-
-  // Parse the condition.
-  Type int1Ty = parser.getBuilder().getI1Type();
-  if (parser.parseOperand(condInfo) || parser.parseComma() ||
-      parser.resolveOperand(condInfo, int1Ty, result.operands)) {
-    return parser.emitError(parser.getNameLoc(),
-                            "expected condition type was boolean (i1)");
-  }
-
-  // Parse the true successor.
-  if (parser.parseSuccessorAndUseList(dest, destOperands))
-    return failure();
-  result.addSuccessor(dest, destOperands);
-
-  // Parse the false successor.
-  destOperands.clear();
-  if (parser.parseComma() ||
-      parser.parseSuccessorAndUseList(dest, destOperands))
-    return failure();
-  result.addSuccessor(dest, destOperands);
-
-  return success();
-}
-
-static void print(OpAsmPrinter &p, CondBranchOp op) {
-  p << "cond_br " << op.getCondition() << ", ";
-  p.printSuccessorAndUseList(op.getOperation(), CondBranchOp::trueIndex);
-  p << ", ";
-  p.printSuccessorAndUseList(op.getOperation(), CondBranchOp::falseIndex);
-}
-
 void CondBranchOp::getCanonicalizationPatterns(
     OwningRewritePatternList &results, MLIRContext *context) {
   results.insert<SimplifyConstCondBranchPred>(context);
@@ -1486,30 +1364,6 @@
 // ExtractElementOp
 //===----------------------------------------------------------------------===//
 
-static void print(OpAsmPrinter &p, ExtractElementOp op) {
-  p << "extract_element " << op.getAggregate() << '[' << op.getIndices();
-  p << ']';
-  p.printOptionalAttrDict(op.getAttrs());
-  p << " : " << op.getAggregate().getType();
-}
-
-static ParseResult parseExtractElementOp(OpAsmParser &parser,
-                                         OperationState &result) {
-  OpAsmParser::OperandType aggregateInfo;
-  SmallVector<OpAsmParser::OperandType, 4> indexInfo;
-  ShapedType type;
-
-  auto indexTy = parser.getBuilder().getIndexType();
-  return failure(
-      parser.parseOperand(aggregateInfo) ||
-      parser.parseOperandList(indexInfo, OpAsmParser::Delimiter::Square) ||
-      parser.parseOptionalAttrDict(result.attributes) ||
-      parser.parseColonType(type) ||
-      parser.resolveOperand(aggregateInfo, type, result.operands) ||
-      parser.resolveOperands(indexInfo, indexTy, result.operands) ||
-      parser.addTypeToList(type.getElementType(), result.types));
-}
-
 static LogicalResult verify(ExtractElementOp op) {
   // Verify the # indices match if we have a ranked type.
   auto aggregateType = op.getAggregate().getType().cast<ShapedType>();
@@ -1577,28 +1431,6 @@
 // LoadOp
 //===----------------------------------------------------------------------===//
 
-static void print(OpAsmPrinter &p, LoadOp op) {
-  p << "load " << op.getMemRef() << '[' << op.getIndices() << ']';
-  p.printOptionalAttrDict(op.getAttrs());
-  p << " : " << op.getMemRefType();
-}
-
-static ParseResult parseLoadOp(OpAsmParser &parser, OperationState &result) {
-  OpAsmParser::OperandType memrefInfo;
-  SmallVector<OpAsmParser::OperandType, 4> indexInfo;
-  MemRefType type;
-
-  auto indexTy = parser.getBuilder().getIndexType();
-  return failure(
-      parser.parseOperand(memrefInfo) ||
-      parser.parseOperandList(indexInfo, OpAsmParser::Delimiter::Square) ||
-      parser.parseOptionalAttrDict(result.attributes) ||
-      parser.parseColonType(type) ||
-      parser.resolveOperand(memrefInfo, type, result.operands) ||
-      parser.resolveOperands(indexInfo, indexTy, result.operands) ||
-      parser.addTypeToList(type.getElementType(), result.types));
-}
-
 static LogicalResult verify(LoadOp op) {
   if (op.getNumOperands() != 1 + op.getMemRefType().getRank())
     return op.emitOpError("incorrect number of indices for load");
@@ -1854,21 +1686,6 @@
 // ReturnOp
 //===----------------------------------------------------------------------===//
 
-static ParseResult parseReturnOp(OpAsmParser &parser, OperationState &result) {
-  SmallVector<OpAsmParser::OperandType, 2> opInfo;
-  SmallVector<Type, 2> types;
-  llvm::SMLoc loc = parser.getCurrentLocation();
-  return failure(parser.parseOperandList(opInfo) ||
-                 (!opInfo.empty() && parser.parseColonTypeList(types)) ||
-                 parser.resolveOperands(opInfo, types, loc, result.operands));
-}
-
-static void print(OpAsmPrinter &p, ReturnOp op) {
-  p << "return";
-  if (op.getNumOperands() != 0)
-    p << ' ' << op.getOperands() << " : " << op.getOperandTypes();
-}
-
 static LogicalResult verify(ReturnOp op) {
   auto function = cast<FuncOp>(op.getParentOp());
 
@@ -1902,31 +1719,6 @@
 // SelectOp
 //===----------------------------------------------------------------------===//
 
-static ParseResult parseSelectOp(OpAsmParser &parser, OperationState &result) {
-  SmallVector<OpAsmParser::OperandType, 3> ops;
-  SmallVector<NamedAttribute, 4> attrs;
-  Type type;
-  if (parser.parseOperandList(ops, 3) ||
-      parser.parseOptionalAttrDict(result.attributes) ||
-      parser.parseColonType(type))
-    return failure();
-
-  auto i1Type = getCheckedI1SameShape(type);
-  if (!i1Type)
-    return parser.emitError(parser.getNameLoc(),
-                            "expected type with valid i1 shape");
-
-  std::array<Type, 3> types = {i1Type, type, type};
-  return failure(parser.resolveOperands(ops, types, parser.getNameLoc(),
-                                        result.operands) ||
-                 parser.addTypeToList(type, result.types));
-}
-
-static void print(OpAsmPrinter &p, SelectOp op) {
-  p << "select " << op.getOperands() << " : " << op.getTrueValue().getType();
-  p.printOptionalAttrDict(op.getAttrs());
-}
-
 OpFoldResult SelectOp::fold(ArrayRef<Attribute> operands) {
   auto condition = getCondition();
 
@@ -1968,25 +1760,6 @@
 // SplatOp
 //===----------------------------------------------------------------------===//
 
-static void print(OpAsmPrinter &p, SplatOp op) {
-  p << "splat " << op.getOperand();
-  p.printOptionalAttrDict(op.getAttrs());
-  p << " : " << op.getType();
-}
-
-static ParseResult parseSplatOp(OpAsmParser &parser, OperationState &result) {
-  OpAsmParser::OperandType splatValueInfo;
-  ShapedType shapedType;
-
-  return failure(parser.parseOperand(splatValueInfo) ||
-                 parser.parseOptionalAttrDict(result.attributes) ||
-                 parser.parseColonType(shapedType) ||
-                 parser.resolveOperand(splatValueInfo,
-                                       shapedType.getElementType(),
-                                       result.operands) ||
-                 parser.addTypeToList(shapedType, result.types));
-}
-
 static LogicalResult verify(SplatOp op) {
   // TODO: we could replace this by a trait.
   if (op.getOperand().getType() !=
@@ -2017,32 +1790,6 @@
 // StoreOp
 //===----------------------------------------------------------------------===//
 
-static void print(OpAsmPrinter &p, StoreOp op) {
-  p << "store " << op.getValueToStore();
-  p << ", " << op.getMemRef() << '[' << op.getIndices() << ']';
-  p.printOptionalAttrDict(op.getAttrs());
-  p << " : " << op.getMemRefType();
-}
-
-static ParseResult parseStoreOp(OpAsmParser &parser, OperationState &result) {
-  OpAsmParser::OperandType storeValueInfo;
-  OpAsmParser::OperandType memrefInfo;
-  SmallVector<OpAsmParser::OperandType, 4> indexInfo;
-  MemRefType memrefType;
-
-  auto indexTy = parser.getBuilder().getIndexType();
-  return failure(
-      parser.parseOperand(storeValueInfo) || parser.parseComma() ||
-      parser.parseOperand(memrefInfo) ||
-      parser.parseOperandList(indexInfo, OpAsmParser::Delimiter::Square) ||
-      parser.parseOptionalAttrDict(result.attributes) ||
-      parser.parseColonType(memrefType) ||
-      parser.resolveOperand(storeValueInfo, memrefType.getElementType(),
-                            result.operands) ||
-      parser.resolveOperand(memrefInfo, memrefType, result.operands) ||
-      parser.resolveOperands(indexInfo, indexTy, result.operands));
-}
-
 static LogicalResult verify(StoreOp op) {
   if (op.getNumOperands() != 2 + op.getMemRefType().getRank())
     return op.emitOpError("store index operand count not equal to memref rank");
@@ -2156,51 +1903,6 @@
   return NoneType::get(type.getContext());
 }
 
-//===----------------------------------------------------------------------===//
-// TensorLoadOp
-//===----------------------------------------------------------------------===//
-
-static void print(OpAsmPrinter &p, TensorLoadOp op) {
-  p << "tensor_load " << op.getOperand();
-  p.printOptionalAttrDict(op.getAttrs());
-  p << " : " << op.getOperand().getType();
-}
-
-static ParseResult parseTensorLoadOp(OpAsmParser &parser,
-                                     OperationState &result) {
-  OpAsmParser::OperandType op;
-  Type type;
-  return failure(
-      parser.parseOperand(op) ||
-      parser.parseOptionalAttrDict(result.attributes) ||
-      parser.parseColonType(type) ||
-      parser.resolveOperand(op, type, result.operands) ||
-      parser.addTypeToList(getTensorTypeFromMemRefType(type), result.types));
-}
-
-//===----------------------------------------------------------------------===//
-// TensorStoreOp
-//===----------------------------------------------------------------------===//
-
-static void print(OpAsmPrinter &p, TensorStoreOp op) {
-  p << "tensor_store " << op.tensor() << ", " << op.memref();
-  p.printOptionalAttrDict(op.getAttrs());
-  p << " : " << op.memref().getType();
-}
-
-static ParseResult parseTensorStoreOp(OpAsmParser &parser,
-                                      OperationState &result) {
-  SmallVector<OpAsmParser::OperandType, 2> ops;
-  Type type;
-  llvm::SMLoc loc = parser.getCurrentLocation();
-  return failure(
-      parser.parseOperandList(ops, /*requiredOperandCount=*/2) ||
-      parser.parseOptionalAttrDict(result.attributes) ||
-      parser.parseColonType(type) ||
-      parser.resolveOperands(ops, {getTensorTypeFromMemRefType(type), type},
-                             loc, result.operands));
-}
-
 //===----------------------------------------------------------------------===//
 // TruncateIOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/VectorOps/VectorOps.cpp b/mlir/lib/Dialect/VectorOps/VectorOps.cpp
--- a/mlir/lib/Dialect/VectorOps/VectorOps.cpp
+++ b/mlir/lib/Dialect/VectorOps/VectorOps.cpp
@@ -412,31 +412,6 @@
 // ExtractElementOp
 //===----------------------------------------------------------------------===//
 
-static void print(OpAsmPrinter &p, vector::ExtractElementOp op) {
-  p << op.getOperationName() << " " << op.vector() << "[" << op.position()
-    << " : " << op.position().getType() << "]";
-  p.printOptionalAttrDict(op.getAttrs());
-  p << " : " << op.vector().getType();
-}
-
-static ParseResult parseExtractElementOp(OpAsmParser &parser,
-                                         OperationState &result) {
-  OpAsmParser::OperandType vector, position;
-  Type positionType;
-  VectorType vectorType;
-  if (parser.parseOperand(vector) || parser.parseLSquare() ||
-      parser.parseOperand(position) || parser.parseColonType(positionType) ||
-      parser.parseRSquare() ||
-      parser.parseOptionalAttrDict(result.attributes) ||
-      parser.parseColonType(vectorType))
-    return failure();
-  Type resultType = vectorType.getElementType();
-  return failure(
-      parser.resolveOperand(vector, vectorType, result.operands) ||
-      parser.resolveOperand(position, positionType, result.operands) ||
-      parser.addTypeToList(resultType, result.types));
-}
-
 static LogicalResult verify(vector::ExtractElementOp op) {
   VectorType vectorType = op.getVectorType();
   if (vectorType.getRank() != 1)
@@ -715,33 +690,6 @@
 // InsertElementOp
 //===----------------------------------------------------------------------===//
 
-static void print(OpAsmPrinter &p, InsertElementOp op) {
-  p << op.getOperationName() << " " << op.source() << ", " << op.dest() << "["
-    << op.position() << " : " << op.position().getType() << "]";
-  p.printOptionalAttrDict(op.getAttrs());
-  p << " : " << op.dest().getType();
-}
-
-static ParseResult parseInsertElementOp(OpAsmParser &parser,
-                                        OperationState &result) {
-  OpAsmParser::OperandType source, dest, position;
-  Type positionType;
-  VectorType destType;
-  if (parser.parseOperand(source) || parser.parseComma() ||
-      parser.parseOperand(dest) || parser.parseLSquare() ||
-      parser.parseOperand(position) || parser.parseColonType(positionType) ||
-      parser.parseRSquare() ||
-      parser.parseOptionalAttrDict(result.attributes) ||
-      parser.parseColonType(destType))
-    return failure();
-  Type sourceType = destType.getElementType();
-  return failure(
-      parser.resolveOperand(source, sourceType, result.operands) ||
-      parser.resolveOperand(dest, destType, result.operands) ||
-      parser.resolveOperand(position, positionType, result.operands) ||
-      parser.addTypeToList(destType, result.types));
-}
-
 static LogicalResult verify(InsertElementOp op) {
   auto dstVectorType = op.getDestVectorType();
   if (dstVectorType.getRank() != 1)
diff --git a/mlir/lib/Parser/Parser.cpp b/mlir/lib/Parser/Parser.cpp
--- a/mlir/lib/Parser/Parser.cpp
+++ b/mlir/lib/Parser/Parser.cpp
@@ -4423,6 +4423,15 @@
     return parser.parseSuccessorAndUseList(dest, operands);
   }
 
+  /// Parse an optional operation successor and its operand list.
+  OptionalParseResult
+  parseOptionalSuccessorAndUseList(Block *&dest,
+                                   SmallVectorImpl<Value> &operands) override {
+    if (parser.getToken().isNot(Token::caret_identifier))
+      return llvm::None;
+    return parseSuccessorAndUseList(dest, operands);
+  }
+
   //===--------------------------------------------------------------------===//
   // Type Parsing
   //===--------------------------------------------------------------------===//
diff --git a/mlir/lib/TableGen/CMakeLists.txt b/mlir/lib/TableGen/CMakeLists.txt
--- a/mlir/lib/TableGen/CMakeLists.txt
+++ b/mlir/lib/TableGen/CMakeLists.txt
@@ -10,6 +10,7 @@
   OpTrait.cpp
   Pattern.cpp
   Predicate.cpp
+  Successor.cpp
   Type.cpp
 
   ADDITIONAL_HEADER_DIRS
diff --git a/mlir/lib/TableGen/Constraint.cpp b/mlir/lib/TableGen/Constraint.cpp
--- a/mlir/lib/TableGen/Constraint.cpp
+++ b/mlir/lib/TableGen/Constraint.cpp
@@ -23,6 +23,8 @@
     kind = CK_Attr;
   } else if (record->isSubClassOf("RegionConstraint")) {
     kind = CK_Region;
+  } else if (record->isSubClassOf("SuccessorConstraint")) {
+    kind = CK_Successor;
   } else {
     assert(record->isSubClassOf("Constraint"));
   }
diff --git a/mlir/lib/TableGen/Operator.cpp b/mlir/lib/TableGen/Operator.cpp
--- a/mlir/lib/TableGen/Operator.cpp
+++ b/mlir/lib/TableGen/Operator.cpp
@@ -159,6 +159,31 @@
   return regions[index];
 }
 
+auto tblgen::Operator::successor_begin() const -> const_successor_iterator {
+  return successors.begin();
+}
+auto tblgen::Operator::successor_end() const -> const_successor_iterator {
+  return successors.end();
+}
+auto tblgen::Operator::getSuccessors() const
+    -> llvm::iterator_range<const_successor_iterator> {
+  return {successor_begin(), successor_end()};
+}
+
+unsigned tblgen::Operator::getNumSuccessors() const {
+  return successors.size();
+}
+
+const tblgen::NamedSuccessor &
+tblgen::Operator::getSuccessor(unsigned index) const {
+  return successors[index];
+}
+
+unsigned tblgen::Operator::getNumVariadicSuccessors() const {
+  return llvm::count_if(successors,
+                        [](const NamedSuccessor &c) { return c.isVariadic(); });
+}
+
 auto tblgen::Operator::trait_begin() const -> const_trait_iterator {
   return traits.begin();
 }
@@ -285,6 +310,29 @@
     results.push_back({name, TypeConstraint(resultDef)});
   }
 
+  // Handle successors
+  auto *successorsDag = def.getValueAsDag("successors");
+  auto *successorsOp = dyn_cast<DefInit>(successorsDag->getOperator());
+  if (!successorsOp || successorsOp->getDef()->getName() != "successor") {
+    PrintFatalError(def.getLoc(),
+                    "'successors' must have 'successor' directive");
+  }
+
+  for (unsigned i = 0, e = successorsDag->getNumArgs(); i < e; ++i) {
+    auto name = successorsDag->getArgNameStr(i);
+    auto *successorInit = dyn_cast<DefInit>(successorsDag->getArg(i));
+    if (!successorInit) {
+      PrintFatalError(def.getLoc(),
+                      Twine("undefined kind for successor #") + Twine(i));
+    }
+    Successor successor(successorInit->getDef());
+
+    // Only support variadic successors if it is the last one for now.
+    if (i != e - 1 && successor.isVariadic())
+      PrintFatalError(def.getLoc(), "only the last successor can be variadic");
+    successors.push_back({name, successor});
+  }
+
   // Create list of traits, skipping over duplicates: appending to lists in
   // tablegen is easy, making them unique less so, so dedupe here.
   if (auto traitList = def.getValueAsListInit("traits")) {
diff --git a/mlir/lib/TableGen/Successor.cpp b/mlir/lib/TableGen/Successor.cpp
new file mode 100644
--- /dev/null
+++ b/mlir/lib/TableGen/Successor.cpp
@@ -0,0 +1,24 @@
+//===- Successor.cpp - Successor class ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Successor wrapper to simplify using TableGen Record defining a MLIR
+// Successor.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/TableGen/Successor.h"
+#include "mlir/ADT/TypeSwitch.h"
+#include "llvm/TableGen/Record.h"
+
+using namespace mlir;
+using namespace mlir::tblgen;
+
+// Returns true if this successor is variadic.
+bool Successor::isVariadic() const {
+  return def->isSubClassOf("VariadicSuccessor");
+}
diff --git a/mlir/test/Dialect/Loops/parallel-loop-tiling.mlir b/mlir/test/Dialect/Loops/parallel-loop-tiling.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Dialect/Loops/parallel-loop-tiling.mlir
@@ -0,0 +1,80 @@
+// RUN: mlir-opt %s -pass-pipeline='func(parallel-loop-tiling{parallel-loop-tile-sizes=1,4})' -split-input-file | FileCheck %s --dump-input-on-failure
+
+func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index,
+                    %arg3 : index, %arg4 : index, %arg5 : index,
+		    %A: memref<?x?xf32>, %B: memref<?x?xf32>,
+                    %C: memref<?x?xf32>, %result: memref<?x?xf32>) {
+  loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3) step (%arg4, %arg5) {
+    %B_elem = load %B[%i0, %i1] : memref<?x?xf32>
+    %C_elem = load %C[%i0, %i1] : memref<?x?xf32>
+    %sum_elem = addf %B_elem, %C_elem : f32
+    store %sum_elem, %result[%i0, %i1] : memref<?x?xf32>
+  }
+  return
+}
+
+// CHECK:       #map0 = affine_map<(d0, d1, d2) -> (d0, d1 - d2)>
+// CHECK-LABEL:   func @parallel_loop(
+// CHECK-SAME:                        [[VAL_0:%.*]]: index, [[VAL_1:%.*]]: index, [[VAL_2:%.*]]: index, [[VAL_3:%.*]]: index, [[VAL_4:%.*]]: index, [[VAL_5:%.*]]: index, [[VAL_6:%.*]]: memref<?x?xf32>, [[VAL_7:%.*]]: memref<?x?xf32>, [[VAL_8:%.*]]: memref<?x?xf32>, [[VAL_9:%.*]]: memref<?x?xf32>) {
+// CHECK:           [[VAL_10:%.*]] = constant 0 : index
+// CHECK:           [[VAL_11:%.*]] = constant 1 : index
+// CHECK:           [[VAL_12:%.*]] = constant 4 : index
+// CHECK:           [[VAL_13:%.*]] = muli [[VAL_4]], [[VAL_11]] : index
+// CHECK:           [[VAL_14:%.*]] = muli [[VAL_5]], [[VAL_12]] : index
+// CHECK:           loop.parallel ([[VAL_15:%.*]], [[VAL_16:%.*]]) = ([[VAL_0]], [[VAL_1]]) to ([[VAL_2]], [[VAL_3]]) step ([[VAL_13]], [[VAL_14]]) {
+// CHECK:             [[VAL_17:%.*]] = affine.min #map0([[VAL_11]], [[VAL_2]], [[VAL_15]])
+// CHECK:             [[VAL_18:%.*]] = affine.min #map0([[VAL_12]], [[VAL_3]], [[VAL_16]])
+// CHECK:             loop.parallel ([[VAL_19:%.*]], [[VAL_20:%.*]]) = ([[VAL_10]], [[VAL_10]]) to ([[VAL_17]], [[VAL_18]]) step ([[VAL_4]], [[VAL_5]]) {
+// CHECK:               [[VAL_21:%.*]] = load [[VAL_7]]{{\[}}[[VAL_19]], [[VAL_20]]] : memref<?x?xf32>
+// CHECK:               [[VAL_22:%.*]] = load [[VAL_8]]{{\[}}[[VAL_19]], [[VAL_20]]] : memref<?x?xf32>
+// CHECK:               [[VAL_23:%.*]] = addf [[VAL_21]], [[VAL_22]] : f32
+// CHECK:               store [[VAL_23]], [[VAL_9]]{{\[}}[[VAL_19]], [[VAL_20]]] : memref<?x?xf32>
+// CHECK:             }
+// CHECK:           }
+// CHECK:           return
+
+// -----
+
+func @tile_nested_innermost() {
+  %c2 = constant 2 : index
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  loop.parallel (%i, %j) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) {
+    loop.parallel (%k, %l) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) {
+    }
+  }
+  loop.parallel (%i, %j) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) {
+  }
+  return
+}
+
+// CHECK-LABEL:   func @tile_nested_innermost() {
+// CHECK:           [[VAL_24:%.*]] = constant 2 : index
+// CHECK:           [[VAL_25:%.*]] = constant 0 : index
+// CHECK:           [[VAL_26:%.*]] = constant 1 : index
+// CHECK:           loop.parallel ([[VAL_27:%.*]], [[VAL_28:%.*]]) = ([[VAL_25]], [[VAL_25]]) to ([[VAL_24]], [[VAL_24]]) step ([[VAL_26]], [[VAL_26]]) {
+// CHECK:             [[VAL_29:%.*]] = constant 0 : index
+// CHECK:             [[VAL_30:%.*]] = constant 1 : index
+// CHECK:             [[VAL_31:%.*]] = constant 4 : index
+// CHECK:             [[VAL_32:%.*]] = muli [[VAL_26]], [[VAL_30]] : index
+// CHECK:             [[VAL_33:%.*]] = muli [[VAL_26]], [[VAL_31]] : index
+// CHECK:             loop.parallel ([[VAL_34:%.*]], [[VAL_35:%.*]]) = ([[VAL_25]], [[VAL_25]]) to ([[VAL_24]], [[VAL_24]]) step ([[VAL_32]], [[VAL_33]]) {
+// CHECK:               [[VAL_36:%.*]] = affine.min #map0([[VAL_30]], [[VAL_24]], [[VAL_34]])
+// CHECK:               [[VAL_37:%.*]] = affine.min #map0([[VAL_31]], [[VAL_24]], [[VAL_35]])
+// CHECK:               loop.parallel ([[VAL_38:%.*]], [[VAL_39:%.*]]) = ([[VAL_29]], [[VAL_29]]) to ([[VAL_36]], [[VAL_37]]) step ([[VAL_26]], [[VAL_26]]) {
+// CHECK:               }
+// CHECK:             }
+// CHECK:           }
+// CHECK:           [[VAL_40:%.*]] = constant 0 : index
+// CHECK:           [[VAL_41:%.*]] = constant 1 : index
+// CHECK:           [[VAL_42:%.*]] = constant 4 : index
+// CHECK:           [[VAL_43:%.*]] = muli [[VAL_26]], [[VAL_41]] : index
+// CHECK:           [[VAL_44:%.*]] = muli [[VAL_26]], [[VAL_42]] : index
+// CHECK:           loop.parallel ([[VAL_45:%.*]], [[VAL_46:%.*]]) = ([[VAL_25]], [[VAL_25]]) to ([[VAL_24]], [[VAL_24]]) step ([[VAL_43]], [[VAL_44]]) {
+// CHECK:             [[VAL_47:%.*]] = affine.min #map0([[VAL_41]], [[VAL_24]], [[VAL_45]])
+// CHECK:             [[VAL_48:%.*]] = affine.min #map0([[VAL_42]], [[VAL_24]], [[VAL_46]])
+// CHECK:             loop.parallel ([[VAL_49:%.*]], [[VAL_50:%.*]]) = ([[VAL_40]], [[VAL_40]]) to ([[VAL_47]], [[VAL_48]]) step ([[VAL_26]], [[VAL_26]]) {
+// CHECK:             }
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
diff --git a/mlir/test/Dialect/SPIRV/control-flow-ops.mlir b/mlir/test/Dialect/SPIRV/control-flow-ops.mlir
--- a/mlir/test/Dialect/SPIRV/control-flow-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/control-flow-ops.mlir
@@ -24,15 +24,15 @@
 // -----
 
 func @missing_accessor() -> () {
+  // expected-error @+1 {{has incorrect number of successors: expected 1 but found 0}}
   spv.Branch
-  // expected-error @+1 {{expected block name}}
 }
 
 // -----
 
 func @wrong_accessor_count() -> () {
   %true = spv.constant true
-  // expected-error @+1 {{must have exactly one successor}}
+  // expected-error @+1 {{incorrect number of successors: expected 1 but found 2}}
   "spv.Branch"()[^one, ^two] : () -> ()
 ^one:
   spv.Return
@@ -116,7 +116,7 @@
 
 func @wrong_accessor_count() -> () {
   %true = spv.constant true
-  // expected-error @+1 {{must have exactly two successors}}
+  // expected-error @+1 {{incorrect number of successors: expected 2 but found 1}}
   "spv.BranchConditional"(%true)[^one] : (i1) -> ()
 ^one:
   spv.Return
diff --git a/mlir/test/Examples/Toy/Ch2/codegen.toy b/mlir/test/Examples/Toy/Ch2/codegen.toy
--- a/mlir/test/Examples/Toy/Ch2/codegen.toy
+++ b/mlir/test/Examples/Toy/Ch2/codegen.toy
@@ -15,17 +15,17 @@
 
 # CHECK-LABEL: func @multiply_transpose(
 # CHECK-SAME:                           [[VAL_0:%.*]]: tensor<*xf64>, [[VAL_1:%.*]]: tensor<*xf64>) -> tensor<*xf64>
-# CHECK:         [[VAL_2:%.*]] = "toy.transpose"([[VAL_0]]) : (tensor<*xf64>) -> tensor<*xf64>
-# CHECK-NEXT:    [[VAL_3:%.*]] = "toy.transpose"([[VAL_1]]) : (tensor<*xf64>) -> tensor<*xf64>
-# CHECK-NEXT:    [[VAL_4:%.*]] = "toy.mul"([[VAL_2]], [[VAL_3]]) : (tensor<*xf64>, tensor<*xf64>) -> tensor<*xf64>
-# CHECK-NEXT:    "toy.return"([[VAL_4]]) : (tensor<*xf64>) -> ()
+# CHECK:         [[VAL_2:%.*]] = toy.transpose([[VAL_0]] : tensor<*xf64>) to tensor<*xf64>
+# CHECK-NEXT:    [[VAL_3:%.*]] = toy.transpose([[VAL_1]] : tensor<*xf64>) to tensor<*xf64>
+# CHECK-NEXT:    [[VAL_4:%.*]] = toy.mul [[VAL_2]], [[VAL_3]] :  tensor<*xf64>
+# CHECK-NEXT:    toy.return [[VAL_4]] : tensor<*xf64>
 
 # CHECK-LABEL: func @main()
-# CHECK-NEXT:    [[VAL_5:%.*]] = "toy.constant"() {value = dense<{{\[\[}}1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
-# CHECK-NEXT:    [[VAL_6:%.*]] = "toy.reshape"([[VAL_5]]) : (tensor<2x3xf64>) -> tensor<2x3xf64>
-# CHECK-NEXT:    [[VAL_7:%.*]] = "toy.constant"() {value = dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64>} : () -> tensor<6xf64>
-# CHECK-NEXT:    [[VAL_8:%.*]] = "toy.reshape"([[VAL_7]]) : (tensor<6xf64>) -> tensor<2x3xf64>
-# CHECK-NEXT:    [[VAL_9:%.*]] = "toy.generic_call"([[VAL_6]], [[VAL_8]]) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
-# CHECK-NEXT:    [[VAL_10:%.*]] = "toy.generic_call"([[VAL_8]], [[VAL_6]]) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
-# CHECK-NEXT:    "toy.print"([[VAL_10]]) : (tensor<*xf64>) -> ()
-# CHECK-NEXT:    "toy.return"() : () -> ()
+# CHECK-NEXT:    [[VAL_5:%.*]] = toy.constant dense<{{\[\[}}1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+# CHECK-NEXT:    [[VAL_6:%.*]] = toy.reshape([[VAL_5]] : tensor<2x3xf64>) to tensor<2x3xf64>
+# CHECK-NEXT:    [[VAL_7:%.*]] = toy.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64>
+# CHECK-NEXT:    [[VAL_8:%.*]] = toy.reshape([[VAL_7]] : tensor<6xf64>) to tensor<2x3xf64>
+# CHECK-NEXT:    [[VAL_9:%.*]] = toy.generic_call @multiply_transpose([[VAL_6]], [[VAL_8]]) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+# CHECK-NEXT:    [[VAL_10:%.*]] = toy.generic_call @multiply_transpose([[VAL_8]], [[VAL_6]]) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+# CHECK-NEXT:    toy.print [[VAL_10]] : tensor<*xf64>
+# CHECK-NEXT:    toy.return
diff --git a/mlir/test/Examples/Toy/Ch2/scalar.toy b/mlir/test/Examples/Toy/Ch2/scalar.toy
--- a/mlir/test/Examples/Toy/Ch2/scalar.toy
+++ b/mlir/test/Examples/Toy/Ch2/scalar.toy
@@ -6,9 +6,9 @@
 }
 
 # CHECK-LABEL: func @main() {
-# CHECK-NEXT:    %0 = "toy.constant"() {value = dense<5.500000e+00> : tensor<f64>} : () -> tensor<f64>
-# CHECK-NEXT:    %1 = "toy.reshape"(%0) : (tensor<f64>) -> tensor<2x2xf64>
-# CHECK-NEXT:    "toy.print"(%1) : (tensor<2x2xf64>) -> ()
-# CHECK-NEXT:    "toy.return"() : () -> ()
+# CHECK-NEXT:    %0 = toy.constant dense<5.500000e+00> : tensor<f64>
+# CHECK-NEXT:    %1 = toy.reshape(%0 : tensor<f64>) to tensor<2x2xf64>
+# CHECK-NEXT:    toy.print %1 : tensor<2x2xf64>
+# CHECK-NEXT:    toy.return
 # CHECK-NEXT:  }
 
diff --git a/mlir/test/Examples/Toy/Ch3/codegen.toy b/mlir/test/Examples/Toy/Ch3/codegen.toy
--- a/mlir/test/Examples/Toy/Ch3/codegen.toy
+++ b/mlir/test/Examples/Toy/Ch3/codegen.toy
@@ -15,17 +15,17 @@
 
 # CHECK-LABEL: func @multiply_transpose(
 # CHECK-SAME:                           [[VAL_0:%.*]]: tensor<*xf64>, [[VAL_1:%.*]]: tensor<*xf64>) -> tensor<*xf64>
-# CHECK:         [[VAL_2:%.*]] = "toy.transpose"([[VAL_0]]) : (tensor<*xf64>) -> tensor<*xf64>
-# CHECK-NEXT:    [[VAL_3:%.*]] = "toy.transpose"([[VAL_1]]) : (tensor<*xf64>) -> tensor<*xf64>
-# CHECK-NEXT:    [[VAL_4:%.*]] = "toy.mul"([[VAL_2]], [[VAL_3]]) : (tensor<*xf64>, tensor<*xf64>) -> tensor<*xf64>
-# CHECK-NEXT:    "toy.return"([[VAL_4]]) : (tensor<*xf64>) -> ()
+# CHECK:         [[VAL_2:%.*]] = toy.transpose([[VAL_0]] : tensor<*xf64>) to tensor<*xf64>
+# CHECK-NEXT:    [[VAL_3:%.*]] = toy.transpose([[VAL_1]] : tensor<*xf64>) to tensor<*xf64>
+# CHECK-NEXT:    [[VAL_4:%.*]] = toy.mul [[VAL_2]], [[VAL_3]] :  tensor<*xf64>
+# CHECK-NEXT:    toy.return [[VAL_4]] : tensor<*xf64>
 
 # CHECK-LABEL: func @main()
-# CHECK-NEXT:    [[VAL_5:%.*]] = "toy.constant"() {value = dense<{{\[\[}}1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
-# CHECK-NEXT:    [[VAL_6:%.*]] = "toy.reshape"([[VAL_5]]) : (tensor<2x3xf64>) -> tensor<2x3xf64>
-# CHECK-NEXT:    [[VAL_7:%.*]] = "toy.constant"() {value = dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64>} : () -> tensor<6xf64>
-# CHECK-NEXT:    [[VAL_8:%.*]] = "toy.reshape"([[VAL_7]]) : (tensor<6xf64>) -> tensor<2x3xf64>
-# CHECK-NEXT:    [[VAL_9:%.*]] = "toy.generic_call"([[VAL_6]], [[VAL_8]]) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
-# CHECK-NEXT:    [[VAL_10:%.*]] = "toy.generic_call"([[VAL_8]], [[VAL_6]]) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
-# CHECK-NEXT:    "toy.print"([[VAL_10]]) : (tensor<*xf64>) -> ()
-# CHECK-NEXT:    "toy.return"() : () -> ()
+# CHECK-NEXT:    [[VAL_5:%.*]] = toy.constant dense<{{\[\[}}1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+# CHECK-NEXT:    [[VAL_6:%.*]] = toy.reshape([[VAL_5]] : tensor<2x3xf64>) to tensor<2x3xf64>
+# CHECK-NEXT:    [[VAL_7:%.*]] = toy.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64>
+# CHECK-NEXT:    [[VAL_8:%.*]] = toy.reshape([[VAL_7]] : tensor<6xf64>) to tensor<2x3xf64>
+# CHECK-NEXT:    [[VAL_9:%.*]] = toy.generic_call @multiply_transpose([[VAL_6]], [[VAL_8]]) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+# CHECK-NEXT:    [[VAL_10:%.*]] = toy.generic_call @multiply_transpose([[VAL_8]], [[VAL_6]]) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+# CHECK-NEXT:    toy.print [[VAL_10]] : tensor<*xf64>
+# CHECK-NEXT:    toy.return
diff --git a/mlir/test/Examples/Toy/Ch3/scalar.toy b/mlir/test/Examples/Toy/Ch3/scalar.toy
--- a/mlir/test/Examples/Toy/Ch3/scalar.toy
+++ b/mlir/test/Examples/Toy/Ch3/scalar.toy
@@ -6,9 +6,9 @@
 }
 
 # CHECK-LABEL: func @main() {
-# CHECK-NEXT:    %0 = "toy.constant"() {value = dense<5.500000e+00> : tensor<f64>} : () -> tensor<f64>
-# CHECK-NEXT:    %1 = "toy.reshape"(%0) : (tensor<f64>) -> tensor<2x2xf64>
-# CHECK-NEXT:    "toy.print"(%1) : (tensor<2x2xf64>) -> ()
-# CHECK-NEXT:    "toy.return"() : () -> ()
+# CHECK-NEXT:    %0 = toy.constant dense<5.500000e+00> : tensor<f64>
+# CHECK-NEXT:    %1 = toy.reshape(%0 : tensor<f64>) to tensor<2x2xf64>
+# CHECK-NEXT:    toy.print %1 : tensor<2x2xf64>
+# CHECK-NEXT:    toy.return
 # CHECK-NEXT:  }
 
diff --git a/mlir/test/Examples/Toy/Ch4/codegen.toy b/mlir/test/Examples/Toy/Ch4/codegen.toy
--- a/mlir/test/Examples/Toy/Ch4/codegen.toy
+++ b/mlir/test/Examples/Toy/Ch4/codegen.toy
@@ -15,17 +15,17 @@
 
 # CHECK-LABEL: func @multiply_transpose(
 # CHECK-SAME:                           [[VAL_0:%.*]]: tensor<*xf64>, [[VAL_1:%.*]]: tensor<*xf64>) -> tensor<*xf64>
-# CHECK:         [[VAL_2:%.*]] = "toy.transpose"([[VAL_0]]) : (tensor<*xf64>) -> tensor<*xf64>
-# CHECK-NEXT:    [[VAL_3:%.*]] = "toy.transpose"([[VAL_1]]) : (tensor<*xf64>) -> tensor<*xf64>
-# CHECK-NEXT:    [[VAL_4:%.*]] = "toy.mul"([[VAL_2]], [[VAL_3]]) : (tensor<*xf64>, tensor<*xf64>) -> tensor<*xf64>
-# CHECK-NEXT:    "toy.return"([[VAL_4]]) : (tensor<*xf64>) -> ()
+# CHECK:         [[VAL_2:%.*]] = toy.transpose([[VAL_0]] : tensor<*xf64>) to tensor<*xf64>
+# CHECK-NEXT:    [[VAL_3:%.*]] = toy.transpose([[VAL_1]] : tensor<*xf64>) to tensor<*xf64>
+# CHECK-NEXT:    [[VAL_4:%.*]] = toy.mul [[VAL_2]], [[VAL_3]] :  tensor<*xf64>
+# CHECK-NEXT:    toy.return [[VAL_4]] : tensor<*xf64>
 
 # CHECK-LABEL: func @main()
-# CHECK-NEXT:    [[VAL_5:%.*]] = "toy.constant"() {value = dense<{{\[\[}}1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
-# CHECK-NEXT:    [[VAL_6:%.*]] = "toy.reshape"([[VAL_5]]) : (tensor<2x3xf64>) -> tensor<2x3xf64>
-# CHECK-NEXT:    [[VAL_7:%.*]] = "toy.constant"() {value = dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64>} : () -> tensor<6xf64>
-# CHECK-NEXT:    [[VAL_8:%.*]] = "toy.reshape"([[VAL_7]]) : (tensor<6xf64>) -> tensor<2x3xf64>
-# CHECK-NEXT:    [[VAL_9:%.*]] = "toy.generic_call"([[VAL_6]], [[VAL_8]]) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
-# CHECK-NEXT:    [[VAL_10:%.*]] = "toy.generic_call"([[VAL_8]], [[VAL_6]]) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
-# CHECK-NEXT:    "toy.print"([[VAL_10]]) : (tensor<*xf64>) -> ()
-# CHECK-NEXT:    "toy.return"() : () -> ()
+# CHECK-NEXT:    [[VAL_5:%.*]] = toy.constant dense<{{\[\[}}1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+# CHECK-NEXT:    [[VAL_6:%.*]] = toy.reshape([[VAL_5]] : tensor<2x3xf64>) to tensor<2x3xf64>
+# CHECK-NEXT:    [[VAL_7:%.*]] = toy.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64>
+# CHECK-NEXT:    [[VAL_8:%.*]] = toy.reshape([[VAL_7]] : tensor<6xf64>) to tensor<2x3xf64>
+# CHECK-NEXT:    [[VAL_9:%.*]] = toy.generic_call @multiply_transpose([[VAL_6]], [[VAL_8]]) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+# CHECK-NEXT:    [[VAL_10:%.*]] = toy.generic_call @multiply_transpose([[VAL_8]], [[VAL_6]]) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+# CHECK-NEXT:    toy.print [[VAL_10]] : tensor<*xf64>
+# CHECK-NEXT:    toy.return
diff --git a/mlir/test/Examples/Toy/Ch4/scalar.toy b/mlir/test/Examples/Toy/Ch4/scalar.toy
--- a/mlir/test/Examples/Toy/Ch4/scalar.toy
+++ b/mlir/test/Examples/Toy/Ch4/scalar.toy
@@ -6,9 +6,9 @@
 }
 
 # CHECK-LABEL: func @main() {
-# CHECK-NEXT:    %0 = "toy.constant"() {value = dense<5.500000e+00> : tensor<f64>} : () -> tensor<f64>
-# CHECK-NEXT:    %1 = "toy.reshape"(%0) : (tensor<f64>) -> tensor<2x2xf64>
-# CHECK-NEXT:    "toy.print"(%1) : (tensor<2x2xf64>) -> ()
-# CHECK-NEXT:    "toy.return"() : () -> ()
+# CHECK-NEXT:    %0 = toy.constant dense<5.500000e+00> : tensor<f64>
+# CHECK-NEXT:    %1 = toy.reshape(%0 : tensor<f64>) to tensor<2x2xf64>
+# CHECK-NEXT:    toy.print %1 : tensor<2x2xf64>
+# CHECK-NEXT:    toy.return
 # CHECK-NEXT:  }
 
diff --git a/mlir/test/Examples/Toy/Ch4/shape_inference.mlir b/mlir/test/Examples/Toy/Ch4/shape_inference.mlir
--- a/mlir/test/Examples/Toy/Ch4/shape_inference.mlir
+++ b/mlir/test/Examples/Toy/Ch4/shape_inference.mlir
@@ -4,28 +4,28 @@
 
 func @multiply_transpose(%arg0: tensor<*xf64>, %arg1: tensor<*xf64>) -> tensor<*xf64>
     attributes { sym_visibility = "private" } {
-  %0 = "toy.transpose"(%arg0) : (tensor<*xf64>) -> tensor<*xf64>
-  %1 = "toy.transpose"(%arg1) : (tensor<*xf64>) -> tensor<*xf64>
-  %2 = "toy.mul"(%0, %1) : (tensor<*xf64>, tensor<*xf64>) -> tensor<*xf64>
-  "toy.return"(%2) : (tensor<*xf64>) -> ()
+  %0 = toy.transpose(%arg0 : tensor<*xf64>) to tensor<*xf64>
+  %1 = toy.transpose(%arg1 : tensor<*xf64>) to tensor<*xf64>
+  %2 = toy.mul %0, %1 : tensor<*xf64>
+  toy.return %2 : tensor<*xf64>
 }
 func @main() {
-  %0 = "toy.constant"() {value = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
-  %1 = "toy.reshape"(%0) : (tensor<2x3xf64>) -> tensor<2x3xf64>
-  %2 = "toy.constant"() {value = dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64>} : () -> tensor<6xf64>
-  %3 = "toy.reshape"(%2) : (tensor<6xf64>) -> tensor<2x3xf64>
-  %4 = "toy.generic_call"(%1, %3) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
-  %5 = "toy.generic_call"(%3, %1) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
-  "toy.print"(%5) : (tensor<*xf64>) -> ()
-  "toy.return"() : () -> ()
+  %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+  %1 = toy.reshape(%0 : tensor<2x3xf64>) to tensor<2x3xf64>
+  %2 = toy.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64>
+  %3 = toy.reshape(%2 : tensor<6xf64>) to tensor<2x3xf64>
+  %4 = toy.generic_call @multiply_transpose(%1, %3) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+  %5 = toy.generic_call @multiply_transpose(%3, %1) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+  toy.print %5 : tensor<*xf64>
+  toy.return
 }
 
 // CHECK-NOT: func @multiply_transpose
 // CHECK-NOT: tensor<*xf64>
 
 // CHECK-LABEL: func @main()
-// CHECK:         [[VAL_0:%.*]] = "toy.constant"() {value = dense<{{\[\[}}1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
-// CHECK:         [[VAL_1:%.*]] = "toy.transpose"([[VAL_0]]) : (tensor<2x3xf64>) -> tensor<3x2xf64>
-// CHECK:         [[VAL_2:%.*]] = "toy.mul"([[VAL_1]], [[VAL_1]]) : (tensor<3x2xf64>, tensor<3x2xf64>) -> tensor<3x2xf64>
-// CHECK:         "toy.print"([[VAL_2]]) : (tensor<3x2xf64>) -> ()
-// CHECK:         "toy.return"() : () -> ()
+// CHECK:         [[VAL_0:%.*]] = toy.constant dense<{{\[\[}}1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+// CHECK:         [[VAL_1:%.*]] = toy.transpose([[VAL_0]] : tensor<2x3xf64>) to tensor<3x2xf64>
+// CHECK:         [[VAL_2:%.*]] = toy.mul [[VAL_1]], [[VAL_1]] : tensor<3x2xf64>
+// CHECK:         toy.print [[VAL_2]] : tensor<3x2xf64>
+// CHECK:         toy.return
diff --git a/mlir/test/Examples/Toy/Ch5/affine-lowering.mlir b/mlir/test/Examples/Toy/Ch5/affine-lowering.mlir
--- a/mlir/test/Examples/Toy/Ch5/affine-lowering.mlir
+++ b/mlir/test/Examples/Toy/Ch5/affine-lowering.mlir
@@ -2,11 +2,11 @@
 // RUN: toyc-ch5 %s -emit=mlir-affine -opt 2>&1 | FileCheck %s --check-prefix=OPT
 
 func @main() {
-  %0 = "toy.constant"() {value = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
-  %2 = "toy.transpose"(%0) : (tensor<2x3xf64>) -> tensor<3x2xf64>
-  %3 = "toy.mul"(%2, %2) : (tensor<3x2xf64>, tensor<3x2xf64>) -> tensor<3x2xf64>
-  "toy.print"(%3) : (tensor<3x2xf64>) -> ()
-  "toy.return"() : () -> ()
+  %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+  %2 = toy.transpose(%0 : tensor<2x3xf64>) to tensor<3x2xf64>
+  %3 = toy.mul %2, %2 : tensor<3x2xf64>
+  toy.print %3 : tensor<3x2xf64>
+  toy.return
 }
 
 // CHECK-LABEL: func @main()
@@ -35,7 +35,7 @@
 // CHECK:             [[VAL_15:%.*]] = affine.load [[VAL_7]]{{\[}}[[VAL_12]], [[VAL_13]]] : memref<3x2xf64>
 // CHECK:             [[VAL_16:%.*]] = mulf [[VAL_14]], [[VAL_15]] : f64
 // CHECK:             affine.store [[VAL_16]], [[VAL_6]]{{\[}}[[VAL_12]], [[VAL_13]]] : memref<3x2xf64>
-// CHECK:         "toy.print"([[VAL_6]]) : (memref<3x2xf64>) -> ()
+// CHECK:         toy.print [[VAL_6]] : memref<3x2xf64>
 // CHECK:         dealloc [[VAL_8]] : memref<2x3xf64>
 // CHECK:         dealloc [[VAL_7]] : memref<3x2xf64>
 // CHECK:         dealloc [[VAL_6]] : memref<3x2xf64>
@@ -60,6 +60,6 @@
 // OPT:             [[VAL_10:%.*]] = affine.load [[VAL_7]]{{\[}}[[VAL_9]], [[VAL_8]]] : memref<2x3xf64>
 // OPT:             [[VAL_11:%.*]] = mulf [[VAL_10]], [[VAL_10]] : f64
 // OPT:             affine.store [[VAL_11]], [[VAL_6]]{{\[}}[[VAL_8]], [[VAL_9]]] : memref<3x2xf64>
-// OPT:         "toy.print"([[VAL_6]]) : (memref<3x2xf64>) -> ()
+// OPT:         toy.print [[VAL_6]] : memref<3x2xf64>
 // OPT:         dealloc [[VAL_7]] : memref<2x3xf64>
 // OPT:         dealloc [[VAL_6]] : memref<3x2xf64>
diff --git a/mlir/test/Examples/Toy/Ch5/codegen.toy b/mlir/test/Examples/Toy/Ch5/codegen.toy
--- a/mlir/test/Examples/Toy/Ch5/codegen.toy
+++ b/mlir/test/Examples/Toy/Ch5/codegen.toy
@@ -15,17 +15,17 @@
 
 # CHECK-LABEL: func @multiply_transpose(
 # CHECK-SAME:                           [[VAL_0:%.*]]: tensor<*xf64>, [[VAL_1:%.*]]: tensor<*xf64>) -> tensor<*xf64>
-# CHECK:         [[VAL_2:%.*]] = "toy.transpose"([[VAL_0]]) : (tensor<*xf64>) -> tensor<*xf64>
-# CHECK-NEXT:    [[VAL_3:%.*]] = "toy.transpose"([[VAL_1]]) : (tensor<*xf64>) -> tensor<*xf64>
-# CHECK-NEXT:    [[VAL_4:%.*]] = "toy.mul"([[VAL_2]], [[VAL_3]]) : (tensor<*xf64>, tensor<*xf64>) -> tensor<*xf64>
-# CHECK-NEXT:    "toy.return"([[VAL_4]]) : (tensor<*xf64>) -> ()
+# CHECK:         [[VAL_2:%.*]] = toy.transpose([[VAL_0]] : tensor<*xf64>) to tensor<*xf64>
+# CHECK-NEXT:    [[VAL_3:%.*]] = toy.transpose([[VAL_1]] : tensor<*xf64>) to tensor<*xf64>
+# CHECK-NEXT:    [[VAL_4:%.*]] = toy.mul [[VAL_2]], [[VAL_3]] :  tensor<*xf64>
+# CHECK-NEXT:    toy.return [[VAL_4]] : tensor<*xf64>
 
 # CHECK-LABEL: func @main()
-# CHECK-NEXT:    [[VAL_5:%.*]] = "toy.constant"() {value = dense<{{\[\[}}1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
-# CHECK-NEXT:    [[VAL_6:%.*]] = "toy.reshape"([[VAL_5]]) : (tensor<2x3xf64>) -> tensor<2x3xf64>
-# CHECK-NEXT:    [[VAL_7:%.*]] = "toy.constant"() {value = dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64>} : () -> tensor<6xf64>
-# CHECK-NEXT:    [[VAL_8:%.*]] = "toy.reshape"([[VAL_7]]) : (tensor<6xf64>) -> tensor<2x3xf64>
-# CHECK-NEXT:    [[VAL_9:%.*]] = "toy.generic_call"([[VAL_6]], [[VAL_8]]) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
-# CHECK-NEXT:    [[VAL_10:%.*]] = "toy.generic_call"([[VAL_8]], [[VAL_6]]) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
-# CHECK-NEXT:    "toy.print"([[VAL_10]]) : (tensor<*xf64>) -> ()
-# CHECK-NEXT:    "toy.return"() : () -> ()
+# CHECK-NEXT:    [[VAL_5:%.*]] = toy.constant dense<{{\[\[}}1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+# CHECK-NEXT:    [[VAL_6:%.*]] = toy.reshape([[VAL_5]] : tensor<2x3xf64>) to tensor<2x3xf64>
+# CHECK-NEXT:    [[VAL_7:%.*]] = toy.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64>
+# CHECK-NEXT:    [[VAL_8:%.*]] = toy.reshape([[VAL_7]] : tensor<6xf64>) to tensor<2x3xf64>
+# CHECK-NEXT:    [[VAL_9:%.*]] = toy.generic_call @multiply_transpose([[VAL_6]], [[VAL_8]]) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+# CHECK-NEXT:    [[VAL_10:%.*]] = toy.generic_call @multiply_transpose([[VAL_8]], [[VAL_6]]) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+# CHECK-NEXT:    toy.print [[VAL_10]] : tensor<*xf64>
+# CHECK-NEXT:    toy.return
diff --git a/mlir/test/Examples/Toy/Ch5/scalar.toy b/mlir/test/Examples/Toy/Ch5/scalar.toy
--- a/mlir/test/Examples/Toy/Ch5/scalar.toy
+++ b/mlir/test/Examples/Toy/Ch5/scalar.toy
@@ -6,9 +6,9 @@
 }
 
 # CHECK-LABEL: func @main() {
-# CHECK-NEXT:    %0 = "toy.constant"() {value = dense<5.500000e+00> : tensor<f64>} : () -> tensor<f64>
-# CHECK-NEXT:    %1 = "toy.reshape"(%0) : (tensor<f64>) -> tensor<2x2xf64>
-# CHECK-NEXT:    "toy.print"(%1) : (tensor<2x2xf64>) -> ()
-# CHECK-NEXT:    "toy.return"() : () -> ()
+# CHECK-NEXT:    %0 = toy.constant dense<5.500000e+00> : tensor<f64>
+# CHECK-NEXT:    %1 = toy.reshape(%0 : tensor<f64>) to tensor<2x2xf64>
+# CHECK-NEXT:    toy.print %1 : tensor<2x2xf64>
+# CHECK-NEXT:    toy.return
 # CHECK-NEXT:  }
 
diff --git a/mlir/test/Examples/Toy/Ch5/shape_inference.mlir b/mlir/test/Examples/Toy/Ch5/shape_inference.mlir
--- a/mlir/test/Examples/Toy/Ch5/shape_inference.mlir
+++ b/mlir/test/Examples/Toy/Ch5/shape_inference.mlir
@@ -4,28 +4,28 @@
 
 func @multiply_transpose(%arg0: tensor<*xf64>, %arg1: tensor<*xf64>) -> tensor<*xf64>
     attributes { sym_visibility = "private" } {
-  %0 = "toy.transpose"(%arg0) : (tensor<*xf64>) -> tensor<*xf64>
-  %1 = "toy.transpose"(%arg1) : (tensor<*xf64>) -> tensor<*xf64>
-  %2 = "toy.mul"(%0, %1) : (tensor<*xf64>, tensor<*xf64>) -> tensor<*xf64>
-  "toy.return"(%2) : (tensor<*xf64>) -> ()
+  %0 = toy.transpose(%arg0 : tensor<*xf64>) to tensor<*xf64>
+  %1 = toy.transpose(%arg1 : tensor<*xf64>) to tensor<*xf64>
+  %2 = toy.mul %0, %1 : tensor<*xf64>
+  toy.return %2 : tensor<*xf64>
 }
 func @main() {
-  %0 = "toy.constant"() {value = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
-  %1 = "toy.reshape"(%0) : (tensor<2x3xf64>) -> tensor<2x3xf64>
-  %2 = "toy.constant"() {value = dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64>} : () -> tensor<6xf64>
-  %3 = "toy.reshape"(%2) : (tensor<6xf64>) -> tensor<2x3xf64>
-  %4 = "toy.generic_call"(%1, %3) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
-  %5 = "toy.generic_call"(%3, %1) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
-  "toy.print"(%5) : (tensor<*xf64>) -> ()
-  "toy.return"() : () -> ()
+  %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+  %1 = toy.reshape(%0 : tensor<2x3xf64>) to tensor<2x3xf64>
+  %2 = toy.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64>
+  %3 = toy.reshape(%2 : tensor<6xf64>) to tensor<2x3xf64>
+  %4 = toy.generic_call @multiply_transpose(%1, %3) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+  %5 = toy.generic_call @multiply_transpose(%3, %1) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+  toy.print %5 : tensor<*xf64>
+  toy.return
 }
 
 // CHECK-NOT: func @multiply_transpose
 // CHECK-NOT: tensor<*xf64>
 
 // CHECK-LABEL: func @main()
-// CHECK:         [[VAL_0:%.*]] = "toy.constant"() {value = dense<{{\[\[}}1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
-// CHECK:         [[VAL_1:%.*]] = "toy.transpose"([[VAL_0]]) : (tensor<2x3xf64>) -> tensor<3x2xf64>
-// CHECK:         [[VAL_2:%.*]] = "toy.mul"([[VAL_1]], [[VAL_1]]) : (tensor<3x2xf64>, tensor<3x2xf64>) -> tensor<3x2xf64>
-// CHECK:         "toy.print"([[VAL_2]]) : (tensor<3x2xf64>) -> ()
-// CHECK:         "toy.return"() : () -> ()
+// CHECK:         [[VAL_0:%.*]] = toy.constant dense<{{\[\[}}1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+// CHECK:         [[VAL_1:%.*]] = toy.transpose([[VAL_0]] : tensor<2x3xf64>) to tensor<3x2xf64>
+// CHECK:         [[VAL_2:%.*]] = toy.mul [[VAL_1]], [[VAL_1]] : tensor<3x2xf64>
+// CHECK:         toy.print [[VAL_2]] : tensor<3x2xf64>
+// CHECK:         toy.return
diff --git a/mlir/test/Examples/Toy/Ch6/affine-lowering.mlir b/mlir/test/Examples/Toy/Ch6/affine-lowering.mlir
--- a/mlir/test/Examples/Toy/Ch6/affine-lowering.mlir
+++ b/mlir/test/Examples/Toy/Ch6/affine-lowering.mlir
@@ -2,11 +2,11 @@
 // RUN: toyc-ch6 %s -emit=mlir-affine -opt 2>&1 | FileCheck %s --check-prefix=OPT
 
 func @main() {
-  %0 = "toy.constant"() {value = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
-  %2 = "toy.transpose"(%0) : (tensor<2x3xf64>) -> tensor<3x2xf64>
-  %3 = "toy.mul"(%2, %2) : (tensor<3x2xf64>, tensor<3x2xf64>) -> tensor<3x2xf64>
-  "toy.print"(%3) : (tensor<3x2xf64>) -> ()
-  "toy.return"() : () -> ()
+  %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+  %2 = toy.transpose(%0 : tensor<2x3xf64>) to tensor<3x2xf64>
+  %3 = toy.mul %2, %2 : tensor<3x2xf64>
+  toy.print %3 : tensor<3x2xf64>
+  toy.return
 }
 
 // CHECK-LABEL: func @main()
@@ -35,7 +35,7 @@
 // CHECK:             [[VAL_15:%.*]] = affine.load [[VAL_7]]{{\[}}[[VAL_12]], [[VAL_13]]] : memref<3x2xf64>
 // CHECK:             [[VAL_16:%.*]] = mulf [[VAL_14]], [[VAL_15]] : f64
 // CHECK:             affine.store [[VAL_16]], [[VAL_6]]{{\[}}[[VAL_12]], [[VAL_13]]] : memref<3x2xf64>
-// CHECK:         "toy.print"([[VAL_6]]) : (memref<3x2xf64>) -> ()
+// CHECK:         toy.print [[VAL_6]] : memref<3x2xf64>
 // CHECK:         dealloc [[VAL_8]] : memref<2x3xf64>
 // CHECK:         dealloc [[VAL_7]] : memref<3x2xf64>
 // CHECK:         dealloc [[VAL_6]] : memref<3x2xf64>
@@ -60,6 +60,6 @@
 // OPT:             [[VAL_10:%.*]] = affine.load [[VAL_7]]{{\[}}[[VAL_9]], [[VAL_8]]] : memref<2x3xf64>
 // OPT:             [[VAL_11:%.*]] = mulf [[VAL_10]], [[VAL_10]] : f64
 // OPT:             affine.store [[VAL_11]], [[VAL_6]]{{\[}}[[VAL_8]], [[VAL_9]]] : memref<3x2xf64>
-// OPT:         "toy.print"([[VAL_6]]) : (memref<3x2xf64>) -> ()
+// OPT:         toy.print [[VAL_6]] : memref<3x2xf64>
 // OPT:         dealloc [[VAL_7]] : memref<2x3xf64>
 // OPT:         dealloc [[VAL_6]] : memref<3x2xf64>
diff --git a/mlir/test/Examples/Toy/Ch6/codegen.toy b/mlir/test/Examples/Toy/Ch6/codegen.toy
--- a/mlir/test/Examples/Toy/Ch6/codegen.toy
+++ b/mlir/test/Examples/Toy/Ch6/codegen.toy
@@ -15,17 +15,17 @@
 
 # CHECK-LABEL: func @multiply_transpose(
 # CHECK-SAME:                           [[VAL_0:%.*]]: tensor<*xf64>, [[VAL_1:%.*]]: tensor<*xf64>) -> tensor<*xf64>
-# CHECK:         [[VAL_2:%.*]] = "toy.transpose"([[VAL_0]]) : (tensor<*xf64>) -> tensor<*xf64>
-# CHECK-NEXT:    [[VAL_3:%.*]] = "toy.transpose"([[VAL_1]]) : (tensor<*xf64>) -> tensor<*xf64>
-# CHECK-NEXT:    [[VAL_4:%.*]] = "toy.mul"([[VAL_2]], [[VAL_3]]) : (tensor<*xf64>, tensor<*xf64>) -> tensor<*xf64>
-# CHECK-NEXT:    "toy.return"([[VAL_4]]) : (tensor<*xf64>) -> ()
+# CHECK:         [[VAL_2:%.*]] = toy.transpose([[VAL_0]] : tensor<*xf64>) to tensor<*xf64>
+# CHECK-NEXT:    [[VAL_3:%.*]] = toy.transpose([[VAL_1]] : tensor<*xf64>) to tensor<*xf64>
+# CHECK-NEXT:    [[VAL_4:%.*]] = toy.mul [[VAL_2]], [[VAL_3]] :  tensor<*xf64>
+# CHECK-NEXT:    toy.return [[VAL_4]] : tensor<*xf64>
 
 # CHECK-LABEL: func @main()
-# CHECK-NEXT:    [[VAL_5:%.*]] = "toy.constant"() {value = dense<{{\[\[}}1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
-# CHECK-NEXT:    [[VAL_6:%.*]] = "toy.reshape"([[VAL_5]]) : (tensor<2x3xf64>) -> tensor<2x3xf64>
-# CHECK-NEXT:    [[VAL_7:%.*]] = "toy.constant"() {value = dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64>} : () -> tensor<6xf64>
-# CHECK-NEXT:    [[VAL_8:%.*]] = "toy.reshape"([[VAL_7]]) : (tensor<6xf64>) -> tensor<2x3xf64>
-# CHECK-NEXT:    [[VAL_9:%.*]] = "toy.generic_call"([[VAL_6]], [[VAL_8]]) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
-# CHECK-NEXT:    [[VAL_10:%.*]] = "toy.generic_call"([[VAL_8]], [[VAL_6]]) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
-# CHECK-NEXT:    "toy.print"([[VAL_10]]) : (tensor<*xf64>) -> ()
-# CHECK-NEXT:    "toy.return"() : () -> ()
+# CHECK-NEXT:    [[VAL_5:%.*]] = toy.constant dense<{{\[\[}}1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+# CHECK-NEXT:    [[VAL_6:%.*]] = toy.reshape([[VAL_5]] : tensor<2x3xf64>) to tensor<2x3xf64>
+# CHECK-NEXT:    [[VAL_7:%.*]] = toy.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64>
+# CHECK-NEXT:    [[VAL_8:%.*]] = toy.reshape([[VAL_7]] : tensor<6xf64>) to tensor<2x3xf64>
+# CHECK-NEXT:    [[VAL_9:%.*]] = toy.generic_call @multiply_transpose([[VAL_6]], [[VAL_8]]) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+# CHECK-NEXT:    [[VAL_10:%.*]] = toy.generic_call @multiply_transpose([[VAL_8]], [[VAL_6]]) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+# CHECK-NEXT:    toy.print [[VAL_10]] : tensor<*xf64>
+# CHECK-NEXT:    toy.return
diff --git a/mlir/test/Examples/Toy/Ch6/llvm-lowering.mlir b/mlir/test/Examples/Toy/Ch6/llvm-lowering.mlir
--- a/mlir/test/Examples/Toy/Ch6/llvm-lowering.mlir
+++ b/mlir/test/Examples/Toy/Ch6/llvm-lowering.mlir
@@ -1,11 +1,11 @@
 // RUN: toyc-ch6 %s -emit=llvm -opt
 
 func @main() {
-  %0 = "toy.constant"() {value = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
-  %2 = "toy.transpose"(%0) : (tensor<2x3xf64>) -> tensor<3x2xf64>
-  %3 = "toy.mul"(%2, %2) : (tensor<3x2xf64>, tensor<3x2xf64>) -> tensor<3x2xf64>
-  "toy.print"(%3) : (tensor<3x2xf64>) -> ()
-  "toy.return"() : () -> ()
+  %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+  %2 = toy.transpose(%0 : tensor<2x3xf64>) to tensor<3x2xf64>
+  %3 = toy.mul %2, %2 : tensor<3x2xf64>
+  toy.print %3 : tensor<3x2xf64>
+  toy.return
 }
 
 // CHECK-LABEL: define void @main()
diff --git a/mlir/test/Examples/Toy/Ch6/scalar.toy b/mlir/test/Examples/Toy/Ch6/scalar.toy
--- a/mlir/test/Examples/Toy/Ch6/scalar.toy
+++ b/mlir/test/Examples/Toy/Ch6/scalar.toy
@@ -6,9 +6,9 @@
 }
 
 # CHECK-LABEL: func @main() {
-# CHECK-NEXT:    %0 = "toy.constant"() {value = dense<5.500000e+00> : tensor<f64>} : () -> tensor<f64>
-# CHECK-NEXT:    %1 = "toy.reshape"(%0) : (tensor<f64>) -> tensor<2x2xf64>
-# CHECK-NEXT:    "toy.print"(%1) : (tensor<2x2xf64>) -> ()
-# CHECK-NEXT:    "toy.return"() : () -> ()
+# CHECK-NEXT:    %0 = toy.constant dense<5.500000e+00> : tensor<f64>
+# CHECK-NEXT:    %1 = toy.reshape(%0 : tensor<f64>) to tensor<2x2xf64>
+# CHECK-NEXT:    toy.print %1 : tensor<2x2xf64>
+# CHECK-NEXT:    toy.return
 # CHECK-NEXT:  }
 
diff --git a/mlir/test/Examples/Toy/Ch6/shape_inference.mlir b/mlir/test/Examples/Toy/Ch6/shape_inference.mlir
--- a/mlir/test/Examples/Toy/Ch6/shape_inference.mlir
+++ b/mlir/test/Examples/Toy/Ch6/shape_inference.mlir
@@ -4,28 +4,28 @@
 
 func @multiply_transpose(%arg0: tensor<*xf64>, %arg1: tensor<*xf64>) -> tensor<*xf64>
     attributes { sym_visibility = "private" } {
-  %0 = "toy.transpose"(%arg0) : (tensor<*xf64>) -> tensor<*xf64>
-  %1 = "toy.transpose"(%arg1) : (tensor<*xf64>) -> tensor<*xf64>
-  %2 = "toy.mul"(%0, %1) : (tensor<*xf64>, tensor<*xf64>) -> tensor<*xf64>
-  "toy.return"(%2) : (tensor<*xf64>) -> ()
+  %0 = toy.transpose(%arg0 : tensor<*xf64>) to tensor<*xf64>
+  %1 = toy.transpose(%arg1 : tensor<*xf64>) to tensor<*xf64>
+  %2 = toy.mul %0, %1 : tensor<*xf64>
+  toy.return %2 : tensor<*xf64>
 }
 func @main() {
-  %0 = "toy.constant"() {value = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
-  %1 = "toy.reshape"(%0) : (tensor<2x3xf64>) -> tensor<2x3xf64>
-  %2 = "toy.constant"() {value = dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64>} : () -> tensor<6xf64>
-  %3 = "toy.reshape"(%2) : (tensor<6xf64>) -> tensor<2x3xf64>
-  %4 = "toy.generic_call"(%1, %3) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
-  %5 = "toy.generic_call"(%3, %1) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
-  "toy.print"(%5) : (tensor<*xf64>) -> ()
-  "toy.return"() : () -> ()
+  %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+  %1 = toy.reshape(%0 : tensor<2x3xf64>) to tensor<2x3xf64>
+  %2 = toy.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64>
+  %3 = toy.reshape(%2 : tensor<6xf64>) to tensor<2x3xf64>
+  %4 = toy.generic_call @multiply_transpose(%1, %3) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+  %5 = toy.generic_call @multiply_transpose(%3, %1) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+  toy.print %5 : tensor<*xf64>
+  toy.return
 }
 
 // CHECK-NOT: func @multiply_transpose
 // CHECK-NOT: tensor<*xf64>
 
 // CHECK-LABEL: func @main()
-// CHECK:         [[VAL_0:%.*]] = "toy.constant"() {value = dense<{{\[\[}}1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
-// CHECK:         [[VAL_1:%.*]] = "toy.transpose"([[VAL_0]]) : (tensor<2x3xf64>) -> tensor<3x2xf64>
-// CHECK:         [[VAL_2:%.*]] = "toy.mul"([[VAL_1]], [[VAL_1]]) : (tensor<3x2xf64>, tensor<3x2xf64>) -> tensor<3x2xf64>
-// CHECK:         "toy.print"([[VAL_2]]) : (tensor<3x2xf64>) -> ()
-// CHECK:         "toy.return"() : () -> ()
+// CHECK:         [[VAL_0:%.*]] = toy.constant dense<{{\[\[}}1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+// CHECK:         [[VAL_1:%.*]] = toy.transpose([[VAL_0]] : tensor<2x3xf64>) to tensor<3x2xf64>
+// CHECK:         [[VAL_2:%.*]] = toy.mul [[VAL_1]], [[VAL_1]] : tensor<3x2xf64>
+// CHECK:         toy.print [[VAL_2]] : tensor<3x2xf64>
+// CHECK:         toy.return
diff --git a/mlir/test/Examples/Toy/Ch7/affine-lowering.mlir b/mlir/test/Examples/Toy/Ch7/affine-lowering.mlir
--- a/mlir/test/Examples/Toy/Ch7/affine-lowering.mlir
+++ b/mlir/test/Examples/Toy/Ch7/affine-lowering.mlir
@@ -2,11 +2,11 @@
 // RUN: toyc-ch7 %s -emit=mlir-affine -opt 2>&1 | FileCheck %s --check-prefix=OPT
 
 func @main() {
-  %0 = "toy.constant"() {value = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
-  %2 = "toy.transpose"(%0) : (tensor<2x3xf64>) -> tensor<3x2xf64>
-  %3 = "toy.mul"(%2, %2) : (tensor<3x2xf64>, tensor<3x2xf64>) -> tensor<3x2xf64>
-  "toy.print"(%3) : (tensor<3x2xf64>) -> ()
-  "toy.return"() : () -> ()
+  %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+  %2 = toy.transpose(%0 : tensor<2x3xf64>) to tensor<3x2xf64>
+  %3 = toy.mul %2, %2 : tensor<3x2xf64>
+  toy.print %3 : tensor<3x2xf64>
+  toy.return
 }
 
 // CHECK-LABEL: func @main()
@@ -35,7 +35,7 @@
 // CHECK:             [[VAL_15:%.*]] = affine.load [[VAL_7]]{{\[}}[[VAL_12]], [[VAL_13]]] : memref<3x2xf64>
 // CHECK:             [[VAL_16:%.*]] = mulf [[VAL_14]], [[VAL_15]] : f64
 // CHECK:             affine.store [[VAL_16]], [[VAL_6]]{{\[}}[[VAL_12]], [[VAL_13]]] : memref<3x2xf64>
-// CHECK:         "toy.print"([[VAL_6]]) : (memref<3x2xf64>) -> ()
+// CHECK:         toy.print [[VAL_6]] : memref<3x2xf64>
 // CHECK:         dealloc [[VAL_8]] : memref<2x3xf64>
 // CHECK:         dealloc [[VAL_7]] : memref<3x2xf64>
 // CHECK:         dealloc [[VAL_6]] : memref<3x2xf64>
@@ -60,6 +60,6 @@
 // OPT:             [[VAL_10:%.*]] = affine.load [[VAL_7]]{{\[}}[[VAL_9]], [[VAL_8]]] : memref<2x3xf64>
 // OPT:             [[VAL_11:%.*]] = mulf [[VAL_10]], [[VAL_10]] : f64
 // OPT:             affine.store [[VAL_11]], [[VAL_6]]{{\[}}[[VAL_8]], [[VAL_9]]] : memref<3x2xf64>
-// OPT:         "toy.print"([[VAL_6]]) : (memref<3x2xf64>) -> ()
+// OPT:         toy.print [[VAL_6]] : memref<3x2xf64>
 // OPT:         dealloc [[VAL_7]] : memref<2x3xf64>
 // OPT:         dealloc [[VAL_6]] : memref<3x2xf64>
diff --git a/mlir/test/Examples/Toy/Ch7/codegen.toy b/mlir/test/Examples/Toy/Ch7/codegen.toy
--- a/mlir/test/Examples/Toy/Ch7/codegen.toy
+++ b/mlir/test/Examples/Toy/Ch7/codegen.toy
@@ -15,17 +15,17 @@
 
 # CHECK-LABEL: func @multiply_transpose(
 # CHECK-SAME:                           [[VAL_0:%.*]]: tensor<*xf64>, [[VAL_1:%.*]]: tensor<*xf64>) -> tensor<*xf64>
-# CHECK:         [[VAL_2:%.*]] = "toy.transpose"([[VAL_0]]) : (tensor<*xf64>) -> tensor<*xf64>
-# CHECK-NEXT:    [[VAL_3:%.*]] = "toy.transpose"([[VAL_1]]) : (tensor<*xf64>) -> tensor<*xf64>
-# CHECK-NEXT:    [[VAL_4:%.*]] = "toy.mul"([[VAL_2]], [[VAL_3]]) : (tensor<*xf64>, tensor<*xf64>) -> tensor<*xf64>
-# CHECK-NEXT:    "toy.return"([[VAL_4]]) : (tensor<*xf64>) -> ()
+# CHECK:         [[VAL_2:%.*]] = toy.transpose([[VAL_0]] : tensor<*xf64>) to tensor<*xf64>
+# CHECK-NEXT:    [[VAL_3:%.*]] = toy.transpose([[VAL_1]] : tensor<*xf64>) to tensor<*xf64>
+# CHECK-NEXT:    [[VAL_4:%.*]] = toy.mul [[VAL_2]], [[VAL_3]] :  tensor<*xf64>
+# CHECK-NEXT:    toy.return [[VAL_4]] : tensor<*xf64>
 
 # CHECK-LABEL: func @main()
-# CHECK-NEXT:    [[VAL_5:%.*]] = "toy.constant"() {value = dense<{{\[\[}}1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
-# CHECK-NEXT:    [[VAL_6:%.*]] = "toy.reshape"([[VAL_5]]) : (tensor<2x3xf64>) -> tensor<2x3xf64>
-# CHECK-NEXT:    [[VAL_7:%.*]] = "toy.constant"() {value = dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64>} : () -> tensor<6xf64>
-# CHECK-NEXT:    [[VAL_8:%.*]] = "toy.reshape"([[VAL_7]]) : (tensor<6xf64>) -> tensor<2x3xf64>
-# CHECK-NEXT:    [[VAL_9:%.*]] = "toy.generic_call"([[VAL_6]], [[VAL_8]]) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
-# CHECK-NEXT:    [[VAL_10:%.*]] = "toy.generic_call"([[VAL_8]], [[VAL_6]]) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
-# CHECK-NEXT:    "toy.print"([[VAL_10]]) : (tensor<*xf64>) -> ()
-# CHECK-NEXT:    "toy.return"() : () -> ()
+# CHECK-NEXT:    [[VAL_5:%.*]] = toy.constant dense<{{\[\[}}1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+# CHECK-NEXT:    [[VAL_6:%.*]] = toy.reshape([[VAL_5]] : tensor<2x3xf64>) to tensor<2x3xf64>
+# CHECK-NEXT:    [[VAL_7:%.*]] = toy.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64>
+# CHECK-NEXT:    [[VAL_8:%.*]] = toy.reshape([[VAL_7]] : tensor<6xf64>) to tensor<2x3xf64>
+# CHECK-NEXT:    [[VAL_9:%.*]] = toy.generic_call @multiply_transpose([[VAL_6]], [[VAL_8]]) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+# CHECK-NEXT:    [[VAL_10:%.*]] = toy.generic_call @multiply_transpose([[VAL_8]], [[VAL_6]]) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+# CHECK-NEXT:    toy.print [[VAL_10]] : tensor<*xf64>
+# CHECK-NEXT:    toy.return
diff --git a/mlir/test/Examples/Toy/Ch7/llvm-lowering.mlir b/mlir/test/Examples/Toy/Ch7/llvm-lowering.mlir
--- a/mlir/test/Examples/Toy/Ch7/llvm-lowering.mlir
+++ b/mlir/test/Examples/Toy/Ch7/llvm-lowering.mlir
@@ -1,11 +1,11 @@
 // RUN: toyc-ch7 %s -emit=llvm -opt
 
 func @main() {
-  %0 = "toy.constant"() {value = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
-  %2 = "toy.transpose"(%0) : (tensor<2x3xf64>) -> tensor<3x2xf64>
-  %3 = "toy.mul"(%2, %2) : (tensor<3x2xf64>, tensor<3x2xf64>) -> tensor<3x2xf64>
-  "toy.print"(%3) : (tensor<3x2xf64>) -> ()
-  "toy.return"() : () -> ()
+  %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+  %2 = toy.transpose(%0 : tensor<2x3xf64>) to tensor<3x2xf64>
+  %3 = toy.mul %2, %2 : tensor<3x2xf64>
+  toy.print %3 : tensor<3x2xf64>
+  toy.return
 }
 
 // CHECK-LABEL: define void @main()
diff --git a/mlir/test/Examples/Toy/Ch7/scalar.toy b/mlir/test/Examples/Toy/Ch7/scalar.toy
--- a/mlir/test/Examples/Toy/Ch7/scalar.toy
+++ b/mlir/test/Examples/Toy/Ch7/scalar.toy
@@ -6,9 +6,9 @@
 }
 
 # CHECK-LABEL: func @main() {
-# CHECK-NEXT:    %0 = "toy.constant"() {value = dense<5.500000e+00> : tensor<f64>} : () -> tensor<f64>
-# CHECK-NEXT:    %1 = "toy.reshape"(%0) : (tensor<f64>) -> tensor<2x2xf64>
-# CHECK-NEXT:    "toy.print"(%1) : (tensor<2x2xf64>) -> ()
-# CHECK-NEXT:    "toy.return"() : () -> ()
+# CHECK-NEXT:    %0 = toy.constant dense<5.500000e+00> : tensor<f64>
+# CHECK-NEXT:    %1 = toy.reshape(%0 : tensor<f64>) to tensor<2x2xf64>
+# CHECK-NEXT:    toy.print %1 : tensor<2x2xf64>
+# CHECK-NEXT:    toy.return
 # CHECK-NEXT:  }
 
diff --git a/mlir/test/Examples/Toy/Ch7/shape_inference.mlir b/mlir/test/Examples/Toy/Ch7/shape_inference.mlir
--- a/mlir/test/Examples/Toy/Ch7/shape_inference.mlir
+++ b/mlir/test/Examples/Toy/Ch7/shape_inference.mlir
@@ -4,28 +4,28 @@
 
 func @multiply_transpose(%arg0: tensor<*xf64>, %arg1: tensor<*xf64>) -> tensor<*xf64>
     attributes { sym_visibility = "private" } {
-  %0 = "toy.transpose"(%arg0) : (tensor<*xf64>) -> tensor<*xf64>
-  %1 = "toy.transpose"(%arg1) : (tensor<*xf64>) -> tensor<*xf64>
-  %2 = "toy.mul"(%0, %1) : (tensor<*xf64>, tensor<*xf64>) -> tensor<*xf64>
-  "toy.return"(%2) : (tensor<*xf64>) -> ()
+  %0 = toy.transpose(%arg0 : tensor<*xf64>) to tensor<*xf64>
+  %1 = toy.transpose(%arg1 : tensor<*xf64>) to tensor<*xf64>
+  %2 = toy.mul %0, %1 : tensor<*xf64>
+  toy.return %2 : tensor<*xf64>
 }
 func @main() {
-  %0 = "toy.constant"() {value = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
-  %1 = "toy.reshape"(%0) : (tensor<2x3xf64>) -> tensor<2x3xf64>
-  %2 = "toy.constant"() {value = dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64>} : () -> tensor<6xf64>
-  %3 = "toy.reshape"(%2) : (tensor<6xf64>) -> tensor<2x3xf64>
-  %4 = "toy.generic_call"(%1, %3) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
-  %5 = "toy.generic_call"(%3, %1) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
-  "toy.print"(%5) : (tensor<*xf64>) -> ()
-  "toy.return"() : () -> ()
+  %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+  %1 = toy.reshape(%0 : tensor<2x3xf64>) to tensor<2x3xf64>
+  %2 = toy.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64>
+  %3 = toy.reshape(%2 : tensor<6xf64>) to tensor<2x3xf64>
+  %4 = toy.generic_call @multiply_transpose(%1, %3) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+  %5 = toy.generic_call @multiply_transpose(%3, %1) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+  toy.print %5 : tensor<*xf64>
+  toy.return
 }
 
 // CHECK-NOT: func @multiply_transpose
 // CHECK-NOT: tensor<*xf64>
 
 // CHECK-LABEL: func @main()
-// CHECK:         [[VAL_0:%.*]] = "toy.constant"() {value = dense<{{\[\[}}1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
-// CHECK:         [[VAL_1:%.*]] = "toy.transpose"([[VAL_0]]) : (tensor<2x3xf64>) -> tensor<3x2xf64>
-// CHECK:         [[VAL_2:%.*]] = "toy.mul"([[VAL_1]], [[VAL_1]]) : (tensor<3x2xf64>, tensor<3x2xf64>) -> tensor<3x2xf64>
-// CHECK:         "toy.print"([[VAL_2]]) : (tensor<3x2xf64>) -> ()
-// CHECK:         "toy.return"() : () -> ()
+// CHECK:         [[VAL_0:%.*]] = toy.constant dense<{{\[\[}}1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+// CHECK:         [[VAL_1:%.*]] = toy.transpose([[VAL_0]] : tensor<2x3xf64>) to tensor<3x2xf64>
+// CHECK:         [[VAL_2:%.*]] = toy.mul [[VAL_1]], [[VAL_1]] : tensor<3x2xf64>
+// CHECK:         toy.print [[VAL_2]] : tensor<3x2xf64>
+// CHECK:         toy.return
diff --git a/mlir/test/Examples/Toy/Ch7/struct-codegen.toy b/mlir/test/Examples/Toy/Ch7/struct-codegen.toy
--- a/mlir/test/Examples/Toy/Ch7/struct-codegen.toy
+++ b/mlir/test/Examples/Toy/Ch7/struct-codegen.toy
@@ -24,22 +24,22 @@
 # CHECK-LABEL:   func @multiply_transpose(
 # CHECK-SAME:                             [[VAL_0:%.*]]: !toy.struct<tensor<*xf64>, tensor<*xf64>>) -> tensor<*xf64>
 # CHECK-SAME:        attributes {sym_visibility = "private"}
-# CHECK-NEXT:      [[VAL_1:%.*]] = "toy.struct_access"([[VAL_0]]) {index = 0 : i64} : (!toy.struct<tensor<*xf64>, tensor<*xf64>>) -> tensor<*xf64>
-# CHECK-NEXT:      [[VAL_2:%.*]] = "toy.transpose"([[VAL_1]]) : (tensor<*xf64>) -> tensor<*xf64>
-# CHECK-NEXT:      [[VAL_3:%.*]] = "toy.struct_access"([[VAL_0]]) {index = 1 : i64} : (!toy.struct<tensor<*xf64>, tensor<*xf64>>) -> tensor<*xf64>
-# CHECK-NEXT:      [[VAL_4:%.*]] = "toy.transpose"([[VAL_3]]) : (tensor<*xf64>) -> tensor<*xf64>
-# CHECK-NEXT:      [[VAL_5:%.*]] = "toy.mul"([[VAL_2]], [[VAL_4]]) : (tensor<*xf64>, tensor<*xf64>) -> tensor<*xf64>
-# CHECK-NEXT:      "toy.return"([[VAL_5]]) : (tensor<*xf64>) -> ()
+# CHECK-NEXT:      [[VAL_1:%.*]] = toy.struct_access [[VAL_0]][0] : !toy.struct<tensor<*xf64>, tensor<*xf64>> -> tensor<*xf64>
+# CHECK-NEXT:      [[VAL_2:%.*]] = toy.transpose([[VAL_1]] : tensor<*xf64>) to tensor<*xf64>
+# CHECK-NEXT:      [[VAL_3:%.*]] = toy.struct_access [[VAL_0]][1] : !toy.struct<tensor<*xf64>, tensor<*xf64>> -> tensor<*xf64>
+# CHECK-NEXT:      [[VAL_4:%.*]] = toy.transpose([[VAL_3]] : tensor<*xf64>) to tensor<*xf64>
+# CHECK-NEXT:      [[VAL_5:%.*]] = toy.mul [[VAL_2]], [[VAL_4]] : tensor<*xf64>
+# CHECK-NEXT:      toy.return [[VAL_5]] : tensor<*xf64>
 
 # CHECK-LABEL:   func @main()
-# CHECK-NEXT:      [[VAL_6:%.*]] = "toy.struct_constant"() {value = [dense<{{\[\[}}1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>, dense<{{\[\[}}1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>]} : () -> !toy.struct<tensor<*xf64>, tensor<*xf64>>
-# CHECK-NEXT:      [[VAL_7:%.*]] = "toy.generic_call"([[VAL_6]]) {callee = @multiply_transpose} : (!toy.struct<tensor<*xf64>, tensor<*xf64>>) -> tensor<*xf64>
-# CHECK-NEXT:      "toy.print"([[VAL_7]]) : (tensor<*xf64>) -> ()
-# CHECK-NEXT:      "toy.return"() : () -> ()
+# CHECK-NEXT:      [[VAL_6:%.*]] = toy.struct_constant [dense<{{\[\[}}1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>, dense<{{\[\[}}1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>] : !toy.struct<tensor<*xf64>, tensor<*xf64>>
+# CHECK-NEXT:      [[VAL_7:%.*]] = toy.generic_call @multiply_transpose([[VAL_6]]) : (!toy.struct<tensor<*xf64>, tensor<*xf64>>) -> tensor<*xf64>
+# CHECK-NEXT:      toy.print [[VAL_7]] : tensor<*xf64>
+# CHECK-NEXT:      toy.return
 
 # OPT-LABEL:   func @main()
-# OPT-NEXT:      [[VAL_0:%.*]] = "toy.constant"() {value = dense<{{\[\[}}1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
-# OPT-NEXT:      [[VAL_1:%.*]] = "toy.transpose"([[VAL_0]]) : (tensor<2x3xf64>) -> tensor<3x2xf64>
-# OPT-NEXT:      [[VAL_2:%.*]] = "toy.mul"([[VAL_1]], [[VAL_1]]) : (tensor<3x2xf64>, tensor<3x2xf64>) -> tensor<3x2xf64>
-# OPT-NEXT:      "toy.print"([[VAL_2]]) : (tensor<3x2xf64>) -> ()
-# OPT-NEXT:      "toy.return"() : () -> ()
+# OPT-NEXT:      [[VAL_0:%.*]] = toy.constant dense<{{\[\[}}1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+# OPT-NEXT:      [[VAL_1:%.*]] = toy.transpose([[VAL_0]] : tensor<2x3xf64>) to tensor<3x2xf64>
+# OPT-NEXT:      [[VAL_2:%.*]] = toy.mul [[VAL_1]], [[VAL_1]] : tensor<3x2xf64>
+# OPT-NEXT:      toy.print [[VAL_2]] : tensor<3x2xf64>
+# OPT-NEXT:      toy.return
diff --git a/mlir/test/Examples/Toy/Ch7/struct-opt.mlir b/mlir/test/Examples/Toy/Ch7/struct-opt.mlir
--- a/mlir/test/Examples/Toy/Ch7/struct-opt.mlir
+++ b/mlir/test/Examples/Toy/Ch7/struct-opt.mlir
@@ -1,16 +1,15 @@
 // RUN: toyc-ch7 %s -emit=mlir -opt 2>&1 | FileCheck %s
 
 func @main() {
-  %0 = "toy.struct_constant"() {
-    value = [[dense<4.000000e+00> : tensor<2x2xf64>], dense<4.000000e+00> : tensor<2x2xf64>]
-  } : () -> !toy.struct<!toy.struct<tensor<*xf64>>, tensor<*xf64>>
-  %1 = "toy.struct_access"(%0) {index = 0 : i64} : (!toy.struct<!toy.struct<tensor<*xf64>>, tensor<*xf64>>) -> !toy.struct<tensor<*xf64>>
-  %2 = "toy.struct_access"(%1) {index = 0 : i64} : (!toy.struct<tensor<*xf64>>) -> tensor<*xf64>
-  "toy.print"(%2) : (tensor<*xf64>) -> ()
-  "toy.return"() : () -> ()
+  %0 = toy.struct_constant [
+    [dense<4.000000e+00> : tensor<2x2xf64>], dense<4.000000e+00> : tensor<2x2xf64>
+  ] : !toy.struct<!toy.struct<tensor<*xf64>>, tensor<*xf64>>
+  %1 = toy.struct_access %0[0] : !toy.struct<!toy.struct<tensor<*xf64>>, tensor<*xf64>> -> !toy.struct<tensor<*xf64>>
+  %2 = toy.struct_access %1[0] : !toy.struct<tensor<*xf64>> -> tensor<*xf64>
+  toy.print %2 : tensor<*xf64>
+  toy.return
 }
 
 // CHECK-LABEL: func @main
-// CHECK-NEXT: %[[CST:.*]] = "toy.constant"
-// CHECK-SAME: dense<4.0
-// CHECK-NEXT: "toy.print"(%[[CST]])
+// CHECK-NEXT: %[[CST:.*]] = toy.constant dense<4.0
+// CHECK-NEXT: toy.print %[[CST]]
diff --git a/mlir/test/IR/invalid-ops.mlir b/mlir/test/IR/invalid-ops.mlir
--- a/mlir/test/IR/invalid-ops.mlir
+++ b/mlir/test/IR/invalid-ops.mlir
@@ -226,7 +226,7 @@
 // Integer comparisons are not recognized for float types.
 func @func_with_ops(f32, f32) {
 ^bb0(%a : f32, %b : f32):
-  %r = cmpi "eq", %a, %b : f32 // expected-error {{operand #0 must be integer-like}}
+  %r = cmpi "eq", %a, %b : f32 // expected-error {{'lhs' must be integer-like, but got 'f32'}}
 }
 
 // -----
@@ -298,13 +298,13 @@
 // -----
 
 func @invalid_select_shape(%cond : i1, %idx : () -> ()) {
-  // expected-error@+1 {{expected type with valid i1 shape}}
+  // expected-error@+1 {{'result' must be integer-like or floating-point-like, but got '() -> ()'}}
   %sel = select %cond, %idx, %idx : () -> ()
 
 // -----
 
 func @invalid_cmp_shape(%idx : () -> ()) {
-  // expected-error@+1 {{expected type with valid i1 shape}}
+  // expected-error@+1 {{'lhs' must be integer-like, but got '() -> ()'}}
   %cmp = cmpi "eq", %idx, %idx : () -> ()
 
 // -----
@@ -340,7 +340,7 @@
 // -----
 
 func @invalid_cmp_attr(%idx : i32) {
-  // expected-error@+1 {{expected string comparison predicate attribute}}
+  // expected-error@+1 {{invalid kind of attribute specified}}
   %cmp = cmpi i1, %idx, %idx : i32
 
 // -----
diff --git a/mlir/test/IR/invalid.mlir b/mlir/test/IR/invalid.mlir
--- a/mlir/test/IR/invalid.mlir
+++ b/mlir/test/IR/invalid.mlir
@@ -402,7 +402,6 @@
 ^bb0:
   %a = "foo"() : () -> i32 // expected-note {{prior use here}}
   cond_br %a, ^bb0, ^bb0 // expected-error {{use of value '%a' expects different type than prior uses: 'i1' vs 'i32'}}
-// expected-error@-1 {{expected condition type was boolean (i1)}}
 }
 
 // -----
diff --git a/mlir/test/lib/TestDialect/TestOps.td b/mlir/test/lib/TestDialect/TestOps.td
--- a/mlir/test/lib/TestDialect/TestOps.td
+++ b/mlir/test/lib/TestDialect/TestOps.td
@@ -431,7 +431,7 @@
                      [(IsNotScalar $attr)]>;
 
 def TestBranchOp : TEST_Op<"br", [Terminator]> {
-  let arguments = (ins Variadic<AnyType>:$operands);
+  let successors = (successor AnySuccessor:$target);
 }
 
 def AttrSizedOperandOp : TEST_Op<"attr_sized_operands",
@@ -1089,6 +1089,12 @@
   let assemblyFormat = "$attr attr-dict";
 }
 
+// Test that we elide attributes that are within the syntax.
+def FormatAttrDictWithKeywordOp : TEST_Op<"format_attr_dict_w_keyword"> {
+  let arguments = (ins I64Attr:$attr);
+  let assemblyFormat = "attr-dict-with-keyword";
+}
+
 // Test that we don't need to provide types in the format if they are buildable.
 def FormatBuildableTypeOp : TEST_Op<"format_buildable_type_op"> {
   let arguments = (ins I64:$buildable);
@@ -1133,4 +1139,9 @@
   $buildable `,` $operand `:` type($buildable) `,` type($operand) attr-dict
 }]>;
 
+def FormatSuccessorAOp : TEST_Op<"format_successor_a_op", [Terminator]> {
+  let successors = (successor VariadicSuccessor<AnySuccessor>:$targets);
+  let assemblyFormat = "$targets attr-dict";
+}
+
 #endif // TEST_OPS
diff --git a/mlir/test/mlir-tblgen/op-format-spec.td b/mlir/test/mlir-tblgen/op-format-spec.td
--- a/mlir/test/mlir-tblgen/op-format-spec.td
+++ b/mlir/test/mlir-tblgen/op-format-spec.td
@@ -26,14 +26,21 @@
 def DirectiveAttrDictInvalidB : TestFormat_Op<"attrdict_invalid_b", [{
   attr-dict attr-dict
 }]>;
-// CHECK: error: 'attr-dict' directive can only be used as a top-level directive
+// CHECK: error: 'attr-dict' directive has already been seen
 def DirectiveAttrDictInvalidC : TestFormat_Op<"attrdict_invalid_c", [{
+  attr-dict attr-dict-with-keyword
+}]>;
+// CHECK: error: 'attr-dict' directive can only be used as a top-level directive
+def DirectiveAttrDictInvalidD : TestFormat_Op<"attrdict_invalid_d", [{
   type(attr-dict)
 }]>;
 // CHECK-NOT: error
-def DirectiveAttrDictValid : TestFormat_Op<"attrdict_valid", [{
+def DirectiveAttrDictValidA : TestFormat_Op<"attrdict_valid_a", [{
   attr-dict
 }]>;
+def DirectiveAttrDictValidB : TestFormat_Op<"attrdict_valid_b", [{
+  attr-dict-with-keyword
+}]>;
 
 //===----------------------------------------------------------------------===//
 // functional-type
@@ -46,7 +53,7 @@
 def DirectiveFunctionalTypeInvalidB : TestFormat_Op<"functype_invalid_b", [{
   functional-type
 }]>;
-// CHECK: error: expected directive, literal, or variable
+// CHECK: error: expected directive, literal, variable, or optional group
 def DirectiveFunctionalTypeInvalidC : TestFormat_Op<"functype_invalid_c", [{
   functional-type(
 }]>;
@@ -54,7 +61,7 @@
 def DirectiveFunctionalTypeInvalidD : TestFormat_Op<"functype_invalid_d", [{
   functional-type(operands
 }]>;
-// CHECK: error: expected directive, literal, or variable
+// CHECK: error: expected directive, literal, variable, or optional group
 def DirectiveFunctionalTypeInvalidE : TestFormat_Op<"functype_invalid_e", [{
   functional-type(operands,
 }]>;
@@ -87,10 +94,18 @@
 // results
 
 // CHECK: error: 'results' directive can not be used as a top-level directive
-def DirectiveResultsInvalidA : TestFormat_Op<"operands_invalid_a", [{
+def DirectiveResultsInvalidA : TestFormat_Op<"results_invalid_a", [{
   results
 }]>;
 
+//===----------------------------------------------------------------------===//
+// successors
+
+// CHECK: error: 'successors' is only valid as a top-level directive
+def DirectiveSuccessorsInvalidA : TestFormat_Op<"successors_invalid_a", [{
+  type(successors)
+}]>;
+
 //===----------------------------------------------------------------------===//
 // type
 
@@ -98,7 +113,7 @@
 def DirectiveTypeInvalidA : TestFormat_Op<"type_invalid_a", [{
   type
 }]>;
-// CHECK: error: expected directive, literal, or variable
+// CHECK: error: expected directive, literal, variable, or optional group
 def DirectiveTypeInvalidB : TestFormat_Op<"type_invalid_b", [{
   type(
 }]>;
@@ -165,7 +180,7 @@
   `1`
 }]>;
 // CHECK: error: unexpected end of file in literal
-// CHECK: error: expected directive, literal, or variable
+// CHECK: error: expected directive, literal, variable, or optional group
 def LiteralInvalidB : TestFormat_Op<"literal_invalid_b", [{
   `
 }]>;
@@ -175,11 +190,60 @@
   attr-dict
 }]>;
 
+//===----------------------------------------------------------------------===//
+// Optional Groups
+//===----------------------------------------------------------------------===//
+
+// CHECK: error: optional groups can only be used as top-level elements
+def OptionalInvalidA : TestFormat_Op<"optional_invalid_a", [{
+  type(($attr^)?) attr-dict
+}]>, Arguments<(ins OptionalAttr<I64Attr>:$attr)>;
+// CHECK: error: expected directive, literal, variable, or optional group
+def OptionalInvalidB : TestFormat_Op<"optional_invalid_b", [{
+  () attr-dict
+}]>, Arguments<(ins OptionalAttr<I64Attr>:$attr)>;
+// CHECK: error: optional group specified no anchor element
+def OptionalInvalidC : TestFormat_Op<"optional_invalid_c", [{
+  ($attr)? attr-dict
+}]>, Arguments<(ins OptionalAttr<I64Attr>:$attr)>;
+// CHECK: error: first element of an operand group must be a literal or operand
+def OptionalInvalidD : TestFormat_Op<"optional_invalid_d", [{
+  ($attr^)? attr-dict
+}]>, Arguments<(ins OptionalAttr<I64Attr>:$attr)>;
+// CHECK: error: type directive can only refer to variables within the optional group
+def OptionalInvalidE : TestFormat_Op<"optional_invalid_e", [{
+  (`,` $attr^ type(operands))? attr-dict
+}]>, Arguments<(ins OptionalAttr<I64Attr>:$attr)>;
+// CHECK: error: only one element can be marked as the anchor of an optional group
+def OptionalInvalidF : TestFormat_Op<"optional_invalid_f", [{
+  ($attr^ $attr2^) attr-dict
+}]>, Arguments<(ins OptionalAttr<I64Attr>:$attr, OptionalAttr<I64Attr>:$attr2)>;
+// CHECK: error: only optional attributes can be used to anchor an optional group
+def OptionalInvalidG : TestFormat_Op<"optional_invalid_g", [{
+  ($attr^) attr-dict
+}]>, Arguments<(ins I64Attr:$attr)>;
+// CHECK: error: only variadic operands can be used within an optional group
+def OptionalInvalidH : TestFormat_Op<"optional_invalid_h", [{
+  ($arg^) attr-dict
+}]>, Arguments<(ins I64:$arg)>;
+// CHECK: error: only variables can be used to anchor an optional group
+def OptionalInvalidI : TestFormat_Op<"optional_invalid_i", [{
+  ($arg type($arg)^) attr-dict
+}]>, Arguments<(ins Variadic<I64>:$arg)>;
+// CHECK: error: only literals, types, and variables can be used within an optional group
+def OptionalInvalidJ : TestFormat_Op<"optional_invalid_j", [{
+  (attr-dict)
+}]>;
+// CHECK: error: expected '?' after optional group
+def OptionalInvalidK : TestFormat_Op<"optional_invalid_k", [{
+  ($arg^)
+}]>, Arguments<(ins Variadic<I64>:$arg)>;
+
 //===----------------------------------------------------------------------===//
 // Variables
 //===----------------------------------------------------------------------===//
 
-// CHECK: error: expected variable to refer to a argument or result
+// CHECK: error: expected variable to refer to a argument, result, or successor
 def VariableInvalidA : TestFormat_Op<"variable_invalid_a", [{
   $unknown_arg attr-dict
 }]>;
@@ -199,6 +263,18 @@
 def VariableInvalidE : TestFormat_Op<"variable_invalid_e", [{
   $result attr-dict
 }]>, Results<(outs I64:$result)>;
+// CHECK: error: successor 'successor' is already bound
+def VariableInvalidF : TestFormat_Op<"variable_invalid_f", [{
+  $successor $successor attr-dict
+}]> {
+  let successors = (successor AnySuccessor:$successor);
+}
+// CHECK: error: successor 'successor' is already bound
+def VariableInvalidG : TestFormat_Op<"variable_invalid_g", [{
+  successors $successor attr-dict
+}]> {
+  let successors = (successor AnySuccessor:$successor);
+}
 
 //===----------------------------------------------------------------------===//
 // Coverage Checks
diff --git a/mlir/test/mlir-tblgen/op-format.mlir b/mlir/test/mlir-tblgen/op-format.mlir
--- a/mlir/test/mlir-tblgen/op-format.mlir
+++ b/mlir/test/mlir-tblgen/op-format.mlir
@@ -12,6 +12,9 @@
 // CHECK-NOT: {attr
 test.format_attr_op 10
 
+// CHECK: test.format_attr_dict_w_keyword attributes {attr = 10 : i64}
+test.format_attr_dict_w_keyword attributes {attr = 10 : i64}
+
 // CHECK: test.format_buildable_type_op %[[I64]]
 %ignored = test.format_buildable_type_op %i64
 
@@ -38,3 +41,19 @@
 
 // CHECK: test.format_operand_e_op %[[I64]], %[[MEMREF]] : i64, memref<1xf64>
 test.format_operand_e_op %i64, %memref : i64, memref<1xf64>
+
+"foo.successor_test_region"() ( {
+  ^bb0:
+    // CHECK: test.format_successor_a_op ^bb1 {attr}
+    test.format_successor_a_op ^bb1 {attr}
+
+  ^bb1:
+    // CHECK: test.format_successor_a_op ^bb1, ^bb2 {attr}
+    test.format_successor_a_op ^bb1, ^bb2 {attr}
+
+  ^bb2:
+    // CHECK: test.format_successor_a_op {attr}
+    test.format_successor_a_op {attr}
+
+}) { arg_names = ["i", "j", "k"] } : () -> ()
+
diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
--- a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
@@ -183,6 +183,9 @@
   // Generates getters for named regions.
   void genNamedRegionGetters();
 
+  // Generates getters for named successors.
+  void genNamedSuccessorGetters();
+
   // Generates builder methods for the operation.
   void genBuilder();
 
@@ -266,6 +269,10 @@
   // The generated code will be attached to `body`.
   void genRegionVerifier(OpMethodBody &body);
 
+  // Generates verify statements for successors in the operation.
+  // The generated code will be attached to `body`.
+  void genSuccessorVerifier(OpMethodBody &body);
+
   // Generates the traits used by the object.
   void genTraits();
 
@@ -302,6 +309,7 @@
   genNamedOperandGetters();
   genNamedResultGetters();
   genNamedRegionGetters();
+  genNamedSuccessorGetters();
   genAttrGetters();
   genAttrSetters();
   genBuilder();
@@ -579,6 +587,42 @@
   }
 }
 
+void OpEmitter::genNamedSuccessorGetters() {
+  unsigned numSuccessors = op.getNumSuccessors();
+  for (unsigned i = 0; i < numSuccessors; ++i) {
+    const NamedSuccessor &successor = op.getSuccessor(i);
+    if (successor.name.empty())
+      continue;
+
+    // Generate the accessors for a variadic successor.
+    if (successor.isVariadic()) {
+      // Generate the getter.
+      auto &m = opClass.newMethod("SuccessorRange", successor.name);
+      m.body() << formatv(
+          "  return {std::next(this->getOperation()->successor_begin(), {0}), "
+          "this->getOperation()->successor_end()};",
+          i);
+      continue;
+    }
+
+    // Generate the block getter.
+    auto &m = opClass.newMethod("Block *", successor.name);
+    m.body() << formatv("  return this->getOperation()->getSuccessor({0});", i);
+
+    // Generate the all-operands getter.
+    auto &operandsMethod = opClass.newMethod(
+        "Operation::operand_range", (successor.name + "Operands").str());
+    operandsMethod.body() << formatv(
+        " return this->getOperation()->getSuccessorOperands({0});", i);
+
+    // Generate the individual-operand getter.
+    auto &operandMethod = opClass.newMethod(
+        "Value", (successor.name + "Operand").str(), "unsigned index");
+    operandMethod.body() << formatv(
+        " return this->getOperation()->getSuccessorOperand({0}, index);", i);
+  }
+}
+
 static bool canGenerateUnwrappedBuilder(Operator &op) {
   // If this op does not have native attributes at all, return directly to avoid
   // redefining builders.
@@ -869,8 +913,9 @@
   // Generate builder that infers type too.
   // TODO(jpienaar): Subsume this with general checking if type can be infered
   // automatically.
-  // TODO(jpienaar): Expand to handle regions.
-  if (op.getTrait("InferTypeOpInterface::Trait") && op.getNumRegions() == 0)
+  // TODO(jpienaar): Expand to handle regions and successors.
+  if (op.getTrait("InferTypeOpInterface::Trait") && op.getNumRegions() == 0 &&
+      op.getNumSuccessors() == 0)
     genInferedTypeCollectiveParamBuilder();
 }
 
@@ -982,17 +1027,28 @@
       ++numAttrs;
     }
   }
+
+  /// Insert parameters for the block and operands for each successor.
+  const char *variadicSuccCode =
+      ", ArrayRef<Block *> {0}, ArrayRef<ValueRange> {0}Operands";
+  const char *succCode = ", Block *{0}, ValueRange {0}Operands";
+  for (const NamedSuccessor &namedSuccessor : op.getSuccessors()) {
+    if (namedSuccessor.isVariadic())
+      paramList += llvm::formatv(variadicSuccCode, namedSuccessor.name).str();
+    else
+      paramList += llvm::formatv(succCode, namedSuccessor.name).str();
+  }
 }
 
 void OpEmitter::genCodeForAddingArgAndRegionForBuilder(OpMethodBody &body,
                                                        bool isRawValueAttr) {
-  // Push all operands to the result
+  // Push all operands to the result.
   for (int i = 0, e = op.getNumOperands(); i < e; ++i) {
     body << "  " << builderOpState << ".addOperands(" << getArgumentName(op, i)
          << ");\n";
   }
 
-  // Push all attributes to the result
+  // Push all attributes to the result.
   for (const auto &namedAttr : op.getAttributes()) {
     auto &attr = namedAttr.attr;
     if (!attr.isDerivedAttr()) {
@@ -1030,11 +1086,24 @@
     }
   }
 
-  // Create the correct number of regions
+  // Create the correct number of regions.
   if (int numRegions = op.getNumRegions()) {
     for (int i = 0; i < numRegions; ++i)
       body << "  (void)" << builderOpState << ".addRegion();\n";
   }
+
+  // Push all successors to the result.
+  for (const NamedSuccessor &namedSuccessor : op.getSuccessors()) {
+    if (namedSuccessor.isVariadic()) {
+      body << formatv("  for (int i = 0, e = {1}.size(); i != e; ++i)\n"
+                      "    {0}.addSuccessor({1}[i], {1}Operands[i]);\n",
+                      builderOpState, namedSuccessor.name);
+      continue;
+    }
+
+    body << formatv("  {0}.addSuccessor({1}, {1}Operands);\n", builderOpState,
+                    namedSuccessor.name);
+  }
 }
 
 void OpEmitter::genCanonicalizerDecls() {
@@ -1228,6 +1297,7 @@
   }
 
   genRegionVerifier(body);
+  genSuccessorVerifier(body);
 
   if (hasCustomVerify) {
     FmtContext fctx;
@@ -1305,6 +1375,58 @@
   }
 }
 
+void OpEmitter::genSuccessorVerifier(OpMethodBody &body) {
+  unsigned numSuccessors = op.getNumSuccessors();
+
+  const char *checkSuccessorSizeCode = R"(
+  if (this->getOperation()->getNumSuccessors() {0} {1}) {
+    return emitOpError("has incorrect number of successors: expected{2} {1}"
+                       " but found ")
+             << this->getOperation()->getNumSuccessors();
+  }
+  )";
+
+  // Verify this op has the correct number of successors.
+  unsigned numVariadicSuccessors = op.getNumVariadicSuccessors();
+  if (numVariadicSuccessors == 0) {
+    body << formatv(checkSuccessorSizeCode, "!=", numSuccessors, "");
+  } else if (numVariadicSuccessors != numSuccessors) {
+    body << formatv(checkSuccessorSizeCode, "<",
+                    numSuccessors - numVariadicSuccessors, " at least");
+  }
+
+  // If we have no successors, there is nothing more to do.
+  if (numSuccessors == 0)
+    return;
+
+  body << "{\n";
+  body << "    unsigned index = 0; (void)index;\n";
+
+  for (unsigned i = 0; i < numSuccessors; ++i) {
+    const auto &successor = op.getSuccessor(i);
+    if (successor.constraint.getPredicate().isNull())
+      continue;
+
+    body << "    for (Block *successor : ";
+    body << formatv(successor.isVariadic() ? "{0}()"
+                                           : "ArrayRef<Block *>({0}())",
+                    successor.name);
+    body << ") {\n";
+    auto constraint = tgfmt(successor.constraint.getConditionTemplate(),
+                            &verifyCtx.withSelf("successor"))
+                          .str();
+
+    body << formatv(
+        "      (void)successor;\n"
+        "      if (!({0})) {\n        "
+        "return emitOpError(\"successor #\") << index << \"('{2}') failed to "
+        "verify constraint: {3}\";\n      }\n",
+        constraint, i, successor.name, successor.constraint.getDescription());
+    body << "    }\n";
+  }
+  body << "  }\n";
+}
+
 void OpEmitter::genTraits() {
   int numResults = op.getNumResults();
   int numVariadicResults = op.getNumVariadicResults();
@@ -1342,7 +1464,9 @@
   int numVariadicOperands = op.getNumVariadicOperands();
 
   // Add operand size trait.
-  if (numVariadicOperands != 0) {
+  // Note: Successor operands are also included in the operation's operand list,
+  // so we always need to use VariadicOperands in the presence of successors.
+  if (numVariadicOperands != 0 || op.getNumSuccessors()) {
     if (numOperands == numVariadicOperands)
       opClass.addTrait("OpTrait::VariadicOperands");
     else
diff --git a/mlir/tools/mlir-tblgen/OpFormatGen.cpp b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
--- a/mlir/tools/mlir-tblgen/OpFormatGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
@@ -49,6 +49,7 @@
     FunctionalTypeDirective,
     OperandsDirective,
     ResultsDirective,
+    SuccessorsDirective,
     TypeDirective,
 
     /// This element is a literal.
@@ -58,6 +59,10 @@
     AttributeVariable,
     OperandVariable,
     ResultVariable,
+    SuccessorVariable,
+
+    /// This element is an optional element.
+    Optional,
   };
   Element(Kind kind) : kind(kind) {}
   virtual ~Element() = default;
@@ -102,6 +107,10 @@
 /// This class represents a variable that refers to a result.
 using ResultVariable =
     VariableElement<NamedTypeConstraint, Element::Kind::ResultVariable>;
+
+/// This class represents a variable that refers to a successor.
+using SuccessorVariable =
+    VariableElement<NamedSuccessor, Element::Kind::SuccessorVariable>;
 } // end anonymous namespace
 
 //===----------------------------------------------------------------------===//
@@ -115,10 +124,6 @@
   DirectiveElement() : Element(type){};
   static bool classof(const Element *ele) { return ele->getKind() == type; }
 };
-/// This class represents the `attr-dict` directive. This directive represents
-/// the attribute dictionary of the operation.
-using AttrDictDirective = DirectiveElement<Element::Kind::AttrDictDirective>;
-
 /// This class represents the `operands` directive. This directive represents
 /// all of the operands of an operation.
 using OperandsDirective = DirectiveElement<Element::Kind::OperandsDirective>;
@@ -127,10 +132,28 @@
 /// all of the results of an operation.
 using ResultsDirective = DirectiveElement<Element::Kind::ResultsDirective>;
 
+/// This class represents the `successors` directive. This directive represents
+/// all of the successors of an operation.
+using SuccessorsDirective =
+    DirectiveElement<Element::Kind::SuccessorsDirective>;
+
+/// This class represents the `attr-dict` directive. This directive represents
+/// the attribute dictionary of the operation.
+class AttrDictDirective
+    : public DirectiveElement<Element::Kind::AttrDictDirective> {
+public:
+  explicit AttrDictDirective(bool withKeyword) : withKeyword(withKeyword) {}
+  bool isWithKeyword() const { return withKeyword; }
+
+private:
+  /// If the dictionary should be printed with the 'attributes' keyword.
+  bool withKeyword;
+};
+
 /// This class represents the `functional-type` directive. This directive takes
 /// two arguments and formats them, respectively, as the inputs and results of a
 /// FunctionType.
-struct FunctionalTypeDirective
+class FunctionalTypeDirective
     : public DirectiveElement<Element::Kind::FunctionalTypeDirective> {
 public:
   FunctionalTypeDirective(std::unique_ptr<Element> inputs,
@@ -145,7 +168,7 @@
 };
 
 /// This class represents the `type` directive.
-struct TypeDirective : public DirectiveElement<Element::Kind::TypeDirective> {
+class TypeDirective : public DirectiveElement<Element::Kind::TypeDirective> {
 public:
   TypeDirective(std::unique_ptr<Element> arg) : operand(std::move(arg)) {}
   Element *getOperand() const { return operand.get(); }
@@ -164,7 +187,7 @@
 class LiteralElement : public Element {
 public:
   LiteralElement(StringRef literal)
-      : Element{Kind::Literal}, literal(literal){};
+      : Element{Kind::Literal}, literal(literal) {}
   static bool classof(const Element *element) {
     return element->getKind() == Kind::Literal;
   }
@@ -203,6 +226,36 @@
   });
 }
 
+//===----------------------------------------------------------------------===//
+// OptionalElement
+
+namespace {
+/// This class represents a group of elements that are optionally emitted based
+/// upon an optional variable of the operation.
+class OptionalElement : public Element {
+public:
+  OptionalElement(std::vector<std::unique_ptr<Element>> &&elements,
+                  unsigned anchor)
+      : Element{Kind::Optional}, elements(std::move(elements)), anchor(anchor) {
+  }
+  static bool classof(const Element *element) {
+    return element->getKind() == Kind::Optional;
+  }
+
+  /// Return the nested elements of this grouping.
+  auto getElements() const { return llvm::make_pointee_range(elements); }
+
+  /// Return the anchor of this optional group.
+  Element *getAnchor() const { return elements[anchor].get(); }
+
+private:
+  /// The child elements of this optional.
+  std::vector<std::unique_ptr<Element>> elements;
+  /// The index of the element that acts as the anchor for the optional group.
+  unsigned anchor;
+};
+} // end anonymous namespace
+
 //===----------------------------------------------------------------------===//
 // OperationFormat
 //===----------------------------------------------------------------------===//
@@ -219,16 +272,26 @@
     void setBuilderIdx(int idx) { builderIdx = idx; }
 
     /// Get the variable this type is resolved to, or None.
-    Optional<StringRef> getVariable() const { return variableName; }
-    void setVariable(StringRef variable) { variableName = variable; }
+    const NamedTypeConstraint *getVariable() const { return variable; }
+    Optional<StringRef> getVarTransformer() const {
+      return variableTransformer;
+    }
+    void setVariable(const NamedTypeConstraint *var,
+                     Optional<StringRef> transformer) {
+      variable = var;
+      variableTransformer = transformer;
+    }
 
   private:
     /// If the type is resolved with a buildable type, this is the index into
     /// 'buildableTypes' in the parent format.
     Optional<int> builderIdx;
     /// If the type is resolved based upon another operand or result, this is
-    /// the name of the variable that this type is resolved to.
-    Optional<StringRef> variableName;
+    /// the variable that this type is resolved to.
+    const NamedTypeConstraint *variable;
+    /// If the type is resolved based upon another operand or result, this is
+    /// a transformer to apply to the variable when resolving.
+    Optional<StringRef> variableTransformer;
   };
 
   OperationFormat(const Operator &op)
@@ -242,6 +305,8 @@
   /// Generate the c++ to resolve the types of operands and results during
   /// parsing.
   void genParserTypeResolution(Operator &op, OpMethodBody &body);
+  /// Generate the c++ to resolve successors during parsing.
+  void genParserSuccessorResolution(Operator &op, OpMethodBody &body);
 
   /// Generate the operation printer from this format.
   void genPrinter(Operator &op, OpClass &opClass);
@@ -317,32 +382,26 @@
 const char *const variadicOperandParserCode = R"(
   llvm::SMLoc {0}OperandsLoc = parser.getCurrentLocation();
   (void){0}OperandsLoc;
-  SmallVector<OpAsmParser::OperandType, 4> {0}Operands;
   if (parser.parseOperandList({0}Operands))
     return failure();
 )";
 const char *const operandParserCode = R"(
   llvm::SMLoc {0}OperandsLoc = parser.getCurrentLocation();
   (void){0}OperandsLoc;
-  OpAsmParser::OperandType {0}RawOperands[1];
   if (parser.parseOperand({0}RawOperands[0]))
     return failure();
-  ArrayRef<OpAsmParser::OperandType> {0}Operands({0}RawOperands);
 )";
 
 /// The code snippet used to generate a parser call for a type list.
 ///
 /// {0}: The name for the type list.
 const char *const variadicTypeParserCode = R"(
-  SmallVector<Type, 1> {0}Types;
   if (parser.parseTypeList({0}Types))
     return failure();
 )";
 const char *const typeParserCode = R"(
-  Type {0}RawTypes[1] = {{nullptr};
   if (parser.parseType({0}RawTypes[0]))
     return failure();
-  ArrayRef<Type> {0}Types({0}RawTypes);
 )";
 
 /// The code snippet used to generate a parser call for a functional type.
@@ -353,8 +412,53 @@
   FunctionType {0}__{1}_functionType;
   if (parser.parseType({0}__{1}_functionType))
     return failure();
-  ArrayRef<Type> {0}Types = {0}__{1}_functionType.getInputs();
-  ArrayRef<Type> {1}Types = {0}__{1}_functionType.getResults();
+  {0}Types = {0}__{1}_functionType.getInputs();
+  {1}Types = {0}__{1}_functionType.getResults();
+)";
+
+/// The code snippet used to generate a parser call for a successor list.
+///
+/// {0}: The name for the successor list.
+const char *successorListParserCode = R"(
+  SmallVector<std::pair<Block *, SmallVector<Value, 4>>, 2> {0}Successors;
+  {
+    Block *succ;
+    SmallVector<Value, 4> succOperands;
+    // Parse the first successor.
+    auto firstSucc = parser.parseOptionalSuccessorAndUseList(succ,
+                                                             succOperands);
+    if (firstSucc.hasValue()) {
+      if (failed(*firstSucc))
+        return failure();
+      {0}Successors.emplace_back(succ, succOperands);
+
+      // Parse any trailing successors.
+      while (succeeded(parser.parseOptionalComma())) {
+        succOperands.clear();
+        if (parser.parseSuccessorAndUseList(succ, succOperands))
+          return failure();
+        {0}Successors.emplace_back(succ, succOperands);
+      }
+    }
+  }
+)";
+
+/// The code snippet used to generate a parser call for a successor.
+///
+/// {0}: The name of the successor.
+const char *successorParserCode = R"(
+  Block *{0}Successor = nullptr;
+  SmallVector<Value, 4> {0}Operands;
+  if (parser.parseSuccessorAndUseList({0}Successor, {0}Operands))
+    return failure();
+)";
+
+/// The code snippet used to resolve a list of parsed successors.
+///
+/// {0}: The name of the successor list.
+const char *resolveSuccessorListParserCode = R"(
+  for (auto &succAndArgs : {0}Successors)
+    result.addSuccessor(succAndArgs.first, succAndArgs.second);
 )";
 
 /// Get the name used for the type list for the given type directive operand.
@@ -378,25 +482,152 @@
 
 /// Generate the parser for a literal value.
 static void genLiteralParser(StringRef value, OpMethodBody &body) {
-  body << "  if (parser.parse";
-
   // Handle the case of a keyword/identifier.
   if (value.front() == '_' || isalpha(value.front())) {
     body << "Keyword(\"" << value << "\")";
+    return;
+  }
+  body << (StringRef)llvm::StringSwitch<StringRef>(value)
+              .Case("->", "Arrow()")
+              .Case(":", "Colon()")
+              .Case(",", "Comma()")
+              .Case("=", "Equal()")
+              .Case("<", "Less()")
+              .Case(">", "Greater()")
+              .Case("(", "LParen()")
+              .Case(")", "RParen()")
+              .Case("[", "LSquare()")
+              .Case("]", "RSquare()");
+}
+
+/// Generate the storage code required for parsing the given element.
+static void genElementParserStorage(Element *element, OpMethodBody &body) {
+  if (auto *optional = dyn_cast<OptionalElement>(element)) {
+    for (auto &childElement : optional->getElements())
+      genElementParserStorage(&childElement, body);
+  } else if (auto *operand = dyn_cast<OperandVariable>(element)) {
+    StringRef name = operand->getVar()->name;
+    if (operand->getVar()->isVariadic())
+      body << "  SmallVector<OpAsmParser::OperandType, 4> " << name
+           << "Operands;\n";
+    else
+      body << "  OpAsmParser::OperandType " << name << "RawOperands[1];\n"
+           << "  ArrayRef<OpAsmParser::OperandType> " << name << "Operands("
+           << name << "RawOperands);";
+  } else if (auto *dir = dyn_cast<TypeDirective>(element)) {
+    bool variadic = false;
+    StringRef name = getTypeListName(dir->getOperand(), variadic);
+    if (variadic)
+      body << "  SmallVector<Type, 1> " << name << "Types;\n";
+    else
+      body << llvm::formatv("  Type {0}RawTypes[1];\n", name)
+           << llvm::formatv("  ArrayRef<Type> {0}Types({0}RawTypes);\n", name);
+  } else if (auto *dir = dyn_cast<FunctionalTypeDirective>(element)) {
+    bool ignored = false;
+    body << "  ArrayRef<Type> " << getTypeListName(dir->getInputs(), ignored)
+         << "Types;\n";
+    body << "  ArrayRef<Type> " << getTypeListName(dir->getResults(), ignored)
+         << "Types;\n";
+  }
+}
+
+/// Generate the parser for a single format element.
+static void genElementParser(Element *element, OpMethodBody &body,
+                             FmtContext &attrTypeCtx) {
+  /// Optional Group.
+  if (auto *optional = dyn_cast<OptionalElement>(element)) {
+    auto elements = optional->getElements();
+
+    // Generate a special optional parser for the first element to gate the
+    // parsing of the rest of the elements.
+    if (auto *literal = dyn_cast<LiteralElement>(&*elements.begin())) {
+      body << "  if (succeeded(parser.parseOptional";
+      genLiteralParser(literal->getLiteral(), body);
+      body << ")) {\n";
+    } else if (auto *opVar = dyn_cast<OperandVariable>(&*elements.begin())) {
+      genElementParser(opVar, body, attrTypeCtx);
+      body << "  if (!" << opVar->getVar()->name << "Operands.empty()) {\n";
+    }
+
+    // Generate the rest of the elements normally.
+    for (auto &childElement : llvm::drop_begin(elements, 1))
+      genElementParser(&childElement, body, attrTypeCtx);
+    body << "  }\n";
+
+    /// Literals.
+  } else if (LiteralElement *literal = dyn_cast<LiteralElement>(element)) {
+    body << "  if (parser.parse";
+    genLiteralParser(literal->getLiteral(), body);
+    body << ")\n    return failure();\n";
+
+    /// Arguments.
+  } else if (auto *attr = dyn_cast<AttributeVariable>(element)) {
+    const NamedAttribute *var = attr->getVar();
+
+    // Check to see if we can parse this as an enum attribute.
+    if (canFormatEnumAttr(var)) {
+      const EnumAttr &enumAttr = cast<EnumAttr>(var->attr);
+
+      // Generate the code for building an attribute for this enum.
+      std::string attrBuilderStr;
+      {
+        llvm::raw_string_ostream os(attrBuilderStr);
+        os << tgfmt(enumAttr.getConstBuilderTemplate(), &attrTypeCtx,
+                    "attrOptional.getValue()");
+      }
+
+      body << formatv(enumAttrParserCode, var->name, enumAttr.getCppNamespace(),
+                      enumAttr.getStringToSymbolFnName(), attrBuilderStr);
+      return;
+    }
+
+    // If this attribute has a buildable type, use that when parsing the
+    // attribute.
+    std::string attrTypeStr;
+    if (Optional<Type> attrType = var->attr.getValueType()) {
+      if (Optional<StringRef> typeBuilder = attrType->getBuilderCall()) {
+        llvm::raw_string_ostream os(attrTypeStr);
+        os << ", " << tgfmt(*typeBuilder, &attrTypeCtx);
+      }
+    }
+
+    body << formatv(attrParserCode, var->attr.getStorageType(), var->name,
+                    attrTypeStr);
+  } else if (auto *operand = dyn_cast<OperandVariable>(element)) {
+    bool isVariadic = operand->getVar()->isVariadic();
+    body << formatv(isVariadic ? variadicOperandParserCode : operandParserCode,
+                    operand->getVar()->name);
+  } else if (auto *successor = dyn_cast<SuccessorVariable>(element)) {
+    bool isVariadic = successor->getVar()->isVariadic();
+    body << formatv(isVariadic ? successorListParserCode : successorParserCode,
+                    successor->getVar()->name);
+
+    /// Directives.
+  } else if (auto *attrDict = dyn_cast<AttrDictDirective>(element)) {
+    body << "  if (parser.parseOptionalAttrDict"
+         << (attrDict->isWithKeyword() ? "WithKeyword" : "")
+         << "(result.attributes))\n"
+         << "    return failure();\n";
+  } else if (isa<OperandsDirective>(element)) {
+    body << "  llvm::SMLoc allOperandLoc = parser.getCurrentLocation();\n"
+         << "  SmallVector<OpAsmParser::OperandType, 4> allOperands;\n"
+         << "  if (parser.parseOperandList(allOperands))\n"
+         << "    return failure();\n";
+  } else if (isa<SuccessorsDirective>(element)) {
+    body << llvm::formatv(successorListParserCode, "full");
+  } else if (auto *dir = dyn_cast<TypeDirective>(element)) {
+    bool isVariadic = false;
+    StringRef listName = getTypeListName(dir->getOperand(), isVariadic);
+    body << formatv(isVariadic ? variadicTypeParserCode : typeParserCode,
+                    listName);
+  } else if (auto *dir = dyn_cast<FunctionalTypeDirective>(element)) {
+    bool ignored = false;
+    body << formatv(functionalTypeParserCode,
+                    getTypeListName(dir->getInputs(), ignored),
+                    getTypeListName(dir->getResults(), ignored));
   } else {
-    body << (StringRef)llvm::StringSwitch<StringRef>(value)
-                .Case("->", "Arrow()")
-                .Case(":", "Colon()")
-                .Case(",", "Comma()")
-                .Case("=", "Equal()")
-                .Case("<", "Less()")
-                .Case(">", "Greater()")
-                .Case("(", "LParen()")
-                .Case(")", "RParen()")
-                .Case("[", "LSquare()")
-                .Case("]", "RSquare()");
+    llvm_unreachable("unknown format element");
   }
-  body << ")\n    return failure();\n";
 }
 
 void OperationFormat::genParser(Operator &op, OpClass &opClass) {
@@ -405,88 +636,57 @@
       OpMethod::MP_Static);
   auto &body = method.body();
 
+  // Generate variables to store the operands and type within the format. This
+  // allows for referencing these variables in the presence of optional
+  // groupings.
+  for (auto &element : elements)
+    genElementParserStorage(&*element, body);
+
   // A format context used when parsing attributes with buildable types.
   FmtContext attrTypeCtx;
   attrTypeCtx.withBuilder("parser.getBuilder()");
 
   // Generate parsers for each of the elements.
-  for (auto &element : elements) {
-    /// Literals.
-    if (LiteralElement *literal = dyn_cast<LiteralElement>(element.get())) {
-      genLiteralParser(literal->getLiteral(), body);
+  for (auto &element : elements)
+    genElementParser(element.get(), body, attrTypeCtx);
 
-      /// Arguments.
-    } else if (auto *attr = dyn_cast<AttributeVariable>(element.get())) {
-      const NamedAttribute *var = attr->getVar();
-
-      // Check to see if we can parse this as an enum attribute.
-      if (canFormatEnumAttr(var)) {
-        const EnumAttr &enumAttr = cast<EnumAttr>(var->attr);
-
-        // Generate the code for building an attribute for this enum.
-        std::string attrBuilderStr;
-        {
-          llvm::raw_string_ostream os(attrBuilderStr);
-          os << tgfmt(enumAttr.getConstBuilderTemplate(), &attrTypeCtx,
-                      "attrOptional.getValue()");
-        }
-
-        body << formatv(enumAttrParserCode, var->name,
-                        enumAttr.getCppNamespace(),
-                        enumAttr.getStringToSymbolFnName(), attrBuilderStr);
-        continue;
-      }
-
-      // If this attribute has a buildable type, use that when parsing the
-      // attribute.
-      std::string attrTypeStr;
-      if (Optional<Type> attrType = var->attr.getValueType()) {
-        if (Optional<StringRef> typeBuilder = attrType->getBuilderCall()) {
-          llvm::raw_string_ostream os(attrTypeStr);
-          os << ", " << tgfmt(*typeBuilder, &attrTypeCtx);
-        }
-      }
-
-      body << formatv(attrParserCode, var->attr.getStorageType(), var->name,
-                      attrTypeStr);
-    } else if (auto *operand = dyn_cast<OperandVariable>(element.get())) {
-      bool isVariadic = operand->getVar()->isVariadic();
-      body << formatv(isVariadic ? variadicOperandParserCode
-                                 : operandParserCode,
-                      operand->getVar()->name);
-
-      /// Directives.
-    } else if (isa<AttrDictDirective>(element.get())) {
-      body << "  if (parser.parseOptionalAttrDict(result.attributes))\n"
-           << "    return failure();\n";
-    } else if (isa<OperandsDirective>(element.get())) {
-      body << "  llvm::SMLoc allOperandLoc = parser.getCurrentLocation();\n"
-           << "  SmallVector<OpAsmParser::OperandType, 4> allOperands;\n"
-           << "  if (parser.parseOperandList(allOperands))\n"
-           << "    return failure();\n";
-    } else if (auto *dir = dyn_cast<TypeDirective>(element.get())) {
-      bool isVariadic = false;
-      StringRef listName = getTypeListName(dir->getOperand(), isVariadic);
-      body << formatv(isVariadic ? variadicTypeParserCode : typeParserCode,
-                      listName);
-    } else if (auto *dir = dyn_cast<FunctionalTypeDirective>(element.get())) {
-      bool ignored = false;
-      body << formatv(functionalTypeParserCode,
-                      getTypeListName(dir->getInputs(), ignored),
-                      getTypeListName(dir->getResults(), ignored));
-    } else {
-      llvm_unreachable("unknown format element");
-    }
-  }
-
-  // Generate the code to resolve the operand and result types now that they
-  // have been parsed.
+  // Generate the code to resolve the operand/result types and successors now
+  // that they have been parsed.
   genParserTypeResolution(op, body);
+  genParserSuccessorResolution(op, body);
   body << "  return success();\n";
 }
 
 void OperationFormat::genParserTypeResolution(Operator &op,
                                               OpMethodBody &body) {
+  // If any of type resolutions use transformed variables, make sure that the
+  // types of those variables are resolved.
+  SmallPtrSet<const NamedTypeConstraint *, 8> verifiedVariables;
+  FmtContext verifierFCtx;
+  for (TypeResolution &resolver :
+       llvm::concat<TypeResolution>(resultTypes, operandTypes)) {
+    Optional<StringRef> transformer = resolver.getVarTransformer();
+    if (!transformer)
+      continue;
+    // Ensure that we don't verify the same variables twice.
+    const NamedTypeConstraint *variable = resolver.getVariable();
+    if (!verifiedVariables.insert(variable).second)
+      continue;
+
+    auto constraint = variable->constraint;
+    body << "  for (Type type : " << variable->name << "Types) {\n"
+         << "    (void)type;\n"
+         << "    if (!("
+         << tgfmt(constraint.getConditionTemplate(),
+                  &verifierFCtx.withSelf("type"))
+         << ")) {\n"
+         << formatv("      return parser.emitError(parser.getNameLoc()) << "
+                    "\"'{0}' must be {1}, but got \" << type;\n",
+                    variable->name, constraint.getDescription())
+         << "    }\n"
+         << "  }\n";
+  }
+
   // Initialize the set of buildable types.
   if (!buildableTypes.empty()) {
     body << "  Builder &builder = parser.getBuilder();\n";
@@ -498,18 +698,27 @@
            << tgfmt(it.first, &typeBuilderCtx) << ";\n";
   }
 
+  // Emit the code necessary for a type resolver.
+  auto emitTypeResolver = [&](TypeResolution &resolver, StringRef curVar) {
+    if (Optional<int> val = resolver.getBuilderIdx()) {
+      body << "odsBuildableType" << *val;
+    } else if (const NamedTypeConstraint *var = resolver.getVariable()) {
+      if (Optional<StringRef> tform = resolver.getVarTransformer())
+        body << tgfmt(*tform, &FmtContext().withSelf(var->name + "Types[0]"));
+      else
+        body << var->name << "Types";
+    } else {
+      body << curVar << "Types";
+    }
+  };
+
   // Resolve each of the result types.
   if (allResultTypes) {
     body << "  result.addTypes(allResultTypes);\n";
   } else {
     for (unsigned i = 0, e = op.getNumResults(); i != e; ++i) {
       body << "  result.addTypes(";
-      if (Optional<int> val = resultTypes[i].getBuilderIdx())
-        body << "odsBuildableType" << *val;
-      else if (Optional<StringRef> var = resultTypes[i].getVariable())
-        body << *var << "Types";
-      else
-        body << op.getResultName(i) << "Types";
+      emitTypeResolver(resultTypes[i], op.getResultName(i));
       body << ");\n";
     }
   }
@@ -552,25 +761,19 @@
   if (hasAllOperands) {
     body << "  if (parser.resolveOperands(allOperands, ";
 
-    auto emitOperandType = [&](int idx) {
-      if (Optional<int> val = operandTypes[idx].getBuilderIdx())
-        body << "ArrayRef<Type>(odsBuildableType" << *val << ")";
-      else if (Optional<StringRef> var = operandTypes[idx].getVariable())
-        body << *var << "Types";
-      else
-        body << op.getOperand(idx).name << "Types";
-    };
-
     // Group all of the operand types together to perform the resolution all at
     // once. Use llvm::concat to perform the merge. llvm::concat does not allow
     // the case of a single range, so guard it here.
     if (op.getNumOperands() > 1) {
       body << "llvm::concat<const Type>(";
-      interleaveComma(llvm::seq<int>(0, op.getNumOperands()), body,
-                      emitOperandType);
+      interleaveComma(llvm::seq<int>(0, op.getNumOperands()), body, [&](int i) {
+        body << "ArrayRef<Type>(";
+        emitTypeResolver(operandTypes[i], op.getOperand(i).name);
+        body << ")";
+      });
       body << ")";
     } else {
-      emitOperandType(/*idx=*/0);
+      emitTypeResolver(operandTypes.front(), op.getOperand(0).name);
     }
 
     body << ", allOperandLoc, result.operands))\n"
@@ -583,13 +786,34 @@
   for (unsigned i = 0, e = op.getNumOperands(); i != e; ++i) {
     NamedTypeConstraint &operand = op.getOperand(i);
     body << "  if (parser.resolveOperands(" << operand.name << "Operands, ";
-    if (Optional<int> val = operandTypes[i].getBuilderIdx())
-      body << "odsBuildableType" << *val << ", ";
-    else if (Optional<StringRef> var = operandTypes[i].getVariable())
-      body << *var << "Types, " << operand.name << "OperandsLoc, ";
-    else
-      body << operand.name << "Types, " << operand.name << "OperandsLoc, ";
-    body << "result.operands))\n    return failure();\n";
+    emitTypeResolver(operandTypes[i], operand.name);
+
+    // If this isn't a buildable type, verify the sizes match by adding the loc.
+    if (!operandTypes[i].getBuilderIdx())
+      body << ", " << operand.name << "OperandsLoc";
+    body << ", result.operands))\n    return failure();\n";
+  }
+}
+
+void OperationFormat::genParserSuccessorResolution(Operator &op,
+                                                   OpMethodBody &body) {
+  // Check for the case where all successors were parsed.
+  bool hasAllSuccessors = llvm::any_of(
+      elements, [](auto &elt) { return isa<SuccessorsDirective>(elt.get()); });
+  if (hasAllSuccessors) {
+    body << llvm::formatv(resolveSuccessorListParserCode, "full");
+    return;
+  }
+
+  // Otherwise, handle each successor individually.
+  for (const NamedSuccessor &successor : op.getSuccessors()) {
+    if (successor.isVariadic()) {
+      body << llvm::formatv(resolveSuccessorListParserCode, successor.name);
+      continue;
+    }
+
+    body << llvm::formatv("  result.addSuccessor({0}Successor, {0}Operands);\n",
+                          successor.name);
   }
 }
 
@@ -597,14 +821,16 @@
 // PrinterGen
 
 /// Generate the printer for the 'attr-dict' directive.
-static void genAttrDictPrinter(OperationFormat &fmt, OpMethodBody &body) {
+static void genAttrDictPrinter(OperationFormat &fmt, OpMethodBody &body,
+                               bool withKeyword) {
   // Collect all of the attributes used in the format, these will be elided.
   SmallVector<const NamedAttribute *, 1> usedAttributes;
   for (auto &it : fmt.elements)
     if (auto *attr = dyn_cast<AttributeVariable>(it.get()))
       usedAttributes.push_back(attr->getVar());
 
-  body << "  p.printOptionalAttrDict(getAttrs(), /*elidedAttrs=*/{";
+  body << "  p.printOptionalAttrDict" << (withKeyword ? "WithKeyword" : "")
+       << "(getAttrs(), /*elidedAttrs=*/{";
   interleaveComma(usedAttributes, body, [&](const NamedAttribute *attr) {
     body << "\"" << attr->name << "\"";
   });
@@ -636,7 +862,7 @@
   lastWasPunctuation = !(value.front() == '_' || isalpha(value.front()));
 }
 
-/// Generate the c++ for an operand to a (*-)type directive.
+/// Generate the C++ for an operand to a (*-)type directive.
 static OpMethodBody &genTypeOperandPrinter(Element *arg, OpMethodBody &body) {
   if (isa<OperandsDirective>(arg))
     return body << "getOperation()->getOperandTypes()";
@@ -649,6 +875,101 @@
   return body << "ArrayRef<Type>(" << var->name << "().getType())";
 }
 
+/// Generate the code for printing the given element.
+static void genElementPrinter(Element *element, OpMethodBody &body,
+                              OperationFormat &fmt, Operator &op,
+                              bool &shouldEmitSpace, bool &lastWasPunctuation) {
+  if (LiteralElement *literal = dyn_cast<LiteralElement>(element))
+    return genLiteralPrinter(literal->getLiteral(), body, shouldEmitSpace,
+                             lastWasPunctuation);
+
+  // Emit an optional group.
+  if (OptionalElement *optional = dyn_cast<OptionalElement>(element)) {
+    // Emit the check for the presence of the anchor element.
+    Element *anchor = optional->getAnchor();
+    if (AttributeVariable *attrVar = dyn_cast<AttributeVariable>(anchor))
+      body << "  if (getAttr(\"" << attrVar->getVar()->name << "\")) {\n";
+    else
+      body << "  if (!" << cast<OperandVariable>(anchor)->getVar()->name
+           << "().empty()) {\n";
+
+    // Emit each of the elements.
+    for (Element &childElement : optional->getElements())
+      genElementPrinter(&childElement, body, fmt, op, shouldEmitSpace,
+                        lastWasPunctuation);
+    body << "  }\n";
+    return;
+  }
+
+  // Emit the attribute dictionary.
+  if (auto *attrDict = dyn_cast<AttrDictDirective>(element)) {
+    genAttrDictPrinter(fmt, body, attrDict->isWithKeyword());
+    lastWasPunctuation = false;
+    return;
+  }
+
+  // Optionally insert a space before the next element. The AttrDict printer
+  // already adds a space as necessary.
+  if (shouldEmitSpace || !lastWasPunctuation)
+    body << "  p << \" \";\n";
+  lastWasPunctuation = false;
+  shouldEmitSpace = true;
+
+  if (auto *attr = dyn_cast<AttributeVariable>(element)) {
+    const NamedAttribute *var = attr->getVar();
+
+    // If we are formatting as a enum, symbolize the attribute as a string.
+    if (canFormatEnumAttr(var)) {
+      const EnumAttr &enumAttr = cast<EnumAttr>(var->attr);
+      body << "  p << \"\\\"\" << " << enumAttr.getSymbolToStringFnName() << "("
+           << var->name << "()) << \"\\\"\";\n";
+      return;
+    }
+
+    // Elide the attribute type if it is buildable.
+    Optional<Type> attrType = var->attr.getValueType();
+    if (attrType && attrType->getBuilderCall())
+      body << "  p.printAttributeWithoutType(" << var->name << "Attr());\n";
+    else
+      body << "  p.printAttribute(" << var->name << "Attr());\n";
+  } else if (auto *operand = dyn_cast<OperandVariable>(element)) {
+    body << "  p << " << operand->getVar()->name << "();\n";
+  } else if (auto *successor = dyn_cast<SuccessorVariable>(element)) {
+    const NamedSuccessor *var = successor->getVar();
+    if (var->isVariadic()) {
+      body << "  {\n"
+           << "    auto succRange = " << var->name << "();\n"
+           << "    auto opSuccBegin = getOperation()->successor_begin();\n"
+           << "    int i = succRange.begin() - opSuccBegin;\n"
+           << "    int e = i + succRange.size();\n"
+           << "    interleaveComma(llvm::seq<int>(i, e), p, [&](int i) {\n"
+           << "      p.printSuccessorAndUseList(*this, i);\n"
+           << "    });\n"
+           << "  }\n";
+      return;
+    }
+
+    unsigned index = successor->getVar() - op.successor_begin();
+    body << "  p.printSuccessorAndUseList(*this, " << index << ");\n";
+  } else if (isa<OperandsDirective>(element)) {
+    body << "  p << getOperation()->getOperands();\n";
+  } else if (isa<SuccessorsDirective>(element)) {
+    body << "  interleaveComma(llvm::seq<int>(0, "
+            "getOperation()->getNumSuccessors()), p, [&](int i) {"
+         << "    p.printSuccessorAndUseList(*this, i);"
+         << "  });\n";
+  } else if (auto *dir = dyn_cast<TypeDirective>(element)) {
+    body << "  p << ";
+    genTypeOperandPrinter(dir->getOperand(), body) << ";\n";
+  } else if (auto *dir = dyn_cast<FunctionalTypeDirective>(element)) {
+    body << "  p.printFunctionalType(";
+    genTypeOperandPrinter(dir->getInputs(), body) << ", ";
+    genTypeOperandPrinter(dir->getResults(), body) << ");\n";
+  } else {
+    llvm_unreachable("unknown format element");
+  }
+}
+
 void OperationFormat::genPrinter(Operator &op, OpClass &opClass) {
   auto &method = opClass.newMethod("void", "print", "OpAsmPrinter &p");
   auto &body = method.body();
@@ -666,60 +987,9 @@
   // Flags for if we should emit a space, and if the last element was
   // punctuation.
   bool shouldEmitSpace = true, lastWasPunctuation = false;
-  for (auto &element : elements) {
-    // Emit a literal element.
-    if (LiteralElement *literal = dyn_cast<LiteralElement>(element.get())) {
-      genLiteralPrinter(literal->getLiteral(), body, shouldEmitSpace,
-                        lastWasPunctuation);
-      continue;
-    }
-
-    // Emit the attribute dictionary.
-    if (isa<AttrDictDirective>(element.get())) {
-      genAttrDictPrinter(*this, body);
-      lastWasPunctuation = false;
-      continue;
-    }
-
-    // Optionally insert a space before the next element. The AttrDict printer
-    // already adds a space as necessary.
-    if (shouldEmitSpace || !lastWasPunctuation)
-      body << "  p << \" \";\n";
-    lastWasPunctuation = false;
-    shouldEmitSpace = true;
-
-    if (auto *attr = dyn_cast<AttributeVariable>(element.get())) {
-      const NamedAttribute *var = attr->getVar();
-
-      // If we are formatting as a enum, symbolize the attribute as a string.
-      if (canFormatEnumAttr(var)) {
-        const EnumAttr &enumAttr = cast<EnumAttr>(var->attr);
-        body << "  p << \"\\\"\" << " << enumAttr.getSymbolToStringFnName()
-             << "(" << var->name << "()) << \"\\\"\";\n";
-        continue;
-      }
-
-      // Elide the attribute type if it is buildable.
-      Optional<Type> attrType = var->attr.getValueType();
-      if (attrType && attrType->getBuilderCall())
-        body << "  p.printAttributeWithoutType(" << var->name << "Attr());\n";
-      else
-        body << "  p.printAttribute(" << var->name << "Attr());\n";
-    } else if (auto *operand = dyn_cast<OperandVariable>(element.get())) {
-      body << "  p << " << operand->getVar()->name << "();\n";
-    } else if (isa<OperandsDirective>(element.get())) {
-      body << "  p << getOperation()->getOperands();\n";
-    } else if (auto *dir = dyn_cast<TypeDirective>(element.get())) {
-      body << "  p << ";
-      genTypeOperandPrinter(dir->getOperand(), body) << ";\n";
-    } else if (auto *dir = dyn_cast<FunctionalTypeDirective>(element.get())) {
-      body << "  p.printFunctionalType(";
-      genTypeOperandPrinter(dir->getInputs(), body) << ", ";
-      genTypeOperandPrinter(dir->getResults(), body) << ");\n";
-    } else {
-      llvm_unreachable("unknown format element");
-    }
-  }
+  for (auto &element : elements)
+    genElementPrinter(element.get(), body, *this, op, shouldEmitSpace,
+                      lastWasPunctuation);
 }
 
 //===----------------------------------------------------------------------===//
@@ -738,15 +1008,19 @@
     // Tokens with no info.
     l_paren,
     r_paren,
+    caret,
     comma,
     equal,
+    question,
 
     // Keywords.
     keyword_start,
     kw_attr_dict,
+    kw_attr_dict_w_keyword,
     kw_functional_type,
     kw_operands,
     kw_results,
+    kw_successors,
     kw_type,
     keyword_end,
 
@@ -868,10 +1142,14 @@
     return formToken(Token::eof, tokStart);
 
   // Lex punctuation.
+  case '^':
+    return formToken(Token::caret, tokStart);
   case ',':
     return formToken(Token::comma, tokStart);
   case '=':
     return formToken(Token::equal, tokStart);
+  case '?':
+    return formToken(Token::question, tokStart);
   case '(':
     return formToken(Token::l_paren, tokStart);
   case ')':
@@ -919,13 +1197,16 @@
 
   // Check to see if this identifier is a keyword.
   StringRef str(tokStart, curPtr - tokStart);
-  Token::Kind kind = llvm::StringSwitch<Token::Kind>(str)
-                         .Case("attr-dict", Token::kw_attr_dict)
-                         .Case("functional-type", Token::kw_functional_type)
-                         .Case("operands", Token::kw_operands)
-                         .Case("results", Token::kw_results)
-                         .Case("type", Token::kw_type)
-                         .Default(Token::identifier);
+  Token::Kind kind =
+      llvm::StringSwitch<Token::Kind>(str)
+          .Case("attr-dict", Token::kw_attr_dict)
+          .Case("attr-dict-with-keyword", Token::kw_attr_dict_w_keyword)
+          .Case("functional-type", Token::kw_functional_type)
+          .Case("operands", Token::kw_operands)
+          .Case("results", Token::kw_results)
+          .Case("successors", Token::kw_successors)
+          .Case("type", Token::kw_type)
+          .Default(Token::identifier);
   return Token(kind, str);
 }
 
@@ -954,18 +1235,30 @@
   LogicalResult parse();
 
 private:
+  /// This struct represents a type resolution instance. It includes a specific
+  /// type as well as an optional transformer to apply to that type in order to
+  /// properly resolve the type of a variable.
+  struct TypeResolutionInstance {
+    const NamedTypeConstraint *type;
+    Optional<StringRef> transformer;
+  };
+
   /// Given the values of an `AllTypesMatch` trait, check for inferrable type
   /// resolution.
   void handleAllTypesMatchConstraint(
       ArrayRef<StringRef> values,
-      llvm::StringMap<const NamedTypeConstraint *> &variableTyResolver);
+      llvm::StringMap<TypeResolutionInstance> &variableTyResolver);
   /// Check for inferrable type resolution given all operands, and or results,
   /// have the same type. If 'includeResults' is true, the results also have the
   /// same type as all of the operands.
   void handleSameTypesConstraint(
-      llvm::StringMap<const NamedTypeConstraint *> &variableTyResolver,
+      llvm::StringMap<TypeResolutionInstance> &variableTyResolver,
       bool includeResults);
 
+  /// Returns an argument with the given name that has been seen within the
+  /// format.
+  const NamedTypeConstraint *findSeenArg(StringRef name);
+
   /// Parse a specific element.
   LogicalResult parseElement(std::unique_ptr<Element> &element,
                              bool isTopLevel);
@@ -974,16 +1267,25 @@
   LogicalResult parseDirective(std::unique_ptr<Element> &element,
                                bool isTopLevel);
   LogicalResult parseLiteral(std::unique_ptr<Element> &element);
+  LogicalResult parseOptional(std::unique_ptr<Element> &element,
+                              bool isTopLevel);
+  LogicalResult parseOptionalChildElement(
+      std::vector<std::unique_ptr<Element>> &childElements,
+      SmallPtrSetImpl<const NamedTypeConstraint *> &seenVariables,
+      Optional<unsigned> &anchorIdx);
 
   /// Parse the various different directives.
   LogicalResult parseAttrDictDirective(std::unique_ptr<Element> &element,
-                                       llvm::SMLoc loc, bool isTopLevel);
+                                       llvm::SMLoc loc, bool isTopLevel,
+                                       bool withKeyword);
   LogicalResult parseFunctionalTypeDirective(std::unique_ptr<Element> &element,
                                              Token tok, bool isTopLevel);
   LogicalResult parseOperandsDirective(std::unique_ptr<Element> &element,
                                        llvm::SMLoc loc, bool isTopLevel);
   LogicalResult parseResultsDirective(std::unique_ptr<Element> &element,
                                       llvm::SMLoc loc, bool isTopLevel);
+  LogicalResult parseSuccessorsDirective(std::unique_ptr<Element> &element,
+                                         llvm::SMLoc loc, bool isTopLevel);
   LogicalResult parseTypeDirective(std::unique_ptr<Element> &element, Token tok,
                                    bool isTopLevel);
   LogicalResult parseTypeDirectiveOperand(std::unique_ptr<Element> &element);
@@ -1022,9 +1324,12 @@
   // The following are various bits of format state used for verification
   // during parsing.
   bool hasAllOperands = false, hasAttrDict = false;
+  bool hasAllSuccessors = false;
   llvm::SmallBitVector seenOperandTypes, seenResultTypes;
   llvm::DenseSet<const NamedTypeConstraint *> seenOperands;
   llvm::DenseSet<const NamedAttribute *> seenAttrs;
+  llvm::DenseSet<const NamedSuccessor *> seenSuccessors;
+  llvm::DenseSet<const NamedTypeConstraint *> optionalVariables;
 };
 } // end anonymous namespace
 
@@ -1044,16 +1349,21 @@
     return emitError(loc, "format missing 'attr-dict' directive");
 
   // Check for any type traits that we can use for inferring types.
-  llvm::StringMap<const NamedTypeConstraint *> variableTyResolver;
+  llvm::StringMap<TypeResolutionInstance> variableTyResolver;
   for (const OpTrait &trait : op.getTraits()) {
     const llvm::Record &def = trait.getDef();
-    if (def.isSubClassOf("AllTypesMatch"))
+    if (def.isSubClassOf("AllTypesMatch")) {
       handleAllTypesMatchConstraint(def.getValueAsListOfStrings("values"),
                                     variableTyResolver);
-    else if (def.getName() == "SameTypeOperands")
+    } else if (def.getName() == "SameTypeOperands") {
       handleSameTypesConstraint(variableTyResolver, /*includeResults=*/false);
-    else if (def.getName() == "SameOperandsAndResultType")
+    } else if (def.getName() == "SameOperandsAndResultType") {
       handleSameTypesConstraint(variableTyResolver, /*includeResults=*/true);
+    } else if (def.isSubClassOf("TypesMatchWith")) {
+      if (const auto *lhsArg = findSeenArg(def.getValueAsString("lhs")))
+        variableTyResolver[def.getValueAsString("rhs")] = {
+            lhsArg, def.getValueAsString("transformer")};
+    }
   }
 
   // Check that all of the result types can be inferred.
@@ -1066,7 +1376,8 @@
       // Check to see if we can infer this type from another variable.
       auto varResolverIt = variableTyResolver.find(op.getResultName(i));
       if (varResolverIt != variableTyResolver.end()) {
-        fmt.resultTypes[i].setVariable(varResolverIt->second->name);
+        fmt.resultTypes[i].setVariable(varResolverIt->second.type,
+                                       varResolverIt->second.transformer);
         continue;
       }
 
@@ -1102,7 +1413,8 @@
     // Check to see if we can infer this type from another variable.
     auto varResolverIt = variableTyResolver.find(op.getOperand(i).name);
     if (varResolverIt != variableTyResolver.end()) {
-      fmt.operandTypes[i].setVariable(varResolverIt->second->name);
+      fmt.operandTypes[i].setVariable(varResolverIt->second.type,
+                                      varResolverIt->second.transformer);
       continue;
     }
 
@@ -1116,35 +1428,39 @@
     auto it = buildableTypes.insert({*builder, buildableTypes.size()});
     fmt.operandTypes[i].setBuilderIdx(it.first->second);
   }
+
+  // Check that all of the successors are within the format.
+  if (!hasAllSuccessors) {
+    for (unsigned i = 0, e = op.getNumSuccessors(); i != e; ++i) {
+      const NamedSuccessor &successor = op.getSuccessor(i);
+      if (!seenSuccessors.count(&successor)) {
+        return emitError(loc, "format missing instance of successor #" +
+                                  Twine(i) + "('" + successor.name + "')");
+      }
+    }
+  }
   return success();
 }
 
 void FormatParser::handleAllTypesMatchConstraint(
     ArrayRef<StringRef> values,
-    llvm::StringMap<const NamedTypeConstraint *> &variableTyResolver) {
+    llvm::StringMap<TypeResolutionInstance> &variableTyResolver) {
   for (unsigned i = 0, e = values.size(); i != e; ++i) {
     // Check to see if this value matches a resolved operand or result type.
-    const NamedTypeConstraint *arg = nullptr;
-    if ((arg = findArg(op.getOperands(), values[i]))) {
-      if (!seenOperandTypes.test(arg - op.operand_begin()))
-        continue;
-    } else if ((arg = findArg(op.getResults(), values[i]))) {
-      if (!seenResultTypes.test(arg - op.result_begin()))
-        continue;
-    } else {
+    const NamedTypeConstraint *arg = findSeenArg(values[i]);
+    if (!arg)
       continue;
-    }
 
     // Mark this value as the type resolver for the other variables.
     for (unsigned j = 0; j != i; ++j)
-      variableTyResolver[values[j]] = arg;
+      variableTyResolver[values[j]] = {arg, llvm::None};
     for (unsigned j = i + 1; j != e; ++j)
-      variableTyResolver[values[j]] = arg;
+      variableTyResolver[values[j]] = {arg, llvm::None};
   }
 }
 
 void FormatParser::handleSameTypesConstraint(
-    llvm::StringMap<const NamedTypeConstraint *> &variableTyResolver,
+    llvm::StringMap<TypeResolutionInstance> &variableTyResolver,
     bool includeResults) {
   const NamedTypeConstraint *resolver = nullptr;
   int resolvedIt = -1;
@@ -1160,14 +1476,22 @@
   // Set the resolvers for each operand and result.
   for (unsigned i = 0, e = op.getNumOperands(); i != e; ++i)
     if (!seenOperandTypes.test(i) && !op.getOperand(i).name.empty())
-      variableTyResolver[op.getOperand(i).name] = resolver;
+      variableTyResolver[op.getOperand(i).name] = {resolver, llvm::None};
   if (includeResults) {
     for (unsigned i = 0, e = op.getNumResults(); i != e; ++i)
       if (!seenResultTypes.test(i) && !op.getResultName(i).empty())
-        variableTyResolver[op.getResultName(i)] = resolver;
+        variableTyResolver[op.getResultName(i)] = {resolver, llvm::None};
   }
 }
 
+const NamedTypeConstraint *FormatParser::findSeenArg(StringRef name) {
+  if (auto *arg = findArg(op.getOperands(), name))
+    return seenOperandTypes.test(arg - op.operand_begin()) ? arg : nullptr;
+  if (auto *arg = findArg(op.getResults(), name))
+    return seenResultTypes.test(arg - op.result_begin()) ? arg : nullptr;
+  return nullptr;
+}
+
 LogicalResult FormatParser::parseElement(std::unique_ptr<Element> &element,
                                          bool isTopLevel) {
   // Directives.
@@ -1176,11 +1500,14 @@
   // Literals.
   if (curToken.getKind() == Token::literal)
     return parseLiteral(element);
+  // Optionals.
+  if (curToken.getKind() == Token::l_paren)
+    return parseOptional(element, isTopLevel);
   // Variables.
   if (curToken.getKind() == Token::variable)
     return parseVariable(element, isTopLevel);
   return emitError(curToken.getLoc(),
-                   "expected directive, literal, or variable");
+                   "expected directive, literal, variable, or optional group");
 }
 
 LogicalResult FormatParser::parseVariable(std::unique_ptr<Element> &element,
@@ -1191,7 +1518,8 @@
   StringRef name = varTok.getSpelling().drop_front();
   llvm::SMLoc loc = varTok.getLoc();
 
-  // Check that the parsed argument is something actually registered on the op.
+  // Check that the parsed argument is something actually registered on the
+  // op.
   /// Attributes
   if (const NamedAttribute *attr = findArg(op.getAttributes(), name)) {
     if (isTopLevel && !seenAttrs.insert(attr).second)
@@ -1215,7 +1543,17 @@
     element = std::make_unique<ResultVariable>(result);
     return success();
   }
-  return emitError(loc, "expected variable to refer to a argument or result");
+  /// Successors.
+  if (const auto *successor = findArg(op.getSuccessors(), name)) {
+    if (!isTopLevel)
+      return emitError(loc, "successors can only be used at the top level");
+    if (hasAllSuccessors || !seenSuccessors.insert(successor).second)
+      return emitError(loc, "successor '" + name + "' is already bound");
+    element = std::make_unique<SuccessorVariable>(successor);
+    return success();
+  }
+  return emitError(
+      loc, "expected variable to refer to a argument, result, or successor");
 }
 
 LogicalResult FormatParser::parseDirective(std::unique_ptr<Element> &element,
@@ -1225,13 +1563,19 @@
 
   switch (dirTok.getKind()) {
   case Token::kw_attr_dict:
-    return parseAttrDictDirective(element, dirTok.getLoc(), isTopLevel);
+    return parseAttrDictDirective(element, dirTok.getLoc(), isTopLevel,
+                                  /*withKeyword=*/false);
+  case Token::kw_attr_dict_w_keyword:
+    return parseAttrDictDirective(element, dirTok.getLoc(), isTopLevel,
+                                  /*withKeyword=*/true);
   case Token::kw_functional_type:
     return parseFunctionalTypeDirective(element, dirTok, isTopLevel);
   case Token::kw_operands:
     return parseOperandsDirective(element, dirTok.getLoc(), isTopLevel);
   case Token::kw_results:
     return parseResultsDirective(element, dirTok.getLoc(), isTopLevel);
+  case Token::kw_successors:
+    return parseSuccessorsDirective(element, dirTok.getLoc(), isTopLevel);
   case Token::kw_type:
     return parseTypeDirective(element, dirTok, isTopLevel);
 
@@ -1253,9 +1597,119 @@
   return success();
 }
 
+LogicalResult FormatParser::parseOptional(std::unique_ptr<Element> &element,
+                                          bool isTopLevel) {
+  llvm::SMLoc curLoc = curToken.getLoc();
+  if (!isTopLevel)
+    return emitError(curLoc, "optional groups can only be used as top-level "
+                             "elements");
+  consumeToken();
+
+  // Parse the child elements for this optional group.
+  std::vector<std::unique_ptr<Element>> elements;
+  SmallPtrSet<const NamedTypeConstraint *, 8> seenVariables;
+  Optional<unsigned> anchorIdx;
+  do {
+    if (failed(parseOptionalChildElement(elements, seenVariables, anchorIdx)))
+      return failure();
+  } while (curToken.getKind() != Token::r_paren);
+  consumeToken();
+  if (failed(parseToken(Token::question, "expected '?' after optional group")))
+    return failure();
+
+  // The optional group is required to have an anchor.
+  if (!anchorIdx)
+    return emitError(curLoc, "optional group specified no anchor element");
+
+  // The first element of the group must be one that can be parsed/printed in an
+  // optional fashion.
+  if (!isa<LiteralElement>(&*elements.front()) &&
+      !isa<OperandVariable>(&*elements.front()))
+    return emitError(curLoc, "first element of an operand group must be a "
+                             "literal or operand");
+
+  // After parsing all of the elements, ensure that all type directives refer
+  // only to elements within the group.
+  auto checkTypeOperand = [&](Element *typeEle) {
+    auto *opVar = dyn_cast<OperandVariable>(typeEle);
+    const NamedTypeConstraint *var = opVar ? opVar->getVar() : nullptr;
+    if (!seenVariables.count(var))
+      return emitError(curLoc, "type directive can only refer to variables "
+                               "within the optional group");
+    return success();
+  };
+  for (auto &ele : elements) {
+    if (auto *typeEle = dyn_cast<TypeDirective>(ele.get())) {
+      if (failed(checkTypeOperand(typeEle->getOperand())))
+        return failure();
+    } else if (auto *typeEle = dyn_cast<FunctionalTypeDirective>(ele.get())) {
+      if (failed(checkTypeOperand(typeEle->getInputs())) ||
+          failed(checkTypeOperand(typeEle->getResults())))
+        return failure();
+    }
+  }
+
+  optionalVariables.insert(seenVariables.begin(), seenVariables.end());
+  element = std::make_unique<OptionalElement>(std::move(elements), *anchorIdx);
+  return success();
+}
+
+LogicalResult FormatParser::parseOptionalChildElement(
+    std::vector<std::unique_ptr<Element>> &childElements,
+    SmallPtrSetImpl<const NamedTypeConstraint *> &seenVariables,
+    Optional<unsigned> &anchorIdx) {
+  llvm::SMLoc childLoc = curToken.getLoc();
+  childElements.push_back({});
+  if (failed(parseElement(childElements.back(), /*isTopLevel=*/true)))
+    return failure();
+
+  // Check to see if this element is the anchor of the optional group.
+  bool isAnchor = curToken.getKind() == Token::caret;
+  if (isAnchor) {
+    if (anchorIdx)
+      return emitError(childLoc, "only one element can be marked as the anchor "
+                                 "of an optional group");
+    anchorIdx = childElements.size() - 1;
+    consumeToken();
+  }
+
+  return TypeSwitch<Element *, LogicalResult>(childElements.back().get())
+      // All attributes can be within the optional group, but only optional
+      // attributes can be the anchor.
+      .Case([&](AttributeVariable *attrEle) {
+        if (isAnchor && !attrEle->getVar()->attr.isOptional())
+          return emitError(childLoc, "only optional attributes can be used to "
+                                     "anchor an optional group");
+        return success();
+      })
+      // Only optional-like(i.e. variadic) operands can be within an optional
+      // group.
+      .Case<OperandVariable>([&](OperandVariable *ele) {
+        if (!ele->getVar()->isVariadic())
+          return emitError(childLoc, "only variadic operands can be used within"
+                                     " an optional group");
+        seenVariables.insert(ele->getVar());
+        return success();
+      })
+      // Literals and type directives may be used, but they can't anchor the
+      // group.
+      .Case<LiteralElement, TypeDirective, FunctionalTypeDirective>(
+          [&](Element *) {
+            if (isAnchor)
+              return emitError(childLoc, "only variables can be used to anchor "
+                                         "an optional group");
+            return success();
+          })
+      .Default([&](Element *) {
+        return emitError(childLoc, "only literals, types, and variables can be "
+                                   "used within an optional group");
+      });
+}
+
 LogicalResult
 FormatParser::parseAttrDictDirective(std::unique_ptr<Element> &element,
-                                     llvm::SMLoc loc, bool isTopLevel) {
+                                     llvm::SMLoc loc, bool isTopLevel,
+                                     bool withKeyword) {
   if (!isTopLevel)
     return emitError(loc, "'attr-dict' directive can only be used as a "
                           "top-level directive");
@@ -1263,7 +1717,7 @@
     return emitError(loc, "'attr-dict' directive has already been seen");
 
   hasAttrDict = true;
-  element = std::make_unique<AttrDictDirective>();
+  element = std::make_unique<AttrDictDirective>(withKeyword);
   return success();
 }
 
@@ -1283,8 +1737,6 @@
       failed(parseTypeDirectiveOperand(results)) ||
       failed(parseToken(Token::r_paren, "expected ')' after argument list")))
     return failure();
-
-  // Get the proper directive kind and create it.
   element = std::make_unique<FunctionalTypeDirective>(std::move(inputs),
                                                       std::move(results));
   return success();
@@ -1310,6 +1762,19 @@
   return success();
 }
 
+LogicalResult
+FormatParser::parseSuccessorsDirective(std::unique_ptr<Element> &element,
+                                       llvm::SMLoc loc, bool isTopLevel) {
+  if (!isTopLevel)
+    return emitError(loc,
+                     "'successors' is only valid as a top-level directive");
+  if (hasAllSuccessors || !seenSuccessors.empty())
+    return emitError(loc, "'successors' directive creates overlap in format");
+  hasAllSuccessors = true;
+  element = std::make_unique<SuccessorsDirective>();
+  return success();
+}
+
 LogicalResult
 FormatParser::parseTypeDirective(std::unique_ptr<Element> &element, Token tok,
                                  bool isTopLevel) {
diff --git a/mlir/tools/mlir-vulkan-runner/VulkanRuntime.cpp b/mlir/tools/mlir-vulkan-runner/VulkanRuntime.cpp
--- a/mlir/tools/mlir-vulkan-runner/VulkanRuntime.cpp
+++ b/mlir/tools/mlir-vulkan-runner/VulkanRuntime.cpp
@@ -170,9 +170,8 @@
   vkGetDeviceQueue(device, queueFamilyIndex, 0, &queue);
 
   // Submit command buffer into the queue.
-  if (failed(submitCommandBuffersToQueue())) {
+  if (failed(submitCommandBuffersToQueue()))
     return failure();
-  }
 
   RETURN_ON_VULKAN_ERROR(vkQueueWaitIdle(queue), "vkQueueWaitIdle");
   return success();