diff --git a/clang-tools-extra/clangd/refactor/Rename.cpp b/clang-tools-extra/clangd/refactor/Rename.cpp
--- a/clang-tools-extra/clangd/refactor/Rename.cpp
+++ b/clang-tools-extra/clangd/refactor/Rename.cpp
@@ -215,7 +215,7 @@
   // getUSRsForDeclaration will find other related symbols, e.g. virtual and its
   // overriddens, primary template and all explicit specializations.
   // FIXME: Get rid of the remaining tooling APIs.
-  const auto RenameDecl =
+  const auto *RenameDecl =
       ND.getDescribedTemplate() ? ND.getDescribedTemplate() : &ND;
   std::vector<std::string> RenameUSRs =
       tooling::getUSRsForDeclaration(RenameDecl, AST.getASTContext());
diff --git a/clang-tools-extra/clangd/unittests/RenameTests.cpp b/clang-tools-extra/clangd/unittests/RenameTests.cpp
--- a/clang-tools-extra/clangd/unittests/RenameTests.cpp
+++ b/clang-tools-extra/clangd/unittests/RenameTests.cpp
@@ -722,7 +722,7 @@
     void onDiagnosticsReady(PathRef File,
                             std::vector<Diag> Diagnostics) override {}
   } DiagConsumer;
-  // rename is runnning on the "^" point in FooH, and "[[]]" ranges are the
+  // rename is runnning on all "^" points in FooH, and "[[]]" ranges are the
   // expected rename occurrences.
   struct Case {
     llvm::StringRef FooH;
@@ -763,28 +763,10 @@
       )cpp",
       },
       {
-          // Constructor.
+          // rename on constructor and destructor.
           R"cpp(
         class [[Foo]] {
           [[^Foo]]();
-          ~[[Foo]]();
-        };
-      )cpp",
-          R"cpp(
-        #include "foo.h"
-        [[Foo]]::[[Foo]]() {}
-        [[Foo]]::~[[Foo]]() {}
-
-        void func() {
-          [[Foo]] foo;
-        }
-      )cpp",
-      },
-      {
-          // Destructor (selecting before the identifier).
-          R"cpp(
-        class [[Foo]] {
-          [[Foo]]();
           ~[[Foo^]]();
         };
       )cpp",
@@ -891,12 +873,15 @@
     runAddDocument(Server, FooCCPath, FooCC.code());
 
     llvm::StringRef NewName = "NewName";
-    auto FileEditsList =
-        llvm::cantFail(runRename(Server, FooHPath, FooH.point(), NewName));
-    EXPECT_THAT(applyEdits(std::move(FileEditsList)),
-                UnorderedElementsAre(
-                    Pair(Eq(FooHPath), Eq(expectedResult(T.FooH, NewName))),
-                    Pair(Eq(FooCCPath), Eq(expectedResult(T.FooCC, NewName)))));
+    for (const auto &RenamePos : FooH.points()) {
+      auto FileEditsList =
+          llvm::cantFail(runRename(Server, FooHPath, RenamePos, NewName));
+      EXPECT_THAT(
+          applyEdits(std::move(FileEditsList)),
+          UnorderedElementsAre(
+              Pair(Eq(FooHPath), Eq(expectedResult(T.FooH, NewName))),
+              Pair(Eq(FooCCPath), Eq(expectedResult(T.FooCC, NewName)))));
+    }
   }
 }
 
diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td
--- a/clang/include/clang/Basic/arm_mve.td
+++ b/clang/include/clang/Basic/arm_mve.td
@@ -307,10 +307,12 @@
                                  (IRIntBase<"maxnum", [Vector]> $a, $b)>;
   def vminnmaq: Intrinsic<Vector, (args Vector:$a, Vector:$b),
                                   (IRIntBase<"minnum", [Vector]>
-                                   $a, (IRIntBase<"fabs", [Vector]> $b))>;
+                                   (IRIntBase<"fabs", [Vector]> $a),
+                                   (IRIntBase<"fabs", [Vector]> $b))>;
   def vmaxnmaq: Intrinsic<Vector, (args Vector:$a, Vector:$b),
                                   (IRIntBase<"maxnum", [Vector]>
-                                   $a, (IRIntBase<"fabs", [Vector]> $b))>;
+                                   (IRIntBase<"fabs", [Vector]> $a),
+                                   (IRIntBase<"fabs", [Vector]> $b))>;
 }
 
 def vpselq: Intrinsic<Vector, (args Vector:$t, Vector:$f, Predicate:$pred),
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -9681,7 +9681,8 @@
   /// Check if the specified variable is captured  by 'target' directive.
   /// \param Level Relative level of nested OpenMP construct for that the check
   /// is performed.
-  bool isOpenMPTargetCapturedDecl(const ValueDecl *D, unsigned Level) const;
+  bool isOpenMPTargetCapturedDecl(const ValueDecl *D, unsigned Level,
+                                  unsigned CaptureLevel) const;
 
   ExprResult PerformOpenMPImplicitIntegerConversion(SourceLocation OpLoc,
                                                     Expr *Op);
diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp
--- a/clang/lib/Format/FormatTokenLexer.cpp
+++ b/clang/lib/Format/FormatTokenLexer.cpp
@@ -120,8 +120,11 @@
       Tokens.back()->Tok.setKind(tok::starequal);
       return;
     }
-    if (tryMergeTokens(JSNullishOperator, TT_JsNullishCoalescingOperator))
+    if (tryMergeTokens(JSNullishOperator, TT_JsNullishCoalescingOperator)) {
+      // Treat like the "||" operator (as opposed to the ternary ?).
+      Tokens.back()->Tok.setKind(tok::pipepipe);
       return;
+    }
     if (tryMergeTokens(JSNullPropagatingOperator,
                        TT_JsNullPropagatingOperator)) {
       // Treat like a regular "." access.
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -16280,8 +16280,10 @@
               captureVariablyModifiedType(Context, QTy, OuterRSI);
             }
           }
-          bool IsTargetCap = !IsOpenMPPrivateDecl &&
-                             isOpenMPTargetCapturedDecl(Var, RSI->OpenMPLevel);
+          bool IsTargetCap =
+              !IsOpenMPPrivateDecl &&
+              isOpenMPTargetCapturedDecl(Var, RSI->OpenMPLevel,
+                                         RSI->OpenMPCaptureLevel);
           // When we detect target captures we are looking from inside the
           // target region, therefore we need to propagate the capture from the
           // enclosing region. Therefore, the capture is not initially nested.
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -31,6 +31,7 @@
 #include "clang/Sema/SemaInternal.h"
 #include "llvm/ADT/IndexedMap.h"
 #include "llvm/ADT/PointerEmbeddedInt.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
 using namespace clang;
 using namespace llvm::omp;
@@ -2010,7 +2011,23 @@
       //
       if (OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
         return nullptr;
-      return VD;
+      CapturedRegionScopeInfo *CSI = nullptr;
+      for (FunctionScopeInfo *FSI : llvm::drop_begin(
+               llvm::reverse(FunctionScopes),
+               CheckScopeInfo ? (FunctionScopes.size() - (StopAt + 1)) : 0)) {
+        if (!isa<CapturingScopeInfo>(FSI))
+          return nullptr;
+        if (auto *RSI = dyn_cast<CapturedRegionScopeInfo>(FSI))
+          if (RSI->CapRegionKind == CR_OpenMP) {
+            CSI = RSI;
+            break;
+          }
+      }
+      SmallVector<OpenMPDirectiveKind, 4> Regions;
+      getOpenMPCaptureRegions(Regions,
+                              DSAStack->getDirective(CSI->OpenMPLevel));
+      if (Regions[CSI->OpenMPCaptureLevel] != OMPD_task)
+        return VD;
     }
   }
 
@@ -2151,15 +2168,18 @@
     FD->addAttr(OMPCaptureKindAttr::CreateImplicit(Context, OMPC));
 }
 
-bool Sema::isOpenMPTargetCapturedDecl(const ValueDecl *D,
-                                      unsigned Level) const {
+bool Sema::isOpenMPTargetCapturedDecl(const ValueDecl *D, unsigned Level,
+                                      unsigned CaptureLevel) const {
   assert(LangOpts.OpenMP && "OpenMP is not allowed");
   // Return true if the current level is no longer enclosed in a target region.
 
+  SmallVector<OpenMPDirectiveKind, 4> Regions;
+  getOpenMPCaptureRegions(Regions, DSAStack->getDirective(Level));
   const auto *VD = dyn_cast<VarDecl>(D);
   return VD && !VD->hasLocalStorage() &&
          DSAStack->hasExplicitDirective(isOpenMPTargetExecutionDirective,
-                                        Level);
+                                        Level) &&
+         Regions[CaptureLevel] != OMPD_task;
 }
 
 void Sema::DestroyDataSharingAttributesStack() { delete DSAStack; }
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmaxnmaq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmaxnmaq.c
--- a/clang/test/CodeGen/arm-mve-intrinsics/vmaxnmaq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmaxnmaq.c
@@ -6,9 +6,10 @@
 
 // CHECK-LABEL: @test_vmaxnmaq_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.fabs.v8f16(<8 x half> [[B:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x half> @llvm.maxnum.v8f16(<8 x half> [[A:%.*]], <8 x half> [[TMP0]])
-// CHECK-NEXT:    ret <8 x half> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.fabs.v8f16(<8 x half> [[A:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x half> @llvm.fabs.v8f16(<8 x half> [[B:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x half> @llvm.maxnum.v8f16(<8 x half> [[TMP0]], <8 x half> [[TMP1]])
+// CHECK-NEXT:    ret <8 x half> [[TMP2]]
 //
 float16x8_t test_vmaxnmaq_f16(float16x8_t a, float16x8_t b)
 {
@@ -21,9 +22,10 @@
 
 // CHECK-LABEL: @test_vmaxnmaq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> [[B:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[A:%.*]], <4 x float> [[TMP0]])
-// CHECK-NEXT:    ret <4 x float> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> [[A:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> [[B:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    ret <4 x float> [[TMP2]]
 //
 float32x4_t test_vmaxnmaq_f32(float32x4_t a, float32x4_t b)
 {
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vminnmaq.c b/clang/test/CodeGen/arm-mve-intrinsics/vminnmaq.c
--- a/clang/test/CodeGen/arm-mve-intrinsics/vminnmaq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vminnmaq.c
@@ -6,9 +6,10 @@
 
 // CHECK-LABEL: @test_vminnmaq_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.fabs.v8f16(<8 x half> [[B:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x half> @llvm.minnum.v8f16(<8 x half> [[A:%.*]], <8 x half> [[TMP0]])
-// CHECK-NEXT:    ret <8 x half> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.fabs.v8f16(<8 x half> [[A:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x half> @llvm.fabs.v8f16(<8 x half> [[B:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x half> @llvm.minnum.v8f16(<8 x half> [[TMP0]], <8 x half> [[TMP1]])
+// CHECK-NEXT:    ret <8 x half> [[TMP2]]
 //
 float16x8_t test_vminnmaq_f16(float16x8_t a, float16x8_t b)
 {
@@ -21,9 +22,10 @@
 
 // CHECK-LABEL: @test_vminnmaq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> [[B:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.minnum.v4f32(<4 x float> [[A:%.*]], <4 x float> [[TMP0]])
-// CHECK-NEXT:    ret <4 x float> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> [[A:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> [[B:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    ret <4 x float> [[TMP2]]
 //
 float32x4_t test_vminnmaq_f32(float32x4_t a, float32x4_t b)
 {
diff --git a/clang/test/OpenMP/target_depend_codegen.cpp b/clang/test/OpenMP/target_depend_codegen.cpp
--- a/clang/test/OpenMP/target_depend_codegen.cpp
+++ b/clang/test/OpenMP/target_depend_codegen.cpp
@@ -209,7 +209,7 @@
 // CHECK-NEXT:  br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]]
 // CHECK:       [[FAIL]]
 // CHECK:       [[BP0:%.+]] = load i[[SZ]]*, i[[SZ]]** %
-// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* @
 // CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
 // CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
 // CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
@@ -223,7 +223,7 @@
 // CHECK:       call void (i8*, ...) %
 // CHECK:       [[DEVICE_CAP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
 // CHECK:       [[BP0:%.+]] = load i[[SZ]]*, i[[SZ]]** %
-// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* @
 // CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
 // CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
 // CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
diff --git a/clang/test/OpenMP/target_messages.cpp b/clang/test/OpenMP/target_messages.cpp
--- a/clang/test/OpenMP/target_messages.cpp
+++ b/clang/test/OpenMP/target_messages.cpp
@@ -112,4 +112,12 @@
 
   return 0;
 }
+
+template <class> struct a { static bool b; };
+template <class c, bool = a<c>::b> void e(c) { // expected-note {{candidate template ignored: substitution failure [with c = int]: non-type template argument is not a constant expression}}
+#pragma omp target
+  {
+    int d ; e(d); // expected-error {{no matching function for call to 'e'}}
+  }
+}
 #endif
diff --git a/clang/test/OpenMP/target_parallel_depend_codegen.cpp b/clang/test/OpenMP/target_parallel_depend_codegen.cpp
--- a/clang/test/OpenMP/target_parallel_depend_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_depend_codegen.cpp
@@ -209,7 +209,7 @@
 // CHECK-NEXT:  br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]]
 // CHECK:       [[FAIL]]
 // CHECK:       [[BP0:%.+]] = load i[[SZ]]*, i[[SZ]]** %
-// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* @
 // CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
 // CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
 // CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
@@ -223,7 +223,7 @@
 // CHECK:       call void (i8*, ...) %
 // CHECK:       [[DEVICE_CAP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
 // CHECK:       [[BP0:%.+]] = load i[[SZ]]*, i[[SZ]]** %
-// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* @
 // CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
 // CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
 // CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
diff --git a/clang/test/OpenMP/target_parallel_for_depend_codegen.cpp b/clang/test/OpenMP/target_parallel_for_depend_codegen.cpp
--- a/clang/test/OpenMP/target_parallel_for_depend_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_for_depend_codegen.cpp
@@ -209,7 +209,7 @@
 // CHECK-NEXT:  br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]]
 // CHECK:       [[FAIL]]
 // CHECK:       [[BP0:%.+]] = load i[[SZ]]*, i[[SZ]]** %
-// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* @
 // CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
 // CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
 // CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
@@ -223,7 +223,7 @@
 // CHECK:       call void (i8*, ...) %
 // CHECK:       [[DEVICE_CAP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
 // CHECK:       [[BP0:%.+]] = load i[[SZ]]*, i[[SZ]]** %
-// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* @
 // CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
 // CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
 // CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
diff --git a/clang/test/OpenMP/target_parallel_for_simd_depend_codegen.cpp b/clang/test/OpenMP/target_parallel_for_simd_depend_codegen.cpp
--- a/clang/test/OpenMP/target_parallel_for_simd_depend_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_for_simd_depend_codegen.cpp
@@ -209,7 +209,7 @@
 // CHECK-NEXT:  br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]]
 // CHECK:       [[FAIL]]
 // CHECK:       [[BP0:%.+]] = load i[[SZ]]*, i[[SZ]]** %
-// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* @
 // CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
 // CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
 // CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
@@ -223,7 +223,7 @@
 // CHECK:       call void (i8*, ...) %
 // CHECK:       [[DEVICE_CAP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
 // CHECK:       [[BP0:%.+]] = load i[[SZ]]*, i[[SZ]]** %
-// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* @
 // CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
 // CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
 // CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
diff --git a/clang/test/OpenMP/target_simd_depend_codegen.cpp b/clang/test/OpenMP/target_simd_depend_codegen.cpp
--- a/clang/test/OpenMP/target_simd_depend_codegen.cpp
+++ b/clang/test/OpenMP/target_simd_depend_codegen.cpp
@@ -209,7 +209,7 @@
 // CHECK-NEXT:  br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]]
 // CHECK:       [[FAIL]]
 // CHECK:       [[BP0:%.+]] = load i[[SZ]]*, i[[SZ]]** %
-// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* @
 // CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
 // CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
 // CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
@@ -223,7 +223,7 @@
 // CHECK:       call void (i8*, ...) %
 // CHECK:       [[DEVICE_CAP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
 // CHECK:       [[BP0:%.+]] = load i[[SZ]]*, i[[SZ]]** %
-// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* @
 // CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
 // CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
 // CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
diff --git a/clang/test/OpenMP/target_teams_depend_codegen.cpp b/clang/test/OpenMP/target_teams_depend_codegen.cpp
--- a/clang/test/OpenMP/target_teams_depend_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_depend_codegen.cpp
@@ -209,7 +209,7 @@
 // CHECK-NEXT:  br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]]
 // CHECK:       [[FAIL]]
 // CHECK:       [[BP0:%.+]] = load i[[SZ]]*, i[[SZ]]** %
-// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* @
 // CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
 // CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
 // CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
@@ -223,7 +223,7 @@
 // CHECK:       call void (i8*, ...) %
 // CHECK:       [[DEVICE_CAP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
 // CHECK:       [[BP0:%.+]] = load i[[SZ]]*, i[[SZ]]** %
-// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* @
 // CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
 // CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
 // CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
diff --git a/clang/test/OpenMP/target_teams_distribute_depend_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_depend_codegen.cpp
--- a/clang/test/OpenMP/target_teams_distribute_depend_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_depend_codegen.cpp
@@ -209,7 +209,7 @@
 // CHECK-NEXT:  br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]]
 // CHECK:       [[FAIL]]
 // CHECK:       [[BP0:%.+]] = load i[[SZ]]*, i[[SZ]]** %
-// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* @
 // CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
 // CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
 // CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
@@ -223,7 +223,7 @@
 // CHECK:       call void (i8*, ...) %
 // CHECK:       [[DEVICE_CAP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
 // CHECK:       [[BP0:%.+]] = load i[[SZ]]*, i[[SZ]]** %
-// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* @
 // CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
 // CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
 // CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_depend_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_depend_codegen.cpp
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_depend_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_depend_codegen.cpp
@@ -209,7 +209,7 @@
 // CHECK-NEXT:  br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]]
 // CHECK:       [[FAIL]]
 // CHECK:       [[BP0:%.+]] = load i[[SZ]]*, i[[SZ]]** %
-// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* @
 // CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
 // CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
 // CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
@@ -223,7 +223,7 @@
 // CHECK:       call void (i8*, ...) %
 // CHECK:       [[DEVICE_CAP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
 // CHECK:       [[BP0:%.+]] = load i[[SZ]]*, i[[SZ]]** %
-// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* @
 // CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
 // CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
 // CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_depend_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_depend_codegen.cpp
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_depend_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_depend_codegen.cpp
@@ -209,7 +209,7 @@
 // CHECK-NEXT:  br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]]
 // CHECK:       [[FAIL]]
 // CHECK:       [[BP0:%.+]] = load i[[SZ]]*, i[[SZ]]** %
-// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* @
 // CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
 // CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
 // CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
@@ -223,7 +223,7 @@
 // CHECK:       call void (i8*, ...) %
 // CHECK:       [[DEVICE_CAP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
 // CHECK:       [[BP0:%.+]] = load i[[SZ]]*, i[[SZ]]** %
-// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* @
 // CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
 // CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
 // CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
diff --git a/clang/test/OpenMP/target_teams_distribute_simd_depend_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_simd_depend_codegen.cpp
--- a/clang/test/OpenMP/target_teams_distribute_simd_depend_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_simd_depend_codegen.cpp
@@ -209,7 +209,7 @@
 // CHECK-NEXT:  br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]]
 // CHECK:       [[FAIL]]
 // CHECK:       [[BP0:%.+]] = load i[[SZ]]*, i[[SZ]]** %
-// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* @
 // CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
 // CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
 // CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
@@ -223,7 +223,7 @@
 // CHECK:       call void (i8*, ...) %
 // CHECK:       [[DEVICE_CAP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
 // CHECK:       [[BP0:%.+]] = load i[[SZ]]*, i[[SZ]]** %
-// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* @
 // CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
 // CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
 // CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
diff --git a/clang/unittests/Format/FormatTestJS.cpp b/clang/unittests/Format/FormatTestJS.cpp
--- a/clang/unittests/Format/FormatTestJS.cpp
+++ b/clang/unittests/Format/FormatTestJS.cpp
@@ -2294,6 +2294,11 @@
 
 TEST_F(FormatTestJS, NullishCoalescingOperator) {
   verifyFormat("const val = something ?? 'some other default';\n");
+  verifyFormat(
+      "const val = something ?? otherDefault ??\n"
+      "    evenMore ?? evenMore;\n",
+      "const val = something ?? otherDefault ?? evenMore ?? evenMore;\n",
+      getGoogleJSStyleWithColumns(40));
 }
 
 TEST_F(FormatTestJS, Conditional) {
diff --git a/debuginfo-tests/llvm-prettyprinters/gdb/prettyprinters.cpp b/debuginfo-tests/llvm-prettyprinters/gdb/prettyprinters.cpp
--- a/debuginfo-tests/llvm-prettyprinters/gdb/prettyprinters.cpp
+++ b/debuginfo-tests/llvm-prettyprinters/gdb/prettyprinters.cpp
@@ -1,12 +1,15 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Optional.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/Error.h"
 
 int Array[] = {1, 2, 3};
+auto IntPtr = reinterpret_cast<int *>(0xabc);
 
 llvm::ArrayRef<int> ArrayRef(Array);
 llvm::MutableArrayRef<int> MutableArrayRef(Array);
@@ -19,7 +22,7 @@
 llvm::SmallString<5> SmallString("foo");
 llvm::StringRef StringRef = "bar";
 llvm::Twine Twine = llvm::Twine(SmallString) + StringRef;
+llvm::PointerIntPair<int *, 1> PointerIntPair(IntPtr, 1);
+llvm::PointerUnion<float *, int *> PointerUnion(IntPtr);
 
-int main() {
-  return 0;
-}
+int main() { return 0; }
diff --git a/debuginfo-tests/llvm-prettyprinters/gdb/prettyprinters.gdb b/debuginfo-tests/llvm-prettyprinters/gdb/prettyprinters.gdb
--- a/debuginfo-tests/llvm-prettyprinters/gdb/prettyprinters.gdb
+++ b/debuginfo-tests/llvm-prettyprinters/gdb/prettyprinters.gdb
@@ -39,3 +39,10 @@
 # CHECK: "\"foo\"\"bar\""
 p Twine
 
+# CHECK: llvm::PointerIntPair<int *> = {pointer = 0xabc, value = 1}
+p PointerIntPair
+
+# CHECK: llvm::PointerUnion containing int * = {pointer = 0xabc}
+p PointerUnion
+
+
diff --git a/lldb/packages/Python/lldbsuite/test/commands/expression/completion-crash-invalid-iterator/Makefile b/lldb/packages/Python/lldbsuite/test/commands/expression/completion-crash-invalid-iterator/Makefile
new file mode 100644
--- /dev/null
+++ b/lldb/packages/Python/lldbsuite/test/commands/expression/completion-crash-invalid-iterator/Makefile
@@ -0,0 +1,2 @@
+CXX_SOURCES := main.cpp
+include Makefile.rules
diff --git a/lldb/packages/Python/lldbsuite/test/commands/expression/completion-crash-invalid-iterator/TestInvalidIteratorCompletionCrash.py b/lldb/packages/Python/lldbsuite/test/commands/expression/completion-crash-invalid-iterator/TestInvalidIteratorCompletionCrash.py
new file mode 100644
--- /dev/null
+++ b/lldb/packages/Python/lldbsuite/test/commands/expression/completion-crash-invalid-iterator/TestInvalidIteratorCompletionCrash.py
@@ -0,0 +1,21 @@
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+
+
+class TestCase(TestBase):
+
+    mydir = TestBase.compute_mydir(__file__)
+
+    @skipIf # rdar://problem/53931074
+    def test(self):
+        self.build()
+        exe = self.getBuildArtifact("a.out")
+        target = self.dbg.CreateTarget(exe)
+        callee_break = target.BreakpointCreateByName(
+            "SomeClass::SomeClass(ParamClass)", None)
+        self.assertTrue(callee_break.GetNumLocations() > 0)
+        self.runCmd("run", RUN_SUCCEEDED)
+
+        to_complete = "e ParamClass"
+        self.dbg.GetCommandInterpreter().HandleCompletion(to_complete, len(to_complete), 0, -1, lldb.SBStringList())
diff --git a/lldb/packages/Python/lldbsuite/test/commands/expression/completion-crash-invalid-iterator/main.cpp b/lldb/packages/Python/lldbsuite/test/commands/expression/completion-crash-invalid-iterator/main.cpp
new file mode 100644
--- /dev/null
+++ b/lldb/packages/Python/lldbsuite/test/commands/expression/completion-crash-invalid-iterator/main.cpp
@@ -0,0 +1,22 @@
+class LoadedByParamClass {};
+struct ParamClass {
+  LoadedByParamClass some_func();
+};
+struct SomeClass {
+  // LLDB stops in the constructor and then requests
+  // possible expression completions. This will iterate over the
+  // declarations in the translation unit.
+  // The unnamed ParamClass parameter causes that LLDB will add
+  // an incomplete ParamClass decl to the translation unit which
+  // the code completion will find. Upon inspecting the ParamClass
+  // decl to see if it can be used to provide any useful completions,
+  // Clang will complete it and load all its members.
+  // This causes that its member function some_func is loaded which in turn
+  // loads the LoadedByParamClass decl. When LoadedByParamClass
+  // is created it will be added to the translation unit which
+  // will invalidate all iterators that currently iterate over
+  // the translation unit. The iterator we use for code completion
+  // is now invalidated and LLDB crashes.
+  SomeClass(ParamClass) {}
+};
+int main() { ParamClass e; SomeClass y(e); }
diff --git a/lldb/packages/Python/lldbsuite/test/commands/expression/completion-crash1/TestCompletionCrash1.py b/lldb/packages/Python/lldbsuite/test/commands/expression/completion-crash1/TestCompletionCrash1.py
deleted file mode 100644
--- a/lldb/packages/Python/lldbsuite/test/commands/expression/completion-crash1/TestCompletionCrash1.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from lldbsuite.test import lldbinline
-from lldbsuite.test import decorators
-
-lldbinline.MakeInlineTest(__file__, globals(), [decorators.skipIf(bugnumber="rdar://53659341")])
diff --git a/lldb/packages/Python/lldbsuite/test/commands/expression/completion-crash1/main.cpp b/lldb/packages/Python/lldbsuite/test/commands/expression/completion-crash1/main.cpp
deleted file mode 100644
--- a/lldb/packages/Python/lldbsuite/test/commands/expression/completion-crash1/main.cpp
+++ /dev/null
@@ -1,12 +0,0 @@
-namespace std {
-struct a {
-  a() {}
-  a(a &&);
-};
-template <class> struct au {
-  a ay;
-  ~au() { //%self.dbg.GetCommandInterpreter().HandleCompletion("e ", len("e "), 0, -1, lldb.SBStringList())
-  }
-};
-}
-int main() { std::au<int>{}; }
diff --git a/lldb/packages/Python/lldbsuite/test/commands/expression/deleting-implicit-copy-constructor/TestDeletingImplicitCopyConstructor.py b/lldb/packages/Python/lldbsuite/test/commands/expression/deleting-implicit-copy-constructor/TestDeletingImplicitCopyConstructor.py
new file mode 100644
--- /dev/null
+++ b/lldb/packages/Python/lldbsuite/test/commands/expression/deleting-implicit-copy-constructor/TestDeletingImplicitCopyConstructor.py
@@ -0,0 +1,4 @@
+from lldbsuite.test import lldbinline
+from lldbsuite.test import decorators
+
+lldbinline.MakeInlineTest(__file__, globals())
diff --git a/lldb/packages/Python/lldbsuite/test/commands/expression/deleting-implicit-copy-constructor/main.cpp b/lldb/packages/Python/lldbsuite/test/commands/expression/deleting-implicit-copy-constructor/main.cpp
new file mode 100644
--- /dev/null
+++ b/lldb/packages/Python/lldbsuite/test/commands/expression/deleting-implicit-copy-constructor/main.cpp
@@ -0,0 +1,20 @@
+struct NoCopyCstr {
+  NoCopyCstr() {}
+  // No copy constructor but a move constructor means we have an
+  // implicitly deleted copy constructor (C++11 [class.copy]p7, p18).
+  NoCopyCstr(NoCopyCstr &&);
+};
+struct IndirectlyDeletedCopyCstr {
+  // This field indirectly deletes the implicit copy constructor.
+  NoCopyCstr field;
+  // Completing in the constructor or constructing the class
+  // will cause Sema to declare the special members of IndirectlyDeletedCopyCstr.
+  // If we correctly set the deleted implicit copy constructor in NoCopyCstr then this
+  // should have propagated to this record and Clang won't crash.
+  IndirectlyDeletedCopyCstr() { //%self.expect_expr("IndirectlyDeletedCopyCstr x; 1+1", result_type="int", result_value="2")
+                                //%self.dbg.GetCommandInterpreter().HandleCompletion("e ", len("e "), 0, -1, lldb.SBStringList())
+  }
+};
+int main() {
+  IndirectlyDeletedCopyCstr{};
+}
diff --git a/lldb/source/Symbol/ClangASTContext.cpp b/lldb/source/Symbol/ClangASTContext.cpp
--- a/lldb/source/Symbol/ClangASTContext.cpp
+++ b/lldb/source/Symbol/ClangASTContext.cpp
@@ -7783,6 +7783,19 @@
     clang::TagDecl *tag_decl = tag_type->getDecl();
 
     if (auto *cxx_record_decl = llvm::dyn_cast<CXXRecordDecl>(tag_decl)) {
+      // If we have a move constructor declared but no copy constructor we
+      // need to explicitly mark it as deleted. Usually Sema would do this for
+      // us in Sema::DeclareImplicitCopyConstructor but we don't have a Sema
+      // when building an AST from debug information.
+      // See also:
+      // C++11 [class.copy]p7, p18:
+      //  If the class definition declares a move constructor or move assignment
+      //  operator, an implicitly declared copy constructor or copy assignment
+      //  operator is defined as deleted.
+      if (cxx_record_decl->hasUserDeclaredMoveConstructor() &&
+          cxx_record_decl->needsImplicitCopyConstructor())
+        cxx_record_decl->setImplicitCopyConstructorIsDeleted();
+
       if (!cxx_record_decl->isCompleteDefinition())
         cxx_record_decl->completeDefinition();
       cxx_record_decl->setHasLoadedFieldsFromExternalStorage(true);
diff --git a/lldb/unittests/Symbol/TestClangASTContext.cpp b/lldb/unittests/Symbol/TestClangASTContext.cpp
--- a/lldb/unittests/Symbol/TestClangASTContext.cpp
+++ b/lldb/unittests/Symbol/TestClangASTContext.cpp
@@ -523,3 +523,89 @@
   EXPECT_EQ("foo", func_template->getName());
   EXPECT_EQ(clang::AccessSpecifier::AS_public, func_template->getAccess());
 }
+
+TEST_F(TestClangASTContext, TestDeletingImplicitCopyCstrDueToMoveCStr) {
+  // We need to simulate this behavior in our AST that we construct as we don't
+  // have a Sema instance that can do this for us:
+  // C++11 [class.copy]p7, p18:
+  //  If the class definition declares a move constructor or move assignment
+  //  operator, an implicitly declared copy constructor or copy assignment
+  //  operator is defined as deleted.
+
+  // Create a record and start defining it.
+  llvm::StringRef class_name = "S";
+  CompilerType t = clang_utils::createRecord(*m_ast, class_name);
+  m_ast->StartTagDeclarationDefinition(t);
+
+  // Create a move constructor that will delete the implicit copy constructor.
+  CompilerType return_type = m_ast->GetBasicType(lldb::eBasicTypeVoid);
+  CompilerType param_type = t.GetRValueReferenceType();
+  CompilerType function_type =
+      m_ast->CreateFunctionType(return_type, &param_type, /*num_params*/ 1,
+                                /*variadic=*/false, /*quals*/ 0U);
+  bool is_virtual = false;
+  bool is_static = false;
+  bool is_inline = false;
+  bool is_explicit = true;
+  bool is_attr_used = false;
+  bool is_artificial = false;
+  m_ast->AddMethodToCXXRecordType(
+      t.GetOpaqueQualType(), class_name, nullptr, function_type,
+      lldb::AccessType::eAccessPublic, is_virtual, is_static, is_inline,
+      is_explicit, is_attr_used, is_artificial);
+
+  // Complete the definition and check the created record.
+  m_ast->CompleteTagDeclarationDefinition(t);
+  auto *record = llvm::cast<CXXRecordDecl>(ClangUtil::GetAsTagDecl(t));
+  // We can't call defaultedCopyConstructorIsDeleted() as this requires that
+  // the Decl passes through Sema which will actually compute this field.
+  // Instead we check that there is no copy constructor declared by the user
+  // which only leaves a non-deleted defaulted copy constructor as an option
+  // that our record will have no simple copy constructor.
+  EXPECT_FALSE(record->hasUserDeclaredCopyConstructor());
+  EXPECT_FALSE(record->hasSimpleCopyConstructor());
+}
+
+TEST_F(TestClangASTContext, TestNotDeletingUserCopyCstrDueToMoveCStr) {
+  // Tests that we don't delete the a user-defined copy constructor when
+  // a move constructor is provided.
+  // See also the TestDeletingImplicitCopyCstrDueToMoveCStr test.
+  llvm::StringRef class_name = "S";
+  CompilerType t = clang_utils::createRecord(*m_ast, class_name);
+  m_ast->StartTagDeclarationDefinition(t);
+
+  CompilerType return_type = m_ast->GetBasicType(lldb::eBasicTypeVoid);
+  bool is_virtual = false;
+  bool is_static = false;
+  bool is_inline = false;
+  bool is_explicit = true;
+  bool is_attr_used = false;
+  bool is_artificial = false;
+  // Create a move constructor.
+  {
+    CompilerType param_type = t.GetRValueReferenceType();
+    CompilerType function_type =
+        m_ast->CreateFunctionType(return_type, &param_type, /*num_params*/ 1,
+                                  /*variadic=*/false, /*quals*/ 0U);
+    m_ast->AddMethodToCXXRecordType(
+        t.GetOpaqueQualType(), class_name, nullptr, function_type,
+        lldb::AccessType::eAccessPublic, is_virtual, is_static, is_inline,
+        is_explicit, is_attr_used, is_artificial);
+  }
+  // Create a copy constructor.
+  {
+    CompilerType param_type = t.GetLValueReferenceType().AddConstModifier();
+    CompilerType function_type =
+        m_ast->CreateFunctionType(return_type, &param_type, /*num_params*/ 1,
+                                  /*variadic=*/false, /*quals*/ 0U);
+    m_ast->AddMethodToCXXRecordType(
+        t.GetOpaqueQualType(), class_name, nullptr, function_type,
+        lldb::AccessType::eAccessPublic, is_virtual, is_static, is_inline,
+        is_explicit, is_attr_used, is_artificial);
+  }
+
+  // Complete the definition and check the created record.
+  m_ast->CompleteTagDeclarationDefinition(t);
+  auto *record = llvm::cast<CXXRecordDecl>(ClangUtil::GetAsTagDecl(t));
+  EXPECT_TRUE(record->hasUserDeclaredCopyConstructor());
+}
diff --git a/llvm/include/llvm/ADT/PointerIntPair.h b/llvm/include/llvm/ADT/PointerIntPair.h
--- a/llvm/include/llvm/ADT/PointerIntPair.h
+++ b/llvm/include/llvm/ADT/PointerIntPair.h
@@ -147,7 +147,7 @@
                 "cannot use a pointer type that has all bits free");
   static_assert(IntBits <= PtrTraits::NumLowBitsAvailable,
                 "PointerIntPair with integer size too large for pointer");
-  enum : uintptr_t {
+  enum MaskAndShiftConstants : uintptr_t {
     /// PointerBitMask - The bits that come from the pointer.
     PointerBitMask =
         ~(uintptr_t)(((intptr_t)1 << PtrTraits::NumLowBitsAvailable) - 1),
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -569,20 +569,40 @@
   /// specified, it will be added to the instruction. Likewise with alias.scope
   /// and noalias tags.
   CallInst *CreateElementUnorderedAtomicMemCpy(
-      Value *Dst, unsigned DstAlign, Value *Src, unsigned SrcAlign,
-      uint64_t Size, uint32_t ElementSize, MDNode *TBAATag = nullptr,
+      Value *Dst, Align DstAlign, Value *Src, Align SrcAlign, Value *Size,
+      uint32_t ElementSize, MDNode *TBAATag = nullptr,
       MDNode *TBAAStructTag = nullptr, MDNode *ScopeTag = nullptr,
-      MDNode *NoAliasTag = nullptr) {
+      MDNode *NoAliasTag = nullptr);
+
+  /// FIXME: Remove this function once transition to Align is over.
+  /// Use the version that takes Align instead of this one.
+  LLVM_ATTRIBUTE_DEPRECATED(CallInst *CreateElementUnorderedAtomicMemCpy(
+                                Value *Dst, unsigned DstAlign, Value *Src,
+                                unsigned SrcAlign, uint64_t Size,
+                                uint32_t ElementSize, MDNode *TBAATag = nullptr,
+                                MDNode *TBAAStructTag = nullptr,
+                                MDNode *ScopeTag = nullptr,
+                                MDNode *NoAliasTag = nullptr),
+                            "Use the version that takes Align instead") {
     return CreateElementUnorderedAtomicMemCpy(
-        Dst, DstAlign, Src, SrcAlign, getInt64(Size), ElementSize, TBAATag,
-        TBAAStructTag, ScopeTag, NoAliasTag);
+        Dst, Align(DstAlign), Src, Align(SrcAlign), getInt64(Size), ElementSize,
+        TBAATag, TBAAStructTag, ScopeTag, NoAliasTag);
   }
 
-  CallInst *CreateElementUnorderedAtomicMemCpy(
-      Value *Dst, unsigned DstAlign, Value *Src, unsigned SrcAlign, Value *Size,
-      uint32_t ElementSize, MDNode *TBAATag = nullptr,
-      MDNode *TBAAStructTag = nullptr, MDNode *ScopeTag = nullptr,
-      MDNode *NoAliasTag = nullptr);
+  /// FIXME: Remove this function once transition to Align is over.
+  /// Use the version that takes Align instead of this one.
+  LLVM_ATTRIBUTE_DEPRECATED(CallInst *CreateElementUnorderedAtomicMemCpy(
+                                Value *Dst, unsigned DstAlign, Value *Src,
+                                unsigned SrcAlign, Value *Size,
+                                uint32_t ElementSize, MDNode *TBAATag = nullptr,
+                                MDNode *TBAAStructTag = nullptr,
+                                MDNode *ScopeTag = nullptr,
+                                MDNode *NoAliasTag = nullptr),
+                            "Use the version that takes Align instead") {
+    return CreateElementUnorderedAtomicMemCpy(
+        Dst, Align(DstAlign), Src, Align(SrcAlign), Size, ElementSize, TBAATag,
+        TBAAStructTag, ScopeTag, NoAliasTag);
+  }
 
   /// Create and insert a memmove between the specified
   /// pointers.
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -4014,6 +4014,34 @@
   if (isa<UndefValue>(FalseVal))   // select ?, X, undef -> X
     return TrueVal;
 
+  // Deal with partial undef vector constants: select ?, VecC, VecC' --> VecC''
+  Constant *TrueC, *FalseC;
+  if (TrueVal->getType()->isVectorTy() && match(TrueVal, m_Constant(TrueC)) &&
+      match(FalseVal, m_Constant(FalseC))) {
+    unsigned NumElts = TrueC->getType()->getVectorNumElements();
+    SmallVector<Constant *, 16> NewC;
+    for (unsigned i = 0; i != NumElts; ++i) {
+      // Bail out on incomplete vector constants.
+      Constant *TEltC = TrueC->getAggregateElement(i);
+      Constant *FEltC = FalseC->getAggregateElement(i);
+      if (!TEltC || !FEltC)
+        break;
+
+      // If the elements match (undef or not), that value is the result. If only
+      // one element is undef, choose the defined element as the safe result.
+      if (TEltC == FEltC)
+        NewC.push_back(TEltC);
+      else if (isa<UndefValue>(TEltC))
+        NewC.push_back(FEltC);
+      else if (isa<UndefValue>(FEltC))
+        NewC.push_back(TEltC);
+      else
+        break;
+    }
+    if (NewC.size() == NumElts)
+      return ConstantVector::get(NewC);
+  }
+
   if (Value *V =
           simplifySelectWithICmpCond(Cond, TrueVal, FalseVal, Q, MaxRecurse))
     return V;
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -9222,9 +9222,11 @@
       !isAvailableAtLoopEntry(SplitRHS.first, MDL))
     return false;
 
-  return isLoopEntryGuardedByCond(MDL, Pred, SplitLHS.first, SplitRHS.first) &&
-         isLoopBackedgeGuardedByCond(MDL, Pred, SplitLHS.second,
-                                     SplitRHS.second);
+  // It seems backedge guard check is faster than entry one so in some cases
+  // it can speed up whole estimation by short circuit
+  return isLoopBackedgeGuardedByCond(MDL, Pred, SplitLHS.second,
+                                     SplitRHS.second) &&
+         isLoopEntryGuardedByCond(MDL, Pred, SplitLHS.first, SplitRHS.first);
 }
 
 bool ScalarEvolution::isKnownPredicate(ICmpInst::Predicate Pred,
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -201,7 +201,7 @@
 }
 
 CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemCpy(
-    Value *Dst, unsigned DstAlign, Value *Src, unsigned SrcAlign, Value *Size,
+    Value *Dst, Align DstAlign, Value *Src, Align SrcAlign, Value *Size,
     uint32_t ElementSize, MDNode *TBAATag, MDNode *TBAAStructTag,
     MDNode *ScopeTag, MDNode *NoAliasTag) {
   assert(DstAlign >= ElementSize &&
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -2312,16 +2312,16 @@
 multiclass MVE_VMINA<MVEVectorVTInfo VTI>
   : MVE_VMINMAXA_m<"vmina", VTI, umin, int_arm_mve_vmina_predicated, 0b1>;
 
-defm MVE_VMAXAs8  : MVE_VMINA<MVE_v16s8>;
-defm MVE_VMAXAs16 : MVE_VMINA<MVE_v8s16>;
-defm MVE_VMAXAs32 : MVE_VMINA<MVE_v4s32>;
+defm MVE_VMINAs8  : MVE_VMINA<MVE_v16s8>;
+defm MVE_VMINAs16 : MVE_VMINA<MVE_v8s16>;
+defm MVE_VMINAs32 : MVE_VMINA<MVE_v4s32>;
 
 multiclass MVE_VMAXA<MVEVectorVTInfo VTI>
   : MVE_VMINMAXA_m<"vmaxa", VTI, umax, int_arm_mve_vmaxa_predicated, 0b0>;
 
-defm MVE_VMINAs8  : MVE_VMAXA<MVE_v16s8>;
-defm MVE_VMINAs16 : MVE_VMAXA<MVE_v8s16>;
-defm MVE_VMINAs32 : MVE_VMAXA<MVE_v4s32>;
+defm MVE_VMAXAs8  : MVE_VMAXA<MVE_v16s8>;
+defm MVE_VMAXAs16 : MVE_VMAXA<MVE_v8s16>;
+defm MVE_VMAXAs32 : MVE_VMAXA<MVE_v4s32>;
 
 // end of MVE Integer instructions
 
@@ -3655,7 +3655,8 @@
 
   let Predicates = [HasMVEInt] in {
     // Unpredicated v(max|min)nma
-    def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qd), (fabs (VTI.Vec MQPR:$Qm)))),
+    def : Pat<(VTI.Vec (unpred_op (fabs (VTI.Vec MQPR:$Qd)),
+                                  (fabs (VTI.Vec MQPR:$Qm)))),
               (VTI.Vec (Inst (VTI.Vec MQPR:$Qd), (VTI.Vec MQPR:$Qm)))>;
 
     // Predicated v(max|min)nma
diff --git a/llvm/lib/Target/Hexagon/HexagonFrameLowering.h b/llvm/lib/Target/Hexagon/HexagonFrameLowering.h
--- a/llvm/lib/Target/Hexagon/HexagonFrameLowering.h
+++ b/llvm/lib/Target/Hexagon/HexagonFrameLowering.h
@@ -29,6 +29,8 @@
 
 class HexagonFrameLowering : public TargetFrameLowering {
 public:
+  // First register which could possibly hold a variable argument.
+  int FirstVarArgSavedReg;
   explicit HexagonFrameLowering()
       : TargetFrameLowering(StackGrowsDown, Align(8), 0, Align::None(), true) {}
 
diff --git a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
--- a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -395,6 +395,9 @@
       MachineBasicBlock *&PrologB, MachineBasicBlock *&EpilogB) const {
   static unsigned ShrinkCounter = 0;
 
+  if (MF.getSubtarget<HexagonSubtarget>().isEnvironmentMusl() &&
+      MF.getFunction().isVarArg())
+    return;
   if (ShrinkLimit.getPosition()) {
     if (ShrinkCounter >= ShrinkLimit)
       return;
@@ -622,6 +625,118 @@
 
   DebugLoc dl = MBB.findDebugLoc(InsertPt);
 
+  if (MF.getFunction().isVarArg() &&
+      MF.getSubtarget<HexagonSubtarget>().isEnvironmentMusl()) {
+    // Calculate the size of register saved area.
+    int NumVarArgRegs = 6 - FirstVarArgSavedReg;
+    int RegisterSavedAreaSizePlusPadding = (NumVarArgRegs % 2 == 0)
+                                              ? NumVarArgRegs * 4
+                                              : NumVarArgRegs * 4 + 4;
+    if (RegisterSavedAreaSizePlusPadding > 0) {
+      // Decrement the stack pointer by size of register saved area plus
+      // padding if any.
+      BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::A2_addi), SP)
+        .addReg(SP)
+        .addImm(-RegisterSavedAreaSizePlusPadding)
+        .setMIFlag(MachineInstr::FrameSetup);
+
+      int NumBytes = 0;
+      // Copy all the named arguments below register saved area.
+      auto &HMFI = *MF.getInfo<HexagonMachineFunctionInfo>();
+      for (int i = HMFI.getFirstNamedArgFrameIndex(),
+               e = HMFI.getLastNamedArgFrameIndex(); i >= e; --i) {
+        int ObjSize = MFI.getObjectSize(i);
+        int ObjAlign = MFI.getObjectAlignment(i);
+
+        // Determine the kind of load/store that should be used.
+        unsigned LDOpc, STOpc;
+        int OpcodeChecker = ObjAlign;
+
+        // Handle cases where alignment of an object is > its size.
+        if (ObjSize < ObjAlign) {
+          if (ObjSize <= 1)
+            OpcodeChecker = 1;
+          else if (ObjSize <= 2)
+            OpcodeChecker = 2;
+          else if (ObjSize <= 4)
+            OpcodeChecker = 4;
+          else if (ObjSize > 4)
+            OpcodeChecker = 8;
+        }
+
+        switch (OpcodeChecker) {
+          case 1:
+            LDOpc = Hexagon::L2_loadrb_io;
+            STOpc = Hexagon::S2_storerb_io;
+            break;
+          case 2:
+            LDOpc = Hexagon::L2_loadrh_io;
+            STOpc = Hexagon::S2_storerh_io;
+            break;
+          case 4:
+            LDOpc = Hexagon::L2_loadri_io;
+            STOpc = Hexagon::S2_storeri_io;
+            break;
+          case 8:
+          default:
+            LDOpc = Hexagon::L2_loadrd_io;
+            STOpc = Hexagon::S2_storerd_io;
+            break;
+        }
+
+        unsigned RegUsed = LDOpc == Hexagon::L2_loadrd_io ? Hexagon::D3
+                                                          : Hexagon::R6;
+        int LoadStoreCount = ObjSize / OpcodeChecker;
+
+        if (ObjSize % OpcodeChecker)
+          ++LoadStoreCount;
+
+        // Get the start location of the load. NumBytes is basically the
+        // offset from the stack pointer of previous function, which would be
+        // the caller in this case, as this function has variable argument
+        // list.
+        if (NumBytes != 0)
+          NumBytes = alignTo(NumBytes, ObjAlign);
+
+        int Count = 0;
+        while (Count < LoadStoreCount) {
+          // Load the value of the named argument on stack.
+          BuildMI(MBB, InsertPt, dl, HII.get(LDOpc), RegUsed)
+            .addReg(SP)
+            .addImm(RegisterSavedAreaSizePlusPadding +
+                    ObjAlign * Count + NumBytes)
+            .setMIFlag(MachineInstr::FrameSetup);
+
+          // Store it below the register saved area plus padding.
+          BuildMI(MBB, InsertPt, dl, HII.get(STOpc))
+            .addReg(SP)
+            .addImm(ObjAlign * Count + NumBytes)
+            .addReg(RegUsed)
+            .setMIFlag(MachineInstr::FrameSetup);
+
+          Count++;
+        }
+        NumBytes += MFI.getObjectSize(i);
+      }
+
+      // Make NumBytes 8 byte aligned
+      NumBytes = alignTo(NumBytes, 8);
+
+      // If the number of registers having variable arguments is odd,
+      // leave 4 bytes of padding to get to the location where first
+      // variable argument which was passed through register was copied.
+      NumBytes = (NumVarArgRegs % 2 == 0) ? NumBytes : NumBytes + 4;
+
+      for (int j = FirstVarArgSavedReg, i = 0; j < 6; ++j, ++i) {
+        BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::S2_storeri_io))
+          .addReg(SP)
+          .addImm(NumBytes + 4 * i)
+          .addReg(Hexagon::R0 + j)
+          .setMIFlag(MachineInstr::FrameSetup);
+      }
+    }
+  }
+
   if (hasFP(MF)) {
     insertAllocframe(MBB, InsertPt, NumBytes);
     if (AlignStack) {
@@ -655,7 +770,16 @@
 
   if (!hasFP(MF)) {
     MachineFrameInfo &MFI = MF.getFrameInfo();
-    if (unsigned NumBytes = MFI.getStackSize()) {
+    unsigned NumBytes = MFI.getStackSize();
+    if (MF.getFunction().isVarArg() &&
+        MF.getSubtarget<HexagonSubtarget>().isEnvironmentMusl()) {
+      // On Hexagon Linux, deallocate the stack for the register saved area.
+      int NumVarArgRegs = 6 - FirstVarArgSavedReg;
+      int RegisterSavedAreaSizePlusPadding = (NumVarArgRegs % 2 == 0) ?
+        (NumVarArgRegs * 4) : (NumVarArgRegs * 4 + 4);
+      NumBytes += RegisterSavedAreaSizePlusPadding;
+    }
+    if (NumBytes) {
       BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::A2_addi), SP)
         .addReg(SP)
         .addImm(NumBytes);
@@ -710,24 +834,49 @@
       NeedsDeallocframe = false;
   }
 
-  if (!NeedsDeallocframe)
-    return;
-  // If the returning instruction is PS_jmpret, replace it with dealloc_return,
-  // otherwise just add deallocframe. The function could be returning via a
-  // tail call.
-  if (RetOpc != Hexagon::PS_jmpret || DisableDeallocRet) {
-    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::L2_deallocframe))
+  if (!MF.getSubtarget<HexagonSubtarget>().isEnvironmentMusl() ||
+      !MF.getFunction().isVarArg()) {
+    if (!NeedsDeallocframe)
+      return;
+    // If the returning instruction is PS_jmpret, replace it with
+    // dealloc_return, otherwise just add deallocframe. The function
+    // could be returning via a tail call.
+    if (RetOpc != Hexagon::PS_jmpret || DisableDeallocRet) {
+      BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::L2_deallocframe))
       .addDef(Hexagon::D15)
       .addReg(Hexagon::R30);
-    return;
-  }
-  unsigned NewOpc = Hexagon::L4_return;
-  MachineInstr *NewI = BuildMI(MBB, RetI, dl, HII.get(NewOpc))
+      return;
+    }
+    unsigned NewOpc = Hexagon::L4_return;
+    MachineInstr *NewI = BuildMI(MBB, RetI, dl, HII.get(NewOpc))
       .addDef(Hexagon::D15)
       .addReg(Hexagon::R30);
-  // Transfer the function live-out registers.
-  NewI->copyImplicitOps(MF, *RetI);
-  MBB.erase(RetI);
+    // Transfer the function live-out registers.
+    NewI->copyImplicitOps(MF, *RetI);
+    MBB.erase(RetI);
+  } else {
+    // L2_deallocframe instruction after it.
+    // Calculate the size of register saved area.
+    int NumVarArgRegs = 6 - FirstVarArgSavedReg;
+    int RegisterSavedAreaSizePlusPadding = (NumVarArgRegs % 2 == 0) ?
+      (NumVarArgRegs * 4) : (NumVarArgRegs * 4 + 4);
+
+    MachineBasicBlock::iterator Term = MBB.getFirstTerminator();
+    MachineBasicBlock::iterator I = (Term == MBB.begin()) ? MBB.end()
+                                                          : std::prev(Term);
+    if (I == MBB.end() ||
+       (I->getOpcode() != Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4_EXT &&
+        I->getOpcode() != Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4_EXT_PIC &&
+        I->getOpcode() != Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4 &&
+        I->getOpcode() != Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4_PIC))
+      BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::L2_deallocframe))
+        .addDef(Hexagon::D15)
+        .addReg(Hexagon::R30);
+    if (RegisterSavedAreaSizePlusPadding != 0)
+      BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::A2_addi), SP)
+        .addReg(SP)
+        .addImm(RegisterSavedAreaSizePlusPadding);
+  }
 }
 
 void HexagonFrameLowering::insertAllocframe(MachineBasicBlock &MBB,
@@ -2473,6 +2622,8 @@
 /// checks are performed, which may still lead to the inline code.
 bool HexagonFrameLowering::shouldInlineCSR(const MachineFunction &MF,
       const CSIVect &CSI) const {
+  if (MF.getSubtarget<HexagonSubtarget>().isEnvironmentMusl())
+    return true;
   if (MF.getInfo<HexagonMachineFunctionInfo>()->hasEHReturn())
     return true;
   if (!hasFP(MF))
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
@@ -248,6 +248,7 @@
     }
 
     SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
 
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -393,9 +393,12 @@
   if (GlobalAddressSDNode *GAN = dyn_cast<GlobalAddressSDNode>(Callee))
     Callee = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, MVT::i32);
 
+  // Linux ABI treats var-arg calls the same way as regular ones.
+  bool TreatAsVarArg = !Subtarget.isEnvironmentMusl() && IsVarArg;
+
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  HexagonCCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext(),
+  HexagonCCState CCInfo(CallConv, TreatAsVarArg, MF, ArgLocs, *DAG.getContext(),
                         NumParams);
 
   if (Subtarget.useHVXOps())
@@ -750,9 +753,13 @@
   MachineFrameInfo &MFI = MF.getFrameInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
+  // Linux ABI treats var-arg calls the same way as regular ones.
+  bool TreatAsVarArg = !Subtarget.isEnvironmentMusl() && IsVarArg;
+
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  HexagonCCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext(),
+  HexagonCCState CCInfo(CallConv, TreatAsVarArg, MF, ArgLocs,
+                        *DAG.getContext(),
                         MF.getFunction().getFunctionType()->getNumParams());
 
   if (Subtarget.useHVXOps())
@@ -766,8 +773,24 @@
   // caller's stack is passed only when the struct size is smaller than (and
   // equal to) 8 bytes. If not, no address will be passed into callee and
   // callee return the result direclty through R0/R1.
+  auto NextSingleReg = [] (const TargetRegisterClass &RC, unsigned Reg) {
+    switch (RC.getID()) {
+    case Hexagon::IntRegsRegClassID:
+      return Reg - Hexagon::R0 + 1;
+    case Hexagon::DoubleRegsRegClassID:
+      return (Reg - Hexagon::D0 + 1) * 2;
+    case Hexagon::HvxVRRegClassID:
+      return Reg - Hexagon::V0 + 1;
+    case Hexagon::HvxWRRegClassID:
+      return (Reg - Hexagon::W0 + 1) * 2;
+    }
+    llvm_unreachable("Unexpected register class");
+  };
 
+  auto &HFL = const_cast<HexagonFrameLowering&>(*Subtarget.getFrameLowering());
   auto &HMFI = *MF.getInfo<HexagonMachineFunctionInfo>();
+  HFL.FirstVarArgSavedReg = 0;
+  HMFI.setFirstNamedArgFrameIndex(-int(MFI.getNumFixedObjects()));
 
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
@@ -811,6 +834,7 @@
       }
       InVals.push_back(Copy);
       MRI.addLiveIn(VA.getLocReg(), VReg);
+      HFL.FirstVarArgSavedReg = NextSingleReg(*RC, VA.getLocReg());
     } else {
       assert(VA.isMemLoc() && "Argument should be passed in memory");
 
@@ -838,8 +862,48 @@
     }
   }
 
+  if (IsVarArg && Subtarget.isEnvironmentMusl()) {
+    for (int i = HFL.FirstVarArgSavedReg; i < 6; i++)
+      MRI.addLiveIn(Hexagon::R0+i);
+  }
+
+  if (IsVarArg && Subtarget.isEnvironmentMusl()) {
+    HMFI.setFirstNamedArgFrameIndex(HMFI.getFirstNamedArgFrameIndex() - 1);
+    HMFI.setLastNamedArgFrameIndex(-int(MFI.getNumFixedObjects()));
+
+    // Create Frame index for the start of register saved area.
+    int NumVarArgRegs = 6 - HFL.FirstVarArgSavedReg;
+    bool RequiresPadding = (NumVarArgRegs & 1);
+    int RegSaveAreaSizePlusPadding = RequiresPadding
+                                        ? (NumVarArgRegs + 1) * 4
+                                        : NumVarArgRegs * 4;
+
+    if (RegSaveAreaSizePlusPadding > 0) {
+      // The offset to saved register area should be 8 byte aligned.
+      int RegAreaStart = HEXAGON_LRFP_SIZE + CCInfo.getNextStackOffset();
+      if (!(RegAreaStart % 8))
+        RegAreaStart = (RegAreaStart + 7) & -8;
 
-  if (IsVarArg) {
+      int RegSaveAreaFrameIndex =
+        MFI.CreateFixedObject(RegSaveAreaSizePlusPadding, RegAreaStart, true);
+      HMFI.setRegSavedAreaStartFrameIndex(RegSaveAreaFrameIndex);
+
+      // This will point to the next argument passed via stack.
+      int Offset = RegAreaStart + RegSaveAreaSizePlusPadding;
+      int FI = MFI.CreateFixedObject(Hexagon_PointerSize, Offset, true);
+      HMFI.setVarArgsFrameIndex(FI);
+    } else {
+      // This will point to the next argument passed via stack, when
+      // there is no saved register area.
+      int Offset = HEXAGON_LRFP_SIZE + CCInfo.getNextStackOffset();
+      int FI = MFI.CreateFixedObject(Hexagon_PointerSize, Offset, true);
+      HMFI.setRegSavedAreaStartFrameIndex(FI);
+      HMFI.setVarArgsFrameIndex(FI);
+    }
+  }
+
+
+  if (IsVarArg && !Subtarget.isEnvironmentMusl()) {
     // This will point to the next argument passed via stack.
     int Offset = HEXAGON_LRFP_SIZE + CCInfo.getNextStackOffset();
     int FI = MFI.CreateFixedObject(Hexagon_PointerSize, Offset, true);
@@ -857,8 +921,82 @@
   HexagonMachineFunctionInfo *QFI = MF.getInfo<HexagonMachineFunctionInfo>();
   SDValue Addr = DAG.getFrameIndex(QFI->getVarArgsFrameIndex(), MVT::i32);
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
-  return DAG.getStore(Op.getOperand(0), SDLoc(Op), Addr, Op.getOperand(1),
-                      MachinePointerInfo(SV));
+
+  if (!Subtarget.isEnvironmentMusl()) {
+    return DAG.getStore(Op.getOperand(0), SDLoc(Op), Addr, Op.getOperand(1),
+                        MachinePointerInfo(SV));
+  }
+  auto &FuncInfo = *MF.getInfo<HexagonMachineFunctionInfo>();
+  auto &HFL = *Subtarget.getFrameLowering();
+  SDLoc DL(Op);
+  SmallVector<SDValue, 8> MemOps;
+
+  // Get frame index of va_list.
+  SDValue FIN = Op.getOperand(1);
+
+  // If first Vararg register is odd, add 4 bytes to start of
+  // saved register area to point to the first register location.
+  // This is because the saved register area has to be 8 byte aligned.
+  // Incase of an odd start register, there will be 4 bytes of padding in
+  // the beginning of saved register area. If all registers area used up,
+  // the following condition will handle it correctly.
+  SDValue SavedRegAreaStartFrameIndex =
+    DAG.getFrameIndex(FuncInfo.getRegSavedAreaStartFrameIndex(), MVT::i32);
+
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+  if (HFL.FirstVarArgSavedReg & 1)
+    SavedRegAreaStartFrameIndex =
+      DAG.getNode(ISD::ADD, DL, PtrVT,
+                  DAG.getFrameIndex(FuncInfo.getRegSavedAreaStartFrameIndex(),
+                                    MVT::i32),
+                  DAG.getIntPtrConstant(4, DL));
+
+  // Store the saved register area start pointer.
+  SDValue Store =
+    DAG.getStore(Op.getOperand(0), DL,
+                 SavedRegAreaStartFrameIndex,
+                 FIN, MachinePointerInfo(SV));
+  MemOps.push_back(Store);
+
+  // Store saved register area end pointer.
+  FIN = DAG.getNode(ISD::ADD, DL, PtrVT,
+                    FIN, DAG.getIntPtrConstant(4, DL));
+  Store = DAG.getStore(Op.getOperand(0), DL,
+                       DAG.getFrameIndex(FuncInfo.getVarArgsFrameIndex(),
+                                         PtrVT),
+                       FIN, MachinePointerInfo(SV, 4));
+  MemOps.push_back(Store);
+
+  // Store overflow area pointer.
+  FIN = DAG.getNode(ISD::ADD, DL, PtrVT,
+                    FIN, DAG.getIntPtrConstant(4, DL));
+  Store = DAG.getStore(Op.getOperand(0), DL,
+                       DAG.getFrameIndex(FuncInfo.getVarArgsFrameIndex(),
+                                         PtrVT),
+                       FIN, MachinePointerInfo(SV, 8));
+  MemOps.push_back(Store);
+
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
+}
+
+SDValue
+HexagonTargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
+  // Assert that the linux ABI is enabled for the current compilation.
+  assert(Subtarget.isEnvironmentMusl() && "Linux ABI should be enabled");
+  SDValue Chain = Op.getOperand(0);
+  SDValue DestPtr = Op.getOperand(1);
+  SDValue SrcPtr = Op.getOperand(2);
+  const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
+  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+  SDLoc DL(Op);
+  // Size of the va_list is 12 bytes as it has 3 pointers. Therefore,
+  // we need to memcopy 12 bytes from va_list to another similar list.
+  return DAG.getMemcpy(Chain, DL, DestPtr, SrcPtr,
+                       DAG.getIntPtrConstant(12, DL), 4, /*isVolatile*/false,
+                       false, false,
+                       MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV));
+
 }
 
 SDValue HexagonTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
@@ -1375,7 +1513,10 @@
   setOperationAction(ISD::VASTART, MVT::Other, Custom);
   setOperationAction(ISD::VAEND,   MVT::Other, Expand);
   setOperationAction(ISD::VAARG,   MVT::Other, Expand);
-  setOperationAction(ISD::VACOPY,  MVT::Other, Expand);
+  if (Subtarget.isEnvironmentMusl())
+    setOperationAction(ISD::VACOPY, MVT::Other, Custom);
+  else
+    setOperationAction(ISD::VACOPY,  MVT::Other, Expand);
 
   setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
@@ -2928,6 +3069,7 @@
     case ISD::GlobalAddress:        return LowerGLOBALADDRESS(Op, DAG);
     case ISD::BlockAddress:         return LowerBlockAddress(Op, DAG);
     case ISD::GLOBAL_OFFSET_TABLE:  return LowerGLOBAL_OFFSET_TABLE(Op, DAG);
+    case ISD::VACOPY:              return LowerVACOPY(Op, DAG);
     case ISD::VASTART:              return LowerVASTART(Op, DAG);
     case ISD::DYNAMIC_STACKALLOC:   return LowerDYNAMIC_STACKALLOC(Op, DAG);
     case ISD::SETCC:                return LowerSETCC(Op, DAG);
diff --git a/llvm/lib/Target/Hexagon/HexagonMachineFunctionInfo.h b/llvm/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
--- a/llvm/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
@@ -30,6 +30,9 @@
   unsigned StackAlignBaseVReg = 0;    // Aligned-stack base register (virtual)
   unsigned StackAlignBasePhysReg = 0; //                             (physical)
   int VarArgsFrameIndex;
+  int RegSavedAreaStartFrameIndex;
+  int FirstNamedArgFrameIndex;
+  int LastNamedArgFrameIndex;
   bool HasClobberLR = false;
   bool HasEHReturn = false;
   std::map<const MachineInstr*, unsigned> PacketInfo;
@@ -46,6 +49,15 @@
   void setVarArgsFrameIndex(int v) { VarArgsFrameIndex = v; }
   int getVarArgsFrameIndex() { return VarArgsFrameIndex; }
 
+  void setRegSavedAreaStartFrameIndex(int v) { RegSavedAreaStartFrameIndex = v;}
+  int getRegSavedAreaStartFrameIndex() { return RegSavedAreaStartFrameIndex; }
+
+  void setFirstNamedArgFrameIndex(int v) { FirstNamedArgFrameIndex = v; }
+  int getFirstNamedArgFrameIndex() { return FirstNamedArgFrameIndex; }
+
+  void setLastNamedArgFrameIndex(int v) { LastNamedArgFrameIndex = v; }
+  int getLastNamedArgFrameIndex() { return LastNamedArgFrameIndex; }
+
   void setStartPacket(MachineInstr* MI) {
     PacketInfo[MI] |= Hexagon::StartPacket;
   }
diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/llvm/lib/Target/Hexagon/HexagonSubtarget.h
--- a/llvm/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.h
@@ -86,6 +86,7 @@
 
 private:
   std::string CPUString;
+  Triple TargetTriple;
   HexagonInstrInfo InstrInfo;
   HexagonRegisterInfo RegInfo;
   HexagonTargetLowering TLInfo;
@@ -97,6 +98,11 @@
   HexagonSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
                    const TargetMachine &TM);
 
+  const Triple &getTargetTriple() const { return TargetTriple; }
+  bool isEnvironmentMusl() const {
+    return TargetTriple.getEnvironment() == Triple::Musl;
+  }
+
   /// getInstrItins - Return the instruction itineraries based on subtarget
   /// selection.
   const InstrItineraryData *getInstrItineraryData() const override {
diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
--- a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -79,7 +79,7 @@
 HexagonSubtarget::HexagonSubtarget(const Triple &TT, StringRef CPU,
                                    StringRef FS, const TargetMachine &TM)
     : HexagonGenSubtargetInfo(TT, CPU, FS), OptLevel(TM.getOptLevel()),
-      CPUString(Hexagon_MC::selectHexagonCPU(CPU)),
+      CPUString(Hexagon_MC::selectHexagonCPU(CPU)), TargetTriple(TT),
       InstrInfo(initializeSubtargetDependencies(CPU, FS)),
       RegInfo(getHwMode()), TLInfo(TM, *this),
       InstrItins(getInstrItineraryForCPU(CPUString)) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -2317,6 +2317,48 @@
   return IntrinsicInst::Create(F, { TVal, TVal, ShAmt });
 }
 
+static Instruction *foldSelectToCopysign(SelectInst &Sel,
+                                         InstCombiner::BuilderTy &Builder) {
+  Value *Cond = Sel.getCondition();
+  Value *TVal = Sel.getTrueValue();
+  Value *FVal = Sel.getFalseValue();
+  Type *SelType = Sel.getType();
+
+  // Match select ?, TC, FC where the constants are equal but negated.
+  // TODO: Generalize to handle a negated variable operand?
+  const APFloat *TC, *FC;
+  if (!match(TVal, m_APFloat(TC)) || !match(FVal, m_APFloat(FC)) ||
+      !abs(*TC).bitwiseIsEqual(abs(*FC)))
+    return nullptr;
+
+  assert(TC != FC && "Expected equal select arms to simplify");
+
+  Value *X;
+  const APInt *C;
+  bool IsTrueIfSignSet;
+  ICmpInst::Predicate Pred;
+  if (!match(Cond, m_OneUse(m_ICmp(Pred, m_BitCast(m_Value(X)), m_APInt(C)))) ||
+      !isSignBitCheck(Pred, *C, IsTrueIfSignSet) || X->getType() != SelType)
+    return nullptr;
+
+  // If needed, negate the value that will be the sign argument of the copysign:
+  // (bitcast X) <  0 ? -TC :  TC --> copysign(TC,  X)
+  // (bitcast X) <  0 ?  TC : -TC --> copysign(TC, -X)
+  // (bitcast X) >= 0 ? -TC :  TC --> copysign(TC, -X)
+  // (bitcast X) >= 0 ?  TC : -TC --> copysign(TC,  X)
+  if (IsTrueIfSignSet ^ TC->isNegative())
+    X = Builder.CreateFNegFMF(X, &Sel);
+
+  // Canonicalize the magnitude argument as the positive constant since we do
+  // not care about its sign.
+  Value *MagArg = TC->isNegative() ? FVal : TVal;
+  Function *F = Intrinsic::getDeclaration(Sel.getModule(), Intrinsic::copysign,
+                                          Sel.getType());
+  Instruction *CopySign = IntrinsicInst::Create(F, { MagArg, X });
+  CopySign->setFastMathFlags(Sel.getFastMathFlags());
+  return CopySign;
+}
+
 Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
   Value *CondVal = SI.getCondition();
   Value *TrueVal = SI.getTrueValue();
@@ -2785,5 +2827,8 @@
   if (Instruction *Rot = foldSelectRotate(SI))
     return Rot;
 
+  if (Instruction *Copysign = foldSelectToCopysign(SI, Builder))
+    return Copysign;
+
   return nullptr;
 }
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -1089,8 +1089,11 @@
   else {
     // We cannot allow unaligned ops for unordered load/store, so reject
     // anything where the alignment isn't at least the element size.
-    unsigned Align = std::min(SI->getAlignment(), LI->getAlignment());
-    if (Align < StoreSize)
+    const MaybeAlign StoreAlign = SI->getAlign();
+    const MaybeAlign LoadAlign = LI->getAlign();
+    if (StoreAlign == None || LoadAlign == None)
+      return false;
+    if (*StoreAlign < StoreSize || *LoadAlign < StoreSize)
       return false;
 
     // If the element.atomic memcpy is not lowered into explicit
@@ -1104,8 +1107,8 @@
     // Note that unordered atomic loads/stores are *required* by the spec to
     // have an alignment but non-atomic loads/stores may not.
     NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
-        StoreBasePtr, SI->getAlignment(), LoadBasePtr, LI->getAlignment(),
-        NumBytes, StoreSize);
+        StoreBasePtr, *StoreAlign, LoadBasePtr, *LoadAlign, NumBytes,
+        StoreSize);
   }
   NewCall->setDebugLoc(SI->getDebugLoc());
 
diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt
--- a/llvm/test/CMakeLists.txt
+++ b/llvm/test/CMakeLists.txt
@@ -80,6 +80,7 @@
           llvm-lto2
           llvm-mc
           llvm-mca
+          llvm-ml
           llvm-modextract
           llvm-mt
           llvm-nm
diff --git a/llvm/test/CodeGen/Hexagon/vacopy.ll b/llvm/test/CodeGen/Hexagon/vacopy.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vacopy.ll
@@ -0,0 +1,39 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv62 -mtriple=hexagon-unknown-linux-musl < %s | FileCheck %s
+; CHECK-LABEL: PrintInts:
+; CHECK-DAG: memw{{.*}} = r{{[0-9]+}}
+; CHECK-DAG: memw{{.*}} = r{{[0-9]+}}
+; CHECK-DAG: r{{[0-9]+}}:{{[0-9]+}} = memd{{.*}}
+; CHECK-DAG: memd{{.*}} = r{{[0-9]+}}:{{[0-9]+}}
+
+%struct.__va_list_tag = type { i8*, i8*, i8* }
+
+; Function Attrs: nounwind
+define void @PrintInts(i32 %first, ...) #0 {
+entry:
+  %vl = alloca [1 x %struct.__va_list_tag], align 8
+  %vl_count = alloca [1 x %struct.__va_list_tag], align 8
+  %arraydecay1 = bitcast [1 x %struct.__va_list_tag]* %vl to i8*
+  call void @llvm.va_start(i8* %arraydecay1)
+  %0 = bitcast [1 x %struct.__va_list_tag]* %vl_count to i8*
+  call void @llvm.va_copy(i8* %0, i8* %arraydecay1)
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.va_start(i8*) #1
+
+; Function Attrs: nounwind
+declare void @llvm.va_copy(i8*, i8*) #1
+
+; Function Attrs: nounwind
+define i32 @main() #0 {
+entry:
+  tail call void (i32, ...) @PrintInts(i32 undef, i32 20, i32 30, i32 40, i32 50, i32 0)
+  ret i32 0
+}
+
+attributes #0 = { nounwind }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"Clang 3.1"}
diff --git a/llvm/test/CodeGen/Hexagon/vararg-deallocate-sp.ll b/llvm/test/CodeGen/Hexagon/vararg-deallocate-sp.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vararg-deallocate-sp.ll
@@ -0,0 +1,13 @@
+; RUN: llc -march=hexagon -mtriple=hexagon-unknown-linux-musl < %s | FileCheck %s
+
+; Test that the compiler deallocates the register saved area on Linux
+; for functions that do not need a frame pointer.
+
+; CHECK: r29 = add(r29,#-[[SIZE:[0-9]+]]
+; CHECK: r29 = add(r29,#[[SIZE]])
+
+define void @test(...) {
+entry:
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/Hexagon/vararg-linux-abi.ll b/llvm/test/CodeGen/Hexagon/vararg-linux-abi.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vararg-linux-abi.ll
@@ -0,0 +1,93 @@
+; RUN: llc -march=hexagon -mtriple=hexagon-unknown-linux-musl < %s | FileCheck %s
+
+; Check that we update the stack pointer before we do allocframe, so that
+; the LR/FP are stored in the location required by the Linux ABI.
+; CHECK: r29 = add(r29,#-24)
+; CHECK: allocframe
+
+target triple = "hexagon-unknown-linux"
+
+%s.0 = type { i8*, i8*, i8* }
+
+define dso_local i32 @f0(i32 %a0, ...) local_unnamed_addr #0 {
+b0:
+  %v0 = alloca [1 x %s.0], align 8
+  %v1 = bitcast [1 x %s.0]* %v0 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 12, i8* nonnull %v1) #2
+  call void @llvm.va_start(i8* nonnull %v1)
+  %v2 = getelementptr inbounds [1 x %s.0], [1 x %s.0]* %v0, i32 0, i32 0, i32 0
+  %v3 = load i8*, i8** %v2, align 8
+  %v4 = getelementptr inbounds [1 x %s.0], [1 x %s.0]* %v0, i32 0, i32 0, i32 1
+  %v5 = load i8*, i8** %v4, align 4
+  %v6 = getelementptr i8, i8* %v3, i32 4
+  %v7 = icmp sgt i8* %v6, %v5
+  br i1 %v7, label %b1, label %b2
+
+b1:                                               ; preds = %b0
+  %v8 = getelementptr inbounds [1 x %s.0], [1 x %s.0]* %v0, i32 0, i32 0, i32 2
+  %v9 = load i8*, i8** %v8, align 8
+  %v10 = getelementptr i8, i8* %v9, i32 4
+  store i8* %v10, i8** %v8, align 8
+  br label %b2
+
+b2:                                               ; preds = %b1, %b0
+  %v11 = phi i8* [ %v10, %b1 ], [ %v6, %b0 ]
+  %v12 = phi i8* [ %v9, %b1 ], [ %v3, %b0 ]
+  %v13 = bitcast i8* %v12 to i32*
+  store i8* %v11, i8** %v2, align 8
+  %v14 = load i32, i32* %v13, align 4
+  %v15 = icmp eq i32 %v14, 0
+  br i1 %v15, label %b7, label %b3
+
+b3:                                               ; preds = %b2
+  %v16 = getelementptr inbounds [1 x %s.0], [1 x %s.0]* %v0, i32 0, i32 0, i32 2
+  br label %b4
+
+b4:                                               ; preds = %b6, %b3
+  %v17 = phi i32 [ %v14, %b3 ], [ %v28, %b6 ]
+  %v18 = phi i32 [ %a0, %b3 ], [ %v20, %b6 ]
+  %v19 = phi i8* [ %v11, %b3 ], [ %v25, %b6 ]
+  %v20 = add nsw i32 %v17, %v18
+  %v21 = getelementptr i8, i8* %v19, i32 4
+  %v22 = icmp sgt i8* %v21, %v5
+  br i1 %v22, label %b5, label %b6
+
+b5:                                               ; preds = %b4
+  %v23 = load i8*, i8** %v16, align 8
+  %v24 = getelementptr i8, i8* %v23, i32 4
+  store i8* %v24, i8** %v16, align 8
+  br label %b6
+
+b6:                                               ; preds = %b5, %b4
+  %v25 = phi i8* [ %v24, %b5 ], [ %v21, %b4 ]
+  %v26 = phi i8* [ %v23, %b5 ], [ %v19, %b4 ]
+  %v27 = bitcast i8* %v26 to i32*
+  store i8* %v25, i8** %v2, align 8
+  %v28 = load i32, i32* %v27, align 4
+  %v29 = icmp eq i32 %v28, 0
+  br i1 %v29, label %b7, label %b4
+
+b7:                                               ; preds = %b6, %b2
+  %v30 = phi i32 [ %a0, %b2 ], [ %v20, %b6 ]
+  call void @llvm.va_end(i8* nonnull %v1)
+  call void @llvm.lifetime.end.p0i8(i64 12, i8* nonnull %v1) #2
+  ret i32 %v30
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
+
+; Function Attrs: nounwind
+declare void @llvm.va_start(i8*) #2
+
+; Function Attrs: nounwind
+declare void @llvm.va_end(i8*) #2
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
+
+attributes #0 = { argmemonly nounwind "frame-pointer"="all" }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
diff --git a/llvm/test/CodeGen/Hexagon/vararg.ll b/llvm/test/CodeGen/Hexagon/vararg.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vararg.ll
@@ -0,0 +1,97 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv62 -mtriple=hexagon-unknown-linux-musl  -O0 < %s | FileCheck %s
+
+; CHECK-LABEL: foo:
+
+; Check function prologue generation
+; CHECK: r29 = add(r29,#-24)
+; CHECK: memw(r29+#4) = r1
+; CHECK: memw(r29+#8) = r2
+; CHECK: memw(r29+#12) = r3
+; CHECK: memw(r29+#16) = r4
+; CHECK: memw(r29+#20) = r5
+; CHECK: r29 = add(r29,#24)
+
+
+%struct.AAA = type { i32, i32, i32, i32 }
+%struct.__va_list_tag = type { i8*, i8*, i8* }
+
+@aaa = global %struct.AAA { i32 100, i32 200, i32 300, i32 400 }, align 4
+@.str = private unnamed_addr constant [13 x i8] c"result = %d\0A\00", align 1
+
+; Function Attrs: nounwind
+define i32 @foo(i32 %xx, ...) #0 {
+entry:
+  %ap = alloca [1 x %struct.__va_list_tag], align 8
+  %arraydecay1 = bitcast [1 x %struct.__va_list_tag]* %ap to i8*
+  call void @llvm.va_start(i8* %arraydecay1)
+  %__current_saved_reg_area_pointer_p = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %ap, i32 0, i32 0, i32 0
+  %__current_saved_reg_area_pointer = load i8*, i8** %__current_saved_reg_area_pointer_p, align 8
+  %__saved_reg_area_end_pointer_p = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %ap, i32 0, i32 0, i32 1
+  %__saved_reg_area_end_pointer = load i8*, i8** %__saved_reg_area_end_pointer_p, align 4
+  %__new_saved_reg_area_pointer = getelementptr i8, i8* %__current_saved_reg_area_pointer, i32 4
+  %0 = icmp sgt i8* %__new_saved_reg_area_pointer, %__saved_reg_area_end_pointer
+  %__overflow_area_pointer_p = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %ap, i32 0, i32 0, i32 2
+  %__overflow_area_pointer = load i8*, i8** %__overflow_area_pointer_p, align 8
+  br i1 %0, label %vaarg.on_stack, label %vaarg.end
+
+vaarg.on_stack:                                   ; preds = %entry
+  %__overflow_area_pointer.next = getelementptr i8, i8* %__overflow_area_pointer, i32 4
+  store i8* %__overflow_area_pointer.next, i8** %__overflow_area_pointer_p, align 8
+  br label %vaarg.end
+
+vaarg.end:                                        ; preds = %entry, %vaarg.on_stack
+  %__overflow_area_pointer5 = phi i8* [ %__overflow_area_pointer.next, %vaarg.on_stack ], [ %__overflow_area_pointer, %entry ]
+  %storemerge32 = phi i8* [ %__overflow_area_pointer.next, %vaarg.on_stack ], [ %__new_saved_reg_area_pointer, %entry ]
+  %vaarg.addr.in = phi i8* [ %__overflow_area_pointer, %vaarg.on_stack ], [ %__current_saved_reg_area_pointer, %entry ]
+  store i8* %storemerge32, i8** %__current_saved_reg_area_pointer_p, align 8
+  %vaarg.addr = bitcast i8* %vaarg.addr.in to i32*
+  %1 = load i32, i32* %vaarg.addr, align 4
+  %__overflow_area_pointer_p4 = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %ap, i32 0, i32 0, i32 2
+  %__overflow_area_pointer.next6 = getelementptr i8, i8* %__overflow_area_pointer5, i32 16
+  store i8* %__overflow_area_pointer.next6, i8** %__overflow_area_pointer_p4, align 8
+  %bbb.sroa.1.0.idx27 = getelementptr inbounds i8, i8* %__overflow_area_pointer5, i32 12
+  %2 = bitcast i8* %bbb.sroa.1.0.idx27 to i32*
+  %bbb.sroa.1.0.copyload = load i32, i32* %2, align 4
+  %add8 = add nsw i32 %bbb.sroa.1.0.copyload, %1
+  %__new_saved_reg_area_pointer15 = getelementptr i8, i8* %storemerge32, i32 4
+  %3 = icmp sgt i8* %__new_saved_reg_area_pointer15, %__saved_reg_area_end_pointer
+  br i1 %3, label %vaarg.on_stack17, label %vaarg.end21
+
+vaarg.on_stack17:                                 ; preds = %vaarg.end
+  %__overflow_area_pointer.next20 = getelementptr i8, i8* %__overflow_area_pointer5, i32 20
+  store i8* %__overflow_area_pointer.next20, i8** %__overflow_area_pointer_p4, align 8
+  br label %vaarg.end21
+
+vaarg.end21:                                      ; preds = %vaarg.end, %vaarg.on_stack17
+  %storemerge = phi i8* [ %__overflow_area_pointer.next20, %vaarg.on_stack17 ], [ %__new_saved_reg_area_pointer15, %vaarg.end ]
+  %vaarg.addr22.in = phi i8* [ %__overflow_area_pointer.next6, %vaarg.on_stack17 ], [ %storemerge32, %vaarg.end ]
+  store i8* %storemerge, i8** %__current_saved_reg_area_pointer_p, align 8
+  %vaarg.addr22 = bitcast i8* %vaarg.addr22.in to i32*
+  %4 = load i32, i32* %vaarg.addr22, align 4
+  %add23 = add nsw i32 %add8, %4
+  call void @llvm.va_end(i8* %arraydecay1)
+  ret i32 %add23
+}
+
+; Function Attrs: nounwind
+declare void @llvm.va_start(i8*) #1
+
+; Function Attrs: nounwind
+declare void @llvm.va_end(i8*) #1
+
+; Function Attrs: nounwind
+define i32 @main() #0 {
+entry:
+  %call = tail call i32 (i32, ...) @foo(i32 undef, i32 2, %struct.AAA* byval align 4 @aaa, i32 4)
+  %call1 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i32 0, i32 0), i32 %call) #1
+  ret i32 %call
+}
+
+; Function Attrs: nounwind
+declare i32 @printf(i8* nocapture readonly, ...) #0
+
+attributes #0 = { nounwind }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"Clang 3.1"}
diff --git a/llvm/test/CodeGen/Hexagon/vararg_align_check.ll b/llvm/test/CodeGen/Hexagon/vararg_align_check.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vararg_align_check.ll
@@ -0,0 +1,186 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv62  -mtriple=hexagon-unknown-linux-musl -O0 < %s | FileCheck %s
+
+; CHECK-LABEL: foo:
+
+; Check Function prologue.
+; Note. All register numbers and offset are fixed.
+; Hence, no need of regular expression.
+
+; CHECK: r29 = add(r29,#-24)
+; CHECK: r7:6 = memd(r29+#24)
+; CHECK: memd(r29+#0) = r7:6
+; CHECK: r7:6 = memd(r29+#32)
+; CHECK: memd(r29+#8) = r7:6
+; CHECK: r7:6 = memd(r29+#40)
+; CHECK: memd(r29+#16) = r7:6
+; CHECK: memw(r29+#28) = r1
+; CHECK: memw(r29+#32) = r2
+; CHECK: memw(r29+#36) = r3
+; CHECK: memw(r29+#40) = r4
+; CHECK: memw(r29+#44) = r5
+; CHECK: r29 = add(r29,#24)
+
+%struct.AAA = type { i32, i32, i32, i32 }
+%struct.BBB = type { i8, i64, i32 }
+%struct.__va_list_tag = type { i8*, i8*, i8* }
+
+@aaa = global %struct.AAA { i32 100, i32 200, i32 300, i32 400 }, align 4
+@ddd = global { i8, i64, i32, [4 x i8] } { i8 1, i64 1000000, i32 5, [4 x i8] undef }, align 8
+@.str = private unnamed_addr constant [13 x i8] c"result = %d\0A\00", align 1
+
+; Function Attrs: nounwind
+define i32 @foo(i32 %xx, %struct.BBB* byval align 8 %eee, ...) #0 {
+entry:
+  %xx.addr = alloca i32, align 4
+  %ap = alloca [1 x %struct.__va_list_tag], align 8
+  %d = alloca i32, align 4
+  %k = alloca i64, align 8
+  %ret = alloca i32, align 4
+  %bbb = alloca %struct.AAA, align 4
+  store i32 %xx, i32* %xx.addr, align 4
+  store i32 0, i32* %ret, align 4
+  %x = getelementptr inbounds %struct.BBB, %struct.BBB* %eee, i32 0, i32 0
+  %0 = load i8, i8* %x, align 1
+  %tobool = trunc i8 %0 to i1
+  br i1 %tobool, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  store i32 1, i32* %ret, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %arraydecay = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %ap, i32 0, i32 0
+  %arraydecay1 = bitcast %struct.__va_list_tag* %arraydecay to i8*
+  call void @llvm.va_start(i8* %arraydecay1)
+  %arraydecay2 = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %ap, i32 0, i32 0
+  br label %vaarg.maybe_reg
+
+vaarg.maybe_reg:                                  ; preds = %if.end
+  %__current_saved_reg_area_pointer_p = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %arraydecay2, i32 0, i32 0
+  %__current_saved_reg_area_pointer = load i8*, i8** %__current_saved_reg_area_pointer_p
+  %__saved_reg_area_end_pointer_p = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %arraydecay2, i32 0, i32 1
+  %__saved_reg_area_end_pointer = load i8*, i8** %__saved_reg_area_end_pointer_p
+  %1 = ptrtoint i8* %__current_saved_reg_area_pointer to i32
+  %align_current_saved_reg_area_pointer = add i32 %1, 7
+  %align_current_saved_reg_area_pointer3 = and i32 %align_current_saved_reg_area_pointer, -8
+  %align_current_saved_reg_area_pointer4 = inttoptr i32 %align_current_saved_reg_area_pointer3 to i8*
+  %__new_saved_reg_area_pointer = getelementptr i8, i8* %align_current_saved_reg_area_pointer4, i32 8
+  %2 = icmp sgt i8* %__new_saved_reg_area_pointer, %__saved_reg_area_end_pointer
+  br i1 %2, label %vaarg.on_stack, label %vaarg.in_reg
+
+vaarg.in_reg:                                     ; preds = %vaarg.maybe_reg
+  %3 = bitcast i8* %align_current_saved_reg_area_pointer4 to i64*
+  store i8* %__new_saved_reg_area_pointer, i8** %__current_saved_reg_area_pointer_p
+  br label %vaarg.end
+
+vaarg.on_stack:                                   ; preds = %vaarg.maybe_reg
+  %__overflow_area_pointer_p = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %arraydecay2, i32 0, i32 2
+  %__overflow_area_pointer = load i8*, i8** %__overflow_area_pointer_p
+  %4 = ptrtoint i8* %__overflow_area_pointer to i32
+  %align_overflow_area_pointer = add i32 %4, 7
+  %align_overflow_area_pointer5 = and i32 %align_overflow_area_pointer, -8
+  %align_overflow_area_pointer6 = inttoptr i32 %align_overflow_area_pointer5 to i8*
+  %__overflow_area_pointer.next = getelementptr i8, i8* %align_overflow_area_pointer6, i32 8
+  store i8* %__overflow_area_pointer.next, i8** %__overflow_area_pointer_p
+  store i8* %__overflow_area_pointer.next, i8** %__current_saved_reg_area_pointer_p
+  %5 = bitcast i8* %align_overflow_area_pointer6 to i64*
+  br label %vaarg.end
+
+vaarg.end:                                        ; preds = %vaarg.on_stack, %vaarg.in_reg
+  %vaarg.addr = phi i64* [ %3, %vaarg.in_reg ], [ %5, %vaarg.on_stack ]
+  %6 = load i64, i64* %vaarg.addr
+  store i64 %6, i64* %k, align 8
+  %7 = load i64, i64* %k, align 8
+  %conv = trunc i64 %7 to i32
+  %div = sdiv i32 %conv, 1000
+  %8 = load i32, i32* %ret, align 4
+  %add = add nsw i32 %8, %div
+  store i32 %add, i32* %ret, align 4
+  %arraydecay7 = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %ap, i32 0, i32 0
+  %__overflow_area_pointer_p8 = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %arraydecay7, i32 0, i32 2
+  %__overflow_area_pointer9 = load i8*, i8** %__overflow_area_pointer_p8
+  %9 = bitcast i8* %__overflow_area_pointer9 to %struct.AAA*
+  %__overflow_area_pointer.next10 = getelementptr i8, i8* %__overflow_area_pointer9, i32 16
+  store i8* %__overflow_area_pointer.next10, i8** %__overflow_area_pointer_p8
+  %10 = bitcast %struct.AAA* %bbb to i8*
+  %11 = bitcast %struct.AAA* %9 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %10, i8* %11, i32 16, i32 4, i1 false)
+  %d11 = getelementptr inbounds %struct.AAA, %struct.AAA* %bbb, i32 0, i32 3
+  %12 = load i32, i32* %d11, align 4
+  %13 = load i32, i32* %ret, align 4
+  %add12 = add nsw i32 %13, %12
+  store i32 %add12, i32* %ret, align 4
+  %arraydecay13 = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %ap, i32 0, i32 0
+  br label %vaarg.maybe_reg14
+
+vaarg.maybe_reg14:                                ; preds = %vaarg.end
+  %__current_saved_reg_area_pointer_p15 = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %arraydecay13, i32 0, i32 0
+  %__current_saved_reg_area_pointer16 = load i8*, i8** %__current_saved_reg_area_pointer_p15
+  %__saved_reg_area_end_pointer_p17 = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %arraydecay13, i32 0, i32 1
+  %__saved_reg_area_end_pointer18 = load i8*, i8** %__saved_reg_area_end_pointer_p17
+  %__new_saved_reg_area_pointer19 = getelementptr i8, i8* %__current_saved_reg_area_pointer16, i32 4
+  %14 = icmp sgt i8* %__new_saved_reg_area_pointer19, %__saved_reg_area_end_pointer18
+  br i1 %14, label %vaarg.on_stack21, label %vaarg.in_reg20
+
+vaarg.in_reg20:                                   ; preds = %vaarg.maybe_reg14
+  %15 = bitcast i8* %__current_saved_reg_area_pointer16 to i32*
+  store i8* %__new_saved_reg_area_pointer19, i8** %__current_saved_reg_area_pointer_p15
+  br label %vaarg.end25
+
+vaarg.on_stack21:                                 ; preds = %vaarg.maybe_reg14
+  %__overflow_area_pointer_p22 = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %arraydecay13, i32 0, i32 2
+  %__overflow_area_pointer23 = load i8*, i8** %__overflow_area_pointer_p22
+  %__overflow_area_pointer.next24 = getelementptr i8, i8* %__overflow_area_pointer23, i32 4
+  store i8* %__overflow_area_pointer.next24, i8** %__overflow_area_pointer_p22
+  store i8* %__overflow_area_pointer.next24, i8** %__current_saved_reg_area_pointer_p15
+  %16 = bitcast i8* %__overflow_area_pointer23 to i32*
+  br label %vaarg.end25
+
+vaarg.end25:                                      ; preds = %vaarg.on_stack21, %vaarg.in_reg20
+  %vaarg.addr26 = phi i32* [ %15, %vaarg.in_reg20 ], [ %16, %vaarg.on_stack21 ]
+  %17 = load i32, i32* %vaarg.addr26
+  store i32 %17, i32* %d, align 4
+  %18 = load i32, i32* %d, align 4
+  %19 = load i32, i32* %ret, align 4
+  %add27 = add nsw i32 %19, %18
+  store i32 %add27, i32* %ret, align 4
+  %arraydecay28 = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %ap, i32 0, i32 0
+  %arraydecay2829 = bitcast %struct.__va_list_tag* %arraydecay28 to i8*
+  call void @llvm.va_end(i8* %arraydecay2829)
+  %20 = load i32, i32* %ret, align 4
+  ret i32 %20
+}
+
+; Function Attrs: nounwind
+declare void @llvm.va_start(i8*) #1
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #1
+
+; Function Attrs: nounwind
+declare void @llvm.va_end(i8*) #1
+
+; Function Attrs: nounwind
+define i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  %x = alloca i32, align 4
+  %m = alloca i64, align 8
+  store i32 0, i32* %retval
+  store i64 1000000, i64* %m, align 8
+  %0 = load i64, i64* %m, align 8
+  %call = call i32 (i32, %struct.BBB*, ...) @foo(i32 1, %struct.BBB* byval align 8 bitcast ({ i8, i64, i32, [4 x i8] }* @ddd to %struct.BBB*), i64 %0, %struct.AAA* byval align 4 @aaa, i32 4)
+  store i32 %call, i32* %x, align 4
+  %1 = load i32, i32* %x, align 4
+  %call1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i32 0, i32 0), i32 %1)
+  %2 = load i32, i32* %x, align 4
+  ret i32 %2
+}
+
+declare i32 @printf(i8*, ...) #2
+
+attributes #1 = { nounwind }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"Clang 3.1"}
diff --git a/llvm/test/CodeGen/Hexagon/vararg_double_onstack.ll b/llvm/test/CodeGen/Hexagon/vararg_double_onstack.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vararg_double_onstack.ll
@@ -0,0 +1,214 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv62  -mtriple=hexagon-unknown-linux-musl -O0 < %s | FileCheck %s
+
+; CHECK-LABEL: foo:
+
+; Check Function prologue.
+; Note. All register numbers and offset are fixed.
+; Hence, no need of regular expression.
+
+; CHECK: r29 = add(r29,#-8)
+; CHECK: memw(r29+#4) = r5
+; CHECK: r29 = add(r29,#8)
+
+%struct.AAA = type { i32, i32, i32, i32 }
+%struct.__va_list_tag = type { i8*, i8*, i8* }
+
+@aaa = global %struct.AAA { i32 100, i32 200, i32 300, i32 400 }, align 4
+@.str = private unnamed_addr constant [13 x i8] c"result = %d\0A\00", align 1
+
+; Function Attrs: nounwind
+define i32 @foo(i32 %xx, i32 %a, i32 %b, i32 %c, i32 %x, ...) #0 {
+entry:
+  %xx.addr = alloca i32, align 4
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca i32, align 4
+  %c.addr = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %ap = alloca [1 x %struct.__va_list_tag], align 8
+  %d = alloca i32, align 4
+  %ret = alloca i32, align 4
+  %bbb = alloca %struct.AAA, align 4
+  store i32 %xx, i32* %xx.addr, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 %b, i32* %b.addr, align 4
+  store i32 %c, i32* %c.addr, align 4
+  store i32 %x, i32* %x.addr, align 4
+  store i32 0, i32* %ret, align 4
+  %arraydecay = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %ap, i32 0, i32 0
+  %arraydecay1 = bitcast %struct.__va_list_tag* %arraydecay to i8*
+  call void @llvm.va_start(i8* %arraydecay1)
+  %arraydecay2 = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %ap, i32 0, i32 0
+  br label %vaarg.maybe_reg
+
+vaarg.maybe_reg:                                  ; preds = %entry
+  %__current_saved_reg_area_pointer_p = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %arraydecay2, i32 0, i32 0
+  %__current_saved_reg_area_pointer = load i8*, i8** %__current_saved_reg_area_pointer_p
+  %__saved_reg_area_end_pointer_p = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %arraydecay2, i32 0, i32 1
+  %__saved_reg_area_end_pointer = load i8*, i8** %__saved_reg_area_end_pointer_p
+  %0 = ptrtoint i8* %__current_saved_reg_area_pointer to i32
+  %align_current_saved_reg_area_pointer = add i32 %0, 7
+  %align_current_saved_reg_area_pointer3 = and i32 %align_current_saved_reg_area_pointer, -8
+  %align_current_saved_reg_area_pointer4 = inttoptr i32 %align_current_saved_reg_area_pointer3 to i8*
+  %__new_saved_reg_area_pointer = getelementptr i8, i8* %align_current_saved_reg_area_pointer4, i32 8
+  %1 = icmp sgt i8* %__new_saved_reg_area_pointer, %__saved_reg_area_end_pointer
+  br i1 %1, label %vaarg.on_stack, label %vaarg.in_reg
+
+vaarg.in_reg:                                     ; preds = %vaarg.maybe_reg
+  %2 = bitcast i8* %align_current_saved_reg_area_pointer4 to i64*
+  store i8* %__new_saved_reg_area_pointer, i8** %__current_saved_reg_area_pointer_p
+  br label %vaarg.end
+
+vaarg.on_stack:                                   ; preds = %vaarg.maybe_reg
+  %__overflow_area_pointer_p = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %arraydecay2, i32 0, i32 2
+  %__overflow_area_pointer = load i8*, i8** %__overflow_area_pointer_p
+  %3 = ptrtoint i8* %__overflow_area_pointer to i32
+  %align_overflow_area_pointer = add i32 %3, 7
+  %align_overflow_area_pointer5 = and i32 %align_overflow_area_pointer, -8
+  %align_overflow_area_pointer6 = inttoptr i32 %align_overflow_area_pointer5 to i8*
+  %__overflow_area_pointer.next = getelementptr i8, i8* %align_overflow_area_pointer6, i32 8
+  store i8* %__overflow_area_pointer.next, i8** %__overflow_area_pointer_p
+  store i8* %__overflow_area_pointer.next, i8** %__current_saved_reg_area_pointer_p
+  %4 = bitcast i8* %align_overflow_area_pointer6 to i64*
+  br label %vaarg.end
+
+vaarg.end:                                        ; preds = %vaarg.on_stack, %vaarg.in_reg
+  %vaarg.addr = phi i64* [ %2, %vaarg.in_reg ], [ %4, %vaarg.on_stack ]
+  %5 = load i64, i64* %vaarg.addr
+  %conv = trunc i64 %5 to i32
+  store i32 %conv, i32* %d, align 4
+  %6 = load i32, i32* %d, align 4
+  %7 = load i32, i32* %ret, align 4
+  %add = add nsw i32 %7, %6
+  store i32 %add, i32* %ret, align 4
+  %arraydecay7 = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %ap, i32 0, i32 0
+  %__overflow_area_pointer_p8 = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %arraydecay7, i32 0, i32 2
+  %__overflow_area_pointer9 = load i8*, i8** %__overflow_area_pointer_p8
+  %8 = bitcast i8* %__overflow_area_pointer9 to %struct.AAA*
+  %__overflow_area_pointer.next10 = getelementptr i8, i8* %__overflow_area_pointer9, i32 16
+  store i8* %__overflow_area_pointer.next10, i8** %__overflow_area_pointer_p8
+  %9 = bitcast %struct.AAA* %bbb to i8*
+  %10 = bitcast %struct.AAA* %8 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %9, i8* %10, i32 16, i32 4, i1 false)
+  %d11 = getelementptr inbounds %struct.AAA, %struct.AAA* %bbb, i32 0, i32 3
+  %11 = load i32, i32* %d11, align 4
+  %12 = load i32, i32* %ret, align 4
+  %add12 = add nsw i32 %12, %11
+  store i32 %add12, i32* %ret, align 4
+  %arraydecay13 = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %ap, i32 0, i32 0
+  br label %vaarg.maybe_reg14
+
+vaarg.maybe_reg14:                                ; preds = %vaarg.end
+  %__current_saved_reg_area_pointer_p15 = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %arraydecay13, i32 0, i32 0
+  %__current_saved_reg_area_pointer16 = load i8*, i8** %__current_saved_reg_area_pointer_p15
+  %__saved_reg_area_end_pointer_p17 = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %arraydecay13, i32 0, i32 1
+  %__saved_reg_area_end_pointer18 = load i8*, i8** %__saved_reg_area_end_pointer_p17
+  %__new_saved_reg_area_pointer19 = getelementptr i8, i8* %__current_saved_reg_area_pointer16, i32 4
+  %13 = icmp sgt i8* %__new_saved_reg_area_pointer19, %__saved_reg_area_end_pointer18
+  br i1 %13, label %vaarg.on_stack21, label %vaarg.in_reg20
+
+vaarg.in_reg20:                                   ; preds = %vaarg.maybe_reg14
+  %14 = bitcast i8* %__current_saved_reg_area_pointer16 to i32*
+  store i8* %__new_saved_reg_area_pointer19, i8** %__current_saved_reg_area_pointer_p15
+  br label %vaarg.end25
+
+vaarg.on_stack21:                                 ; preds = %vaarg.maybe_reg14
+  %__overflow_area_pointer_p22 = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %arraydecay13, i32 0, i32 2
+  %__overflow_area_pointer23 = load i8*, i8** %__overflow_area_pointer_p22
+  %__overflow_area_pointer.next24 = getelementptr i8, i8* %__overflow_area_pointer23, i32 4
+  store i8* %__overflow_area_pointer.next24, i8** %__overflow_area_pointer_p22
+  store i8* %__overflow_area_pointer.next24, i8** %__current_saved_reg_area_pointer_p15
+  %15 = bitcast i8* %__overflow_area_pointer23 to i32*
+  br label %vaarg.end25
+
+vaarg.end25:                                      ; preds = %vaarg.on_stack21, %vaarg.in_reg20
+  %vaarg.addr26 = phi i32* [ %14, %vaarg.in_reg20 ], [ %15, %vaarg.on_stack21 ]
+  %16 = load i32, i32* %vaarg.addr26
+  store i32 %16, i32* %d, align 4
+  %17 = load i32, i32* %d, align 4
+  %18 = load i32, i32* %ret, align 4
+  %add27 = add nsw i32 %18, %17
+  store i32 %add27, i32* %ret, align 4
+  %arraydecay28 = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %ap, i32 0, i32 0
+  br label %vaarg.maybe_reg29
+
+vaarg.maybe_reg29:                                ; preds = %vaarg.end25
+  %__current_saved_reg_area_pointer_p30 = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %arraydecay28, i32 0, i32 0
+  %__current_saved_reg_area_pointer31 = load i8*, i8** %__current_saved_reg_area_pointer_p30
+  %__saved_reg_area_end_pointer_p32 = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %arraydecay28, i32 0, i32 1
+  %__saved_reg_area_end_pointer33 = load i8*, i8** %__saved_reg_area_end_pointer_p32
+  %19 = ptrtoint i8* %__current_saved_reg_area_pointer31 to i32
+  %align_current_saved_reg_area_pointer34 = add i32 %19, 7
+  %align_current_saved_reg_area_pointer35 = and i32 %align_current_saved_reg_area_pointer34, -8
+  %align_current_saved_reg_area_pointer36 = inttoptr i32 %align_current_saved_reg_area_pointer35 to i8*
+  %__new_saved_reg_area_pointer37 = getelementptr i8, i8* %align_current_saved_reg_area_pointer36, i32 8
+  %20 = icmp sgt i8* %__new_saved_reg_area_pointer37, %__saved_reg_area_end_pointer33
+  br i1 %20, label %vaarg.on_stack39, label %vaarg.in_reg38
+
+vaarg.in_reg38:                                   ; preds = %vaarg.maybe_reg29
+  %21 = bitcast i8* %align_current_saved_reg_area_pointer36 to i64*
+  store i8* %__new_saved_reg_area_pointer37, i8** %__current_saved_reg_area_pointer_p30
+  br label %vaarg.end46
+
+vaarg.on_stack39:                                 ; preds = %vaarg.maybe_reg29
+  %__overflow_area_pointer_p40 = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %arraydecay28, i32 0, i32 2
+  %__overflow_area_pointer41 = load i8*, i8** %__overflow_area_pointer_p40
+  %22 = ptrtoint i8* %__overflow_area_pointer41 to i32
+  %align_overflow_area_pointer42 = add i32 %22, 7
+  %align_overflow_area_pointer43 = and i32 %align_overflow_area_pointer42, -8
+  %align_overflow_area_pointer44 = inttoptr i32 %align_overflow_area_pointer43 to i8*
+  %__overflow_area_pointer.next45 = getelementptr i8, i8* %align_overflow_area_pointer44, i32 8
+  store i8* %__overflow_area_pointer.next45, i8** %__overflow_area_pointer_p40
+  store i8* %__overflow_area_pointer.next45, i8** %__current_saved_reg_area_pointer_p30
+  %23 = bitcast i8* %align_overflow_area_pointer44 to i64*
+  br label %vaarg.end46
+
+vaarg.end46:                                      ; preds = %vaarg.on_stack39, %vaarg.in_reg38
+  %vaarg.addr47 = phi i64* [ %21, %vaarg.in_reg38 ], [ %23, %vaarg.on_stack39 ]
+  %24 = load i64, i64* %vaarg.addr47
+  %conv48 = trunc i64 %24 to i32
+  store i32 %conv48, i32* %d, align 4
+  %25 = load i32, i32* %d, align 4
+  %26 = load i32, i32* %ret, align 4
+  %add49 = add nsw i32 %26, %25
+  store i32 %add49, i32* %ret, align 4
+  %arraydecay50 = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %ap, i32 0, i32 0
+  %arraydecay5051 = bitcast %struct.__va_list_tag* %arraydecay50 to i8*
+  call void @llvm.va_end(i8* %arraydecay5051)
+  %27 = load i32, i32* %ret, align 4
+  ret i32 %27
+}
+
+; Function Attrs: nounwind
+declare void @llvm.va_start(i8*) #1
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #1
+
+; Function Attrs: nounwind
+declare void @llvm.va_end(i8*) #1
+
+; Function Attrs: nounwind
+define i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  %x = alloca i32, align 4
+  %y = alloca i64, align 8
+  store i32 0, i32* %retval
+  store i64 1000000, i64* %y, align 8
+  %0 = load i64, i64* %y, align 8
+  %1 = load i64, i64* %y, align 8
+  %call = call i32 (i32, i32, i32, i32, i32, ...) @foo(i32 1, i32 2, i32 3, i32 4, i32 5, i64 %0, %struct.AAA* byval align 4 @aaa, i32 4, i64 %1)
+  store i32 %call, i32* %x, align 4
+  %2 = load i32, i32* %x, align 4
+  %call1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i32 0, i32 0), i32 %2)
+  %3 = load i32, i32* %x, align 4
+  ret i32 %3
+}
+
+declare i32 @printf(i8*, ...) #2
+
+attributes #0 = { nounwind }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"Clang 3.1"}
diff --git a/llvm/test/CodeGen/Hexagon/vararg_named.ll b/llvm/test/CodeGen/Hexagon/vararg_named.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vararg_named.ll
@@ -0,0 +1,211 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv62 -mtriple=hexagon-unknown-linux-musl -O0 < %s | FileCheck %s
+
+; CHECK-LABEL: foo:
+
+; Check Function prologue.
+; Note. All register numbers and offset are fixed.
+; Hence, no need of regular expression.
+
+; CHECK: r29 = add(r29,#-16)
+; CHECK: r7:6 = memd(r29+#16)
+; CHECK: memd(r29+#0) = r7:6
+; CHECK: r7:6 = memd(r29+#24)
+; CHECK: memd(r29+#8) = r7:6
+; CHECK: r7:6 = memd(r29+#32)
+; CHECK: memd(r29+#16) = r7:6
+; CHECK: r7:6 = memd(r29+#40)
+; CHECK: memd(r29+#24) = r7:6
+; CHECK: memw(r29+#36) = r3
+; CHECK: memw(r29+#40) = r4
+; CHECK: memw(r29+#44) = r5
+; CHECK: r29 = add(r29,#16)
+
+%struct.AAA = type { i32, i32, i32, i32 }
+%struct.__va_list_tag = type { i8*, i8*, i8* }
+
+@aaa = global %struct.AAA { i32 100, i32 200, i32 300, i32 400 }, align 4
+@xxx = global %struct.AAA { i32 100, i32 200, i32 300, i32 400 }, align 4
+@yyy = global %struct.AAA { i32 100, i32 200, i32 300, i32 400 }, align 4
+@ccc = global %struct.AAA { i32 10, i32 20, i32 30, i32 40 }, align 4
+@fff = global %struct.AAA { i32 1, i32 2, i32 3, i32 4 }, align 4
+@.str = private unnamed_addr constant [13 x i8] c"result = %d\0A\00", align 1
+
+; Function Attrs: nounwind
+define i32 @foo(i32 %xx, i32 %z, i32 %m, %struct.AAA* byval align 4 %bbb, %struct.AAA* byval align 4 %GGG, ...) #0 {
+entry:
+  %xx.addr = alloca i32, align 4
+  %z.addr = alloca i32, align 4
+  %m.addr = alloca i32, align 4
+  %ap = alloca [1 x %struct.__va_list_tag], align 8
+  %d = alloca i32, align 4
+  %ret = alloca i32, align 4
+  %ddd = alloca %struct.AAA, align 4
+  %ggg = alloca %struct.AAA, align 4
+  %nnn = alloca %struct.AAA, align 4
+  store i32 %xx, i32* %xx.addr, align 4
+  store i32 %z, i32* %z.addr, align 4
+  store i32 %m, i32* %m.addr, align 4
+  store i32 0, i32* %ret, align 4
+  %arraydecay = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %ap, i32 0, i32 0
+  %arraydecay1 = bitcast %struct.__va_list_tag* %arraydecay to i8*
+  call void @llvm.va_start(i8* %arraydecay1)
+  %d2 = getelementptr inbounds %struct.AAA, %struct.AAA* %bbb, i32 0, i32 3
+  %0 = load i32, i32* %d2, align 4
+  %1 = load i32, i32* %ret, align 4
+  %add = add nsw i32 %1, %0
+  store i32 %add, i32* %ret, align 4
+  %2 = load i32, i32* %z.addr, align 4
+  %3 = load i32, i32* %ret, align 4
+  %add3 = add nsw i32 %3, %2
+  store i32 %add3, i32* %ret, align 4
+  %arraydecay4 = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %ap, i32 0, i32 0
+  br label %vaarg.maybe_reg
+
+vaarg.maybe_reg:                                  ; preds = %entry
+  %__current_saved_reg_area_pointer_p = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %arraydecay4, i32 0, i32 0
+  %__current_saved_reg_area_pointer = load i8*, i8** %__current_saved_reg_area_pointer_p
+  %__saved_reg_area_end_pointer_p = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %arraydecay4, i32 0, i32 1
+  %__saved_reg_area_end_pointer = load i8*, i8** %__saved_reg_area_end_pointer_p
+  %__new_saved_reg_area_pointer = getelementptr i8, i8* %__current_saved_reg_area_pointer, i32 4
+  %4 = icmp sgt i8* %__new_saved_reg_area_pointer, %__saved_reg_area_end_pointer
+  br i1 %4, label %vaarg.on_stack, label %vaarg.in_reg
+
+vaarg.in_reg:                                     ; preds = %vaarg.maybe_reg
+  %5 = bitcast i8* %__current_saved_reg_area_pointer to i32*
+  store i8* %__new_saved_reg_area_pointer, i8** %__current_saved_reg_area_pointer_p
+  br label %vaarg.end
+
+vaarg.on_stack:                                   ; preds = %vaarg.maybe_reg
+  %__overflow_area_pointer_p = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %arraydecay4, i32 0, i32 2
+  %__overflow_area_pointer = load i8*, i8** %__overflow_area_pointer_p
+  %__overflow_area_pointer.next = getelementptr i8, i8* %__overflow_area_pointer, i32 4
+  store i8* %__overflow_area_pointer.next, i8** %__overflow_area_pointer_p
+  store i8* %__overflow_area_pointer.next, i8** %__current_saved_reg_area_pointer_p
+  %6 = bitcast i8* %__overflow_area_pointer to i32*
+  br label %vaarg.end
+
+vaarg.end:                                        ; preds = %vaarg.on_stack, %vaarg.in_reg
+  %vaarg.addr = phi i32* [ %5, %vaarg.in_reg ], [ %6, %vaarg.on_stack ]
+  %7 = load i32, i32* %vaarg.addr
+  store i32 %7, i32* %d, align 4
+  %8 = load i32, i32* %d, align 4
+  %9 = load i32, i32* %ret, align 4
+  %add5 = add nsw i32 %9, %8
+  store i32 %add5, i32* %ret, align 4
+  %arraydecay6 = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %ap, i32 0, i32 0
+  %__overflow_area_pointer_p7 = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %arraydecay6, i32 0, i32 2
+  %__overflow_area_pointer8 = load i8*, i8** %__overflow_area_pointer_p7
+  %10 = bitcast i8* %__overflow_area_pointer8 to %struct.AAA*
+  %__overflow_area_pointer.next9 = getelementptr i8, i8* %__overflow_area_pointer8, i32 16
+  store i8* %__overflow_area_pointer.next9, i8** %__overflow_area_pointer_p7
+  %11 = bitcast %struct.AAA* %ddd to i8*
+  %12 = bitcast %struct.AAA* %10 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %11, i8* %12, i32 16, i32 4, i1 false)
+  %d10 = getelementptr inbounds %struct.AAA, %struct.AAA* %ddd, i32 0, i32 3
+  %13 = load i32, i32* %d10, align 4
+  %14 = load i32, i32* %ret, align 4
+  %add11 = add nsw i32 %14, %13
+  store i32 %add11, i32* %ret, align 4
+  %arraydecay12 = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %ap, i32 0, i32 0
+  %__overflow_area_pointer_p13 = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %arraydecay12, i32 0, i32 2
+  %__overflow_area_pointer14 = load i8*, i8** %__overflow_area_pointer_p13
+  %15 = bitcast i8* %__overflow_area_pointer14 to %struct.AAA*
+  %__overflow_area_pointer.next15 = getelementptr i8, i8* %__overflow_area_pointer14, i32 16
+  store i8* %__overflow_area_pointer.next15, i8** %__overflow_area_pointer_p13
+  %16 = bitcast %struct.AAA* %ggg to i8*
+  %17 = bitcast %struct.AAA* %15 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %16, i8* %17, i32 16, i32 4, i1 false)
+  %d16 = getelementptr inbounds %struct.AAA, %struct.AAA* %ggg, i32 0, i32 3
+  %18 = load i32, i32* %d16, align 4
+  %19 = load i32, i32* %ret, align 4
+  %add17 = add nsw i32 %19, %18
+  store i32 %add17, i32* %ret, align 4
+  %arraydecay18 = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %ap, i32 0, i32 0
+  %__overflow_area_pointer_p19 = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %arraydecay18, i32 0, i32 2
+  %__overflow_area_pointer20 = load i8*, i8** %__overflow_area_pointer_p19
+  %20 = bitcast i8* %__overflow_area_pointer20 to %struct.AAA*
+  %__overflow_area_pointer.next21 = getelementptr i8, i8* %__overflow_area_pointer20, i32 16
+  store i8* %__overflow_area_pointer.next21, i8** %__overflow_area_pointer_p19
+  %21 = bitcast %struct.AAA* %nnn to i8*
+  %22 = bitcast %struct.AAA* %20 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %21, i8* %22, i32 16, i32 4, i1 false)
+  %d22 = getelementptr inbounds %struct.AAA, %struct.AAA* %nnn, i32 0, i32 3
+  %23 = load i32, i32* %d22, align 4
+  %24 = load i32, i32* %ret, align 4
+  %add23 = add nsw i32 %24, %23
+  store i32 %add23, i32* %ret, align 4
+  %arraydecay24 = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %ap, i32 0, i32 0
+  br label %vaarg.maybe_reg25
+
+vaarg.maybe_reg25:                                ; preds = %vaarg.end
+  %__current_saved_reg_area_pointer_p26 = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %arraydecay24, i32 0, i32 0
+  %__current_saved_reg_area_pointer27 = load i8*, i8** %__current_saved_reg_area_pointer_p26
+  %__saved_reg_area_end_pointer_p28 = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %arraydecay24, i32 0, i32 1
+  %__saved_reg_area_end_pointer29 = load i8*, i8** %__saved_reg_area_end_pointer_p28
+  %__new_saved_reg_area_pointer30 = getelementptr i8, i8* %__current_saved_reg_area_pointer27, i32 4
+  %25 = icmp sgt i8* %__new_saved_reg_area_pointer30, %__saved_reg_area_end_pointer29
+  br i1 %25, label %vaarg.on_stack32, label %vaarg.in_reg31
+
+vaarg.in_reg31:                                   ; preds = %vaarg.maybe_reg25
+  %26 = bitcast i8* %__current_saved_reg_area_pointer27 to i32*
+  store i8* %__new_saved_reg_area_pointer30, i8** %__current_saved_reg_area_pointer_p26
+  br label %vaarg.end36
+
+vaarg.on_stack32:                                 ; preds = %vaarg.maybe_reg25
+  %__overflow_area_pointer_p33 = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %arraydecay24, i32 0, i32 2
+  %__overflow_area_pointer34 = load i8*, i8** %__overflow_area_pointer_p33
+  %__overflow_area_pointer.next35 = getelementptr i8, i8* %__overflow_area_pointer34, i32 4
+  store i8* %__overflow_area_pointer.next35, i8** %__overflow_area_pointer_p33
+  store i8* %__overflow_area_pointer.next35, i8** %__current_saved_reg_area_pointer_p26
+  %27 = bitcast i8* %__overflow_area_pointer34 to i32*
+  br label %vaarg.end36
+
+vaarg.end36:                                      ; preds = %vaarg.on_stack32, %vaarg.in_reg31
+  %vaarg.addr37 = phi i32* [ %26, %vaarg.in_reg31 ], [ %27, %vaarg.on_stack32 ]
+  %28 = load i32, i32* %vaarg.addr37
+  store i32 %28, i32* %d, align 4
+  %29 = load i32, i32* %d, align 4
+  %30 = load i32, i32* %ret, align 4
+  %add38 = add nsw i32 %30, %29
+  store i32 %add38, i32* %ret, align 4
+  %31 = load i32, i32* %m.addr, align 4
+  %32 = load i32, i32* %ret, align 4
+  %add39 = add nsw i32 %32, %31
+  store i32 %add39, i32* %ret, align 4
+  %arraydecay40 = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %ap, i32 0, i32 0
+  %arraydecay4041 = bitcast %struct.__va_list_tag* %arraydecay40 to i8*
+  call void @llvm.va_end(i8* %arraydecay4041)
+  %33 = load i32, i32* %ret, align 4
+  ret i32 %33
+}
+
+; Function Attrs: nounwind
+declare void @llvm.va_start(i8*) #1
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #1
+
+; Function Attrs: nounwind
+declare void @llvm.va_end(i8*) #1
+
+; Function Attrs: nounwind
+define i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  %x = alloca i32, align 4
+  store i32 0, i32* %retval
+  %call = call i32 (i32, i32, i32, %struct.AAA*, %struct.AAA*, ...) @foo(i32 1, i32 3, i32 5, %struct.AAA* byval align 4 @aaa, %struct.AAA* byval align 4 @fff, i32 2, %struct.AAA* byval align 4 @xxx, %struct.AAA* byval align 4 @yyy, %struct.AAA* byval align 4 @ccc, i32 4)
+  store i32 %call, i32* %x, align 4
+  %0 = load i32, i32* %x, align 4
+  %call1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i32 0, i32 0), i32 %0)
+  %1 = load i32, i32* %x, align 4
+  ret i32 %1
+}
+
+declare i32 @printf(i8*, ...) #2
+
+attributes #0 = { nounwind }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"Clang 3.1"}
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxnmaq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxnmaq.ll
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxnmaq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxnmaq.ll
@@ -7,9 +7,10 @@
 ; CHECK-NEXT:    vmaxnma.f16 q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = tail call <8 x half> @llvm.fabs.v8f16(<8 x half> %b)
-  %1 = tail call <8 x half> @llvm.maxnum.v8f16(<8 x half> %a, <8 x half> %0)
-  ret <8 x half> %1
+  %0 = tail call <8 x half> @llvm.fabs.v8f16(<8 x half> %a)
+  %1 = tail call <8 x half> @llvm.fabs.v8f16(<8 x half> %b)
+  %2 = tail call <8 x half> @llvm.maxnum.v8f16(<8 x half> %0, <8 x half> %1)
+  ret <8 x half> %2
 }
 
 declare <8 x half> @llvm.fabs.v8f16(<8 x half>) #1
@@ -22,9 +23,10 @@
 ; CHECK-NEXT:    vmaxnma.f32 q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> %b)
-  %1 = tail call <4 x float> @llvm.maxnum.v4f32(<4 x float> %a, <4 x float> %0)
-  ret <4 x float> %1
+  %0 = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> %a)
+  %1 = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> %b)
+  %2 = tail call <4 x float> @llvm.maxnum.v4f32(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %2
 }
 
 declare <4 x float> @llvm.fabs.v4f32(<4 x float>) #1
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminnmaq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminnmaq.ll
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminnmaq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminnmaq.ll
@@ -7,9 +7,10 @@
 ; CHECK-NEXT:    vminnma.f16 q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = tail call <8 x half> @llvm.fabs.v8f16(<8 x half> %b)
-  %1 = tail call <8 x half> @llvm.minnum.v8f16(<8 x half> %a, <8 x half> %0)
-  ret <8 x half> %1
+  %0 = tail call <8 x half> @llvm.fabs.v8f16(<8 x half> %a)
+  %1 = tail call <8 x half> @llvm.fabs.v8f16(<8 x half> %b)
+  %2 = tail call <8 x half> @llvm.minnum.v8f16(<8 x half> %0, <8 x half> %1)
+  ret <8 x half> %2
 }
 
 declare <8 x half> @llvm.fabs.v8f16(<8 x half>) #1
@@ -22,9 +23,10 @@
 ; CHECK-NEXT:    vminnma.f32 q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> %b)
-  %1 = tail call <4 x float> @llvm.minnum.v4f32(<4 x float> %a, <4 x float> %0)
-  ret <4 x float> %1
+  %0 = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> %a)
+  %1 = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> %b)
+  %2 = tail call <4 x float> @llvm.minnum.v4f32(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %2
 }
 
 declare <4 x float> @llvm.fabs.v4f32(<4 x float>) #1
diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll
--- a/llvm/test/Transforms/InstCombine/select.ll
+++ b/llvm/test/Transforms/InstCombine/select.ll
@@ -1535,9 +1535,7 @@
 
 define float @copysign1(float %x) {
 ; CHECK-LABEL: @copysign1(
-; CHECK-NEXT:    [[I:%.*]] = bitcast float [[X:%.*]] to i32
-; CHECK-NEXT:    [[ISPOS:%.*]] = icmp sgt i32 [[I]], -1
-; CHECK-NEXT:    [[R:%.*]] = select i1 [[ISPOS]], float 1.000000e+00, float -1.000000e+00
+; CHECK-NEXT:    [[R:%.*]] = call float @llvm.copysign.f32(float 1.000000e+00, float [[X:%.*]])
 ; CHECK-NEXT:    ret float [[R]]
 ;
   %i = bitcast float %x to i32
@@ -1548,9 +1546,8 @@
 
 define <2 x float> @copysign2(<2 x float> %x) {
 ; CHECK-LABEL: @copysign2(
-; CHECK-NEXT:    [[I:%.*]] = bitcast <2 x float> [[X:%.*]] to <2 x i32>
-; CHECK-NEXT:    [[ISNEG:%.*]] = icmp slt <2 x i32> [[I]], zeroinitializer
-; CHECK-NEXT:    [[R:%.*]] = select nsz <2 x i1> [[ISNEG]], <2 x float> <float 4.200000e+01, float 4.200000e+01>, <2 x float> <float -4.200000e+01, float -4.200000e+01>
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg nsz <2 x float> [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = call nsz <2 x float> @llvm.copysign.v2f32(<2 x float> <float 4.200000e+01, float 4.200000e+01>, <2 x float> [[TMP1]])
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
   %i = bitcast <2 x float> %x to <2 x i32>
@@ -1561,9 +1558,8 @@
 
 define float @copysign3(float %x) {
 ; CHECK-LABEL: @copysign3(
-; CHECK-NEXT:    [[I:%.*]] = bitcast float [[X:%.*]] to i32
-; CHECK-NEXT:    [[ISPOS:%.*]] = icmp sgt i32 [[I]], -1
-; CHECK-NEXT:    [[R:%.*]] = select fast i1 [[ISPOS]], float -4.300000e+01, float 4.300000e+01
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg fast float [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = call fast float @llvm.copysign.f32(float 4.300000e+01, float [[TMP1]])
 ; CHECK-NEXT:    ret float [[R]]
 ;
   %i = bitcast float %x to i32
@@ -1572,6 +1568,8 @@
   ret float %r
 }
 
+; TODO: Allow undefs when matching vectors.
+
 define <2 x float> @copysign4(<2 x float> %x) {
 ; CHECK-LABEL: @copysign4(
 ; CHECK-NEXT:    [[I:%.*]] = bitcast <2 x float> [[X:%.*]] to <2 x i32>
@@ -1587,6 +1585,8 @@
 
 declare void @use1(i1)
 
+; Negative test
+
 define float @copysign_extra_use(float %x) {
 ; CHECK-LABEL: @copysign_extra_use(
 ; CHECK-NEXT:    [[I:%.*]] = bitcast float [[X:%.*]] to i32
@@ -1602,6 +1602,8 @@
   ret float %r
 }
 
+; Negative test
+
 define float @copysign_type_mismatch(double %x) {
 ; CHECK-LABEL: @copysign_type_mismatch(
 ; CHECK-NEXT:    [[I:%.*]] = bitcast double [[X:%.*]] to i64
@@ -1615,6 +1617,8 @@
   ret float %r
 }
 
+; Negative test
+
 define float @copysign_wrong_cmp(float %x) {
 ; CHECK-LABEL: @copysign_wrong_cmp(
 ; CHECK-NEXT:    [[I:%.*]] = bitcast float [[X:%.*]] to i32
@@ -1628,6 +1632,8 @@
   ret float %r
 }
 
+; Negative test
+
 define float @copysign_wrong_const(float %x) {
 ; CHECK-LABEL: @copysign_wrong_const(
 ; CHECK-NEXT:    [[I:%.*]] = bitcast float [[X:%.*]] to i32
diff --git a/llvm/test/Transforms/InstSimplify/select.ll b/llvm/test/Transforms/InstSimplify/select.ll
--- a/llvm/test/Transforms/InstSimplify/select.ll
+++ b/llvm/test/Transforms/InstSimplify/select.ll
@@ -59,8 +59,7 @@
 
 define <2 x i32> @equal_arms_vec_undef(<2 x i1> %cond) {
 ; CHECK-LABEL: @equal_arms_vec_undef(
-; CHECK-NEXT:    [[V:%.*]] = select <2 x i1> [[COND:%.*]], <2 x i32> <i32 42, i32 undef>, <2 x i32> <i32 undef, i32 42>
-; CHECK-NEXT:    ret <2 x i32> [[V]]
+; CHECK-NEXT:    ret <2 x i32> <i32 42, i32 42>
 ;
   %V = select <2 x i1> %cond, <2 x i32> <i32 42, i32 undef>, <2 x i32> <i32 undef, i32 42>
   ret <2 x i32> %V
@@ -68,8 +67,7 @@
 
 define <3 x float> @equal_arms_vec_less_undef(<3 x i1> %cond) {
 ; CHECK-LABEL: @equal_arms_vec_less_undef(
-; CHECK-NEXT:    [[V:%.*]] = select <3 x i1> [[COND:%.*]], <3 x float> <float 4.200000e+01, float undef, float 4.300000e+01>, <3 x float> <float 4.200000e+01, float 4.200000e+01, float 4.300000e+01>
-; CHECK-NEXT:    ret <3 x float> [[V]]
+; CHECK-NEXT:    ret <3 x float> <float 4.200000e+01, float 4.200000e+01, float 4.300000e+01>
 ;
   %V = select <3 x i1> %cond, <3 x float> <float 42.0, float undef, float 43.0>, <3 x float> <float 42.0, float 42.0, float 43.0>
   ret <3 x float> %V
@@ -77,8 +75,7 @@
 
 define <3 x float> @equal_arms_vec_more_undef(<3 x i1> %cond) {
 ; CHECK-LABEL: @equal_arms_vec_more_undef(
-; CHECK-NEXT:    [[V:%.*]] = select <3 x i1> [[COND:%.*]], <3 x float> <float 4.200000e+01, float undef, float undef>, <3 x float> <float undef, float undef, float 4.300000e+01>
-; CHECK-NEXT:    ret <3 x float> [[V]]
+; CHECK-NEXT:    ret <3 x float> <float 4.200000e+01, float undef, float 4.300000e+01>
 ;
   %V = select <3 x i1> %cond, <3 x float> <float 42.0, float undef, float undef>, <3 x float> <float undef, float undef, float 43.0>
   ret <3 x float> %V
diff --git a/llvm/test/tools/llvm-ml/basic.test b/llvm/test/tools/llvm-ml/basic.test
new file mode 100644
--- /dev/null
+++ b/llvm/test/tools/llvm-ml/basic.test
@@ -0,0 +1,4 @@
+# REQUIRES: x86-registered-target
+# RUN: not llvm-ml %t.blah -o /dev/null 2>&1 | FileCheck --check-prefix=ENOENT %s
+
+# ENOENT: {{.*}}.blah: {{[Nn]}}o such file or directory
diff --git a/llvm/test/tools/llvm-ml/run.test b/llvm/test/tools/llvm-ml/run.test
new file mode 100644
--- /dev/null
+++ b/llvm/test/tools/llvm-ml/run.test
@@ -0,0 +1,3 @@
+# RUN: llvm-ml --help | FileCheck %s
+
+# CHECK: USAGE: llvm-ml
diff --git a/llvm/test/tools/llvm-profdata/text-format-errors.test b/llvm/test/tools/llvm-profdata/text-format-errors.test
--- a/llvm/test/tools/llvm-profdata/text-format-errors.test
+++ b/llvm/test/tools/llvm-profdata/text-format-errors.test
@@ -25,7 +25,7 @@
 4- Detect binary input
 RUN: not llvm-profdata show %p/Inputs/text-format-errors.text.bin 2>&1 | FileCheck %s --check-prefix=BINARY
 BINARY: error: {{.+}}: Unrecognized instrumentation profile encoding format
-BINARY: Perhaps you forgot to use the -sample option?
+BINARY: Perhaps you forgot to use the --sample option?
 
 5- Detect malformed value profile data
 RUN: not llvm-profdata show %p/Inputs/vp-malform.proftext 2>&1 | FileCheck %s --check-prefix=VP
diff --git a/llvm/tools/llvm-exegesis/lib/AArch64/Target.cpp b/llvm/tools/llvm-exegesis/lib/AArch64/Target.cpp
--- a/llvm/tools/llvm-exegesis/lib/AArch64/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/AArch64/Target.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 #include "../Target.h"
-#include "../Latency.h"
 #include "AArch64.h"
 #include "AArch64RegisterInfo.h"
 
diff --git a/llvm/tools/llvm-exegesis/lib/Analysis.cpp b/llvm/tools/llvm-exegesis/lib/Analysis.cpp
--- a/llvm/tools/llvm-exegesis/lib/Analysis.cpp
+++ b/llvm/tools/llvm-exegesis/lib/Analysis.cpp
@@ -244,9 +244,9 @@
   return Entries;
 }
 
-// Uops repeat the same opcode over again. Just show this opcode and show the
-// whole snippet only on hover.
-static void writeUopsSnippetHtml(raw_ostream &OS,
+// Parallel benchmarks repeat the same opcode multiple times. Just show this
+// opcode and show the whole snippet only on hover.
+static void writeParallelSnippetHtml(raw_ostream &OS,
                                  const std::vector<MCInst> &Instructions,
                                  const MCInstrInfo &InstrInfo) {
   if (Instructions.empty())
@@ -282,7 +282,7 @@
     break;
   case InstructionBenchmark::Uops:
   case InstructionBenchmark::InverseThroughput:
-    writeUopsSnippetHtml(OS, Point.Key.Instructions, *InstrInfo_);
+    writeParallelSnippetHtml(OS, Point.Key.Instructions, *InstrInfo_);
     break;
   default:
     llvm_unreachable("invalid mode");
diff --git a/llvm/tools/llvm-exegesis/lib/CMakeLists.txt b/llvm/tools/llvm-exegesis/lib/CMakeLists.txt
--- a/llvm/tools/llvm-exegesis/lib/CMakeLists.txt
+++ b/llvm/tools/llvm-exegesis/lib/CMakeLists.txt
@@ -27,18 +27,20 @@
   BenchmarkRunner.cpp
   Clustering.cpp
   CodeTemplate.cpp
-  Latency.cpp
+  LatencyBenchmarkRunner.cpp
   LlvmState.cpp
   MCInstrDescView.cpp
+  ParallelSnippetGenerator.cpp
   PerfHelper.cpp
   RegisterAliasing.cpp
   RegisterValue.cpp
   SchedClassResolution.cpp
+  SerialSnippetGenerator.cpp
   SnippetFile.cpp
   SnippetGenerator.cpp
   SnippetRepetitor.cpp
   Target.cpp
-  Uops.cpp
+  UopsBenchmarkRunner.cpp
   )
 
 llvm_update_compile_flags(LLVMExegesis)
diff --git a/llvm/tools/llvm-exegesis/lib/Latency.h b/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.h
rename from llvm/tools/llvm-exegesis/lib/Latency.h
rename to llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.h
--- a/llvm/tools/llvm-exegesis/lib/Latency.h
+++ b/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.h
@@ -1,4 +1,4 @@
-//===-- Latency.h -----------------------------------------------*- C++ -*-===//
+//===-- LatencyBenchmarkRunner.h --------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -15,23 +15,10 @@
 #define LLVM_TOOLS_LLVM_EXEGESIS_LATENCY_H
 
 #include "BenchmarkRunner.h"
-#include "Error.h"
-#include "MCInstrDescView.h"
-#include "SnippetGenerator.h"
 
 namespace llvm {
 namespace exegesis {
 
-class LatencySnippetGenerator : public SnippetGenerator {
-public:
-  using SnippetGenerator::SnippetGenerator;
-  ~LatencySnippetGenerator() override;
-
-  Expected<std::vector<CodeTemplate>>
-  generateCodeTemplates(const Instruction &Instr,
-                        const BitVector &ForbiddenRegisters) const override;
-};
-
 class LatencyBenchmarkRunner : public BenchmarkRunner {
 public:
   LatencyBenchmarkRunner(const LLVMState &State,
diff --git a/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.cpp
@@ -0,0 +1,58 @@
+//===-- LatencyBenchmarkRunner.cpp ------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "LatencyBenchmarkRunner.h"
+
+#include "Target.h"
+#include "BenchmarkRunner.h"
+
+namespace llvm {
+namespace exegesis {
+
+static constexpr size_t kMaxAliasingInstructions = 10;
+
+LatencyBenchmarkRunner::LatencyBenchmarkRunner(const LLVMState &State,
+                                               InstructionBenchmark::ModeE Mode)
+    : BenchmarkRunner(State, Mode) {
+  assert((Mode == InstructionBenchmark::Latency ||
+          Mode == InstructionBenchmark::InverseThroughput) &&
+         "invalid mode");
+}
+
+LatencyBenchmarkRunner::~LatencyBenchmarkRunner() = default;
+
+Expected<std::vector<BenchmarkMeasure>> LatencyBenchmarkRunner::runMeasurements(
+    const FunctionExecutor &Executor) const {
+  // Cycle measurements include some overhead from the kernel. Repeat the
+  // measure several times and take the minimum value.
+  constexpr const int NumMeasurements = 30;
+  int64_t MinValue = std::numeric_limits<int64_t>::max();
+  const char *CounterName = State.getPfmCounters().CycleCounter;
+  for (size_t I = 0; I < NumMeasurements; ++I) {
+    auto ExpectedCounterValue = Executor.runAndMeasure(CounterName);
+    if (!ExpectedCounterValue)
+      return ExpectedCounterValue.takeError();
+    if (*ExpectedCounterValue < MinValue)
+      MinValue = *ExpectedCounterValue;
+  }
+  std::vector<BenchmarkMeasure> Result;
+  switch (Mode) {
+  case InstructionBenchmark::Latency:
+    Result = {BenchmarkMeasure::Create("latency", MinValue)};
+    break;
+  case InstructionBenchmark::InverseThroughput:
+    Result = {BenchmarkMeasure::Create("inverse_throughput", MinValue)};
+    break;
+  default:
+    break;
+  }
+  return std::move(Result);
+}
+
+} // namespace exegesis
+} // namespace llvm
diff --git a/llvm/tools/llvm-exegesis/lib/Mips/Target.cpp b/llvm/tools/llvm-exegesis/lib/Mips/Target.cpp
--- a/llvm/tools/llvm-exegesis/lib/Mips/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/Mips/Target.cpp
@@ -5,8 +5,8 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+#include "../Error.h"
 #include "../Target.h"
-#include "../Latency.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "Mips.h"
 #include "MipsRegisterInfo.h"
diff --git a/llvm/tools/llvm-exegesis/lib/Uops.h b/llvm/tools/llvm-exegesis/lib/ParallelSnippetGenerator.h
rename from llvm/tools/llvm-exegesis/lib/Uops.h
rename to llvm/tools/llvm-exegesis/lib/ParallelSnippetGenerator.h
--- a/llvm/tools/llvm-exegesis/lib/Uops.h
+++ b/llvm/tools/llvm-exegesis/lib/ParallelSnippetGenerator.h
@@ -1,4 +1,4 @@
-//===-- Uops.h --------------------------------------------------*- C++ -*-===//
+//===-- ParallelSnippetGenerator.h ------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,23 +7,22 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// A BenchmarkRunner implementation to measure uop decomposition.
+/// A SnippetGenerator implementation to create parallel instruction snippets.
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVM_EXEGESIS_UOPS_H
-#define LLVM_TOOLS_LLVM_EXEGESIS_UOPS_H
+#ifndef LLVM_TOOLS_LLVM_EXEGESIS_PARALLELSNIPPETGENERATOR_H
+#define LLVM_TOOLS_LLVM_EXEGESIS_PARALLELSNIPPETGENERATOR_H
 
-#include "BenchmarkRunner.h"
 #include "SnippetGenerator.h"
 
 namespace llvm {
 namespace exegesis {
 
-class UopsSnippetGenerator : public SnippetGenerator {
+class ParallelSnippetGenerator : public SnippetGenerator {
 public:
   using SnippetGenerator::SnippetGenerator;
-  ~UopsSnippetGenerator() override;
+  ~ParallelSnippetGenerator() override;
 
   Expected<std::vector<CodeTemplate>>
   generateCodeTemplates(const Instruction &Instr,
@@ -60,20 +59,7 @@
       std::vector<InstructionTemplate> &SnippetTemplate) const;
 };
 
-class UopsBenchmarkRunner : public BenchmarkRunner {
-public:
-  UopsBenchmarkRunner(const LLVMState &State)
-      : BenchmarkRunner(State, InstructionBenchmark::Uops) {}
-  ~UopsBenchmarkRunner() override;
-
-  static constexpr const size_t kMinNumDifferentAddresses = 6;
-
-private:
-  Expected<std::vector<BenchmarkMeasure>>
-  runMeasurements(const FunctionExecutor &Executor) const override;
-};
-
 } // namespace exegesis
 } // namespace llvm
 
-#endif // LLVM_TOOLS_LLVM_EXEGESIS_UOPS_H
+#endif // LLVM_TOOLS_LLVM_EXEGESIS_PARALLELSNIPPETGENERATOR_H
diff --git a/llvm/tools/llvm-exegesis/lib/Uops.cpp b/llvm/tools/llvm-exegesis/lib/ParallelSnippetGenerator.cpp
rename from llvm/tools/llvm-exegesis/lib/Uops.cpp
rename to llvm/tools/llvm-exegesis/lib/ParallelSnippetGenerator.cpp
--- a/llvm/tools/llvm-exegesis/lib/Uops.cpp
+++ b/llvm/tools/llvm-exegesis/lib/ParallelSnippetGenerator.cpp
@@ -1,4 +1,4 @@
-//===-- Uops.cpp ------------------------------------------------*- C++ -*-===//
+//===-- ParallelSnippetGenerator.cpp ----------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,9 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Uops.h"
+#include "ParallelSnippetGenerator.h"
 
-#include "Assembler.h"
 #include "BenchmarkRunner.h"
 #include "MCInstrDescView.h"
 #include "Target.h"
@@ -16,9 +15,9 @@
 // FIXME: Load constants into registers (e.g. with fld1) to not break
 // instructions like x87.
 
-// Ideally we would like the only limitation on executing uops to be the issue
-// ports. Maximizing port pressure increases the likelihood that the load is
-// distributed evenly across possible ports.
+// Ideally we would like the only limitation on executing instructions to be the
+// availability of the CPU resources (e.g. execution ports) needed to execute
+// them, instead of the availability of their data dependencies.
 
 // To achieve that, one approach is to generate instructions that do not have
 // data dependencies between them.
@@ -89,11 +88,9 @@
   return Result;
 }
 
-UopsBenchmarkRunner::~UopsBenchmarkRunner() = default;
+ParallelSnippetGenerator::~ParallelSnippetGenerator() = default;
 
-UopsSnippetGenerator::~UopsSnippetGenerator() = default;
-
-void UopsSnippetGenerator::instantiateMemoryOperands(
+void ParallelSnippetGenerator::instantiateMemoryOperands(
     const unsigned ScratchSpacePointerInReg,
     std::vector<InstructionTemplate> &Instructions) const {
   if (ScratchSpacePointerInReg == 0)
@@ -157,7 +154,7 @@
   }
 }
 
-Expected<std::vector<CodeTemplate>> UopsSnippetGenerator::generateCodeTemplates(
+Expected<std::vector<CodeTemplate>> ParallelSnippetGenerator::generateCodeTemplates(
     const Instruction &Instr, const BitVector &ForbiddenRegisters) const {
   CodeTemplate CT;
   CT.ScratchSpacePointerInReg =
@@ -219,34 +216,7 @@
   return getSingleton(std::move(CT));
 }
 
-Expected<std::vector<BenchmarkMeasure>>
-UopsBenchmarkRunner::runMeasurements(const FunctionExecutor &Executor) const {
-  std::vector<BenchmarkMeasure> Result;
-  const PfmCountersInfo &PCI = State.getPfmCounters();
-  // Uops per port.
-  for (const auto *IssueCounter = PCI.IssueCounters,
-                  *IssueCounterEnd = PCI.IssueCounters + PCI.NumIssueCounters;
-       IssueCounter != IssueCounterEnd; ++IssueCounter) {
-    if (!IssueCounter->Counter)
-      continue;
-    auto ExpectedCounterValue = Executor.runAndMeasure(IssueCounter->Counter);
-    if (!ExpectedCounterValue)
-      return ExpectedCounterValue.takeError();
-    Result.push_back(BenchmarkMeasure::Create(IssueCounter->ProcResName,
-                                              *ExpectedCounterValue));
-  }
-  // NumMicroOps.
-  if (const char *const UopsCounter = PCI.UopsCounter) {
-    auto ExpectedCounterValue = Executor.runAndMeasure(UopsCounter);
-    if (!ExpectedCounterValue)
-      return ExpectedCounterValue.takeError();
-    Result.push_back(
-        BenchmarkMeasure::Create("NumMicroOps", *ExpectedCounterValue));
-  }
-  return std::move(Result);
-}
-
-constexpr const size_t UopsSnippetGenerator::kMinNumDifferentAddresses;
+constexpr const size_t ParallelSnippetGenerator::kMinNumDifferentAddresses;
 
 } // namespace exegesis
 } // namespace llvm
diff --git a/llvm/tools/llvm-exegesis/lib/PowerPC/Target.cpp b/llvm/tools/llvm-exegesis/lib/PowerPC/Target.cpp
--- a/llvm/tools/llvm-exegesis/lib/PowerPC/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/PowerPC/Target.cpp
@@ -7,7 +7,6 @@
 // The PowerPC ExegesisTarget.
 //===----------------------------------------------------------------------===//
 #include "../Target.h"
-#include "../Latency.h"
 #include "PPC.h"
 #include "PPCRegisterInfo.h"
 
diff --git a/llvm/tools/llvm-exegesis/lib/Latency.h b/llvm/tools/llvm-exegesis/lib/SerialSnippetGenerator.h
rename from llvm/tools/llvm-exegesis/lib/Latency.h
rename to llvm/tools/llvm-exegesis/lib/SerialSnippetGenerator.h
--- a/llvm/tools/llvm-exegesis/lib/Latency.h
+++ b/llvm/tools/llvm-exegesis/lib/SerialSnippetGenerator.h
@@ -1,4 +1,4 @@
-//===-- Latency.h -----------------------------------------------*- C++ -*-===//
+//===-- SerialSnippetGenerator.h --------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,14 +7,13 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// A BenchmarkRunner implementation to measure instruction latencies.
+/// A SnippetGenerator implementation to create serial instruction snippets.
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVM_EXEGESIS_LATENCY_H
-#define LLVM_TOOLS_LLVM_EXEGESIS_LATENCY_H
+#ifndef LLVM_TOOLS_LLVM_EXEGESIS_SERIALSNIPPETGENERATOR_H
+#define LLVM_TOOLS_LLVM_EXEGESIS_SERIALSNIPPETGENERATOR_H
 
-#include "BenchmarkRunner.h"
 #include "Error.h"
 #include "MCInstrDescView.h"
 #include "SnippetGenerator.h"
@@ -22,27 +21,17 @@
 namespace llvm {
 namespace exegesis {
 
-class LatencySnippetGenerator : public SnippetGenerator {
+class SerialSnippetGenerator : public SnippetGenerator {
 public:
   using SnippetGenerator::SnippetGenerator;
-  ~LatencySnippetGenerator() override;
+  ~SerialSnippetGenerator() override;
 
   Expected<std::vector<CodeTemplate>>
   generateCodeTemplates(const Instruction &Instr,
                         const BitVector &ForbiddenRegisters) const override;
 };
 
-class LatencyBenchmarkRunner : public BenchmarkRunner {
-public:
-  LatencyBenchmarkRunner(const LLVMState &State,
-                         InstructionBenchmark::ModeE Mode);
-  ~LatencyBenchmarkRunner() override;
-
-private:
-  Expected<std::vector<BenchmarkMeasure>>
-  runMeasurements(const FunctionExecutor &Executor) const override;
-};
 } // namespace exegesis
 } // namespace llvm
 
-#endif // LLVM_TOOLS_LLVM_EXEGESIS_LATENCY_H
+#endif // LLVM_TOOLS_LLVM_EXEGESIS_SERIALSNIPPETGENERATOR_H
diff --git a/llvm/tools/llvm-exegesis/lib/Latency.cpp b/llvm/tools/llvm-exegesis/lib/SerialSnippetGenerator.cpp
rename from llvm/tools/llvm-exegesis/lib/Latency.cpp
rename to llvm/tools/llvm-exegesis/lib/SerialSnippetGenerator.cpp
--- a/llvm/tools/llvm-exegesis/lib/Latency.cpp
+++ b/llvm/tools/llvm-exegesis/lib/SerialSnippetGenerator.cpp
@@ -1,4 +1,4 @@
-//===-- Latency.cpp ---------------------------------------------*- C++ -*-===//
+//===-- SerialSnippetGenerator.cpp ------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,17 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Latency.h"
+#include "SerialSnippetGenerator.h"
 
-#include "Assembler.h"
-#include "BenchmarkRunner.h"
 #include "MCInstrDescView.h"
-#include "PerfHelper.h"
-#include "Target.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstBuilder.h"
-#include "llvm/Support/FormatVariadic.h"
+#include "CodeTemplate.h"
+#include <algorithm>
+#include <numeric>
+#include <vector>
 
 namespace llvm {
 namespace exegesis {
@@ -149,10 +145,10 @@
   }
 }
 
-LatencySnippetGenerator::~LatencySnippetGenerator() = default;
+SerialSnippetGenerator::~SerialSnippetGenerator() = default;
 
 Expected<std::vector<CodeTemplate>>
-LatencySnippetGenerator::generateCodeTemplates(
+SerialSnippetGenerator::generateCodeTemplates(
     const Instruction &Instr, const BitVector &ForbiddenRegisters) const {
   std::vector<CodeTemplate> Results;
   const ExecutionMode EM = getExecutionModes(Instr, ForbiddenRegisters);
@@ -169,43 +165,5 @@
   return std::move(Results);
 }
 
-LatencyBenchmarkRunner::LatencyBenchmarkRunner(const LLVMState &State,
-                                               InstructionBenchmark::ModeE Mode)
-    : BenchmarkRunner(State, Mode) {
-  assert((Mode == InstructionBenchmark::Latency ||
-          Mode == InstructionBenchmark::InverseThroughput) &&
-         "invalid mode");
-}
-
-LatencyBenchmarkRunner::~LatencyBenchmarkRunner() = default;
-
-Expected<std::vector<BenchmarkMeasure>> LatencyBenchmarkRunner::runMeasurements(
-    const FunctionExecutor &Executor) const {
-  // Cycle measurements include some overhead from the kernel. Repeat the
-  // measure several times and take the minimum value.
-  constexpr const int NumMeasurements = 30;
-  int64_t MinValue = std::numeric_limits<int64_t>::max();
-  const char *CounterName = State.getPfmCounters().CycleCounter;
-  for (size_t I = 0; I < NumMeasurements; ++I) {
-    auto ExpectedCounterValue = Executor.runAndMeasure(CounterName);
-    if (!ExpectedCounterValue)
-      return ExpectedCounterValue.takeError();
-    if (*ExpectedCounterValue < MinValue)
-      MinValue = *ExpectedCounterValue;
-  }
-  std::vector<BenchmarkMeasure> Result;
-  switch (Mode) {
-  case InstructionBenchmark::Latency:
-    Result = {BenchmarkMeasure::Create("latency", MinValue)};
-    break;
-  case InstructionBenchmark::InverseThroughput:
-    Result = {BenchmarkMeasure::Create("inverse_throughput", MinValue)};
-    break;
-  default:
-    break;
-  }
-  return std::move(Result);
-}
-
 } // namespace exegesis
 } // namespace llvm
diff --git a/llvm/tools/llvm-exegesis/lib/Target.h b/llvm/tools/llvm-exegesis/lib/Target.h
--- a/llvm/tools/llvm-exegesis/lib/Target.h
+++ b/llvm/tools/llvm-exegesis/lib/Target.h
@@ -144,9 +144,9 @@
 
   // Targets can implement their own snippet generators/benchmarks runners by
   // implementing these.
-  std::unique_ptr<SnippetGenerator> virtual createLatencySnippetGenerator(
+  std::unique_ptr<SnippetGenerator> virtual createSerialSnippetGenerator(
       const LLVMState &State, const SnippetGenerator::Options &Opts) const;
-  std::unique_ptr<SnippetGenerator> virtual createUopsSnippetGenerator(
+  std::unique_ptr<SnippetGenerator> virtual createParallelSnippetGenerator(
       const LLVMState &State, const SnippetGenerator::Options &Opts) const;
   std::unique_ptr<BenchmarkRunner> virtual createLatencyBenchmarkRunner(
       const LLVMState &State, InstructionBenchmark::ModeE Mode) const;
diff --git a/llvm/tools/llvm-exegesis/lib/Target.cpp b/llvm/tools/llvm-exegesis/lib/Target.cpp
--- a/llvm/tools/llvm-exegesis/lib/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/Target.cpp
@@ -7,8 +7,10 @@
 //===----------------------------------------------------------------------===//
 #include "Target.h"
 
-#include "Latency.h"
-#include "Uops.h"
+#include "LatencyBenchmarkRunner.h"
+#include "ParallelSnippetGenerator.h"
+#include "SerialSnippetGenerator.h"
+#include "UopsBenchmarkRunner.h"
 
 namespace llvm {
 namespace exegesis {
@@ -43,10 +45,10 @@
   case InstructionBenchmark::Unknown:
     return nullptr;
   case InstructionBenchmark::Latency:
-    return createLatencySnippetGenerator(State, Opts);
+    return createSerialSnippetGenerator(State, Opts);
   case InstructionBenchmark::Uops:
   case InstructionBenchmark::InverseThroughput:
-    return createUopsSnippetGenerator(State, Opts);
+    return createParallelSnippetGenerator(State, Opts);
   }
   return nullptr;
 }
@@ -77,14 +79,14 @@
   return nullptr;
 }
 
-std::unique_ptr<SnippetGenerator> ExegesisTarget::createLatencySnippetGenerator(
+std::unique_ptr<SnippetGenerator> ExegesisTarget::createSerialSnippetGenerator(
     const LLVMState &State, const SnippetGenerator::Options &Opts) const {
-  return std::make_unique<LatencySnippetGenerator>(State, Opts);
+  return std::make_unique<SerialSnippetGenerator>(State, Opts);
 }
 
-std::unique_ptr<SnippetGenerator> ExegesisTarget::createUopsSnippetGenerator(
+std::unique_ptr<SnippetGenerator> ExegesisTarget::createParallelSnippetGenerator(
     const LLVMState &State, const SnippetGenerator::Options &Opts) const {
-  return std::make_unique<UopsSnippetGenerator>(State, Opts);
+  return std::make_unique<ParallelSnippetGenerator>(State, Opts);
 }
 
 std::unique_ptr<BenchmarkRunner> ExegesisTarget::createLatencyBenchmarkRunner(
diff --git a/llvm/tools/llvm-exegesis/lib/UopsBenchmarkRunner.h b/llvm/tools/llvm-exegesis/lib/UopsBenchmarkRunner.h
new file mode 100644
--- /dev/null
+++ b/llvm/tools/llvm-exegesis/lib/UopsBenchmarkRunner.h
@@ -0,0 +1,38 @@
+//===-- UopsBenchmarkRunner.h -----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// A BenchmarkRunner implementation to measure uop decomposition.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_EXEGESIS_UOPSBENCHMARKRUNNER_H
+#define LLVM_TOOLS_LLVM_EXEGESIS_UOPSBENCHMARKRUNNER_H
+
+#include "BenchmarkRunner.h"
+
+namespace llvm {
+namespace exegesis {
+
+class UopsBenchmarkRunner : public BenchmarkRunner {
+public:
+  UopsBenchmarkRunner(const LLVMState &State)
+      : BenchmarkRunner(State, InstructionBenchmark::Uops) {}
+  ~UopsBenchmarkRunner() override;
+
+  static constexpr const size_t kMinNumDifferentAddresses = 6;
+
+private:
+  Expected<std::vector<BenchmarkMeasure>>
+  runMeasurements(const FunctionExecutor &Executor) const override;
+};
+
+} // namespace exegesis
+} // namespace llvm
+
+#endif // LLVM_TOOLS_LLVM_EXEGESIS_UOPSBENCHMARKRUNNER_H
diff --git a/llvm/tools/llvm-exegesis/lib/UopsBenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/UopsBenchmarkRunner.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/tools/llvm-exegesis/lib/UopsBenchmarkRunner.cpp
@@ -0,0 +1,46 @@
+//===-- UopsBenchmarkRunner.cpp ---------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "UopsBenchmarkRunner.h"
+
+#include "Target.h"
+
+namespace llvm {
+namespace exegesis {
+
+UopsBenchmarkRunner::~UopsBenchmarkRunner() = default;
+
+Expected<std::vector<BenchmarkMeasure>>
+UopsBenchmarkRunner::runMeasurements(const FunctionExecutor &Executor) const {
+  std::vector<BenchmarkMeasure> Result;
+  const PfmCountersInfo &PCI = State.getPfmCounters();
+  // Uops per port.
+  for (const auto *IssueCounter = PCI.IssueCounters,
+                  *IssueCounterEnd = PCI.IssueCounters + PCI.NumIssueCounters;
+       IssueCounter != IssueCounterEnd; ++IssueCounter) {
+    if (!IssueCounter->Counter)
+      continue;
+    auto ExpectedCounterValue = Executor.runAndMeasure(IssueCounter->Counter);
+    if (!ExpectedCounterValue)
+      return ExpectedCounterValue.takeError();
+    Result.push_back(BenchmarkMeasure::Create(IssueCounter->ProcResName,
+                                              *ExpectedCounterValue));
+  }
+  // NumMicroOps.
+  if (const char *const UopsCounter = PCI.UopsCounter) {
+    auto ExpectedCounterValue = Executor.runAndMeasure(UopsCounter);
+    if (!ExpectedCounterValue)
+      return ExpectedCounterValue.takeError();
+    Result.push_back(
+        BenchmarkMeasure::Create("NumMicroOps", *ExpectedCounterValue));
+  }
+  return std::move(Result);
+}
+
+} // namespace exegesis
+} // namespace llvm
diff --git a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
--- a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
@@ -8,9 +8,9 @@
 #include "../Target.h"
 
 #include "../Error.h"
-#include "../Latency.h"
+#include "../SerialSnippetGenerator.h"
 #include "../SnippetGenerator.h"
-#include "../Uops.h"
+#include "../ParallelSnippetGenerator.h"
 #include "MCTargetDesc/X86BaseInfo.h"
 #include "MCTargetDesc/X86MCTargetDesc.h"
 #include "X86.h"
@@ -242,9 +242,9 @@
 }
 
 namespace {
-class X86LatencySnippetGenerator : public LatencySnippetGenerator {
+class X86SerialSnippetGenerator : public SerialSnippetGenerator {
 public:
-  using LatencySnippetGenerator::LatencySnippetGenerator;
+  using SerialSnippetGenerator::SerialSnippetGenerator;
 
   Expected<std::vector<CodeTemplate>>
   generateCodeTemplates(const Instruction &Instr,
@@ -253,7 +253,7 @@
 } // namespace
 
 Expected<std::vector<CodeTemplate>>
-X86LatencySnippetGenerator::generateCodeTemplates(
+X86SerialSnippetGenerator::generateCodeTemplates(
     const Instruction &Instr, const BitVector &ForbiddenRegisters) const {
   if (auto E = IsInvalidOpcode(Instr))
     return std::move(E);
@@ -271,7 +271,7 @@
 
   switch (getX86FPFlags(Instr)) {
   case X86II::NotFP:
-    return LatencySnippetGenerator::generateCodeTemplates(Instr,
+    return SerialSnippetGenerator::generateCodeTemplates(Instr,
                                                           ForbiddenRegisters);
   case X86II::ZeroArgFP:
   case X86II::OneArgFP:
@@ -292,9 +292,9 @@
 }
 
 namespace {
-class X86UopsSnippetGenerator : public UopsSnippetGenerator {
+class X86ParallelSnippetGenerator : public ParallelSnippetGenerator {
 public:
-  using UopsSnippetGenerator::UopsSnippetGenerator;
+  using ParallelSnippetGenerator::ParallelSnippetGenerator;
 
   Expected<std::vector<CodeTemplate>>
   generateCodeTemplates(const Instruction &Instr,
@@ -304,7 +304,7 @@
 } // namespace
 
 Expected<std::vector<CodeTemplate>>
-X86UopsSnippetGenerator::generateCodeTemplates(
+X86ParallelSnippetGenerator::generateCodeTemplates(
     const Instruction &Instr, const BitVector &ForbiddenRegisters) const {
   if (auto E = IsInvalidOpcode(Instr))
     return std::move(E);
@@ -333,7 +333,7 @@
 
   switch (getX86FPFlags(Instr)) {
   case X86II::NotFP:
-    return UopsSnippetGenerator::generateCodeTemplates(Instr,
+    return ParallelSnippetGenerator::generateCodeTemplates(Instr,
                                                        ForbiddenRegisters);
   case X86II::ZeroArgFP:
   case X86II::OneArgFP:
@@ -577,16 +577,16 @@
                             sizeof(kUnavailableRegisters[0]));
   }
 
-  std::unique_ptr<SnippetGenerator> createLatencySnippetGenerator(
+  std::unique_ptr<SnippetGenerator> createSerialSnippetGenerator(
       const LLVMState &State,
       const SnippetGenerator::Options &Opts) const override {
-    return std::make_unique<X86LatencySnippetGenerator>(State, Opts);
+    return std::make_unique<X86SerialSnippetGenerator>(State, Opts);
   }
 
-  std::unique_ptr<SnippetGenerator> createUopsSnippetGenerator(
+  std::unique_ptr<SnippetGenerator> createParallelSnippetGenerator(
       const LLVMState &State,
       const SnippetGenerator::Options &Opts) const override {
-    return std::make_unique<X86UopsSnippetGenerator>(State, Opts);
+    return std::make_unique<X86ParallelSnippetGenerator>(State, Opts);
   }
 
   bool matchesArch(Triple::ArchType Arch) const override {
diff --git a/llvm/tools/llvm-ml/CMakeLists.txt b/llvm/tools/llvm-ml/CMakeLists.txt
new file mode 100644
--- /dev/null
+++ b/llvm/tools/llvm-ml/CMakeLists.txt
@@ -0,0 +1,14 @@
+set(LLVM_LINK_COMPONENTS
+  AllTargetsAsmParsers
+  AllTargetsDescs
+  AllTargetsDisassemblers
+  AllTargetsInfos
+  MC
+  MCParser
+  Support
+  )
+
+add_llvm_tool(llvm-ml
+  llvm-ml.cpp
+  Disassembler.cpp
+  )
diff --git a/llvm/tools/llvm-ml/Disassembler.h b/llvm/tools/llvm-ml/Disassembler.h
new file mode 100644
--- /dev/null
+++ b/llvm/tools/llvm-ml/Disassembler.h
@@ -0,0 +1,37 @@
+//===- Disassembler.h - Text File Disassembler ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This class implements the disassembler of strings of bytes written in
+// hexadecimal, from standard input or from a file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MC_DISASSEMBLER_H
+#define LLVM_TOOLS_LLVM_MC_DISASSEMBLER_H
+
+#include <string>
+
+namespace llvm {
+
+class MemoryBuffer;
+class Target;
+class raw_ostream;
+class SourceMgr;
+class MCSubtargetInfo;
+class MCStreamer;
+
+class Disassembler {
+public:
+  static int disassemble(const Target &T, const std::string &Triple,
+                         MCSubtargetInfo &STI, MCStreamer &Streamer,
+                         MemoryBuffer &Buffer, SourceMgr &SM, raw_ostream &Out);
+};
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/tools/llvm-ml/Disassembler.cpp b/llvm/tools/llvm-ml/Disassembler.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/tools/llvm-ml/Disassembler.cpp
@@ -0,0 +1,203 @@
+//===- Disassembler.cpp - Disassembler for hex strings --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This class implements the disassembler of strings of bytes written in
+// hexadecimal, from standard input or from a file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Disassembler.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+typedef std::pair<std::vector<unsigned char>, std::vector<const char *>>
+    ByteArrayTy;
+
+static bool PrintInsts(const MCDisassembler &DisAsm, const ByteArrayTy &Bytes,
+                       SourceMgr &SM, raw_ostream &Out, MCStreamer &Streamer,
+                       bool InAtomicBlock, const MCSubtargetInfo &STI) {
+  ArrayRef<uint8_t> Data(Bytes.first.data(), Bytes.first.size());
+
+  // Disassemble it to strings.
+  uint64_t Size;
+  uint64_t Index;
+
+  for (Index = 0; Index < Bytes.first.size(); Index += Size) {
+    MCInst Inst;
+
+    MCDisassembler::DecodeStatus S;
+    S = DisAsm.getInstruction(Inst, Size, Data.slice(Index), Index, nulls());
+    switch (S) {
+    case MCDisassembler::Fail:
+      SM.PrintMessage(SMLoc::getFromPointer(Bytes.second[Index]),
+                      SourceMgr::DK_Warning, "invalid instruction encoding");
+      // Don't try to resynchronise the stream in a block
+      if (InAtomicBlock)
+        return true;
+
+      if (Size == 0)
+        Size = 1;  // skip illegible bytes
+
+      break;
+
+    case MCDisassembler::SoftFail:
+      SM.PrintMessage(SMLoc::getFromPointer(Bytes.second[Index]),
+                      SourceMgr::DK_Warning,
+                      "potentially undefined instruction encoding");
+      LLVM_FALLTHROUGH;
+
+    case MCDisassembler::Success:
+      Streamer.EmitInstruction(Inst, STI);
+      break;
+    }
+  }
+
+  return false;
+}
+
+static bool SkipToToken(StringRef &Str) {
+  for (;;) {
+    if (Str.empty())
+      return false;
+
+    // Strip horizontal whitespace and commas.
+    if (size_t Pos = Str.find_first_not_of(" \t\r\n,")) {
+      Str = Str.substr(Pos);
+      continue;
+    }
+
+    // If this is the start of a comment, remove the rest of the line.
+    if (Str[0] == '#') {
+      Str = Str.substr(Str.find_first_of('\n'));
+      continue;
+    }
+    return true;
+  }
+}
+
+static bool ByteArrayFromString(ByteArrayTy &ByteArray, StringRef &Str,
+                                SourceMgr &SM) {
+  while (SkipToToken(Str)) {
+    // Handled by higher level
+    if (Str[0] == '[' || Str[0] == ']')
+      return false;
+
+    // Get the current token.
+    size_t Next = Str.find_first_of(" \t\n\r,#[]");
+    StringRef Value = Str.substr(0, Next);
+
+    // Convert to a byte and add to the byte vector.
+    unsigned ByteVal;
+    if (Value.getAsInteger(0, ByteVal) || ByteVal > 255) {
+      // If we have an error, print it and skip to the end of line.
+      SM.PrintMessage(SMLoc::getFromPointer(Value.data()), SourceMgr::DK_Error,
+                      "invalid input token");
+      Str = Str.substr(Str.find('\n'));
+      ByteArray.first.clear();
+      ByteArray.second.clear();
+      continue;
+    }
+
+    ByteArray.first.push_back(ByteVal);
+    ByteArray.second.push_back(Value.data());
+    Str = Str.substr(Next);
+  }
+
+  return false;
+}
+
+int Disassembler::disassemble(const Target &T, const std::string &Triple,
+                              MCSubtargetInfo &STI, MCStreamer &Streamer,
+                              MemoryBuffer &Buffer, SourceMgr &SM,
+                              raw_ostream &Out) {
+  std::unique_ptr<const MCRegisterInfo> MRI(T.createMCRegInfo(Triple));
+  if (!MRI) {
+    errs() << "error: no register info for target " << Triple << "\n";
+    return -1;
+  }
+
+  MCTargetOptions MCOptions;
+  std::unique_ptr<const MCAsmInfo> MAI(
+      T.createMCAsmInfo(*MRI, Triple, MCOptions));
+  if (!MAI) {
+    errs() << "error: no assembly info for target " << Triple << "\n";
+    return -1;
+  }
+
+  // Set up the MCContext for creating symbols and MCExpr's.
+  MCContext Ctx(MAI.get(), MRI.get(), nullptr);
+
+  std::unique_ptr<const MCDisassembler> DisAsm(
+      T.createMCDisassembler(STI, Ctx));
+  if (!DisAsm) {
+    errs() << "error: no disassembler for target " << Triple << "\n";
+    return -1;
+  }
+
+  // Set up initial section manually here
+  Streamer.InitSections(false);
+
+  bool ErrorOccurred = false;
+
+  // Convert the input to a vector for disassembly.
+  ByteArrayTy ByteArray;
+  StringRef Str = Buffer.getBuffer();
+  bool InAtomicBlock = false;
+
+  while (SkipToToken(Str)) {
+    ByteArray.first.clear();
+    ByteArray.second.clear();
+
+    if (Str[0] == '[') {
+      if (InAtomicBlock) {
+        SM.PrintMessage(SMLoc::getFromPointer(Str.data()), SourceMgr::DK_Error,
+                        "nested atomic blocks make no sense");
+        ErrorOccurred = true;
+      }
+      InAtomicBlock = true;
+      Str = Str.drop_front();
+      continue;
+    } else if (Str[0] == ']') {
+      if (!InAtomicBlock) {
+        SM.PrintMessage(SMLoc::getFromPointer(Str.data()), SourceMgr::DK_Error,
+                        "attempt to close atomic block without opening");
+        ErrorOccurred = true;
+      }
+      InAtomicBlock = false;
+      Str = Str.drop_front();
+      continue;
+    }
+
+    // It's a real token, get the bytes and emit them
+    ErrorOccurred |= ByteArrayFromString(ByteArray, Str, SM);
+
+    if (!ByteArray.first.empty())
+      ErrorOccurred |=
+          PrintInsts(*DisAsm, ByteArray, SM, Out, Streamer, InAtomicBlock, STI);
+  }
+
+  if (InAtomicBlock) {
+    SM.PrintMessage(SMLoc::getFromPointer(Str.data()), SourceMgr::DK_Error,
+                    "unclosed atomic block");
+    ErrorOccurred = true;
+  }
+
+  return ErrorOccurred;
+}
diff --git a/llvm/tools/llvm-ml/llvm-ml.cpp b/llvm/tools/llvm-ml/llvm-ml.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/tools/llvm-ml/llvm-ml.cpp
@@ -0,0 +1,381 @@
+//===-- llvm-ml.cpp - masm-compatible assembler -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// A simple driver around MasmParser; based on llvm-mc.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Disassembler.h"
+
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCParser/AsmLexer.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCTargetOptionsCommandFlags.inc"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compression.h"
+#include "llvm/Support/FileUtilities.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/WithColor.h"
+
+using namespace llvm;
+
+static cl::opt<std::string>
+InputFilename(cl::Positional, cl::desc("<input file>"), cl::init("-"));
+
+static cl::opt<std::string>
+OutputFilename("o", cl::desc("Output filename"), cl::value_desc("filename"),
+               cl::init("-"));
+
+static cl::opt<bool>
+ShowEncoding("show-encoding", cl::desc("Show instruction encodings"));
+
+static cl::opt<bool>
+ShowInst("show-inst", cl::desc("Show internal instruction representation"));
+
+static cl::opt<bool>
+ShowInstOperands("show-inst-operands",
+                 cl::desc("Show instructions operands as parsed"));
+
+static cl::opt<bool>
+OutputATTAsm("output-att-asm", cl::desc("Use ATT syntax for output printing"));
+
+static cl::opt<bool>
+PrintImmHex("print-imm-hex", cl::init(false),
+            cl::desc("Prefer hex format for immediate values"));
+
+static cl::opt<bool>
+PreserveComments("preserve-comments",
+                 cl::desc("Preserve Comments in outputted assembly"));
+
+enum OutputFileType {
+  OFT_Null,
+  OFT_AssemblyFile,
+  OFT_ObjectFile
+};
+static cl::opt<OutputFileType>
+FileType("filetype", cl::init(OFT_ObjectFile),
+  cl::desc("Choose an output file type:"),
+  cl::values(
+       clEnumValN(OFT_AssemblyFile, "asm",
+                  "Emit an assembly ('.s') file"),
+       clEnumValN(OFT_Null, "null",
+                  "Don't emit anything (for timing purposes)"),
+       clEnumValN(OFT_ObjectFile, "obj",
+                  "Emit a native object ('.o') file")));
+
+static cl::list<std::string>
+IncludeDirs("I", cl::desc("Directory of include files"),
+            cl::value_desc("directory"), cl::Prefix);
+
+enum BitnessType {
+  m32,
+  m64,
+};
+cl::opt<BitnessType> Bitness(cl::desc("Choose bitness:"), cl::init(m64),
+                             cl::values(clEnumVal(m32, "32-bit"),
+                                        clEnumVal(m64, "64-bit (default)")));
+
+static cl::opt<std::string>
+TripleName("triple", cl::desc("Target triple to assemble for, "
+                              "see -version for available targets"));
+
+static cl::opt<std::string>
+DebugCompilationDir("fdebug-compilation-dir",
+                    cl::desc("Specifies the debug info's compilation dir"));
+
+static cl::list<std::string>
+DebugPrefixMap("fdebug-prefix-map",
+               cl::desc("Map file source paths in debug info"),
+               cl::value_desc("= separated key-value pairs"));
+
+static cl::opt<std::string>
+MainFileName("main-file-name",
+             cl::desc("Specifies the name we should consider the input file"));
+
+static cl::opt<bool> SaveTempLabels("save-temp-labels",
+                                    cl::desc("Don't discard temporary labels"));
+
+enum ActionType {
+  AC_AsLex,
+  AC_Assemble,
+  AC_Disassemble,
+  AC_MDisassemble,
+};
+
+static cl::opt<ActionType>
+Action(cl::desc("Action to perform:"),
+       cl::init(AC_Assemble),
+       cl::values(clEnumValN(AC_AsLex, "as-lex",
+                             "Lex tokens from a .asm file"),
+                  clEnumValN(AC_Assemble, "assemble",
+                             "Assemble a .asm file (default)"),
+                  clEnumValN(AC_Disassemble, "disassemble",
+                             "Disassemble strings of hex bytes"),
+                  clEnumValN(AC_MDisassemble, "mdis",
+                             "Marked up disassembly of strings of hex bytes")));
+
+static const Target *GetTarget(const char *ProgName) {
+  // Figure out the target triple.
+  if (TripleName.empty()) {
+    if (Bitness == m32)
+      TripleName = "i386-pc-windows";
+    else if (Bitness == m64)
+      TripleName = "x86_64-pc-windows";
+  }
+  Triple TheTriple(Triple::normalize(TripleName));
+
+  // Get the target specific parser.
+  std::string Error;
+  const Target *TheTarget = TargetRegistry::lookupTarget("", TheTriple, Error);
+  if (!TheTarget) {
+    WithColor::error(errs(), ProgName) << Error;
+    return nullptr;
+  }
+
+  // Update the triple name and return the found target.
+  TripleName = TheTriple.getTriple();
+  return TheTarget;
+}
+
+static std::unique_ptr<ToolOutputFile> GetOutputStream(StringRef Path) {
+  std::error_code EC;
+  auto Out = std::make_unique<ToolOutputFile>(Path, EC, sys::fs::F_None);
+  if (EC) {
+    WithColor::error() << EC.message() << '\n';
+    return nullptr;
+  }
+
+  return Out;
+}
+
+static int AsLexInput(SourceMgr &SrcMgr, MCAsmInfo &MAI, raw_ostream &OS) {
+  AsmLexer Lexer(MAI);
+  Lexer.setBuffer(SrcMgr.getMemoryBuffer(SrcMgr.getMainFileID())->getBuffer());
+
+  bool Error = false;
+  while (Lexer.Lex().isNot(AsmToken::Eof)) {
+    Lexer.getTok().dump(OS);
+    OS << "\n";
+    if (Lexer.getTok().getKind() == AsmToken::Error)
+      Error = true;
+  }
+
+  return Error;
+}
+
+static int AssembleInput(const char *ProgName, const Target *TheTarget,
+                         SourceMgr &SrcMgr, MCContext &Ctx, MCStreamer &Str,
+                         MCAsmInfo &MAI, MCSubtargetInfo &STI,
+                         MCInstrInfo &MCII, MCTargetOptions &MCOptions) {
+  std::unique_ptr<MCAsmParser> Parser(createMCAsmParser(SrcMgr, Ctx, Str, MAI));
+  std::unique_ptr<MCTargetAsmParser> TAP(
+      TheTarget->createMCAsmParser(STI, *Parser, MCII, MCOptions));
+
+  if (!TAP) {
+    WithColor::error(errs(), ProgName)
+        << "this target does not support assembly parsing.\n";
+    return 1;
+  }
+
+  Parser->setShowParsedOperands(ShowInstOperands);
+  Parser->setTargetParser(*TAP);
+  Parser->getLexer().setLexMasmIntegers(true);
+
+  int Res = Parser->Run(/*NoInitialTextSection=*/true);
+
+  return Res;
+}
+
+int main(int argc, char **argv) {
+  InitLLVM X(argc, argv);
+
+  // Initialize targets and assembly printers/parsers.
+  llvm::InitializeAllTargetInfos();
+  llvm::InitializeAllTargetMCs();
+  llvm::InitializeAllAsmParsers();
+  llvm::InitializeAllDisassemblers();
+
+  // Register the target printer for --version.
+  cl::AddExtraVersionPrinter(TargetRegistry::printRegisteredTargetsForVersion);
+
+  cl::ParseCommandLineOptions(argc, argv, "llvm machine code playground\n");
+  MCTargetOptions MCOptions = InitMCTargetOptionsFromFlags();
+
+  const char *ProgName = argv[0];
+  const Target *TheTarget = GetTarget(ProgName);
+  if (!TheTarget)
+    return 1;
+  // Now that GetTarget() has (potentially) replaced TripleName, it's safe to
+  // construct the Triple object.
+  Triple TheTriple(TripleName);
+
+  ErrorOr<std::unique_ptr<MemoryBuffer>> BufferPtr =
+      MemoryBuffer::getFileOrSTDIN(InputFilename);
+  if (std::error_code EC = BufferPtr.getError()) {
+    WithColor::error(errs(), ProgName)
+        << InputFilename << ": " << EC.message() << '\n';
+    return 1;
+  }
+  MemoryBuffer *Buffer = BufferPtr->get();
+
+  SourceMgr SrcMgr;
+
+  // Tell SrcMgr about this buffer, which is what the parser will pick up.
+  SrcMgr.AddNewSourceBuffer(std::move(*BufferPtr), SMLoc());
+
+  // Record the location of the include directories so that the lexer can find
+  // it later.
+  SrcMgr.setIncludeDirs(IncludeDirs);
+
+  std::unique_ptr<MCRegisterInfo> MRI(TheTarget->createMCRegInfo(TripleName));
+  assert(MRI && "Unable to create target register info!");
+
+  std::unique_ptr<MCAsmInfo> MAI(
+      TheTarget->createMCAsmInfo(*MRI, TripleName, MCOptions));
+  assert(MAI && "Unable to create target asm info!");
+
+  MAI->setPreserveAsmComments(PreserveComments);
+
+  // FIXME: This is not pretty. MCContext has a ptr to MCObjectFileInfo and
+  // MCObjectFileInfo needs a MCContext reference in order to initialize itself.
+  MCObjectFileInfo MOFI;
+  MCContext Ctx(MAI.get(), MRI.get(), &MOFI, &SrcMgr);
+  MOFI.InitMCObjectFileInfo(TheTriple, /*PIC=*/false, Ctx,
+                            /*LargeCodeModel=*/true);
+
+  if (SaveTempLabels)
+    Ctx.setAllowTemporaryLabels(false);
+
+  if (!DebugCompilationDir.empty()) {
+    Ctx.setCompilationDir(DebugCompilationDir);
+  } else {
+    // If no compilation dir is set, try to use the current directory.
+    SmallString<128> CWD;
+    if (!sys::fs::current_path(CWD))
+      Ctx.setCompilationDir(CWD);
+  }
+  for (const auto &Arg : DebugPrefixMap) {
+    const auto &KV = StringRef(Arg).split('=');
+    Ctx.addDebugPrefixMapEntry(KV.first, KV.second);
+  }
+  if (!MainFileName.empty())
+    Ctx.setMainFileName(MainFileName);
+
+  std::unique_ptr<ToolOutputFile> Out = GetOutputStream(OutputFilename);
+  if (!Out)
+    return 1;
+
+  std::unique_ptr<buffer_ostream> BOS;
+  raw_pwrite_stream *OS = &Out->os();
+  std::unique_ptr<MCStreamer> Str;
+
+  std::unique_ptr<MCInstrInfo> MCII(TheTarget->createMCInstrInfo());
+  std::unique_ptr<MCSubtargetInfo> STI(TheTarget->createMCSubtargetInfo(
+      TripleName, /*CPU=*/"", /*Features=*/""));
+
+  MCInstPrinter *IP = nullptr;
+  if (FileType == OFT_AssemblyFile) {
+    const unsigned OutputAsmVariant = OutputATTAsm ? 0U   // ATT dialect
+                                                   : 1U;  // Intel dialect
+    IP = TheTarget->createMCInstPrinter(Triple(TripleName), OutputAsmVariant,
+                                        *MAI, *MCII, *MRI);
+
+    if (!IP) {
+      WithColor::error()
+          << "unable to create instruction printer for target triple '"
+          << TheTriple.normalize() << "' with "
+          << (OutputATTAsm ? "ATT" : "Intel") << " assembly variant.\n";
+      return 1;
+    }
+
+    // Set the display preference for hex vs. decimal immediates.
+    IP->setPrintImmHex(PrintImmHex);
+
+    // Set up the AsmStreamer.
+    std::unique_ptr<MCCodeEmitter> CE;
+    if (ShowEncoding)
+      CE.reset(TheTarget->createMCCodeEmitter(*MCII, *MRI, Ctx));
+
+    std::unique_ptr<MCAsmBackend> MAB(
+        TheTarget->createMCAsmBackend(*STI, *MRI, MCOptions));
+    auto FOut = std::make_unique<formatted_raw_ostream>(*OS);
+    Str.reset(
+        TheTarget->createAsmStreamer(Ctx, std::move(FOut), /*asmverbose*/ true,
+                                     /*useDwarfDirectory*/ true, IP,
+                                     std::move(CE), std::move(MAB), ShowInst));
+
+  } else if (FileType == OFT_Null) {
+    Str.reset(TheTarget->createNullStreamer(Ctx));
+  } else {
+    assert(FileType == OFT_ObjectFile && "Invalid file type!");
+
+    // Don't waste memory on names of temp labels.
+    Ctx.setUseNamesOnTempLabels(false);
+
+    if (!Out->os().supportsSeeking()) {
+      BOS = std::make_unique<buffer_ostream>(Out->os());
+      OS = BOS.get();
+    }
+
+    MCCodeEmitter *CE = TheTarget->createMCCodeEmitter(*MCII, *MRI, Ctx);
+    MCAsmBackend *MAB = TheTarget->createMCAsmBackend(*STI, *MRI, MCOptions);
+    Str.reset(TheTarget->createMCObjectStreamer(
+        TheTriple, Ctx, std::unique_ptr<MCAsmBackend>(MAB),
+        MAB->createObjectWriter(*OS), std::unique_ptr<MCCodeEmitter>(CE), *STI,
+        MCOptions.MCRelaxAll, MCOptions.MCIncrementalLinkerCompatible,
+        /*DWARFMustBeAtTheEnd*/ false));
+  }
+
+  // Use Assembler information for parsing.
+  Str->setUseAssemblerInfoForParsing(true);
+
+  int Res = 1;
+  bool disassemble = false;
+  switch (Action) {
+  case AC_AsLex:
+    Res = AsLexInput(SrcMgr, *MAI, Out->os());
+    break;
+  case AC_Assemble:
+    Res = AssembleInput(ProgName, TheTarget, SrcMgr, Ctx, *Str, *MAI, *STI,
+                        *MCII, MCOptions);
+    break;
+  case AC_MDisassemble:
+    assert(IP && "Expected assembly output");
+    IP->setUseMarkup(1);
+    disassemble = true;
+    break;
+  case AC_Disassemble:
+    disassemble = true;
+    break;
+  }
+  if (disassemble)
+    Res = Disassembler::disassemble(*TheTarget, TripleName, *STI, *Str, *Buffer,
+                                    SrcMgr, Out->os());
+
+  // Keep output if no errors.
+  if (Res == 0)
+    Out->keep();
+  return Res;
+}
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -70,8 +70,8 @@
       instrprof_error instrError = IPE.get();
       StringRef Hint = "";
       if (instrError == instrprof_error::unrecognized_format) {
-        // Hint for common error of forgetting -sample for sample profiles.
-        Hint = "Perhaps you forgot to use the -sample option?";
+        // Hint for common error of forgetting --sample for sample profiles.
+        Hint = "Perhaps you forgot to use the --sample option?";
       }
       exitWithError(IPE.message(), Whence, Hint);
     });
diff --git a/llvm/unittests/Support/FileCheckTest.cpp b/llvm/unittests/Support/FileCheckTest.cpp
--- a/llvm/unittests/Support/FileCheckTest.cpp
+++ b/llvm/unittests/Support/FileCheckTest.cpp
@@ -215,7 +215,7 @@
   SourceMgr SM;
   FileCheckRequest Req;
   FileCheckPatternContext Context;
-  Pattern P{Check::CheckPlain, &Context, LineNumber++};
+  Pattern P{Check::CheckPlain, &Context, LineNumber};
 
 public:
   PatternTester() {
@@ -236,16 +236,18 @@
   }
 
   void initNextPattern() {
-    P = Pattern(Check::CheckPlain, &Context, LineNumber++);
+    P = Pattern(Check::CheckPlain, &Context, ++LineNumber);
   }
 
-  bool parseSubstExpect(StringRef Expr) {
+  size_t getLineNumber() const { return LineNumber; }
+
+  bool parseSubstExpect(StringRef Expr, bool IsLegacyLineExpr = false) {
     StringRef ExprBufferRef = bufferize(SM, Expr);
     Optional<NumericVariable *> DefinedNumericVariable;
-    return errorToBool(
-        P.parseNumericSubstitutionBlock(ExprBufferRef, DefinedNumericVariable,
-                                        false, LineNumber - 1, &Context, SM)
-            .takeError());
+    return errorToBool(P.parseNumericSubstitutionBlock(
+                            ExprBufferRef, DefinedNumericVariable,
+                            IsLegacyLineExpr, LineNumber, &Context, SM)
+                           .takeError());
   }
 
   bool parsePatternExpect(StringRef Pattern) {
@@ -260,14 +262,15 @@
   }
 };
 
-TEST_F(FileCheckTest, ParseExpr) {
+TEST_F(FileCheckTest, ParseNumericSubstitutionBlock) {
   PatternTester Tester;
 
   // Variable definition.
 
-  // Definition of invalid variable.
-  EXPECT_TRUE(Tester.parseSubstExpect("10VAR:"));
-  EXPECT_TRUE(Tester.parseSubstExpect("@FOO:"));
+  // Invalid variable name.
+  EXPECT_TRUE(Tester.parseSubstExpect("%VAR:"));
+
+  // Invalid definition of pseudo variable.
   EXPECT_TRUE(Tester.parseSubstExpect("@LINE:"));
 
   // Conflict with pattern variable.
@@ -281,82 +284,100 @@
   EXPECT_FALSE(Tester.parseSubstExpect("  VAR2:"));
   EXPECT_FALSE(Tester.parseSubstExpect("VAR3  :"));
   EXPECT_FALSE(Tester.parseSubstExpect("VAR3:  "));
-  EXPECT_FALSE(Tester.parsePatternExpect("[[#FOOBAR: FOO+1]]"));
+  EXPECT_FALSE(Tester.parseSubstExpect("FOOBAR: FOO+1"));
 
   // Numeric expression.
 
-  // Unacceptable variable.
-  EXPECT_TRUE(Tester.parseSubstExpect("10VAR"));
+  // Invalid variable name.
+  EXPECT_TRUE(Tester.parseSubstExpect("%VAR"));
+
+  // Invalid pseudo variable.
   EXPECT_TRUE(Tester.parseSubstExpect("@FOO"));
 
-  // Only valid variable.
-  EXPECT_FALSE(Tester.parseSubstExpect("@LINE"));
-  EXPECT_FALSE(Tester.parseSubstExpect("FOO"));
-  EXPECT_FALSE(Tester.parseSubstExpect("UNDEF"));
+  // Invalid use of variable defined on the same line. Use parsePatternExpect
+  // for the variable to be recorded in GlobalNumericVariableTable and thus
+  // appear defined to parseNumericVariableUse. Note that the same pattern
+  // object is used for the parsePatternExpect and parseSubstExpect since no
+  // initNextPattern is called, thus appearing as being on the same line from
+  // the pattern's point of view.
+  ASSERT_FALSE(Tester.parsePatternExpect("[[#SAME_LINE_VAR:]]"));
+  EXPECT_TRUE(Tester.parseSubstExpect("SAME_LINE_VAR"));
+
+  // Invalid use of variable defined on the same line from an expression not
+  // using any variable defined on the same line.
+  ASSERT_FALSE(Tester.parsePatternExpect("[[#SAME_LINE_EXPR_VAR:@LINE+1]]"));
+  EXPECT_TRUE(Tester.parseSubstExpect("SAME_LINE_EXPR_VAR"));
+
+  // Valid use of undefined variable which creates the variable and record it
+  // in GlobalNumericVariableTable.
+  ASSERT_FALSE(Tester.parseSubstExpect("UNDEF"));
+  EXPECT_TRUE(Tester.parsePatternExpect("[[UNDEF:.*]]"));
+
+  // Invalid literal.
+  EXPECT_TRUE(Tester.parseSubstExpect("42U"));
 
   // Valid empty expression.
   EXPECT_FALSE(Tester.parseSubstExpect(""));
 
-  // Invalid use of variable defined on the same line from expression. Note
-  // that the same pattern object is used for the parsePatternExpect and
-  // parseSubstExpect since no initNextPattern is called, thus appearing as
-  // being on the same line from the pattern's point of view.
-  ASSERT_FALSE(Tester.parsePatternExpect("[[#LINE1VAR:FOO+1]]"));
-  EXPECT_TRUE(Tester.parseSubstExpect("LINE1VAR"));
+  // Valid single operand expression.
+  EXPECT_FALSE(Tester.parseSubstExpect("FOO"));
 
-  // Invalid use of variable defined on same line from input. As above, the
-  // absence of a call to initNextPattern makes it appear to be on the same
-  // line from the pattern's point of view.
-  ASSERT_FALSE(Tester.parsePatternExpect("[[#LINE2VAR:]]"));
-  EXPECT_TRUE(Tester.parseSubstExpect("LINE2VAR"));
+  // Valid expression with 2 or more operands.
+  EXPECT_FALSE(Tester.parseSubstExpect("FOO+3"));
+  EXPECT_FALSE(Tester.parseSubstExpect("FOO-3+FOO"));
 
   // Unsupported operator.
   EXPECT_TRUE(Tester.parseSubstExpect("@LINE/2"));
 
-  // Missing offset operand.
+  // Missing RHS operand.
   EXPECT_TRUE(Tester.parseSubstExpect("@LINE+"));
 
-  // Valid expression.
-  EXPECT_FALSE(Tester.parseSubstExpect("@LINE+5"));
-  EXPECT_FALSE(Tester.parseSubstExpect("FOO+4"));
-  Tester.initNextPattern();
-  EXPECT_FALSE(Tester.parseSubstExpect("FOOBAR"));
-  EXPECT_FALSE(Tester.parseSubstExpect("LINE1VAR"));
-  EXPECT_FALSE(Tester.parsePatternExpect("[[#FOO+FOO]]"));
-  EXPECT_FALSE(Tester.parsePatternExpect("[[#FOO+3-FOO]]"));
+  // Errors in RHS operand are bubbled up by parseBinop() to
+  // parseNumericSubstitutionBlock.
+  EXPECT_TRUE(Tester.parseSubstExpect("@LINE+%VAR"));
+
+  // Invalid legacy @LINE expression with non literal rhs.
+  EXPECT_TRUE(Tester.parseSubstExpect("@LINE+@LINE", /*IsLegacyNumExpr=*/true));
+
+  // Invalid legacy @LINE expression made of a single literal.
+  EXPECT_TRUE(Tester.parseSubstExpect("2", /*IsLegacyNumExpr=*/true));
+
+  // Valid legacy @LINE expression.
+  EXPECT_FALSE(Tester.parseSubstExpect("@LINE+2", /*IsLegacyNumExpr=*/true));
+
+  // Invalid legacy @LINE expression with more than 2 operands.
+  EXPECT_TRUE(
+      Tester.parseSubstExpect("@LINE+2+@LINE", /*IsLegacyNumExpr=*/true));
+  EXPECT_TRUE(Tester.parseSubstExpect("@LINE+2+2", /*IsLegacyNumExpr=*/true));
 }
 
 TEST_F(FileCheckTest, ParsePattern) {
   PatternTester Tester;
 
-  // Space in pattern variable expression.
+  // Invalid space in string substitution.
   EXPECT_TRUE(Tester.parsePatternExpect("[[ BAR]]"));
 
-  // Invalid variable name.
+  // Invalid variable name in string substitution.
   EXPECT_TRUE(Tester.parsePatternExpect("[[42INVALID]]"));
 
-  // Invalid pattern variable definition.
+  // Invalid string variable definition.
   EXPECT_TRUE(Tester.parsePatternExpect("[[@PAT:]]"));
   EXPECT_TRUE(Tester.parsePatternExpect("[[PAT+2:]]"));
 
   // Collision with numeric variable.
   EXPECT_TRUE(Tester.parsePatternExpect("[[FOO:]]"));
 
-  // Valid use of pattern variable.
+  // Valid use of string variable.
   EXPECT_FALSE(Tester.parsePatternExpect("[[BAR]]"));
 
-  // Valid pattern variable definition.
+  // Valid string variable definition.
   EXPECT_FALSE(Tester.parsePatternExpect("[[PAT:[0-9]+]]"));
 
-  // Invalid numeric expressions.
+  // Invalid numeric substitution.
   EXPECT_TRUE(Tester.parsePatternExpect("[[#42INVALID]]"));
-  EXPECT_TRUE(Tester.parsePatternExpect("[[#@FOO]]"));
-  EXPECT_TRUE(Tester.parsePatternExpect("[[#@LINE/2]]"));
 
-  // Valid numeric expressions and numeric variable definition.
+  // Valid numeric substitution.
   EXPECT_FALSE(Tester.parsePatternExpect("[[#FOO]]"));
-  EXPECT_FALSE(Tester.parsePatternExpect("[[#@LINE+2]]"));
-  EXPECT_FALSE(Tester.parsePatternExpect("[[#NUMVAR:]]"));
 }
 
 TEST_F(FileCheckTest, Match) {
@@ -374,7 +395,17 @@
   EXPECT_TRUE(Tester.matchExpect(""));
   EXPECT_FALSE(Tester.matchExpect("18"));
 
-  // Check matching the variable defined matches the correct number only
+  // Check matching an undefined variable returns a NotFound error.
+  Tester.initNextPattern();
+  ASSERT_FALSE(Tester.parsePatternExpect("100"));
+  EXPECT_TRUE(Tester.matchExpect("101"));
+
+  // Check matching the defined variable matches the correct number only.
+  Tester.initNextPattern();
+  ASSERT_FALSE(Tester.parsePatternExpect("[[#NUMVAR]]"));
+  EXPECT_FALSE(Tester.matchExpect("18"));
+
+  // Check matching several substitutions does not match them independently.
   Tester.initNextPattern();
   Tester.parsePatternExpect("[[#NUMVAR]] [[#NUMVAR+2]]");
   EXPECT_TRUE(Tester.matchExpect("19 21"));
@@ -385,16 +416,17 @@
   // the correct value for @LINE.
   Tester.initNextPattern();
   EXPECT_FALSE(Tester.parsePatternExpect("[[#@LINE]]"));
-  // Ok, @LINE is 5 now.
-  EXPECT_FALSE(Tester.matchExpect("5"));
+  // Ok, @LINE matches the current line number.
+  EXPECT_FALSE(Tester.matchExpect(std::to_string(Tester.getLineNumber())));
   Tester.initNextPattern();
-  // @LINE is now 6, match with substitution failure.
+  // Match with substitution failure.
   EXPECT_FALSE(Tester.parsePatternExpect("[[#UNKNOWN]]"));
   EXPECT_TRUE(Tester.matchExpect("FOO"));
   Tester.initNextPattern();
-  // Check that @LINE is 7 as expected.
+  // Check that @LINE matches the later (given the calls to initNextPattern())
+  // line number.
   EXPECT_FALSE(Tester.parsePatternExpect("[[#@LINE]]"));
-  EXPECT_FALSE(Tester.matchExpect("7"));
+  EXPECT_FALSE(Tester.matchExpect(std::to_string(Tester.getLineNumber())));
 }
 
 TEST_F(FileCheckTest, Substitution) {
@@ -411,30 +443,19 @@
   ASSERT_FALSE(bool(SubstValue));
   expectUndefError("VAR404", SubstValue.takeError());
 
-  // Substitutions of defined pseudo and non-pseudo numeric variables return
-  // the right value.
-  NumericVariable LineVar("@LINE", 1);
+  // Numeric substitution blocks constituted of defined numeric variables are
+  // substituted for the variable's value.
   NumericVariable NVar("N", 1);
-  LineVar.setValue(42);
   NVar.setValue(10);
-  auto LineVarUse = std::make_unique<NumericVariableUse>("@LINE", &LineVar);
   auto NVarUse = std::make_unique<NumericVariableUse>("N", &NVar);
-  NumericSubstitution SubstitutionLine(&Context, "@LINE", std::move(LineVarUse),
-                                       12);
-  NumericSubstitution SubstitutionN(&Context, "N", std::move(NVarUse), 30);
-  SubstValue = SubstitutionLine.getResult();
-  ASSERT_TRUE(bool(SubstValue));
-  EXPECT_EQ("42", *SubstValue);
+  NumericSubstitution SubstitutionN(&Context, "N", std::move(NVarUse),
+                                    /*InsertIdx=*/30);
   SubstValue = SubstitutionN.getResult();
   ASSERT_TRUE(bool(SubstValue));
   EXPECT_EQ("10", *SubstValue);
 
   // Substitution of an undefined numeric variable fails, error holds name of
   // undefined variable.
-  LineVar.clearValue();
-  SubstValue = SubstitutionLine.getResult();
-  ASSERT_FALSE(bool(SubstValue));
-  expectUndefError("@LINE", SubstValue.takeError());
   NVar.clearValue();
   SubstValue = SubstitutionN.getResult();
   ASSERT_FALSE(bool(SubstValue));
@@ -453,6 +474,9 @@
   std::vector<std::string> GlobalDefines;
   SourceMgr SM;
 
+  // No definition.
+  EXPECT_FALSE(errorToBool(Cxt.defineCmdlineVariables(GlobalDefines, SM)));
+
   // Missing equal sign.
   GlobalDefines.emplace_back(std::string("LocalVar"));
   EXPECT_TRUE(errorToBool(Cxt.defineCmdlineVariables(GlobalDefines, SM)));
@@ -505,19 +529,35 @@
   GlobalDefines.emplace_back(std::string("#LocalNumVar2=LocalNumVar1+2"));
   ASSERT_FALSE(errorToBool(Cxt.defineCmdlineVariables(GlobalDefines, SM)));
 
-  // Check defined variables are present and undefined is absent.
+  // Create @LINE pseudo numeric variable and check it is present by matching
+  // it.
+  size_t LineNumber = 1;
+  Pattern P(Check::CheckPlain, &Cxt, LineNumber);
+  FileCheckRequest Req;
+  Cxt.createLineVariable();
+  ASSERT_FALSE(P.parsePattern("[[@LINE]]", "CHECK", SM, Req));
+  size_t MatchLen;
+  ASSERT_FALSE(errorToBool(P.match("1", MatchLen, SM).takeError()));
+
+#ifndef NDEBUG
+  // Recreating @LINE pseudo numeric variable fails.
+  EXPECT_DEATH(Cxt.createLineVariable(),
+               "@LINE pseudo numeric variable already created");
+#endif
+
+  // Check defined variables are present and undefined ones are absent.
   StringRef LocalVarStr = "LocalVar";
   StringRef LocalNumVar1Ref = bufferize(SM, "LocalNumVar1");
   StringRef LocalNumVar2Ref = bufferize(SM, "LocalNumVar2");
   StringRef EmptyVarStr = "EmptyVar";
   StringRef UnknownVarStr = "UnknownVar";
   Expected<StringRef> LocalVar = Cxt.getPatternVarValue(LocalVarStr);
-  Pattern P(Check::CheckPlain, &Cxt, 1);
+  P = Pattern(Check::CheckPlain, &Cxt, ++LineNumber);
   Optional<NumericVariable *> DefinedNumericVariable;
   Expected<std::unique_ptr<ExpressionAST>> ExpressionASTPointer =
       P.parseNumericSubstitutionBlock(LocalNumVar1Ref, DefinedNumericVariable,
-                                      /*IsLegacyLineExpr=*/false,
-                                      /*LineNumber=*/1, &Cxt, SM);
+                                      /*IsLegacyLineExpr=*/false, LineNumber,
+                                      &Cxt, SM);
   ASSERT_TRUE(bool(LocalVar));
   EXPECT_EQ(*LocalVar, "FOO");
   Expected<StringRef> EmptyVar = Cxt.getPatternVarValue(EmptyVarStr);
@@ -526,10 +566,9 @@
   Expected<uint64_t> ExpressionVal = (*ExpressionASTPointer)->eval();
   ASSERT_TRUE(bool(ExpressionVal));
   EXPECT_EQ(*ExpressionVal, 18U);
-  ExpressionASTPointer =
-      P.parseNumericSubstitutionBlock(LocalNumVar2Ref, DefinedNumericVariable,
-                                      /*IsLegacyLineExpr=*/false,
-                                      /*LineNumber=*/1, &Cxt, SM);
+  ExpressionASTPointer = P.parseNumericSubstitutionBlock(
+      LocalNumVar2Ref, DefinedNumericVariable,
+      /*IsLegacyLineExpr=*/false, LineNumber, &Cxt, SM);
   ASSERT_TRUE(bool(ExpressionASTPointer));
   ExpressionVal = (*ExpressionASTPointer)->eval();
   ASSERT_TRUE(bool(ExpressionVal));
@@ -547,16 +586,16 @@
   // variable clearing due to --enable-var-scope happens after numeric
   // expressions are linked to the numeric variables they use.
   EXPECT_TRUE(errorToBool((*ExpressionASTPointer)->eval().takeError()));
-  P = Pattern(Check::CheckPlain, &Cxt, 2);
+  P = Pattern(Check::CheckPlain, &Cxt, ++LineNumber);
   ExpressionASTPointer = P.parseNumericSubstitutionBlock(
       LocalNumVar1Ref, DefinedNumericVariable, /*IsLegacyLineExpr=*/false,
-      /*LineNumber=*/2, &Cxt, SM);
+      LineNumber, &Cxt, SM);
   ASSERT_TRUE(bool(ExpressionASTPointer));
   ExpressionVal = (*ExpressionASTPointer)->eval();
   EXPECT_TRUE(errorToBool(ExpressionVal.takeError()));
   ExpressionASTPointer = P.parseNumericSubstitutionBlock(
       LocalNumVar2Ref, DefinedNumericVariable, /*IsLegacyLineExpr=*/false,
-      /*LineNumber=*/2, &Cxt, SM);
+      LineNumber, &Cxt, SM);
   ASSERT_TRUE(bool(ExpressionASTPointer));
   ExpressionVal = (*ExpressionASTPointer)->eval();
   EXPECT_TRUE(errorToBool(ExpressionVal.takeError()));
@@ -575,10 +614,10 @@
   Expected<StringRef> GlobalVar = Cxt.getPatternVarValue(GlobalVarStr);
   ASSERT_TRUE(bool(GlobalVar));
   EXPECT_EQ(*GlobalVar, "BAR");
-  P = Pattern(Check::CheckPlain, &Cxt, 3);
+  P = Pattern(Check::CheckPlain, &Cxt, ++LineNumber);
   ExpressionASTPointer = P.parseNumericSubstitutionBlock(
       GlobalNumVarRef, DefinedNumericVariable, /*IsLegacyLineExpr=*/false,
-      /*LineNumber=*/3, &Cxt, SM);
+      LineNumber, &Cxt, SM);
   ASSERT_TRUE(bool(ExpressionASTPointer));
   ExpressionVal = (*ExpressionASTPointer)->eval();
   ASSERT_TRUE(bool(ExpressionVal));
@@ -587,10 +626,10 @@
   // Clear local variables and check global variables remain defined.
   Cxt.clearLocalVars();
   EXPECT_FALSE(errorToBool(Cxt.getPatternVarValue(GlobalVarStr).takeError()));
-  P = Pattern(Check::CheckPlain, &Cxt, 4);
+  P = Pattern(Check::CheckPlain, &Cxt, ++LineNumber);
   ExpressionASTPointer = P.parseNumericSubstitutionBlock(
       GlobalNumVarRef, DefinedNumericVariable, /*IsLegacyLineExpr=*/false,
-      /*LineNumber=*/4, &Cxt, SM);
+      LineNumber, &Cxt, SM);
   ASSERT_TRUE(bool(ExpressionASTPointer));
   ExpressionVal = (*ExpressionASTPointer)->eval();
   ASSERT_TRUE(bool(ExpressionVal));
diff --git a/llvm/unittests/tools/llvm-exegesis/Mips/SnippetGeneratorTest.cpp b/llvm/unittests/tools/llvm-exegesis/Mips/SnippetGeneratorTest.cpp
--- a/llvm/unittests/tools/llvm-exegesis/Mips/SnippetGeneratorTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/Mips/SnippetGeneratorTest.cpp
@@ -7,13 +7,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "../Common/AssemblerUtils.h"
-#include "Latency.h"
 #include "LlvmState.h"
 #include "MCInstrDescView.h"
 #include "MipsInstrInfo.h"
+#include "ParallelSnippetGenerator.h"
 #include "RegisterAliasing.h"
+#include "SerialSnippetGenerator.h"
 #include "TestBase.h"
-#include "Uops.h"
 
 #include <unordered_set>
 
@@ -48,12 +48,12 @@
   SnippetGeneratorT Generator;
 };
 
-using LatencySnippetGeneratorTest =
-    SnippetGeneratorTest<LatencySnippetGenerator>;
+using SerialSnippetGeneratorTest = SnippetGeneratorTest<SerialSnippetGenerator>;
 
-using UopsSnippetGeneratorTest = SnippetGeneratorTest<UopsSnippetGenerator>;
+using ParallelSnippetGeneratorTest =
+    SnippetGeneratorTest<ParallelSnippetGenerator>;
 
-TEST_F(LatencySnippetGeneratorTest, ImplicitSelfDependencyThroughExplicitRegs) {
+TEST_F(SerialSnippetGeneratorTest, ImplicitSelfDependencyThroughExplicitRegs) {
   // - ADD
   // - Op0 Explicit Def RegClass(GPR32)
   // - Op1 Explicit Use RegClass(GPR32)
@@ -77,8 +77,8 @@
       << "Op0 is either set to Op1 or to Op2";
 }
 
-TEST_F(LatencySnippetGeneratorTest,
- ImplicitSelfDependencyThroughExplicitRegsForbidAll) {
+TEST_F(SerialSnippetGeneratorTest,
+       ImplicitSelfDependencyThroughExplicitRegsForbidAll) {
   // - XOR
   // - Op0 Explicit Def RegClass(GPR32)
   // - Op1 Explicit Use RegClass(GPR32)
@@ -96,7 +96,7 @@
   consumeError(std::move(Error));
 }
 
-TEST_F(UopsSnippetGeneratorTest, MemoryUse) {
+TEST_F(ParallelSnippetGeneratorTest, MemoryUse) {
   // LB reads from memory.
   // - LB
   // - Op0 Explicit Def RegClass(GPR32)
@@ -110,10 +110,11 @@
   const auto CodeTemplates = checkAndGetCodeTemplates(Opcode);
   ASSERT_THAT(CodeTemplates, SizeIs(1));
   const auto &CT = CodeTemplates[0];
-  EXPECT_THAT(CT.Info, HasSubstr("instruction is parallel, repeating a random one."));
+  EXPECT_THAT(CT.Info,
+              HasSubstr("instruction is parallel, repeating a random one."));
   EXPECT_THAT(CT.Execution, ExecutionMode::UNKNOWN);
   ASSERT_THAT(CT.Instructions,
-              SizeIs(UopsSnippetGenerator::kMinNumDifferentAddresses));
+              SizeIs(ParallelSnippetGenerator::kMinNumDifferentAddresses));
   const InstructionTemplate &IT = CT.Instructions[0];
   EXPECT_THAT(IT.getOpcode(), Opcode);
   ASSERT_THAT(IT.getVariableValues(), SizeIs(3));
diff --git a/llvm/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp b/llvm/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp
--- a/llvm/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp
@@ -7,12 +7,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "../Common/AssemblerUtils.h"
-#include "Latency.h"
 #include "LlvmState.h"
 #include "MCInstrDescView.h"
+#include "ParallelSnippetGenerator.h"
 #include "RegisterAliasing.h"
+#include "SerialSnippetGenerator.h"
 #include "TestBase.h"
-#include "Uops.h"
 #include "X86InstrInfo.h"
 
 #include <unordered_set>
@@ -59,12 +59,12 @@
   SnippetGeneratorT Generator;
 };
 
-using LatencySnippetGeneratorTest =
-    SnippetGeneratorTest<LatencySnippetGenerator>;
+using SerialSnippetGeneratorTest = SnippetGeneratorTest<SerialSnippetGenerator>;
 
-using UopsSnippetGeneratorTest = SnippetGeneratorTest<UopsSnippetGenerator>;
+using ParallelSnippetGeneratorTest =
+    SnippetGeneratorTest<ParallelSnippetGenerator>;
 
-TEST_F(LatencySnippetGeneratorTest, ImplicitSelfDependencyThroughImplicitReg) {
+TEST_F(SerialSnippetGeneratorTest, ImplicitSelfDependencyThroughImplicitReg) {
   // - ADC16i16
   // - Op0 Explicit Use Immediate
   // - Op1 Implicit Def Reg(AX)
@@ -90,7 +90,7 @@
   EXPECT_THAT(IT.getVariableValues()[0], IsInvalid()) << "Immediate is not set";
 }
 
-TEST_F(LatencySnippetGeneratorTest, ImplicitSelfDependencyThroughTiedRegs) {
+TEST_F(SerialSnippetGeneratorTest, ImplicitSelfDependencyThroughTiedRegs) {
   // - ADD16ri
   // - Op0 Explicit Def RegClass(GR16)
   // - Op1 Explicit Use RegClass(GR16) TiedToOp0
@@ -114,7 +114,7 @@
   EXPECT_THAT(IT.getVariableValues()[1], IsInvalid()) << "Operand 2 is not set";
 }
 
-TEST_F(LatencySnippetGeneratorTest, ImplicitSelfDependencyThroughExplicitRegs) {
+TEST_F(SerialSnippetGeneratorTest, ImplicitSelfDependencyThroughExplicitRegs) {
   // - VXORPSrr
   // - Op0 Explicit Def RegClass(VR128)
   // - Op1 Explicit Use RegClass(VR128)
@@ -138,7 +138,7 @@
       << "Op0 is either set to Op1 or to Op2";
 }
 
-TEST_F(LatencySnippetGeneratorTest,
+TEST_F(SerialSnippetGeneratorTest,
        ImplicitSelfDependencyThroughExplicitRegsForbidAll) {
   // - VXORPSrr
   // - Op0 Explicit Def RegClass(VR128)
@@ -158,7 +158,7 @@
   consumeError(std::move(Error));
 }
 
-TEST_F(LatencySnippetGeneratorTest, DependencyThroughOtherOpcode) {
+TEST_F(SerialSnippetGeneratorTest, DependencyThroughOtherOpcode) {
   // - CMP64rr
   // - Op0 Explicit Use RegClass(GR64)
   // - Op1 Explicit Use RegClass(GR64)
@@ -182,7 +182,7 @@
   }
 }
 
-TEST_F(LatencySnippetGeneratorTest, LAHF) {
+TEST_F(SerialSnippetGeneratorTest, LAHF) {
   // - LAHF
   // - Op0 Implicit Def Reg(AH)
   // - Op1 Implicit Use Reg(EFLAGS)
@@ -198,7 +198,7 @@
   }
 }
 
-TEST_F(UopsSnippetGeneratorTest, ParallelInstruction) {
+TEST_F(ParallelSnippetGeneratorTest, ParallelInstruction) {
   // - BNDCL32rr
   // - Op0 Explicit Use RegClass(BNDR)
   // - Op1 Explicit Use RegClass(GR32)
@@ -218,7 +218,7 @@
   EXPECT_THAT(IT.getVariableValues()[1], IsInvalid());
 }
 
-TEST_F(UopsSnippetGeneratorTest, SerialInstruction) {
+TEST_F(ParallelSnippetGeneratorTest, SerialInstruction) {
   // - CDQ
   // - Op0 Implicit Def Reg(EAX)
   // - Op1 Implicit Def Reg(EDX)
@@ -237,7 +237,7 @@
   ASSERT_THAT(IT.getVariableValues(), SizeIs(0));
 }
 
-TEST_F(UopsSnippetGeneratorTest, StaticRenaming) {
+TEST_F(ParallelSnippetGeneratorTest, StaticRenaming) {
   // CMOV32rr has tied variables, we enumerate the possible values to execute
   // as many in parallel as possible.
 
@@ -268,7 +268,7 @@
       << "Each instruction writes to a different register";
 }
 
-TEST_F(UopsSnippetGeneratorTest, NoTiedVariables) {
+TEST_F(ParallelSnippetGeneratorTest, NoTiedVariables) {
   // CMOV_GR32 has no tied variables, we make sure def and use are different
   // from each other.
 
@@ -302,7 +302,7 @@
   EXPECT_THAT(IT.getVariableValues()[3], IsInvalid());
 }
 
-TEST_F(UopsSnippetGeneratorTest, MemoryUse) {
+TEST_F(ParallelSnippetGeneratorTest, MemoryUse) {
   // Mov32rm reads from memory.
   // - MOV32rm
   // - Op0 Explicit Def RegClass(GR32)
@@ -326,7 +326,7 @@
   EXPECT_THAT(CT.Info, HasSubstr("no tied variables"));
   EXPECT_THAT(CT.Execution, ExecutionMode::UNKNOWN);
   ASSERT_THAT(CT.Instructions,
-              SizeIs(UopsSnippetGenerator::kMinNumDifferentAddresses));
+              SizeIs(ParallelSnippetGenerator::kMinNumDifferentAddresses));
   const InstructionTemplate &IT = CT.Instructions[0];
   EXPECT_THAT(IT.getOpcode(), Opcode);
   ASSERT_THAT(IT.getVariableValues(), SizeIs(6));
diff --git a/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp b/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp
--- a/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp
@@ -7,12 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "../Common/AssemblerUtils.h"
-#include "Latency.h"
 #include "LlvmState.h"
 #include "MCInstrDescView.h"
 #include "RegisterAliasing.h"
 #include "TestBase.h"
-#include "Uops.h"
 #include "X86InstrInfo.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 
@@ -34,8 +32,7 @@
   void SetUp() {
     TM = State.createTargetMachine();
     Context = std::make_unique<LLVMContext>();
-    Mod =
-        std::make_unique<Module>("X86SnippetRepetitorTest", *Context);
+    Mod = std::make_unique<Module>("X86SnippetRepetitorTest", *Context);
     Mod->setDataLayout(TM->createDataLayout());
     MMI = std::make_unique<MachineModuleInfo>(TM.get());
     MF = &createVoidVoidPtrMachineFunction("TestFn", Mod.get(), MMI.get());
diff --git a/llvm/unittests/tools/llvm-exegesis/X86/TestBase.h b/llvm/unittests/tools/llvm-exegesis/X86/TestBase.h
--- a/llvm/unittests/tools/llvm-exegesis/X86/TestBase.h
+++ b/llvm/unittests/tools/llvm-exegesis/X86/TestBase.h
@@ -1,4 +1,4 @@
-//===-- TestBase.cpp --------------------------------------------*- C++ -*-===//
+//===-- TestBase.h ----------------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/utils/gdb-scripts/prettyprinters.py b/llvm/utils/gdb-scripts/prettyprinters.py
--- a/llvm/utils/gdb-scripts/prettyprinters.py
+++ b/llvm/utils/gdb-scripts/prettyprinters.py
@@ -2,6 +2,7 @@
 import sys
 
 import gdb.printing
+import gdb.types
 
 class Iterator:
   def __iter__(self):
@@ -315,6 +316,51 @@
   def to_string(self):
     return self.string_from_twine_object(self._val)
 
+def make_printer(string = None, children = None, hint = None):
+  """Create a printer from the parameters."""
+  class Printer : pass
+  printer = Printer()
+  if string:
+    setattr(printer, 'to_string', lambda: string)
+  if children:
+    setattr(printer, 'children', lambda: children)
+  if hint:
+    setattr(printer, 'display_hint', lambda: hint)
+  return printer
+
+def get_pointer_int_pair(val):
+  """Get iterable with zero or single tuple from llvm::PointerIntPair."""
+  info_name = val.type.template_argument(4).strip_typedefs().name
+  try:
+    enum_type = gdb.lookup_type(info_name + '::MaskAndShiftConstants')
+  except gdb.error:
+    return
+  enum_dict = gdb.types.make_enum_dict(enum_type)
+  ptr_mask = enum_dict[info_name + '::PointerBitMask']
+  int_shift = enum_dict[info_name + '::IntShift']
+  int_mask = enum_dict[info_name + '::IntMask']
+  pair_union = val['Value']
+  pointer = (pair_union & ptr_mask)
+  value = ((pair_union >> int_shift) & int_mask)
+  yield (pointer, value)
+
+def make_pointer_int_pair_printer(val):
+  """Factory for an llvm::PointerIntPair printer."""
+  for (pointer, value) in get_pointer_int_pair(val):
+    pointer_type = val.type.template_argument(0)
+    value_type = val.type.template_argument(2)
+    string = 'llvm::PointerIntPair<%s>' % pointer_type
+    children = [('pointer', pointer.cast(pointer_type)), 
+                ('value', value.cast(value_type))]
+    return make_printer(string, children)
+
+def make_pointer_union_printer(val):
+  """Factory for an llvm::PointerUnion printer."""
+  for (pointer, value) in get_pointer_int_pair(val['Val']):
+    pointer_type = val.type.template_argument(int(value))
+    string = 'llvm::PointerUnion containing %s' % pointer_type
+    return make_printer(string, [('pointer', pointer.cast(pointer_type))])
+
 pp = gdb.printing.RegexpCollectionPrettyPrinter("LLVMSupport")
 pp.add_printer('llvm::SmallString', '^llvm::SmallString<.*>$', SmallStringPrinter)
 pp.add_printer('llvm::StringRef', '^llvm::StringRef$', StringRefPrinter)
@@ -324,4 +370,6 @@
 pp.add_printer('llvm::Optional', '^llvm::Optional<.*>$', OptionalPrinter)
 pp.add_printer('llvm::DenseMap', '^llvm::DenseMap<.*>$', DenseMapPrinter)
 pp.add_printer('llvm::Twine', '^llvm::Twine$', TwinePrinter)
+pp.add_printer('llvm::PointerIntPair', '^llvm::PointerIntPair<.*>$', make_pointer_int_pair_printer)
+pp.add_printer('llvm::PointerUnion', '^llvm::PointerUnion<.*>$', make_pointer_union_printer)
 gdb.printing.register_pretty_printer(gdb.current_objfile(), pp)
diff --git a/llvm/utils/gn/secondary/llvm/test/BUILD.gn b/llvm/utils/gn/secondary/llvm/test/BUILD.gn
--- a/llvm/utils/gn/secondary/llvm/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/test/BUILD.gn
@@ -225,6 +225,7 @@
     "//llvm/tools/llvm-lto2",
     "//llvm/tools/llvm-mc",
     "//llvm/tools/llvm-mca",
+    "//llvm/tools/llvm-ml",
     "//llvm/tools/llvm-modextract",
     "//llvm/tools/llvm-mt",
     "//llvm/tools/llvm-nm",
diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-exegesis/lib/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-exegesis/lib/BUILD.gn
--- a/llvm/utils/gn/secondary/llvm/tools/llvm-exegesis/lib/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/tools/llvm-exegesis/lib/BUILD.gn
@@ -21,18 +21,20 @@
     "BenchmarkRunner.cpp",
     "Clustering.cpp",
     "CodeTemplate.cpp",
-    "Latency.cpp",
+    "LatencyBenchmarkRunner.cpp",
     "LlvmState.cpp",
     "MCInstrDescView.cpp",
+    "ParallelSnippetGenerator.cpp",
     "PerfHelper.cpp",
     "RegisterAliasing.cpp",
     "RegisterValue.cpp",
     "SchedClassResolution.cpp",
+    "SerialSnippetGenerator.cpp",
     "SnippetFile.cpp",
     "SnippetGenerator.cpp",
     "SnippetRepetitor.cpp",
     "Target.cpp",
-    "Uops.cpp",
+    "UopsBenchmarkRunner.cpp",
   ]
 
   if (llvm_build_AArch64) {
diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-ml/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-ml/BUILD.gn
new file mode 100644
--- /dev/null
+++ b/llvm/utils/gn/secondary/llvm/tools/llvm-ml/BUILD.gn
@@ -0,0 +1,15 @@
+executable("llvm-ml") {
+  deps = [
+    "//llvm/lib/MC",
+    "//llvm/lib/MC/MCParser",
+    "//llvm/lib/Support",
+    "//llvm/lib/Target:AllTargetsAsmParsers",
+    "//llvm/lib/Target:AllTargetsDescs",
+    "//llvm/lib/Target:AllTargetsDisassemblers",
+    "//llvm/lib/Target:AllTargetsInfos",
+  ]
+  sources = [
+    "Disassembler.cpp",
+    "llvm-ml.cpp",
+  ]
+}
diff --git a/mlir/docs/LangRef.md b/mlir/docs/LangRef.md
--- a/mlir/docs/LangRef.md
+++ b/mlir/docs/LangRef.md
@@ -391,9 +391,10 @@
 ```
 
 A [block](https://en.wikipedia.org/wiki/Basic_block) is a sequential list of
-operations without control flow (calls are not considered control flow for this
-purpose) that are executed from top to bottom. The last operation in a block is
-a [terminator operation](#terminator-operations), which ends the block.
+operations without control flow (a call or entering an op's region is not
+considered control flow for this purpose) that are executed from top to bottom.
+The last operation in a block is a
+[terminator operation](#terminator-operations), which ends the block.
 
 Blocks in MLIR take a list of block arguments, which represent SSA PHI nodes in
 a functional notation. The arguments are defined by the block, and values are
@@ -501,8 +502,11 @@
 Regions are Single-Entry-Multiple-Exit (SEME). This means that control can only
 flow into the first block of the region, but can flow out of the region at the
 end of any of the contained blocks (This behavior is similar to that of a
-function body in most programming languages). When exiting a Region, control is
-returned to the enclosing operation.
+function body in most programming languages). A terminator of a block within a
+region may transfer the control flow to another block in this region, or return
+it to the immediately enclosing op. The semantics of the enclosing op defines
+where the control flow is transmitted next. It may, for example, enter a region
+of the same op, including the same region that returned the control flow.
 
 The enclosing operation determines the way in which control is transmitted into
 the entry block of a Region. The successor to a region’s exit points may not
diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
--- a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
@@ -55,6 +55,7 @@
   DIRECTORY)
 
 set(cuda_sources
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/amdgcn_smid.hip
   ${CMAKE_CURRENT_SOURCE_DIR}/src/target_impl.hip
   ${devicertl_base_directory}/common/src/cancel.cu
   ${devicertl_base_directory}/common/src/critical.cu
diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip
@@ -0,0 +1,61 @@
+//===-------- amdgcn_smid.hip - AMDGCN smid implementation -------- HIP -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "target_impl.h"
+
+// Partially derived fom hcc_detail/device_functions.h
+
+// HW_ID Register bit structure
+// WAVE_ID     3:0     Wave buffer slot number. 0-9.
+// SIMD_ID     5:4     SIMD which the wave is assigned to within the CU.
+// PIPE_ID     7:6     Pipeline from which the wave was dispatched.
+// CU_ID       11:8    Compute Unit the wave is assigned to.
+// SH_ID       12      Shader Array (within an SE) the wave is assigned to.
+// SE_ID       14:13   Shader Engine the wave is assigned to.
+// TG_ID       19:16   Thread-group ID
+// VM_ID       23:20   Virtual Memory ID
+// QUEUE_ID    26:24   Queue from which this wave was dispatched.
+// STATE_ID    29:27   State ID (graphics only, not compute).
+// ME_ID       31:30   Micro-engine ID.
+
+enum {
+  HW_ID = 4, // specify that the hardware register to read is HW_ID
+
+  HW_ID_CU_ID_SIZE = 4,   // size of CU_ID field in bits
+  HW_ID_CU_ID_OFFSET = 8, // offset of CU_ID from start of register
+
+  HW_ID_SE_ID_SIZE = 2,    // sizeof SE_ID field in bits
+  HW_ID_SE_ID_OFFSET = 13, // offset of SE_ID from start of register
+};
+
+// The s_getreg_b32 instruction, exposed as an intrinsic, takes a 16 bit
+// immediate and returns a 32 bit value.
+// The encoding of the immediate parameter is:
+// ID           5:0     Which register to read from
+// OFFSET       10:6    Range: 0..31
+// WIDTH        15:11   Range: 1..32
+
+// The asm equivalent is s_getreg_b32 %0, hwreg(HW_REG_HW_ID, Offset, Width)
+// where hwreg forms a 16 bit immediate encoded by the assembler thus:
+// uint64_t encodeHwreg(uint64_t Id, uint64_t Offset, uint64_t Width) {
+//   return (Id << 0_) | (Offset << 6) | ((Width - 1) << 11);
+// }
+#define ENCODE_HWREG(WIDTH, OFF, REG) (REG | (OFF << 6) | ((WIDTH - 1) << 11))
+
+// Note: The results can be changed by a context switch
+// Return value in [0 2^SE_ID_SIZE * 2^CU_ID_SIZE), which is an upper
+// bound on how many compute units are available. Some values in this
+// range may never be returned if there are fewer than 2^CU_ID_SIZE CUs.
+
+DEVICE uint32_t __kmpc_impl_smid() {
+  uint32_t cu_id = __builtin_amdgcn_s_getreg(
+      ENCODE_HWREG(HW_ID_CU_ID_SIZE, HW_ID_CU_ID_OFFSET, HW_ID));
+  uint32_t se_id = __builtin_amdgcn_s_getreg(
+      ENCODE_HWREG(HW_ID_SE_ID_SIZE, HW_ID_SE_ID_OFFSET, HW_ID));
+  return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
+}