diff --git a/clang-tools-extra/clang-tidy/bugprone/AssertSideEffectCheck.h b/clang-tools-extra/clang-tidy/bugprone/AssertSideEffectCheck.h
--- a/clang-tools-extra/clang-tidy/bugprone/AssertSideEffectCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/AssertSideEffectCheck.h
@@ -42,6 +42,7 @@
   const bool CheckFunctionCalls;
   const std::string RawAssertList;
   SmallVector<StringRef, 5> AssertMacros;
+  const std::vector<std::string> IgnoredFunctions;
 };
 
 } // namespace bugprone
diff --git a/clang-tools-extra/clang-tidy/bugprone/AssertSideEffectCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/AssertSideEffectCheck.cpp
--- a/clang-tools-extra/clang-tidy/bugprone/AssertSideEffectCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/AssertSideEffectCheck.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "AssertSideEffectCheck.h"
+#include "../utils/Matchers.h"
+#include "../utils/OptionsUtils.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "clang/Frontend/CompilerInstance.h"
@@ -25,7 +27,9 @@
 
 namespace {
 
-AST_MATCHER_P(Expr, hasSideEffect, bool, CheckFunctionCalls) {
+AST_MATCHER_P2(Expr, hasSideEffect, bool, CheckFunctionCalls,
+               clang::ast_matchers::internal::Matcher<NamedDecl>,
+               IgnoredFunctionsMatcher) {
   const Expr *E = &Node;
 
   if (const auto *Op = dyn_cast<UnaryOperator>(E)) {
@@ -55,7 +59,8 @@
     bool Result = CheckFunctionCalls;
     if (const auto *FuncDecl = CExpr->getDirectCallee()) {
       if (FuncDecl->getDeclName().isIdentifier() &&
-          FuncDecl->getName() == "__builtin_expect") // exceptions come here
+          IgnoredFunctionsMatcher.matches(*FuncDecl, Finder,
+                                          Builder)) // exceptions come here
         Result = false;
       else if (const auto *MethodDecl = dyn_cast<CXXMethodDecl>(FuncDecl))
         Result &= !MethodDecl->isConst();
@@ -72,8 +77,9 @@
                                              ClangTidyContext *Context)
     : ClangTidyCheck(Name, Context),
       CheckFunctionCalls(Options.get("CheckFunctionCalls", false)),
-      RawAssertList(Options.get("AssertMacros",
-                                "assert,NSAssert,NSCAssert")) {
+      RawAssertList(Options.get("AssertMacros", "assert,NSAssert,NSCAssert")),
+      IgnoredFunctions(utils::options::parseStringList(
+          "__builtin_expect;" + Options.get("IgnoredFunctions", ""))) {
   StringRef(RawAssertList).split(AssertMacros, ",", -1, false);
 }
 
@@ -81,11 +87,17 @@
 void AssertSideEffectCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
   Options.store(Opts, "CheckFunctionCalls", CheckFunctionCalls);
   Options.store(Opts, "AssertMacros", RawAssertList);
+  Options.store(Opts, "IgnoredFunctions",
+                utils::options::serializeStringList(IgnoredFunctions));
 }
 
 void AssertSideEffectCheck::registerMatchers(MatchFinder *Finder) {
+  auto IgnoredFunctionsMatcher =
+      matchers::matchesAnyListedName(IgnoredFunctions);
+
   auto DescendantWithSideEffect =
-      traverse(TK_AsIs, hasDescendant(expr(hasSideEffect(CheckFunctionCalls))));
+      traverse(TK_AsIs, hasDescendant(expr(hasSideEffect(
+                            CheckFunctionCalls, IgnoredFunctionsMatcher))));
   auto ConditionWithSideEffect = hasCondition(DescendantWithSideEffect);
   Finder->addMatcher(
       stmt(
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -133,7 +133,7 @@
 - New :doc:`readability-duplicate-include
   <clang-tidy/checks/readability-duplicate-include>` check.
 
-  Looks for duplicate includes and removes them.  
+  Looks for duplicate includes and removes them.
 
 - New :doc:`readability-identifier-length
   <clang-tidy/checks/readability-identifier-length>` check.
@@ -167,7 +167,13 @@
 Changes in existing checks
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 
+- :doc:`bugprone-assert-side-effect <clang-tidy/checks/bugprone-assert-side-effect>`
+  check now supports an ``IgnoredFunctions`` option to explicitly consider
+  the specified semicolon-separated functions list as not having any
+  side-effects. Regular expressions for the list items are also accepted.
+
 - Removed default setting ``cppcoreguidelines-explicit-virtual-functions.IgnoreDestructors = "true"``,
+  from :doc:`cppcoreguidelines-explicit-virtual-functions <clang-tidy/checks/cppcoreguidelines-explicit-virtual-functions>`
   to match the current state of the C++ Core Guidelines.
 
 - Removed suggestion ``use gsl::at`` from warning message in the
@@ -185,10 +191,10 @@
 
 - Fixed a false positive in :doc:`bugprone-throw-keyword-missing
   <clang-tidy/checks/bugprone-throw-keyword-missing>` when creating an exception object
-  using placement new
+  using placement new.
 
 - :doc:`cppcoreguidelines-narrowing-conversions <clang-tidy/checks/cppcoreguidelines-narrowing-conversions>`
-  check now supports a `WarnOnIntegerToFloatingPointNarrowingConversion`
+  check now supports a ``WarnOnIntegerToFloatingPointNarrowingConversion``
   option to control whether to warn on narrowing integer to floating-point
   conversions.
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone-assert-side-effect.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone-assert-side-effect.rst
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone-assert-side-effect.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone-assert-side-effect.rst
@@ -21,3 +21,13 @@
    Whether to treat non-const member and non-member functions as they produce
    side effects. Disabled by default because it can increase the number of false
    positive warnings.
+
+.. option:: IgnoredFunctions
+
+   A semicolon-separated list of the names of functions or methods to be
+   considered as not having side-effects. Regular expressions are accepted,
+   e.g. `[Rr]ef(erence)?$` matches every type with suffix `Ref`, `ref`,
+   `Reference` and `reference`. The default is empty. If a name in the list
+   contains the sequence `::` it is matched against the qualified typename
+   (i.e. `namespace::Type`, otherwise it is matched against only
+   the type name (i.e. `Type`).
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone-assert-side-effect.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone-assert-side-effect.cpp
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone-assert-side-effect.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone-assert-side-effect.cpp
@@ -1,4 +1,4 @@
-// RUN: %check_clang_tidy %s bugprone-assert-side-effect %t -- -config="{CheckOptions: [{key: bugprone-assert-side-effect.CheckFunctionCalls, value: true}, {key: bugprone-assert-side-effect.AssertMacros, value: 'assert,assert2,my_assert,convoluted_assert,msvc_assert'}]}" -- -fexceptions
+// RUN: %check_clang_tidy %s bugprone-assert-side-effect %t -- -config="{CheckOptions: [{key: bugprone-assert-side-effect.CheckFunctionCalls, value: true}, {key: bugprone-assert-side-effect.AssertMacros, value: 'assert,assert2,my_assert,convoluted_assert,msvc_assert'}, {key: bugprone-assert-side-effect.IgnoredFunctions, value: 'MyClass::badButIgnoredFunc'}]}" -- -fexceptions
 
 //===--- assert definition block ------------------------------------------===//
 int abort() { return 0; }
@@ -43,9 +43,12 @@
 
 //===----------------------------------------------------------------------===//
 
+bool badButIgnoredFunc(int a, int b) { return a * b > 0; }
+
 class MyClass {
 public:
   bool badFunc(int a, int b) { return a * b > 0; }
+  bool badButIgnoredFunc(int a, int b) { return a * b > 0; }
   bool goodFunc(int a, int b) const { return a * b > 0; }
 
   MyClass &operator=(const MyClass &rhs) { return *this; }
@@ -57,6 +60,11 @@
   void operator delete(void *p) {}
 };
 
+class SomeoneElseClass {
+public:
+  bool badButIgnoredFunc(int a, int b) { return a * b > 0; }
+};
+
 bool freeFunction() {
   return true;
 }
@@ -85,8 +93,16 @@
   // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: side effect in assert() condition discarded in release builds
 
   MyClass mc;
+  SomeoneElseClass sec;
   assert(mc.badFunc(0, 1));
   // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: side effect in assert() condition discarded in release builds
+  assert(mc.badButIgnoredFunc(0, 1));
+  // badButIgnoredFunc is not ignored as only class members are ignored by the config
+  assert(badButIgnoredFunc(0, 1));
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: side effect in assert() condition discarded in release builds
+  // sec.badButIgnoredFunc is not ignored as only MyClass members are ignored by the config
+  assert(sec.badButIgnoredFunc(0, 1));
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: side effect in assert() condition discarded in release builds
   assert(mc.goodFunc(0, 1));
 
   MyClass mc2;
diff --git a/clang/cmake/caches/Fuchsia-stage2.cmake b/clang/cmake/caches/Fuchsia-stage2.cmake
--- a/clang/cmake/caches/Fuchsia-stage2.cmake
+++ b/clang/cmake/caches/Fuchsia-stage2.cmake
@@ -134,6 +134,8 @@
     set(RUNTIMES_${target}_LLVM_ENABLE_ASSERTIONS OFF CACHE BOOL "")
     set(RUNTIMES_${target}_SANITIZER_CXX_ABI "libc++" CACHE STRING "")
     set(RUNTIMES_${target}_SANITIZER_CXX_ABI_INTREE ON CACHE BOOL "")
+    set(RUNTIMES_${target}_SANITIZER_TEST_CXX "libc++" CACHE STRING "")
+    set(RUNTIMES_${target}_SANITIZER_TEST_CXX_INTREE ON CACHE BOOL "")
     set(RUNTIMES_${target}_COMPILER_RT_TEST_COMPILER_CFLAGS "--unwindlib=libunwind -static-libgcc" CACHE STRING "")
     set(RUNTIMES_${target}_SANITIZER_COMMON_TEST_TARGET_CFLAGS "--unwindlib=libunwind -static-libgcc" CACHE STRING "")
     set(RUNTIMES_${target}_TSAN_TEST_TARGET_CFLAGS "--unwindlib=libunwind -static-libgcc" CACHE STRING "")
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -256,9 +256,9 @@
 ----------------------
 
 - The default DWARF version has increased from DWARFv4 to DWARFv5.  You can opt
-  back in to the old behavior with -gdwarf-4. Some platforms (Darwin, Android,
-  and SCE for instance) already opt out of this version bump as is suitable for
-  the platform
+  back in to the old behavior with ``-gdwarf-4`` or ``-fdebug-default-version=4``.
+  Some platforms (Darwin, Android, and SCE for instance) already opt out of this
+  version bump as is suitable for the platform
 
 Arm and AArch64 Support in Clang
 --------------------------------
diff --git a/clang/include/clang/Basic/riscv_vector.td b/clang/include/clang/Basic/riscv_vector.td
--- a/clang/include/clang/Basic/riscv_vector.td
+++ b/clang/include/clang/Basic/riscv_vector.td
@@ -1668,11 +1668,13 @@
 
 // 12.10. Vector Single-Width Integer Multiply Instructions
 defm vmul : RVVIntBinBuiltinSet;
+let RequiredFeatures = ["FullMultiply"] in {
 defm vmulh : RVVSignedBinBuiltinSet;
 defm vmulhu : RVVUnsignedBinBuiltinSet;
 defm vmulhsu : RVVOutOp1BuiltinSet<"vmulhsu", "csil",
                                    [["vv", "v", "vvUv"],
                                     ["vx", "v", "vvUe"]]>;
+}
 
 // 12.11. Vector Integer Divide Instructions
 defm vdivu : RVVUnsignedBinBuiltinSet;
@@ -1759,7 +1761,9 @@
 defm vasub : RVVSignedBinBuiltinSet;
 
 // 13.3. Vector Single-Width Fractional Multiply with Rounding and Saturation
+let RequiredFeatures = ["FullMultiply"] in {
 defm vsmul : RVVSignedBinBuiltinSet;
+}
 
 // 13.4. Vector Single-Width Scaling Shift Instructions
 defm vssrl : RVVUnsignedShiftBuiltinSet;
diff --git a/clang/include/clang/Tooling/Transformer/SourceCodeBuilders.h b/clang/include/clang/Tooling/Transformer/SourceCodeBuilders.h
--- a/clang/include/clang/Tooling/Transformer/SourceCodeBuilders.h
+++ b/clang/include/clang/Tooling/Transformer/SourceCodeBuilders.h
@@ -43,6 +43,15 @@
 /// Determines whether printing this expression to the right of a unary operator
 /// requires a parentheses to preserve its meaning.
 bool needParensAfterUnaryOperator(const Expr &E);
+
+// Recognizes known types (and sugared versions thereof) that overload the `*`
+// and `->` operator. Below is the list of currently included types, but it is
+// subject to change:
+//
+// * std::unique_ptr, std::shared_ptr, std::weak_ptr,
+// * std::optional, absl::optional, llvm::Optional,
+// * absl::StatusOr, llvm::Expected.
+bool isKnownPointerLikeType(QualType Ty, ASTContext &Context);
 /// @}
 
 /// \name Basic code-string generation utilities.
@@ -69,6 +78,8 @@
 ///  `x` becomes `x.`
 ///  `*a` becomes `a->`
 ///  `a+b` becomes `(a+b).`
+///
+/// DEPRECATED. Use `buildAccess`.
 llvm::Optional<std::string> buildDot(const Expr &E, const ASTContext &Context);
 
 /// Adds an arrow to the end of the given expression, but adds parentheses
@@ -77,8 +88,32 @@
 ///  `x` becomes `x->`
 ///  `&a` becomes `a.`
 ///  `a+b` becomes `(a+b)->`
+///
+/// DEPRECATED. Use `buildAccess`.
 llvm::Optional<std::string> buildArrow(const Expr &E,
                                        const ASTContext &Context);
+
+/// Specifies how to classify pointer-like types -- like values or like pointers
+/// -- with regard to generating member-access syntax.
+enum class PLTClass : bool {
+  Value,
+  Pointer,
+};
+
+/// Adds an appropriate access operator (`.`, `->` or nothing, in the case of
+/// implicit `this`) to the end of the given expression. Adds parentheses when
+/// needed by the syntax and simplifies when possible. If `PLTypeClass` is
+/// `Pointer`, for known pointer-like types (see `isKnownPointerLikeType`),
+/// treats `operator->` and `operator*` like the built-in `->` and `*`
+/// operators.
+///
+///  `x` becomes `x->` or `x.`, depending on `E`'s type
+///  `a+b` becomes `(a+b)->` or `(a+b).`, depending on `E`'s type
+///  `&a` becomes `a.`
+///  `*a` becomes `a->`
+llvm::Optional<std::string>
+buildAccess(const Expr &E, ASTContext &Context,
+            PLTClass Classification = PLTClass::Pointer);
 /// @}
 
 } // namespace tooling
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -17190,7 +17190,7 @@
   case NVPTX::BI__mma_tf32_m16n16k8_ld_a:
     return MMA_LDST(4, m16n16k8_load_a_tf32);
   case NVPTX::BI__mma_tf32_m16n16k8_ld_b:
-    return MMA_LDST(2, m16n16k8_load_b_tf32);
+    return MMA_LDST(4, m16n16k8_load_b_tf32);
   case NVPTX::BI__mma_tf32_m16n16k8_ld_c:
     return MMA_LDST(8, m16n16k8_load_c_f32);
 
diff --git a/clang/lib/Tooling/Transformer/SourceCodeBuilders.cpp b/clang/lib/Tooling/Transformer/SourceCodeBuilders.cpp
--- a/clang/lib/Tooling/Transformer/SourceCodeBuilders.cpp
+++ b/clang/lib/Tooling/Transformer/SourceCodeBuilders.cpp
@@ -10,6 +10,8 @@
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Expr.h"
 #include "clang/AST/ExprCXX.h"
+#include "clang/ASTMatchers/ASTMatchFinder.h"
+#include "clang/ASTMatchers/ASTMatchers.h"
 #include "clang/Tooling/Transformer/SourceCode.h"
 #include "llvm/ADT/Twine.h"
 #include <string>
@@ -60,6 +62,16 @@
   return false;
 }
 
+bool tooling::isKnownPointerLikeType(QualType Ty, ASTContext &Context) {
+  using namespace ast_matchers;
+  const auto PointerLikeTy = type(hasUnqualifiedDesugaredType(
+      recordType(hasDeclaration(cxxRecordDecl(hasAnyName(
+          "::std::unique_ptr", "::std::shared_ptr", "::std::weak_ptr",
+          "::std::optional", "::absl::optional", "::llvm::Optional",
+          "absl::StatusOr", "::llvm::Expected"))))));
+  return match(PointerLikeTy, Ty, Context).size() > 0;
+}
+
 llvm::Optional<std::string> tooling::buildParens(const Expr &E,
                                                  const ASTContext &Context) {
   StringRef Text = getText(E, Context);
@@ -114,8 +126,10 @@
   return ("&" + Text).str();
 }
 
-llvm::Optional<std::string> tooling::buildDot(const Expr &E,
-                                              const ASTContext &Context) {
+// Append the appropriate access operation (syntactically) to `E`, assuming `E`
+// is a non-pointer value.
+static llvm::Optional<std::string>
+buildAccessForValue(const Expr &E, const ASTContext &Context) {
   if (const auto *Op = llvm::dyn_cast<UnaryOperator>(&E))
     if (Op->getOpcode() == UO_Deref) {
       // Strip leading '*', add following '->'.
@@ -138,8 +152,10 @@
   return (Text + ".").str();
 }
 
-llvm::Optional<std::string> tooling::buildArrow(const Expr &E,
-                                                const ASTContext &Context) {
+// Append the appropriate access operation (syntactically) to `E`, assuming `E`
+// is a pointer value.
+static llvm::Optional<std::string>
+buildAccessForPointer(const Expr &E, const ASTContext &Context) {
   if (const auto *Op = llvm::dyn_cast<UnaryOperator>(&E))
     if (Op->getOpcode() == UO_AddrOf) {
       // Strip leading '&', add following '.'.
@@ -160,3 +176,62 @@
     return ("(" + Text + ")->").str();
   return (Text + "->").str();
 }
+
+llvm::Optional<std::string> tooling::buildDot(const Expr &E,
+                                              const ASTContext &Context) {
+  return buildAccessForValue(E, Context);
+}
+
+llvm::Optional<std::string> tooling::buildArrow(const Expr &E,
+                                                const ASTContext &Context) {
+  return buildAccessForPointer(E, Context);
+}
+
+// If `E` is an overloaded-operator call of kind `K` on an object `O`, returns
+// `O`. Otherwise, returns `nullptr`.
+static const Expr *maybeGetOperatorObjectArg(const Expr &E,
+                                             OverloadedOperatorKind K) {
+  if (const auto *OpCall = dyn_cast<clang::CXXOperatorCallExpr>(&E)) {
+    if (OpCall->getOperator() == K && OpCall->getNumArgs() == 1)
+      return OpCall->getArg(0);
+  }
+  return nullptr;
+}
+
+static bool treatLikePointer(QualType Ty, PLTClass C, ASTContext &Context) {
+  switch (C) {
+  case PLTClass::Value:
+    return false;
+  case PLTClass::Pointer:
+    return isKnownPointerLikeType(Ty, Context);
+  }
+}
+
+// FIXME: move over the other `maybe` functionality from Stencil. Should all be
+// in one place.
+llvm::Optional<std::string> tooling::buildAccess(const Expr &RawExpression,
+                                                 ASTContext &Context,
+                                                 PLTClass Classification) {
+  if (RawExpression.isImplicitCXXThis())
+    // Return the empty string, because `None` signifies some sort of failure.
+    return std::string();
+
+  const Expr *E = RawExpression.IgnoreImplicitAsWritten();
+
+  if (E->getType()->isAnyPointerType() ||
+      treatLikePointer(E->getType(), Classification, Context)) {
+    // Strip off operator-> calls. They can only occur inside an actual arrow
+    // member access, so we treat them as equivalent to an actual object
+    // expression.
+    if (const auto *Obj = maybeGetOperatorObjectArg(*E, clang::OO_Arrow))
+      E = Obj;
+    return buildAccessForPointer(*E, Context);
+  }
+
+  if (const auto *Obj = maybeGetOperatorObjectArg(*E, clang::OO_Star)) {
+    if (treatLikePointer(Obj->getType(), Classification, Context))
+      return buildAccessForPointer(*Obj, Context);
+  };
+
+  return buildAccessForValue(*E, Context);
+}
diff --git a/clang/lib/Tooling/Transformer/Stencil.cpp b/clang/lib/Tooling/Transformer/Stencil.cpp
--- a/clang/lib/Tooling/Transformer/Stencil.cpp
+++ b/clang/lib/Tooling/Transformer/Stencil.cpp
@@ -11,7 +11,6 @@
 #include "clang/AST/ASTTypeTraits.h"
 #include "clang/AST/Expr.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
-#include "clang/ASTMatchers/ASTMatchers.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Lex/Lexer.h"
 #include "clang/Tooling/Transformer/SourceCode.h"
@@ -56,39 +55,6 @@
   return Error::success();
 }
 
-// FIXME: Consider memoizing this function using the `ASTContext`.
-static bool isSmartPointerType(QualType Ty, ASTContext &Context) {
-  using namespace ::clang::ast_matchers;
-
-  // Optimization: hard-code common smart-pointer types. This can/should be
-  // removed if we start caching the results of this function.
-  auto KnownSmartPointer =
-      cxxRecordDecl(hasAnyName("::std::unique_ptr", "::std::shared_ptr"));
-  const auto QuacksLikeASmartPointer = cxxRecordDecl(
-      hasMethod(cxxMethodDecl(hasOverloadedOperatorName("->"),
-                              returns(qualType(pointsTo(type()))))),
-      hasMethod(cxxMethodDecl(hasOverloadedOperatorName("*"),
-                              returns(qualType(references(type()))))));
-  const auto SmartPointer = qualType(hasDeclaration(
-      cxxRecordDecl(anyOf(KnownSmartPointer, QuacksLikeASmartPointer))));
-  return match(SmartPointer, Ty, Context).size() > 0;
-}
-
-// Identifies use of `operator*` on smart pointers, and returns the underlying
-// smart-pointer expression; otherwise, returns null.
-static const Expr *isSmartDereference(const Expr &E, ASTContext &Context) {
-  using namespace ::clang::ast_matchers;
-
-  const auto HasOverloadedArrow = cxxRecordDecl(hasMethod(cxxMethodDecl(
-      hasOverloadedOperatorName("->"), returns(qualType(pointsTo(type()))))));
-  // Verify it is a smart pointer by finding `operator->` in the class
-  // declaration.
-  auto Deref = cxxOperatorCallExpr(
-      hasOverloadedOperatorName("*"), hasUnaryOperand(expr().bind("arg")),
-      callee(cxxMethodDecl(ofClass(HasOverloadedArrow))));
-  return selectFirst<Expr>("arg", match(Deref, E, Context));
-}
-
 namespace {
 // An arbitrary fragment of code within a stencil.
 class RawTextStencil : public StencilInterface {
@@ -196,7 +162,7 @@
       break;
     case UnaryNodeOperator::MaybeDeref:
       if (E->getType()->isAnyPointerType() ||
-          isSmartPointerType(E->getType(), *Match.Context)) {
+          tooling::isKnownPointerLikeType(E->getType(), *Match.Context)) {
         // Strip off any operator->. This can only occur inside an actual arrow
         // member access, so we treat it as equivalent to an actual object
         // expression.
@@ -216,7 +182,7 @@
       break;
     case UnaryNodeOperator::MaybeAddressOf:
       if (E->getType()->isAnyPointerType() ||
-          isSmartPointerType(E->getType(), *Match.Context)) {
+          tooling::isKnownPointerLikeType(E->getType(), *Match.Context)) {
         // Strip off any operator->. This can only occur inside an actual arrow
         // member access, so we treat it as equivalent to an actual object
         // expression.
@@ -311,34 +277,12 @@
     if (E == nullptr)
       return llvm::make_error<StringError>(errc::invalid_argument,
                                            "Id not bound: " + BaseId);
-    if (!E->isImplicitCXXThis()) {
-      llvm::Optional<std::string> S;
-      if (E->getType()->isAnyPointerType() ||
-          isSmartPointerType(E->getType(), *Match.Context)) {
-        // Strip off any operator->. This can only occur inside an actual arrow
-        // member access, so we treat it as equivalent to an actual object
-        // expression.
-        if (const auto *OpCall = dyn_cast<clang::CXXOperatorCallExpr>(E)) {
-          if (OpCall->getOperator() == clang::OO_Arrow &&
-              OpCall->getNumArgs() == 1) {
-            E = OpCall->getArg(0);
-          }
-        }
-        S = tooling::buildArrow(*E, *Match.Context);
-      } else if (const auto *Operand = isSmartDereference(*E, *Match.Context)) {
-        // `buildDot` already handles the built-in dereference operator, so we
-        // only need to catch overloaded `operator*`.
-        S = tooling::buildArrow(*Operand, *Match.Context);
-      } else {
-        S = tooling::buildDot(*E, *Match.Context);
-      }
-      if (S.hasValue())
-        *Result += *S;
-      else
-        return llvm::make_error<StringError>(
-            errc::invalid_argument,
-            "Could not construct object text from ID: " + BaseId);
-    }
+    llvm::Optional<std::string> S = tooling::buildAccess(*E, *Match.Context);
+    if (!S.hasValue())
+      return llvm::make_error<StringError>(
+          errc::invalid_argument,
+          "Could not construct object text from ID: " + BaseId);
+    *Result += *S;
     return Member->eval(Match, Result);
   }
 };
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vmul-eew64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vmul-eew64.c
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vmul-eew64.c
@@ -0,0 +1,440 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v -disable-O0-optnone -emit-llvm %s -o - | opt -S -mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+// NOTE: This test file contains eew=64 of vmulh, vmulhu, vmulhsu.
+// NOTE: The purpose of separating these 3 instructions from vmul.c is that
+// eew=64 versions only enable when V extension is specified. (Not for zve)
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: @test_vmulh_vv_i64m1(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulh.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
+//
+vint64m1_t test_vmulh_vv_i64m1(vint64m1_t op1, vint64m1_t op2, size_t vl) {
+  return vmulh(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulh_vx_i64m1(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulh.nxv1i64.i64.i64(<vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
+//
+vint64m1_t test_vmulh_vx_i64m1(vint64m1_t op1, int64_t op2, size_t vl) {
+  return vmulh(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulh_vv_i64m2(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulh.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+vint64m2_t test_vmulh_vv_i64m2(vint64m2_t op1, vint64m2_t op2, size_t vl) {
+  return vmulh(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulh_vx_i64m2(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulh.nxv2i64.i64.i64(<vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+vint64m2_t test_vmulh_vx_i64m2(vint64m2_t op1, int64_t op2, size_t vl) {
+  return vmulh(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulh_vv_i64m4(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulh.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+vint64m4_t test_vmulh_vv_i64m4(vint64m4_t op1, vint64m4_t op2, size_t vl) {
+  return vmulh(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulh_vx_i64m4(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulh.nxv4i64.i64.i64(<vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+vint64m4_t test_vmulh_vx_i64m4(vint64m4_t op1, int64_t op2, size_t vl) {
+  return vmulh(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulh_vv_i64m8(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulh.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+vint64m8_t test_vmulh_vv_i64m8(vint64m8_t op1, vint64m8_t op2, size_t vl) {
+  return vmulh(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulh_vx_i64m8(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulh.nxv8i64.i64.i64(<vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+vint64m8_t test_vmulh_vx_i64m8(vint64m8_t op1, int64_t op2, size_t vl) {
+  return vmulh(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhu_vv_u64m1(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulhu.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
+//
+vuint64m1_t test_vmulhu_vv_u64m1(vuint64m1_t op1, vuint64m1_t op2, size_t vl) {
+  return vmulhu(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhu_vx_u64m1(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulhu.nxv1i64.i64.i64(<vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
+//
+vuint64m1_t test_vmulhu_vx_u64m1(vuint64m1_t op1, uint64_t op2, size_t vl) {
+  return vmulhu(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhu_vv_u64m2(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulhu.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+vuint64m2_t test_vmulhu_vv_u64m2(vuint64m2_t op1, vuint64m2_t op2, size_t vl) {
+  return vmulhu(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhu_vx_u64m2(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulhu.nxv2i64.i64.i64(<vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+vuint64m2_t test_vmulhu_vx_u64m2(vuint64m2_t op1, uint64_t op2, size_t vl) {
+  return vmulhu(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhu_vv_u64m4(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulhu.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+vuint64m4_t test_vmulhu_vv_u64m4(vuint64m4_t op1, vuint64m4_t op2, size_t vl) {
+  return vmulhu(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhu_vx_u64m4(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulhu.nxv4i64.i64.i64(<vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+vuint64m4_t test_vmulhu_vx_u64m4(vuint64m4_t op1, uint64_t op2, size_t vl) {
+  return vmulhu(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhu_vv_u64m8(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulhu.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+vuint64m8_t test_vmulhu_vv_u64m8(vuint64m8_t op1, vuint64m8_t op2, size_t vl) {
+  return vmulhu(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhu_vx_u64m8(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulhu.nxv8i64.i64.i64(<vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+vuint64m8_t test_vmulhu_vx_u64m8(vuint64m8_t op1, uint64_t op2, size_t vl) {
+  return vmulhu(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhsu_vv_i64m1(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulhsu.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
+//
+vint64m1_t test_vmulhsu_vv_i64m1(vint64m1_t op1, vuint64m1_t op2, size_t vl) {
+  return vmulhsu(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhsu_vx_i64m1(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulhsu.nxv1i64.i64.i64(<vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
+//
+vint64m1_t test_vmulhsu_vx_i64m1(vint64m1_t op1, uint64_t op2, size_t vl) {
+  return vmulhsu(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhsu_vv_i64m2(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulhsu.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+vint64m2_t test_vmulhsu_vv_i64m2(vint64m2_t op1, vuint64m2_t op2, size_t vl) {
+  return vmulhsu(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhsu_vx_i64m2(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulhsu.nxv2i64.i64.i64(<vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+vint64m2_t test_vmulhsu_vx_i64m2(vint64m2_t op1, uint64_t op2, size_t vl) {
+  return vmulhsu(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhsu_vv_i64m4(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulhsu.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+vint64m4_t test_vmulhsu_vv_i64m4(vint64m4_t op1, vuint64m4_t op2, size_t vl) {
+  return vmulhsu(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhsu_vx_i64m4(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulhsu.nxv4i64.i64.i64(<vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+vint64m4_t test_vmulhsu_vx_i64m4(vint64m4_t op1, uint64_t op2, size_t vl) {
+  return vmulhsu(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhsu_vv_i64m8(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulhsu.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+vint64m8_t test_vmulhsu_vv_i64m8(vint64m8_t op1, vuint64m8_t op2, size_t vl) {
+  return vmulhsu(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhsu_vx_i64m8(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulhsu.nxv8i64.i64.i64(<vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+vint64m8_t test_vmulhsu_vx_i64m8(vint64m8_t op1, uint64_t op2, size_t vl) {
+  return vmulhsu(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulh_vv_i64m1_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulh.mask.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
+//
+vint64m1_t test_vmulh_vv_i64m1_m(vbool64_t mask, vint64m1_t maskedoff, vint64m1_t op1, vint64m1_t op2, size_t vl) {
+  return vmulh(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulh_vx_i64m1_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulh.mask.nxv1i64.i64.i64(<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
+//
+vint64m1_t test_vmulh_vx_i64m1_m(vbool64_t mask, vint64m1_t maskedoff, vint64m1_t op1, int64_t op2, size_t vl) {
+  return vmulh(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulh_vv_i64m2_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulh.mask.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+vint64m2_t test_vmulh_vv_i64m2_m(vbool32_t mask, vint64m2_t maskedoff, vint64m2_t op1, vint64m2_t op2, size_t vl) {
+  return vmulh(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulh_vx_i64m2_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulh.mask.nxv2i64.i64.i64(<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+vint64m2_t test_vmulh_vx_i64m2_m(vbool32_t mask, vint64m2_t maskedoff, vint64m2_t op1, int64_t op2, size_t vl) {
+  return vmulh(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulh_vv_i64m4_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulh.mask.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+vint64m4_t test_vmulh_vv_i64m4_m(vbool16_t mask, vint64m4_t maskedoff, vint64m4_t op1, vint64m4_t op2, size_t vl) {
+  return vmulh(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulh_vx_i64m4_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulh.mask.nxv4i64.i64.i64(<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+vint64m4_t test_vmulh_vx_i64m4_m(vbool16_t mask, vint64m4_t maskedoff, vint64m4_t op1, int64_t op2, size_t vl) {
+  return vmulh(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulh_vv_i64m8_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulh.mask.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+vint64m8_t test_vmulh_vv_i64m8_m(vbool8_t mask, vint64m8_t maskedoff, vint64m8_t op1, vint64m8_t op2, size_t vl) {
+  return vmulh(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulh_vx_i64m8_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulh.mask.nxv8i64.i64.i64(<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+vint64m8_t test_vmulh_vx_i64m8_m(vbool8_t mask, vint64m8_t maskedoff, vint64m8_t op1, int64_t op2, size_t vl) {
+  return vmulh(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhu_vv_u64m1_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulhu.mask.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
+//
+vuint64m1_t test_vmulhu_vv_u64m1_m(vbool64_t mask, vuint64m1_t maskedoff, vuint64m1_t op1, vuint64m1_t op2, size_t vl) {
+  return vmulhu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhu_vx_u64m1_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulhu.mask.nxv1i64.i64.i64(<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
+//
+vuint64m1_t test_vmulhu_vx_u64m1_m(vbool64_t mask, vuint64m1_t maskedoff, vuint64m1_t op1, uint64_t op2, size_t vl) {
+  return vmulhu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhu_vv_u64m2_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulhu.mask.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+vuint64m2_t test_vmulhu_vv_u64m2_m(vbool32_t mask, vuint64m2_t maskedoff, vuint64m2_t op1, vuint64m2_t op2, size_t vl) {
+  return vmulhu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhu_vx_u64m2_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulhu.mask.nxv2i64.i64.i64(<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+vuint64m2_t test_vmulhu_vx_u64m2_m(vbool32_t mask, vuint64m2_t maskedoff, vuint64m2_t op1, uint64_t op2, size_t vl) {
+  return vmulhu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhu_vv_u64m4_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulhu.mask.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+vuint64m4_t test_vmulhu_vv_u64m4_m(vbool16_t mask, vuint64m4_t maskedoff, vuint64m4_t op1, vuint64m4_t op2, size_t vl) {
+  return vmulhu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhu_vx_u64m4_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulhu.mask.nxv4i64.i64.i64(<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+vuint64m4_t test_vmulhu_vx_u64m4_m(vbool16_t mask, vuint64m4_t maskedoff, vuint64m4_t op1, uint64_t op2, size_t vl) {
+  return vmulhu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhu_vv_u64m8_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulhu.mask.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+vuint64m8_t test_vmulhu_vv_u64m8_m(vbool8_t mask, vuint64m8_t maskedoff, vuint64m8_t op1, vuint64m8_t op2, size_t vl) {
+  return vmulhu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhu_vx_u64m8_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulhu.mask.nxv8i64.i64.i64(<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+vuint64m8_t test_vmulhu_vx_u64m8_m(vbool8_t mask, vuint64m8_t maskedoff, vuint64m8_t op1, uint64_t op2, size_t vl) {
+  return vmulhu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhsu_vv_i64m1_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulhsu.mask.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
+//
+vint64m1_t test_vmulhsu_vv_i64m1_m(vbool64_t mask, vint64m1_t maskedoff, vint64m1_t op1, vuint64m1_t op2, size_t vl) {
+  return vmulhsu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhsu_vx_i64m1_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulhsu.mask.nxv1i64.i64.i64(<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
+//
+vint64m1_t test_vmulhsu_vx_i64m1_m(vbool64_t mask, vint64m1_t maskedoff, vint64m1_t op1, uint64_t op2, size_t vl) {
+  return vmulhsu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhsu_vv_i64m2_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulhsu.mask.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+vint64m2_t test_vmulhsu_vv_i64m2_m(vbool32_t mask, vint64m2_t maskedoff, vint64m2_t op1, vuint64m2_t op2, size_t vl) {
+  return vmulhsu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhsu_vx_i64m2_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulhsu.mask.nxv2i64.i64.i64(<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+vint64m2_t test_vmulhsu_vx_i64m2_m(vbool32_t mask, vint64m2_t maskedoff, vint64m2_t op1, uint64_t op2, size_t vl) {
+  return vmulhsu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhsu_vv_i64m4_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulhsu.mask.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+vint64m4_t test_vmulhsu_vv_i64m4_m(vbool16_t mask, vint64m4_t maskedoff, vint64m4_t op1, vuint64m4_t op2, size_t vl) {
+  return vmulhsu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhsu_vx_i64m4_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulhsu.mask.nxv4i64.i64.i64(<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+vint64m4_t test_vmulhsu_vx_i64m4_m(vbool16_t mask, vint64m4_t maskedoff, vint64m4_t op1, uint64_t op2, size_t vl) {
+  return vmulhsu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhsu_vv_i64m8_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulhsu.mask.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+vint64m8_t test_vmulhsu_vv_i64m8_m(vbool8_t mask, vint64m8_t maskedoff, vint64m8_t op1, vuint64m8_t op2, size_t vl) {
+  return vmulhsu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhsu_vx_i64m8_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulhsu.mask.nxv8i64.i64.i64(<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+vint64m8_t test_vmulhsu_vx_i64m8_m(vbool8_t mask, vint64m8_t maskedoff, vint64m8_t op1, uint64_t op2, size_t vl) {
+  return vmulhsu(mask, maskedoff, op1, op2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vmul.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vmul.c
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vmul.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vmul.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: riscv-registered-target
-// RUN: %clang_cc1 -triple riscv64 -target-feature +v -disable-O0-optnone -emit-llvm %s -o - | opt -S -mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -disable-O0-optnone -emit-llvm %s -o - | opt -S -mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
 
 #include <riscv_vector.h>
 
@@ -1120,78 +1120,6 @@
   return vmulh(op1, op2, vl);
 }
 
-// CHECK-RV64-LABEL: @test_vmulh_vv_i64m1(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulh.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
-//
-vint64m1_t test_vmulh_vv_i64m1(vint64m1_t op1, vint64m1_t op2, size_t vl) {
-  return vmulh(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulh_vx_i64m1(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulh.nxv1i64.i64.i64(<vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
-//
-vint64m1_t test_vmulh_vx_i64m1(vint64m1_t op1, int64_t op2, size_t vl) {
-  return vmulh(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulh_vv_i64m2(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulh.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
-//
-vint64m2_t test_vmulh_vv_i64m2(vint64m2_t op1, vint64m2_t op2, size_t vl) {
-  return vmulh(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulh_vx_i64m2(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulh.nxv2i64.i64.i64(<vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
-//
-vint64m2_t test_vmulh_vx_i64m2(vint64m2_t op1, int64_t op2, size_t vl) {
-  return vmulh(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulh_vv_i64m4(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulh.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
-//
-vint64m4_t test_vmulh_vv_i64m4(vint64m4_t op1, vint64m4_t op2, size_t vl) {
-  return vmulh(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulh_vx_i64m4(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulh.nxv4i64.i64.i64(<vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
-//
-vint64m4_t test_vmulh_vx_i64m4(vint64m4_t op1, int64_t op2, size_t vl) {
-  return vmulh(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulh_vv_i64m8(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulh.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
-//
-vint64m8_t test_vmulh_vv_i64m8(vint64m8_t op1, vint64m8_t op2, size_t vl) {
-  return vmulh(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulh_vx_i64m8(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulh.nxv8i64.i64.i64(<vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
-//
-vint64m8_t test_vmulh_vx_i64m8(vint64m8_t op1, int64_t op2, size_t vl) {
-  return vmulh(op1, op2, vl);
-}
-
 // CHECK-RV64-LABEL: @test_vmulhu_vv_u8mf8(
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmulhu.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
@@ -1516,78 +1444,6 @@
   return vmulhu(op1, op2, vl);
 }
 
-// CHECK-RV64-LABEL: @test_vmulhu_vv_u64m1(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulhu.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
-//
-vuint64m1_t test_vmulhu_vv_u64m1(vuint64m1_t op1, vuint64m1_t op2, size_t vl) {
-  return vmulhu(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhu_vx_u64m1(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulhu.nxv1i64.i64.i64(<vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
-//
-vuint64m1_t test_vmulhu_vx_u64m1(vuint64m1_t op1, uint64_t op2, size_t vl) {
-  return vmulhu(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhu_vv_u64m2(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulhu.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
-//
-vuint64m2_t test_vmulhu_vv_u64m2(vuint64m2_t op1, vuint64m2_t op2, size_t vl) {
-  return vmulhu(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhu_vx_u64m2(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulhu.nxv2i64.i64.i64(<vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
-//
-vuint64m2_t test_vmulhu_vx_u64m2(vuint64m2_t op1, uint64_t op2, size_t vl) {
-  return vmulhu(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhu_vv_u64m4(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulhu.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
-//
-vuint64m4_t test_vmulhu_vv_u64m4(vuint64m4_t op1, vuint64m4_t op2, size_t vl) {
-  return vmulhu(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhu_vx_u64m4(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulhu.nxv4i64.i64.i64(<vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
-//
-vuint64m4_t test_vmulhu_vx_u64m4(vuint64m4_t op1, uint64_t op2, size_t vl) {
-  return vmulhu(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhu_vv_u64m8(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulhu.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
-//
-vuint64m8_t test_vmulhu_vv_u64m8(vuint64m8_t op1, vuint64m8_t op2, size_t vl) {
-  return vmulhu(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhu_vx_u64m8(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulhu.nxv8i64.i64.i64(<vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
-//
-vuint64m8_t test_vmulhu_vx_u64m8(vuint64m8_t op1, uint64_t op2, size_t vl) {
-  return vmulhu(op1, op2, vl);
-}
-
 // CHECK-RV64-LABEL: @test_vmulhsu_vv_i8mf8(
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmulhsu.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
@@ -1912,78 +1768,6 @@
   return vmulhsu(op1, op2, vl);
 }
 
-// CHECK-RV64-LABEL: @test_vmulhsu_vv_i64m1(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulhsu.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
-//
-vint64m1_t test_vmulhsu_vv_i64m1(vint64m1_t op1, vuint64m1_t op2, size_t vl) {
-  return vmulhsu(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhsu_vx_i64m1(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulhsu.nxv1i64.i64.i64(<vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
-//
-vint64m1_t test_vmulhsu_vx_i64m1(vint64m1_t op1, uint64_t op2, size_t vl) {
-  return vmulhsu(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhsu_vv_i64m2(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulhsu.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
-//
-vint64m2_t test_vmulhsu_vv_i64m2(vint64m2_t op1, vuint64m2_t op2, size_t vl) {
-  return vmulhsu(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhsu_vx_i64m2(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulhsu.nxv2i64.i64.i64(<vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
-//
-vint64m2_t test_vmulhsu_vx_i64m2(vint64m2_t op1, uint64_t op2, size_t vl) {
-  return vmulhsu(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhsu_vv_i64m4(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulhsu.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
-//
-vint64m4_t test_vmulhsu_vv_i64m4(vint64m4_t op1, vuint64m4_t op2, size_t vl) {
-  return vmulhsu(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhsu_vx_i64m4(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulhsu.nxv4i64.i64.i64(<vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
-//
-vint64m4_t test_vmulhsu_vx_i64m4(vint64m4_t op1, uint64_t op2, size_t vl) {
-  return vmulhsu(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhsu_vv_i64m8(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulhsu.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
-//
-vint64m8_t test_vmulhsu_vv_i64m8(vint64m8_t op1, vuint64m8_t op2, size_t vl) {
-  return vmulhsu(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhsu_vx_i64m8(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulhsu.nxv8i64.i64.i64(<vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
-//
-vint64m8_t test_vmulhsu_vx_i64m8(vint64m8_t op1, uint64_t op2, size_t vl) {
-  return vmulhsu(op1, op2, vl);
-}
-
 // CHECK-RV64-LABEL: @test_vmul_vv_i8mf8_m(
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmul.mask.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[MASKEDOFF:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
@@ -3100,78 +2884,6 @@
   return vmulh(mask, maskedoff, op1, op2, vl);
 }
 
-// CHECK-RV64-LABEL: @test_vmulh_vv_i64m1_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulh.mask.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
-//
-vint64m1_t test_vmulh_vv_i64m1_m(vbool64_t mask, vint64m1_t maskedoff, vint64m1_t op1, vint64m1_t op2, size_t vl) {
-  return vmulh(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulh_vx_i64m1_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulh.mask.nxv1i64.i64.i64(<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
-//
-vint64m1_t test_vmulh_vx_i64m1_m(vbool64_t mask, vint64m1_t maskedoff, vint64m1_t op1, int64_t op2, size_t vl) {
-  return vmulh(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulh_vv_i64m2_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulh.mask.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
-//
-vint64m2_t test_vmulh_vv_i64m2_m(vbool32_t mask, vint64m2_t maskedoff, vint64m2_t op1, vint64m2_t op2, size_t vl) {
-  return vmulh(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulh_vx_i64m2_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulh.mask.nxv2i64.i64.i64(<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
-//
-vint64m2_t test_vmulh_vx_i64m2_m(vbool32_t mask, vint64m2_t maskedoff, vint64m2_t op1, int64_t op2, size_t vl) {
-  return vmulh(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulh_vv_i64m4_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulh.mask.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
-//
-vint64m4_t test_vmulh_vv_i64m4_m(vbool16_t mask, vint64m4_t maskedoff, vint64m4_t op1, vint64m4_t op2, size_t vl) {
-  return vmulh(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulh_vx_i64m4_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulh.mask.nxv4i64.i64.i64(<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
-//
-vint64m4_t test_vmulh_vx_i64m4_m(vbool16_t mask, vint64m4_t maskedoff, vint64m4_t op1, int64_t op2, size_t vl) {
-  return vmulh(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulh_vv_i64m8_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulh.mask.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
-//
-vint64m8_t test_vmulh_vv_i64m8_m(vbool8_t mask, vint64m8_t maskedoff, vint64m8_t op1, vint64m8_t op2, size_t vl) {
-  return vmulh(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulh_vx_i64m8_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulh.mask.nxv8i64.i64.i64(<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
-//
-vint64m8_t test_vmulh_vx_i64m8_m(vbool8_t mask, vint64m8_t maskedoff, vint64m8_t op1, int64_t op2, size_t vl) {
-  return vmulh(mask, maskedoff, op1, op2, vl);
-}
-
 // CHECK-RV64-LABEL: @test_vmulhu_vv_u8mf8_m(
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmulhu.mask.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[MASKEDOFF:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
@@ -3496,78 +3208,6 @@
   return vmulhu(mask, maskedoff, op1, op2, vl);
 }
 
-// CHECK-RV64-LABEL: @test_vmulhu_vv_u64m1_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulhu.mask.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
-//
-vuint64m1_t test_vmulhu_vv_u64m1_m(vbool64_t mask, vuint64m1_t maskedoff, vuint64m1_t op1, vuint64m1_t op2, size_t vl) {
-  return vmulhu(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhu_vx_u64m1_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulhu.mask.nxv1i64.i64.i64(<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
-//
-vuint64m1_t test_vmulhu_vx_u64m1_m(vbool64_t mask, vuint64m1_t maskedoff, vuint64m1_t op1, uint64_t op2, size_t vl) {
-  return vmulhu(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhu_vv_u64m2_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulhu.mask.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
-//
-vuint64m2_t test_vmulhu_vv_u64m2_m(vbool32_t mask, vuint64m2_t maskedoff, vuint64m2_t op1, vuint64m2_t op2, size_t vl) {
-  return vmulhu(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhu_vx_u64m2_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulhu.mask.nxv2i64.i64.i64(<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
-//
-vuint64m2_t test_vmulhu_vx_u64m2_m(vbool32_t mask, vuint64m2_t maskedoff, vuint64m2_t op1, uint64_t op2, size_t vl) {
-  return vmulhu(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhu_vv_u64m4_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulhu.mask.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
-//
-vuint64m4_t test_vmulhu_vv_u64m4_m(vbool16_t mask, vuint64m4_t maskedoff, vuint64m4_t op1, vuint64m4_t op2, size_t vl) {
-  return vmulhu(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhu_vx_u64m4_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulhu.mask.nxv4i64.i64.i64(<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
-//
-vuint64m4_t test_vmulhu_vx_u64m4_m(vbool16_t mask, vuint64m4_t maskedoff, vuint64m4_t op1, uint64_t op2, size_t vl) {
-  return vmulhu(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhu_vv_u64m8_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulhu.mask.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
-//
-vuint64m8_t test_vmulhu_vv_u64m8_m(vbool8_t mask, vuint64m8_t maskedoff, vuint64m8_t op1, vuint64m8_t op2, size_t vl) {
-  return vmulhu(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhu_vx_u64m8_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulhu.mask.nxv8i64.i64.i64(<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
-//
-vuint64m8_t test_vmulhu_vx_u64m8_m(vbool8_t mask, vuint64m8_t maskedoff, vuint64m8_t op1, uint64_t op2, size_t vl) {
-  return vmulhu(mask, maskedoff, op1, op2, vl);
-}
-
 // CHECK-RV64-LABEL: @test_vmulhsu_vv_i8mf8_m(
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmulhsu.mask.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[MASKEDOFF:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
@@ -3891,75 +3531,3 @@
 vint32m8_t test_vmulhsu_vx_i32m8_m(vbool4_t mask, vint32m8_t maskedoff, vint32m8_t op1, uint32_t op2, size_t vl) {
   return vmulhsu(mask, maskedoff, op1, op2, vl);
 }
-
-// CHECK-RV64-LABEL: @test_vmulhsu_vv_i64m1_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulhsu.mask.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
-//
-vint64m1_t test_vmulhsu_vv_i64m1_m(vbool64_t mask, vint64m1_t maskedoff, vint64m1_t op1, vuint64m1_t op2, size_t vl) {
-  return vmulhsu(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhsu_vx_i64m1_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulhsu.mask.nxv1i64.i64.i64(<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
-//
-vint64m1_t test_vmulhsu_vx_i64m1_m(vbool64_t mask, vint64m1_t maskedoff, vint64m1_t op1, uint64_t op2, size_t vl) {
-  return vmulhsu(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhsu_vv_i64m2_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulhsu.mask.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
-//
-vint64m2_t test_vmulhsu_vv_i64m2_m(vbool32_t mask, vint64m2_t maskedoff, vint64m2_t op1, vuint64m2_t op2, size_t vl) {
-  return vmulhsu(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhsu_vx_i64m2_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulhsu.mask.nxv2i64.i64.i64(<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
-//
-vint64m2_t test_vmulhsu_vx_i64m2_m(vbool32_t mask, vint64m2_t maskedoff, vint64m2_t op1, uint64_t op2, size_t vl) {
-  return vmulhsu(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhsu_vv_i64m4_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulhsu.mask.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
-//
-vint64m4_t test_vmulhsu_vv_i64m4_m(vbool16_t mask, vint64m4_t maskedoff, vint64m4_t op1, vuint64m4_t op2, size_t vl) {
-  return vmulhsu(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhsu_vx_i64m4_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulhsu.mask.nxv4i64.i64.i64(<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
-//
-vint64m4_t test_vmulhsu_vx_i64m4_m(vbool16_t mask, vint64m4_t maskedoff, vint64m4_t op1, uint64_t op2, size_t vl) {
-  return vmulhsu(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhsu_vv_i64m8_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulhsu.mask.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
-//
-vint64m8_t test_vmulhsu_vv_i64m8_m(vbool8_t mask, vint64m8_t maskedoff, vint64m8_t op1, vuint64m8_t op2, size_t vl) {
-  return vmulhsu(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhsu_vx_i64m8_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulhsu.mask.nxv8i64.i64.i64(<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
-//
-vint64m8_t test_vmulhsu_vx_i64m8_m(vbool8_t mask, vint64m8_t maskedoff, vint64m8_t op1, uint64_t op2, size_t vl) {
-  return vmulhsu(mask, maskedoff, op1, op2, vl);
-}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vsmul-eew64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vsmul-eew64.c
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vsmul-eew64.c
@@ -0,0 +1,159 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v -disable-O0-optnone -emit-llvm %s -o - | opt -S -mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+// NOTE: The purpose of separating these 3 instructions from vsmul.c is that
+// eew=64 versions only enable when V extension is specified. (Not for zve)
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: @test_vsmul_vv_i64m1(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vsmul.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
+//
+vint64m1_t test_vsmul_vv_i64m1(vint64m1_t op1, vint64m1_t op2, size_t vl) {
+  return vsmul(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vsmul_vx_i64m1(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vsmul.nxv1i64.i64.i64(<vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
+//
+vint64m1_t test_vsmul_vx_i64m1(vint64m1_t op1, int64_t op2, size_t vl) {
+  return vsmul(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vsmul_vv_i64m2(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vsmul.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+vint64m2_t test_vsmul_vv_i64m2(vint64m2_t op1, vint64m2_t op2, size_t vl) {
+  return vsmul(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vsmul_vx_i64m2(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vsmul.nxv2i64.i64.i64(<vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+vint64m2_t test_vsmul_vx_i64m2(vint64m2_t op1, int64_t op2, size_t vl) {
+  return vsmul(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vsmul_vv_i64m4(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vsmul.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+vint64m4_t test_vsmul_vv_i64m4(vint64m4_t op1, vint64m4_t op2, size_t vl) {
+  return vsmul(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vsmul_vx_i64m4(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vsmul.nxv4i64.i64.i64(<vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+vint64m4_t test_vsmul_vx_i64m4(vint64m4_t op1, int64_t op2, size_t vl) {
+  return vsmul(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vsmul_vv_i64m8(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vsmul.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+vint64m8_t test_vsmul_vv_i64m8(vint64m8_t op1, vint64m8_t op2, size_t vl) {
+  return vsmul(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vsmul_vx_i64m8(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vsmul.nxv8i64.i64.i64(<vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+vint64m8_t test_vsmul_vx_i64m8(vint64m8_t op1, int64_t op2, size_t vl) {
+  return vsmul(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vsmul_vv_i64m1_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vsmul.mask.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
+//
+vint64m1_t test_vsmul_vv_i64m1_m(vbool64_t mask, vint64m1_t maskedoff,
+                                 vint64m1_t op1, vint64m1_t op2, size_t vl) {
+  return vsmul(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vsmul_vx_i64m1_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vsmul.mask.nxv1i64.i64.i64(<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
+//
+vint64m1_t test_vsmul_vx_i64m1_m(vbool64_t mask, vint64m1_t maskedoff,
+                                 vint64m1_t op1, int64_t op2, size_t vl) {
+  return vsmul(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vsmul_vv_i64m2_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vsmul.mask.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+vint64m2_t test_vsmul_vv_i64m2_m(vbool32_t mask, vint64m2_t maskedoff,
+                                 vint64m2_t op1, vint64m2_t op2, size_t vl) {
+  return vsmul(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vsmul_vx_i64m2_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vsmul.mask.nxv2i64.i64.i64(<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+vint64m2_t test_vsmul_vx_i64m2_m(vbool32_t mask, vint64m2_t maskedoff,
+                                 vint64m2_t op1, int64_t op2, size_t vl) {
+  return vsmul(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vsmul_vv_i64m4_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vsmul.mask.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+vint64m4_t test_vsmul_vv_i64m4_m(vbool16_t mask, vint64m4_t maskedoff,
+                                 vint64m4_t op1, vint64m4_t op2, size_t vl) {
+  return vsmul(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vsmul_vx_i64m4_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vsmul.mask.nxv4i64.i64.i64(<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+vint64m4_t test_vsmul_vx_i64m4_m(vbool16_t mask, vint64m4_t maskedoff,
+                                 vint64m4_t op1, int64_t op2, size_t vl) {
+  return vsmul(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vsmul_vv_i64m8_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vsmul.mask.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+vint64m8_t test_vsmul_vv_i64m8_m(vbool8_t mask, vint64m8_t maskedoff,
+                                 vint64m8_t op1, vint64m8_t op2, size_t vl) {
+  return vsmul(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vsmul_vx_i64m8_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vsmul.mask.nxv8i64.i64.i64(<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+vint64m8_t test_vsmul_vx_i64m8_m(vbool8_t mask, vint64m8_t maskedoff,
+                                 vint64m8_t op1, int64_t op2, size_t vl) {
+  return vsmul(mask, maskedoff, op1, op2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vsmul.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vsmul.c
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vsmul.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vsmul.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: riscv-registered-target
-// RUN: %clang_cc1 -triple riscv64 -target-feature +v -disable-O0-optnone -emit-llvm %s -o - | opt -S -mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -disable-O0-optnone -emit-llvm %s -o - | opt -S -mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
 
 #include <riscv_vector.h>
 
@@ -328,78 +328,6 @@
   return vsmul(op1, op2, vl);
 }
 
-// CHECK-RV64-LABEL: @test_vsmul_vv_i64m1(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vsmul.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
-//
-vint64m1_t test_vsmul_vv_i64m1(vint64m1_t op1, vint64m1_t op2, size_t vl) {
-  return vsmul(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vsmul_vx_i64m1(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vsmul.nxv1i64.i64.i64(<vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
-//
-vint64m1_t test_vsmul_vx_i64m1(vint64m1_t op1, int64_t op2, size_t vl) {
-  return vsmul(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vsmul_vv_i64m2(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vsmul.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
-//
-vint64m2_t test_vsmul_vv_i64m2(vint64m2_t op1, vint64m2_t op2, size_t vl) {
-  return vsmul(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vsmul_vx_i64m2(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vsmul.nxv2i64.i64.i64(<vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
-//
-vint64m2_t test_vsmul_vx_i64m2(vint64m2_t op1, int64_t op2, size_t vl) {
-  return vsmul(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vsmul_vv_i64m4(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vsmul.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
-//
-vint64m4_t test_vsmul_vv_i64m4(vint64m4_t op1, vint64m4_t op2, size_t vl) {
-  return vsmul(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vsmul_vx_i64m4(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vsmul.nxv4i64.i64.i64(<vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
-//
-vint64m4_t test_vsmul_vx_i64m4(vint64m4_t op1, int64_t op2, size_t vl) {
-  return vsmul(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vsmul_vv_i64m8(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vsmul.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
-//
-vint64m8_t test_vsmul_vv_i64m8(vint64m8_t op1, vint64m8_t op2, size_t vl) {
-  return vsmul(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vsmul_vx_i64m8(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vsmul.nxv8i64.i64.i64(<vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
-//
-vint64m8_t test_vsmul_vx_i64m8(vint64m8_t op1, int64_t op2, size_t vl) {
-  return vsmul(op1, op2, vl);
-}
-
 // CHECK-RV64-LABEL: @test_vsmul_vv_i8mf8_m(
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vsmul.mask.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[MASKEDOFF:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
@@ -762,83 +690,3 @@
                                  vint32m8_t op1, int32_t op2, size_t vl) {
   return vsmul(mask, maskedoff, op1, op2, vl);
 }
-
-// CHECK-RV64-LABEL: @test_vsmul_vv_i64m1_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vsmul.mask.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
-//
-vint64m1_t test_vsmul_vv_i64m1_m(vbool64_t mask, vint64m1_t maskedoff,
-                                 vint64m1_t op1, vint64m1_t op2, size_t vl) {
-  return vsmul(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vsmul_vx_i64m1_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vsmul.mask.nxv1i64.i64.i64(<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
-//
-vint64m1_t test_vsmul_vx_i64m1_m(vbool64_t mask, vint64m1_t maskedoff,
-                                 vint64m1_t op1, int64_t op2, size_t vl) {
-  return vsmul(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vsmul_vv_i64m2_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vsmul.mask.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
-//
-vint64m2_t test_vsmul_vv_i64m2_m(vbool32_t mask, vint64m2_t maskedoff,
-                                 vint64m2_t op1, vint64m2_t op2, size_t vl) {
-  return vsmul(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vsmul_vx_i64m2_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vsmul.mask.nxv2i64.i64.i64(<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
-//
-vint64m2_t test_vsmul_vx_i64m2_m(vbool32_t mask, vint64m2_t maskedoff,
-                                 vint64m2_t op1, int64_t op2, size_t vl) {
-  return vsmul(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vsmul_vv_i64m4_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vsmul.mask.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
-//
-vint64m4_t test_vsmul_vv_i64m4_m(vbool16_t mask, vint64m4_t maskedoff,
-                                 vint64m4_t op1, vint64m4_t op2, size_t vl) {
-  return vsmul(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vsmul_vx_i64m4_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vsmul.mask.nxv4i64.i64.i64(<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
-//
-vint64m4_t test_vsmul_vx_i64m4_m(vbool16_t mask, vint64m4_t maskedoff,
-                                 vint64m4_t op1, int64_t op2, size_t vl) {
-  return vsmul(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vsmul_vv_i64m8_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vsmul.mask.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
-//
-vint64m8_t test_vsmul_vv_i64m8_m(vbool8_t mask, vint64m8_t maskedoff,
-                                 vint64m8_t op1, vint64m8_t op2, size_t vl) {
-  return vsmul(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vsmul_vx_i64m8_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vsmul.mask.nxv8i64.i64.i64(<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
-//
-vint64m8_t test_vsmul_vx_i64m8_m(vbool8_t mask, vint64m8_t maskedoff,
-                                 vint64m8_t op1, int64_t op2, size_t vl) {
-  return vsmul(mask, maskedoff, op1, op2, vl);
-}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics/vmul-eew64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics/vmul-eew64.c
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics/vmul-eew64.c
@@ -0,0 +1,440 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v -disable-O0-optnone -emit-llvm %s -o - | opt -S -mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+// NOTE: This test file contains eew=64 of vmulh, vmulhu, vmulhsu.
+// NOTE: The purpose of separating these 3 instructions from vmul.c is that
+// eew=64 versions only enable when V extension is specified. (Not for zve)
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: @test_vmulh_vv_i64m1(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulh.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
+//
+vint64m1_t test_vmulh_vv_i64m1(vint64m1_t op1, vint64m1_t op2, size_t vl) {
+  return vmulh_vv_i64m1(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulh_vx_i64m1(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulh.nxv1i64.i64.i64(<vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
+//
+vint64m1_t test_vmulh_vx_i64m1(vint64m1_t op1, int64_t op2, size_t vl) {
+  return vmulh_vx_i64m1(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulh_vv_i64m2(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulh.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+vint64m2_t test_vmulh_vv_i64m2(vint64m2_t op1, vint64m2_t op2, size_t vl) {
+  return vmulh_vv_i64m2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulh_vx_i64m2(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulh.nxv2i64.i64.i64(<vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+vint64m2_t test_vmulh_vx_i64m2(vint64m2_t op1, int64_t op2, size_t vl) {
+  return vmulh_vx_i64m2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulh_vv_i64m4(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulh.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+vint64m4_t test_vmulh_vv_i64m4(vint64m4_t op1, vint64m4_t op2, size_t vl) {
+  return vmulh_vv_i64m4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulh_vx_i64m4(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulh.nxv4i64.i64.i64(<vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+vint64m4_t test_vmulh_vx_i64m4(vint64m4_t op1, int64_t op2, size_t vl) {
+  return vmulh_vx_i64m4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulh_vv_i64m8(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulh.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+vint64m8_t test_vmulh_vv_i64m8(vint64m8_t op1, vint64m8_t op2, size_t vl) {
+  return vmulh_vv_i64m8(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulh_vx_i64m8(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulh.nxv8i64.i64.i64(<vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+vint64m8_t test_vmulh_vx_i64m8(vint64m8_t op1, int64_t op2, size_t vl) {
+  return vmulh_vx_i64m8(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhu_vv_u64m1(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulhu.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
+//
+vuint64m1_t test_vmulhu_vv_u64m1(vuint64m1_t op1, vuint64m1_t op2, size_t vl) {
+  return vmulhu_vv_u64m1(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhu_vx_u64m1(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulhu.nxv1i64.i64.i64(<vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
+//
+vuint64m1_t test_vmulhu_vx_u64m1(vuint64m1_t op1, uint64_t op2, size_t vl) {
+  return vmulhu_vx_u64m1(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhu_vv_u64m2(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulhu.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+vuint64m2_t test_vmulhu_vv_u64m2(vuint64m2_t op1, vuint64m2_t op2, size_t vl) {
+  return vmulhu_vv_u64m2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhu_vx_u64m2(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulhu.nxv2i64.i64.i64(<vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+vuint64m2_t test_vmulhu_vx_u64m2(vuint64m2_t op1, uint64_t op2, size_t vl) {
+  return vmulhu_vx_u64m2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhu_vv_u64m4(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulhu.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+vuint64m4_t test_vmulhu_vv_u64m4(vuint64m4_t op1, vuint64m4_t op2, size_t vl) {
+  return vmulhu_vv_u64m4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhu_vx_u64m4(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulhu.nxv4i64.i64.i64(<vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+vuint64m4_t test_vmulhu_vx_u64m4(vuint64m4_t op1, uint64_t op2, size_t vl) {
+  return vmulhu_vx_u64m4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhu_vv_u64m8(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulhu.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+vuint64m8_t test_vmulhu_vv_u64m8(vuint64m8_t op1, vuint64m8_t op2, size_t vl) {
+  return vmulhu_vv_u64m8(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhu_vx_u64m8(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulhu.nxv8i64.i64.i64(<vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+vuint64m8_t test_vmulhu_vx_u64m8(vuint64m8_t op1, uint64_t op2, size_t vl) {
+  return vmulhu_vx_u64m8(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhsu_vv_i64m1(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulhsu.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
+//
+vint64m1_t test_vmulhsu_vv_i64m1(vint64m1_t op1, vuint64m1_t op2, size_t vl) {
+  return vmulhsu_vv_i64m1(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhsu_vx_i64m1(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulhsu.nxv1i64.i64.i64(<vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
+//
+vint64m1_t test_vmulhsu_vx_i64m1(vint64m1_t op1, uint64_t op2, size_t vl) {
+  return vmulhsu_vx_i64m1(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhsu_vv_i64m2(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulhsu.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+vint64m2_t test_vmulhsu_vv_i64m2(vint64m2_t op1, vuint64m2_t op2, size_t vl) {
+  return vmulhsu_vv_i64m2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhsu_vx_i64m2(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulhsu.nxv2i64.i64.i64(<vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+vint64m2_t test_vmulhsu_vx_i64m2(vint64m2_t op1, uint64_t op2, size_t vl) {
+  return vmulhsu_vx_i64m2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhsu_vv_i64m4(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulhsu.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+vint64m4_t test_vmulhsu_vv_i64m4(vint64m4_t op1, vuint64m4_t op2, size_t vl) {
+  return vmulhsu_vv_i64m4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhsu_vx_i64m4(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulhsu.nxv4i64.i64.i64(<vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+vint64m4_t test_vmulhsu_vx_i64m4(vint64m4_t op1, uint64_t op2, size_t vl) {
+  return vmulhsu_vx_i64m4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhsu_vv_i64m8(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulhsu.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+vint64m8_t test_vmulhsu_vv_i64m8(vint64m8_t op1, vuint64m8_t op2, size_t vl) {
+  return vmulhsu_vv_i64m8(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhsu_vx_i64m8(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulhsu.nxv8i64.i64.i64(<vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+vint64m8_t test_vmulhsu_vx_i64m8(vint64m8_t op1, uint64_t op2, size_t vl) {
+  return vmulhsu_vx_i64m8(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulh_vv_i64m1_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulh.mask.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
+//
+vint64m1_t test_vmulh_vv_i64m1_m(vbool64_t mask, vint64m1_t maskedoff, vint64m1_t op1, vint64m1_t op2, size_t vl) {
+  return vmulh_vv_i64m1_m(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulh_vx_i64m1_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulh.mask.nxv1i64.i64.i64(<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
+//
+vint64m1_t test_vmulh_vx_i64m1_m(vbool64_t mask, vint64m1_t maskedoff, vint64m1_t op1, int64_t op2, size_t vl) {
+  return vmulh_vx_i64m1_m(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulh_vv_i64m2_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulh.mask.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+vint64m2_t test_vmulh_vv_i64m2_m(vbool32_t mask, vint64m2_t maskedoff, vint64m2_t op1, vint64m2_t op2, size_t vl) {
+  return vmulh_vv_i64m2_m(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulh_vx_i64m2_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulh.mask.nxv2i64.i64.i64(<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+vint64m2_t test_vmulh_vx_i64m2_m(vbool32_t mask, vint64m2_t maskedoff, vint64m2_t op1, int64_t op2, size_t vl) {
+  return vmulh_vx_i64m2_m(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulh_vv_i64m4_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulh.mask.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+vint64m4_t test_vmulh_vv_i64m4_m(vbool16_t mask, vint64m4_t maskedoff, vint64m4_t op1, vint64m4_t op2, size_t vl) {
+  return vmulh_vv_i64m4_m(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulh_vx_i64m4_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulh.mask.nxv4i64.i64.i64(<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+vint64m4_t test_vmulh_vx_i64m4_m(vbool16_t mask, vint64m4_t maskedoff, vint64m4_t op1, int64_t op2, size_t vl) {
+  return vmulh_vx_i64m4_m(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulh_vv_i64m8_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulh.mask.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+vint64m8_t test_vmulh_vv_i64m8_m(vbool8_t mask, vint64m8_t maskedoff, vint64m8_t op1, vint64m8_t op2, size_t vl) {
+  return vmulh_vv_i64m8_m(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulh_vx_i64m8_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulh.mask.nxv8i64.i64.i64(<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+vint64m8_t test_vmulh_vx_i64m8_m(vbool8_t mask, vint64m8_t maskedoff, vint64m8_t op1, int64_t op2, size_t vl) {
+  return vmulh_vx_i64m8_m(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhu_vv_u64m1_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulhu.mask.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
+//
+vuint64m1_t test_vmulhu_vv_u64m1_m(vbool64_t mask, vuint64m1_t maskedoff, vuint64m1_t op1, vuint64m1_t op2, size_t vl) {
+  return vmulhu_vv_u64m1_m(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhu_vx_u64m1_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulhu.mask.nxv1i64.i64.i64(<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
+//
+vuint64m1_t test_vmulhu_vx_u64m1_m(vbool64_t mask, vuint64m1_t maskedoff, vuint64m1_t op1, uint64_t op2, size_t vl) {
+  return vmulhu_vx_u64m1_m(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhu_vv_u64m2_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulhu.mask.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+vuint64m2_t test_vmulhu_vv_u64m2_m(vbool32_t mask, vuint64m2_t maskedoff, vuint64m2_t op1, vuint64m2_t op2, size_t vl) {
+  return vmulhu_vv_u64m2_m(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhu_vx_u64m2_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulhu.mask.nxv2i64.i64.i64(<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+vuint64m2_t test_vmulhu_vx_u64m2_m(vbool32_t mask, vuint64m2_t maskedoff, vuint64m2_t op1, uint64_t op2, size_t vl) {
+  return vmulhu_vx_u64m2_m(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhu_vv_u64m4_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulhu.mask.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+vuint64m4_t test_vmulhu_vv_u64m4_m(vbool16_t mask, vuint64m4_t maskedoff, vuint64m4_t op1, vuint64m4_t op2, size_t vl) {
+  return vmulhu_vv_u64m4_m(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhu_vx_u64m4_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulhu.mask.nxv4i64.i64.i64(<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+vuint64m4_t test_vmulhu_vx_u64m4_m(vbool16_t mask, vuint64m4_t maskedoff, vuint64m4_t op1, uint64_t op2, size_t vl) {
+  return vmulhu_vx_u64m4_m(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhu_vv_u64m8_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulhu.mask.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+vuint64m8_t test_vmulhu_vv_u64m8_m(vbool8_t mask, vuint64m8_t maskedoff, vuint64m8_t op1, vuint64m8_t op2, size_t vl) {
+  return vmulhu_vv_u64m8_m(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhu_vx_u64m8_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulhu.mask.nxv8i64.i64.i64(<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+vuint64m8_t test_vmulhu_vx_u64m8_m(vbool8_t mask, vuint64m8_t maskedoff, vuint64m8_t op1, uint64_t op2, size_t vl) {
+  return vmulhu_vx_u64m8_m(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhsu_vv_i64m1_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulhsu.mask.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
+//
+vint64m1_t test_vmulhsu_vv_i64m1_m(vbool64_t mask, vint64m1_t maskedoff, vint64m1_t op1, vuint64m1_t op2, size_t vl) {
+  return vmulhsu_vv_i64m1_m(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhsu_vx_i64m1_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulhsu.mask.nxv1i64.i64.i64(<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
+//
+vint64m1_t test_vmulhsu_vx_i64m1_m(vbool64_t mask, vint64m1_t maskedoff, vint64m1_t op1, uint64_t op2, size_t vl) {
+  return vmulhsu_vx_i64m1_m(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhsu_vv_i64m2_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulhsu.mask.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+vint64m2_t test_vmulhsu_vv_i64m2_m(vbool32_t mask, vint64m2_t maskedoff, vint64m2_t op1, vuint64m2_t op2, size_t vl) {
+  return vmulhsu_vv_i64m2_m(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhsu_vx_i64m2_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulhsu.mask.nxv2i64.i64.i64(<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+vint64m2_t test_vmulhsu_vx_i64m2_m(vbool32_t mask, vint64m2_t maskedoff, vint64m2_t op1, uint64_t op2, size_t vl) {
+  return vmulhsu_vx_i64m2_m(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhsu_vv_i64m4_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulhsu.mask.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+vint64m4_t test_vmulhsu_vv_i64m4_m(vbool16_t mask, vint64m4_t maskedoff, vint64m4_t op1, vuint64m4_t op2, size_t vl) {
+  return vmulhsu_vv_i64m4_m(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhsu_vx_i64m4_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulhsu.mask.nxv4i64.i64.i64(<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+vint64m4_t test_vmulhsu_vx_i64m4_m(vbool16_t mask, vint64m4_t maskedoff, vint64m4_t op1, uint64_t op2, size_t vl) {
+  return vmulhsu_vx_i64m4_m(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhsu_vv_i64m8_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulhsu.mask.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+vint64m8_t test_vmulhsu_vv_i64m8_m(vbool8_t mask, vint64m8_t maskedoff, vint64m8_t op1, vuint64m8_t op2, size_t vl) {
+  return vmulhsu_vv_i64m8_m(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vmulhsu_vx_i64m8_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulhsu.mask.nxv8i64.i64.i64(<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+vint64m8_t test_vmulhsu_vx_i64m8_m(vbool8_t mask, vint64m8_t maskedoff, vint64m8_t op1, uint64_t op2, size_t vl) {
+  return vmulhsu_vx_i64m8_m(mask, maskedoff, op1, op2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics/vmul.c b/clang/test/CodeGen/RISCV/rvv-intrinsics/vmul.c
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics/vmul.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics/vmul.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: riscv-registered-target
-// RUN: %clang_cc1 -triple riscv64 -target-feature +v -disable-O0-optnone -emit-llvm %s -o - | opt -S -mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -disable-O0-optnone -emit-llvm %s -o - | opt -S -mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
 
 #include <riscv_vector.h>
 
@@ -1120,78 +1120,6 @@
   return vmulh_vx_i32m8(op1, op2, vl);
 }
 
-// CHECK-RV64-LABEL: @test_vmulh_vv_i64m1(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulh.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
-//
-vint64m1_t test_vmulh_vv_i64m1(vint64m1_t op1, vint64m1_t op2, size_t vl) {
-  return vmulh_vv_i64m1(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulh_vx_i64m1(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulh.nxv1i64.i64.i64(<vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
-//
-vint64m1_t test_vmulh_vx_i64m1(vint64m1_t op1, int64_t op2, size_t vl) {
-  return vmulh_vx_i64m1(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulh_vv_i64m2(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulh.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
-//
-vint64m2_t test_vmulh_vv_i64m2(vint64m2_t op1, vint64m2_t op2, size_t vl) {
-  return vmulh_vv_i64m2(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulh_vx_i64m2(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulh.nxv2i64.i64.i64(<vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
-//
-vint64m2_t test_vmulh_vx_i64m2(vint64m2_t op1, int64_t op2, size_t vl) {
-  return vmulh_vx_i64m2(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulh_vv_i64m4(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulh.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
-//
-vint64m4_t test_vmulh_vv_i64m4(vint64m4_t op1, vint64m4_t op2, size_t vl) {
-  return vmulh_vv_i64m4(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulh_vx_i64m4(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulh.nxv4i64.i64.i64(<vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
-//
-vint64m4_t test_vmulh_vx_i64m4(vint64m4_t op1, int64_t op2, size_t vl) {
-  return vmulh_vx_i64m4(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulh_vv_i64m8(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulh.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
-//
-vint64m8_t test_vmulh_vv_i64m8(vint64m8_t op1, vint64m8_t op2, size_t vl) {
-  return vmulh_vv_i64m8(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulh_vx_i64m8(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulh.nxv8i64.i64.i64(<vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
-//
-vint64m8_t test_vmulh_vx_i64m8(vint64m8_t op1, int64_t op2, size_t vl) {
-  return vmulh_vx_i64m8(op1, op2, vl);
-}
-
 // CHECK-RV64-LABEL: @test_vmulhu_vv_u8mf8(
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmulhu.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
@@ -1516,78 +1444,6 @@
   return vmulhu_vx_u32m8(op1, op2, vl);
 }
 
-// CHECK-RV64-LABEL: @test_vmulhu_vv_u64m1(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulhu.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
-//
-vuint64m1_t test_vmulhu_vv_u64m1(vuint64m1_t op1, vuint64m1_t op2, size_t vl) {
-  return vmulhu_vv_u64m1(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhu_vx_u64m1(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulhu.nxv1i64.i64.i64(<vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
-//
-vuint64m1_t test_vmulhu_vx_u64m1(vuint64m1_t op1, uint64_t op2, size_t vl) {
-  return vmulhu_vx_u64m1(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhu_vv_u64m2(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulhu.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
-//
-vuint64m2_t test_vmulhu_vv_u64m2(vuint64m2_t op1, vuint64m2_t op2, size_t vl) {
-  return vmulhu_vv_u64m2(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhu_vx_u64m2(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulhu.nxv2i64.i64.i64(<vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
-//
-vuint64m2_t test_vmulhu_vx_u64m2(vuint64m2_t op1, uint64_t op2, size_t vl) {
-  return vmulhu_vx_u64m2(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhu_vv_u64m4(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulhu.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
-//
-vuint64m4_t test_vmulhu_vv_u64m4(vuint64m4_t op1, vuint64m4_t op2, size_t vl) {
-  return vmulhu_vv_u64m4(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhu_vx_u64m4(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulhu.nxv4i64.i64.i64(<vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
-//
-vuint64m4_t test_vmulhu_vx_u64m4(vuint64m4_t op1, uint64_t op2, size_t vl) {
-  return vmulhu_vx_u64m4(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhu_vv_u64m8(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulhu.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
-//
-vuint64m8_t test_vmulhu_vv_u64m8(vuint64m8_t op1, vuint64m8_t op2, size_t vl) {
-  return vmulhu_vv_u64m8(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhu_vx_u64m8(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulhu.nxv8i64.i64.i64(<vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
-//
-vuint64m8_t test_vmulhu_vx_u64m8(vuint64m8_t op1, uint64_t op2, size_t vl) {
-  return vmulhu_vx_u64m8(op1, op2, vl);
-}
-
 // CHECK-RV64-LABEL: @test_vmulhsu_vv_i8mf8(
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmulhsu.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
@@ -1912,78 +1768,6 @@
   return vmulhsu_vx_i32m8(op1, op2, vl);
 }
 
-// CHECK-RV64-LABEL: @test_vmulhsu_vv_i64m1(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulhsu.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
-//
-vint64m1_t test_vmulhsu_vv_i64m1(vint64m1_t op1, vuint64m1_t op2, size_t vl) {
-  return vmulhsu_vv_i64m1(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhsu_vx_i64m1(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulhsu.nxv1i64.i64.i64(<vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
-//
-vint64m1_t test_vmulhsu_vx_i64m1(vint64m1_t op1, uint64_t op2, size_t vl) {
-  return vmulhsu_vx_i64m1(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhsu_vv_i64m2(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulhsu.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
-//
-vint64m2_t test_vmulhsu_vv_i64m2(vint64m2_t op1, vuint64m2_t op2, size_t vl) {
-  return vmulhsu_vv_i64m2(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhsu_vx_i64m2(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulhsu.nxv2i64.i64.i64(<vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
-//
-vint64m2_t test_vmulhsu_vx_i64m2(vint64m2_t op1, uint64_t op2, size_t vl) {
-  return vmulhsu_vx_i64m2(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhsu_vv_i64m4(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulhsu.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
-//
-vint64m4_t test_vmulhsu_vv_i64m4(vint64m4_t op1, vuint64m4_t op2, size_t vl) {
-  return vmulhsu_vv_i64m4(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhsu_vx_i64m4(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulhsu.nxv4i64.i64.i64(<vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
-//
-vint64m4_t test_vmulhsu_vx_i64m4(vint64m4_t op1, uint64_t op2, size_t vl) {
-  return vmulhsu_vx_i64m4(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhsu_vv_i64m8(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulhsu.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
-//
-vint64m8_t test_vmulhsu_vv_i64m8(vint64m8_t op1, vuint64m8_t op2, size_t vl) {
-  return vmulhsu_vv_i64m8(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhsu_vx_i64m8(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulhsu.nxv8i64.i64.i64(<vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
-//
-vint64m8_t test_vmulhsu_vx_i64m8(vint64m8_t op1, uint64_t op2, size_t vl) {
-  return vmulhsu_vx_i64m8(op1, op2, vl);
-}
-
 // CHECK-RV64-LABEL: @test_vmul_vv_i8mf8_m(
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmul.mask.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[MASKEDOFF:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
@@ -3100,78 +2884,6 @@
   return vmulh_vx_i32m8_m(mask, maskedoff, op1, op2, vl);
 }
 
-// CHECK-RV64-LABEL: @test_vmulh_vv_i64m1_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulh.mask.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
-//
-vint64m1_t test_vmulh_vv_i64m1_m(vbool64_t mask, vint64m1_t maskedoff, vint64m1_t op1, vint64m1_t op2, size_t vl) {
-  return vmulh_vv_i64m1_m(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulh_vx_i64m1_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulh.mask.nxv1i64.i64.i64(<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
-//
-vint64m1_t test_vmulh_vx_i64m1_m(vbool64_t mask, vint64m1_t maskedoff, vint64m1_t op1, int64_t op2, size_t vl) {
-  return vmulh_vx_i64m1_m(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulh_vv_i64m2_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulh.mask.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
-//
-vint64m2_t test_vmulh_vv_i64m2_m(vbool32_t mask, vint64m2_t maskedoff, vint64m2_t op1, vint64m2_t op2, size_t vl) {
-  return vmulh_vv_i64m2_m(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulh_vx_i64m2_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulh.mask.nxv2i64.i64.i64(<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
-//
-vint64m2_t test_vmulh_vx_i64m2_m(vbool32_t mask, vint64m2_t maskedoff, vint64m2_t op1, int64_t op2, size_t vl) {
-  return vmulh_vx_i64m2_m(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulh_vv_i64m4_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulh.mask.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
-//
-vint64m4_t test_vmulh_vv_i64m4_m(vbool16_t mask, vint64m4_t maskedoff, vint64m4_t op1, vint64m4_t op2, size_t vl) {
-  return vmulh_vv_i64m4_m(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulh_vx_i64m4_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulh.mask.nxv4i64.i64.i64(<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
-//
-vint64m4_t test_vmulh_vx_i64m4_m(vbool16_t mask, vint64m4_t maskedoff, vint64m4_t op1, int64_t op2, size_t vl) {
-  return vmulh_vx_i64m4_m(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulh_vv_i64m8_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulh.mask.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
-//
-vint64m8_t test_vmulh_vv_i64m8_m(vbool8_t mask, vint64m8_t maskedoff, vint64m8_t op1, vint64m8_t op2, size_t vl) {
-  return vmulh_vv_i64m8_m(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulh_vx_i64m8_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulh.mask.nxv8i64.i64.i64(<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
-//
-vint64m8_t test_vmulh_vx_i64m8_m(vbool8_t mask, vint64m8_t maskedoff, vint64m8_t op1, int64_t op2, size_t vl) {
-  return vmulh_vx_i64m8_m(mask, maskedoff, op1, op2, vl);
-}
-
 // CHECK-RV64-LABEL: @test_vmulhu_vv_u8mf8_m(
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmulhu.mask.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[MASKEDOFF:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
@@ -3496,78 +3208,6 @@
   return vmulhu_vx_u32m8_m(mask, maskedoff, op1, op2, vl);
 }
 
-// CHECK-RV64-LABEL: @test_vmulhu_vv_u64m1_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulhu.mask.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
-//
-vuint64m1_t test_vmulhu_vv_u64m1_m(vbool64_t mask, vuint64m1_t maskedoff, vuint64m1_t op1, vuint64m1_t op2, size_t vl) {
-  return vmulhu_vv_u64m1_m(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhu_vx_u64m1_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulhu.mask.nxv1i64.i64.i64(<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
-//
-vuint64m1_t test_vmulhu_vx_u64m1_m(vbool64_t mask, vuint64m1_t maskedoff, vuint64m1_t op1, uint64_t op2, size_t vl) {
-  return vmulhu_vx_u64m1_m(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhu_vv_u64m2_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulhu.mask.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
-//
-vuint64m2_t test_vmulhu_vv_u64m2_m(vbool32_t mask, vuint64m2_t maskedoff, vuint64m2_t op1, vuint64m2_t op2, size_t vl) {
-  return vmulhu_vv_u64m2_m(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhu_vx_u64m2_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulhu.mask.nxv2i64.i64.i64(<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
-//
-vuint64m2_t test_vmulhu_vx_u64m2_m(vbool32_t mask, vuint64m2_t maskedoff, vuint64m2_t op1, uint64_t op2, size_t vl) {
-  return vmulhu_vx_u64m2_m(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhu_vv_u64m4_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulhu.mask.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
-//
-vuint64m4_t test_vmulhu_vv_u64m4_m(vbool16_t mask, vuint64m4_t maskedoff, vuint64m4_t op1, vuint64m4_t op2, size_t vl) {
-  return vmulhu_vv_u64m4_m(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhu_vx_u64m4_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulhu.mask.nxv4i64.i64.i64(<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
-//
-vuint64m4_t test_vmulhu_vx_u64m4_m(vbool16_t mask, vuint64m4_t maskedoff, vuint64m4_t op1, uint64_t op2, size_t vl) {
-  return vmulhu_vx_u64m4_m(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhu_vv_u64m8_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulhu.mask.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
-//
-vuint64m8_t test_vmulhu_vv_u64m8_m(vbool8_t mask, vuint64m8_t maskedoff, vuint64m8_t op1, vuint64m8_t op2, size_t vl) {
-  return vmulhu_vv_u64m8_m(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhu_vx_u64m8_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulhu.mask.nxv8i64.i64.i64(<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
-//
-vuint64m8_t test_vmulhu_vx_u64m8_m(vbool8_t mask, vuint64m8_t maskedoff, vuint64m8_t op1, uint64_t op2, size_t vl) {
-  return vmulhu_vx_u64m8_m(mask, maskedoff, op1, op2, vl);
-}
-
 // CHECK-RV64-LABEL: @test_vmulhsu_vv_i8mf8_m(
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmulhsu.mask.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[MASKEDOFF:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
@@ -3891,75 +3531,3 @@
 vint32m8_t test_vmulhsu_vx_i32m8_m(vbool4_t mask, vint32m8_t maskedoff, vint32m8_t op1, uint32_t op2, size_t vl) {
   return vmulhsu_vx_i32m8_m(mask, maskedoff, op1, op2, vl);
 }
-
-// CHECK-RV64-LABEL: @test_vmulhsu_vv_i64m1_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulhsu.mask.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
-//
-vint64m1_t test_vmulhsu_vv_i64m1_m(vbool64_t mask, vint64m1_t maskedoff, vint64m1_t op1, vuint64m1_t op2, size_t vl) {
-  return vmulhsu_vv_i64m1_m(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhsu_vx_i64m1_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmulhsu.mask.nxv1i64.i64.i64(<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
-//
-vint64m1_t test_vmulhsu_vx_i64m1_m(vbool64_t mask, vint64m1_t maskedoff, vint64m1_t op1, uint64_t op2, size_t vl) {
-  return vmulhsu_vx_i64m1_m(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhsu_vv_i64m2_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulhsu.mask.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
-//
-vint64m2_t test_vmulhsu_vv_i64m2_m(vbool32_t mask, vint64m2_t maskedoff, vint64m2_t op1, vuint64m2_t op2, size_t vl) {
-  return vmulhsu_vv_i64m2_m(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhsu_vx_i64m2_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmulhsu.mask.nxv2i64.i64.i64(<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
-//
-vint64m2_t test_vmulhsu_vx_i64m2_m(vbool32_t mask, vint64m2_t maskedoff, vint64m2_t op1, uint64_t op2, size_t vl) {
-  return vmulhsu_vx_i64m2_m(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhsu_vv_i64m4_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulhsu.mask.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
-//
-vint64m4_t test_vmulhsu_vv_i64m4_m(vbool16_t mask, vint64m4_t maskedoff, vint64m4_t op1, vuint64m4_t op2, size_t vl) {
-  return vmulhsu_vv_i64m4_m(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhsu_vx_i64m4_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmulhsu.mask.nxv4i64.i64.i64(<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
-//
-vint64m4_t test_vmulhsu_vx_i64m4_m(vbool16_t mask, vint64m4_t maskedoff, vint64m4_t op1, uint64_t op2, size_t vl) {
-  return vmulhsu_vx_i64m4_m(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhsu_vv_i64m8_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulhsu.mask.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
-//
-vint64m8_t test_vmulhsu_vv_i64m8_m(vbool8_t mask, vint64m8_t maskedoff, vint64m8_t op1, vuint64m8_t op2, size_t vl) {
-  return vmulhsu_vv_i64m8_m(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vmulhsu_vx_i64m8_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmulhsu.mask.nxv8i64.i64.i64(<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
-//
-vint64m8_t test_vmulhsu_vx_i64m8_m(vbool8_t mask, vint64m8_t maskedoff, vint64m8_t op1, uint64_t op2, size_t vl) {
-  return vmulhsu_vx_i64m8_m(mask, maskedoff, op1, op2, vl);
-}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics/vsmul-eew64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics/vsmul-eew64.c
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics/vsmul-eew64.c
@@ -0,0 +1,159 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v -disable-O0-optnone -emit-llvm %s -o - | opt -S -mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+// NOTE: The purpose of separating these 3 instructions from vsmul.c is that
+// eew=64 versions only enable when V extension is specified. (Not for zve)
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: @test_vsmul_vv_i64m1(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vsmul.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
+//
+vint64m1_t test_vsmul_vv_i64m1(vint64m1_t op1, vint64m1_t op2, size_t vl) {
+  return vsmul_vv_i64m1(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vsmul_vx_i64m1(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vsmul.nxv1i64.i64.i64(<vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
+//
+vint64m1_t test_vsmul_vx_i64m1(vint64m1_t op1, int64_t op2, size_t vl) {
+  return vsmul_vx_i64m1(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vsmul_vv_i64m2(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vsmul.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+vint64m2_t test_vsmul_vv_i64m2(vint64m2_t op1, vint64m2_t op2, size_t vl) {
+  return vsmul_vv_i64m2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vsmul_vx_i64m2(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vsmul.nxv2i64.i64.i64(<vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+vint64m2_t test_vsmul_vx_i64m2(vint64m2_t op1, int64_t op2, size_t vl) {
+  return vsmul_vx_i64m2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vsmul_vv_i64m4(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vsmul.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+vint64m4_t test_vsmul_vv_i64m4(vint64m4_t op1, vint64m4_t op2, size_t vl) {
+  return vsmul_vv_i64m4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vsmul_vx_i64m4(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vsmul.nxv4i64.i64.i64(<vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+vint64m4_t test_vsmul_vx_i64m4(vint64m4_t op1, int64_t op2, size_t vl) {
+  return vsmul_vx_i64m4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vsmul_vv_i64m8(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vsmul.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+vint64m8_t test_vsmul_vv_i64m8(vint64m8_t op1, vint64m8_t op2, size_t vl) {
+  return vsmul_vv_i64m8(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vsmul_vx_i64m8(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vsmul.nxv8i64.i64.i64(<vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+vint64m8_t test_vsmul_vx_i64m8(vint64m8_t op1, int64_t op2, size_t vl) {
+  return vsmul_vx_i64m8(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vsmul_vv_i64m1_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vsmul.mask.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
+//
+vint64m1_t test_vsmul_vv_i64m1_m(vbool64_t mask, vint64m1_t maskedoff,
+                                 vint64m1_t op1, vint64m1_t op2, size_t vl) {
+  return vsmul_vv_i64m1_m(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vsmul_vx_i64m1_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vsmul.mask.nxv1i64.i64.i64(<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
+//
+vint64m1_t test_vsmul_vx_i64m1_m(vbool64_t mask, vint64m1_t maskedoff,
+                                 vint64m1_t op1, int64_t op2, size_t vl) {
+  return vsmul_vx_i64m1_m(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vsmul_vv_i64m2_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vsmul.mask.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+vint64m2_t test_vsmul_vv_i64m2_m(vbool32_t mask, vint64m2_t maskedoff,
+                                 vint64m2_t op1, vint64m2_t op2, size_t vl) {
+  return vsmul_vv_i64m2_m(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vsmul_vx_i64m2_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vsmul.mask.nxv2i64.i64.i64(<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+vint64m2_t test_vsmul_vx_i64m2_m(vbool32_t mask, vint64m2_t maskedoff,
+                                 vint64m2_t op1, int64_t op2, size_t vl) {
+  return vsmul_vx_i64m2_m(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vsmul_vv_i64m4_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vsmul.mask.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+vint64m4_t test_vsmul_vv_i64m4_m(vbool16_t mask, vint64m4_t maskedoff,
+                                 vint64m4_t op1, vint64m4_t op2, size_t vl) {
+  return vsmul_vv_i64m4_m(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vsmul_vx_i64m4_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vsmul.mask.nxv4i64.i64.i64(<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+vint64m4_t test_vsmul_vx_i64m4_m(vbool16_t mask, vint64m4_t maskedoff,
+                                 vint64m4_t op1, int64_t op2, size_t vl) {
+  return vsmul_vx_i64m4_m(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vsmul_vv_i64m8_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vsmul.mask.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+vint64m8_t test_vsmul_vv_i64m8_m(vbool8_t mask, vint64m8_t maskedoff,
+                                 vint64m8_t op1, vint64m8_t op2, size_t vl) {
+  return vsmul_vv_i64m8_m(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: @test_vsmul_vx_i64m8_m(
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vsmul.mask.nxv8i64.i64.i64(<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+vint64m8_t test_vsmul_vx_i64m8_m(vbool8_t mask, vint64m8_t maskedoff,
+                                 vint64m8_t op1, int64_t op2, size_t vl) {
+  return vsmul_vx_i64m8_m(mask, maskedoff, op1, op2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics/vsmul.c b/clang/test/CodeGen/RISCV/rvv-intrinsics/vsmul.c
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics/vsmul.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics/vsmul.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: riscv-registered-target
-// RUN: %clang_cc1 -triple riscv64 -target-feature +v -disable-O0-optnone -emit-llvm %s -o - | opt -S -mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -disable-O0-optnone -emit-llvm %s -o - | opt -S -mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
 
 #include <riscv_vector.h>
 
@@ -328,78 +328,6 @@
   return vsmul_vx_i32m8(op1, op2, vl);
 }
 
-// CHECK-RV64-LABEL: @test_vsmul_vv_i64m1(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vsmul.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
-//
-vint64m1_t test_vsmul_vv_i64m1(vint64m1_t op1, vint64m1_t op2, size_t vl) {
-  return vsmul_vv_i64m1(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vsmul_vx_i64m1(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vsmul.nxv1i64.i64.i64(<vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
-//
-vint64m1_t test_vsmul_vx_i64m1(vint64m1_t op1, int64_t op2, size_t vl) {
-  return vsmul_vx_i64m1(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vsmul_vv_i64m2(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vsmul.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
-//
-vint64m2_t test_vsmul_vv_i64m2(vint64m2_t op1, vint64m2_t op2, size_t vl) {
-  return vsmul_vv_i64m2(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vsmul_vx_i64m2(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vsmul.nxv2i64.i64.i64(<vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
-//
-vint64m2_t test_vsmul_vx_i64m2(vint64m2_t op1, int64_t op2, size_t vl) {
-  return vsmul_vx_i64m2(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vsmul_vv_i64m4(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vsmul.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
-//
-vint64m4_t test_vsmul_vv_i64m4(vint64m4_t op1, vint64m4_t op2, size_t vl) {
-  return vsmul_vv_i64m4(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vsmul_vx_i64m4(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vsmul.nxv4i64.i64.i64(<vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
-//
-vint64m4_t test_vsmul_vx_i64m4(vint64m4_t op1, int64_t op2, size_t vl) {
-  return vsmul_vx_i64m4(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vsmul_vv_i64m8(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vsmul.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
-//
-vint64m8_t test_vsmul_vv_i64m8(vint64m8_t op1, vint64m8_t op2, size_t vl) {
-  return vsmul_vv_i64m8(op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vsmul_vx_i64m8(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vsmul.nxv8i64.i64.i64(<vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], i64 [[VL:%.*]])
-// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
-//
-vint64m8_t test_vsmul_vx_i64m8(vint64m8_t op1, int64_t op2, size_t vl) {
-  return vsmul_vx_i64m8(op1, op2, vl);
-}
-
 // CHECK-RV64-LABEL: @test_vsmul_vv_i8mf8_m(
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vsmul.mask.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[MASKEDOFF:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
@@ -762,83 +690,3 @@
                                  vint32m8_t op1, int32_t op2, size_t vl) {
   return vsmul_vx_i32m8_m(mask, maskedoff, op1, op2, vl);
 }
-
-// CHECK-RV64-LABEL: @test_vsmul_vv_i64m1_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vsmul.mask.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
-//
-vint64m1_t test_vsmul_vv_i64m1_m(vbool64_t mask, vint64m1_t maskedoff,
-                                 vint64m1_t op1, vint64m1_t op2, size_t vl) {
-  return vsmul_vv_i64m1_m(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vsmul_vx_i64m1_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vsmul.mask.nxv1i64.i64.i64(<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
-//
-vint64m1_t test_vsmul_vx_i64m1_m(vbool64_t mask, vint64m1_t maskedoff,
-                                 vint64m1_t op1, int64_t op2, size_t vl) {
-  return vsmul_vx_i64m1_m(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vsmul_vv_i64m2_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vsmul.mask.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
-//
-vint64m2_t test_vsmul_vv_i64m2_m(vbool32_t mask, vint64m2_t maskedoff,
-                                 vint64m2_t op1, vint64m2_t op2, size_t vl) {
-  return vsmul_vv_i64m2_m(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vsmul_vx_i64m2_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vsmul.mask.nxv2i64.i64.i64(<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
-//
-vint64m2_t test_vsmul_vx_i64m2_m(vbool32_t mask, vint64m2_t maskedoff,
-                                 vint64m2_t op1, int64_t op2, size_t vl) {
-  return vsmul_vx_i64m2_m(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vsmul_vv_i64m4_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vsmul.mask.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
-//
-vint64m4_t test_vsmul_vv_i64m4_m(vbool16_t mask, vint64m4_t maskedoff,
-                                 vint64m4_t op1, vint64m4_t op2, size_t vl) {
-  return vsmul_vv_i64m4_m(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vsmul_vx_i64m4_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vsmul.mask.nxv4i64.i64.i64(<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
-//
-vint64m4_t test_vsmul_vx_i64m4_m(vbool16_t mask, vint64m4_t maskedoff,
-                                 vint64m4_t op1, int64_t op2, size_t vl) {
-  return vsmul_vx_i64m4_m(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vsmul_vv_i64m8_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vsmul.mask.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
-//
-vint64m8_t test_vsmul_vv_i64m8_m(vbool8_t mask, vint64m8_t maskedoff,
-                                 vint64m8_t op1, vint64m8_t op2, size_t vl) {
-  return vsmul_vv_i64m8_m(mask, maskedoff, op1, op2, vl);
-}
-
-// CHECK-RV64-LABEL: @test_vsmul_vx_i64m8_m(
-// CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vsmul.mask.nxv8i64.i64.i64(<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0)
-// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
-//
-vint64m8_t test_vsmul_vx_i64m8_m(vbool8_t mask, vint64m8_t maskedoff,
-                                 vint64m8_t op1, int64_t op2, size_t vl) {
-  return vsmul_vx_i64m8_m(mask, maskedoff, op1, op2, vl);
-}
diff --git a/clang/unittests/Tooling/SourceCodeBuildersTest.cpp b/clang/unittests/Tooling/SourceCodeBuildersTest.cpp
--- a/clang/unittests/Tooling/SourceCodeBuildersTest.cpp
+++ b/clang/unittests/Tooling/SourceCodeBuildersTest.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Tooling/Transformer/SourceCodeBuilders.h"
+#include "clang/AST/Type.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "clang/ASTMatchers/ASTMatchers.h"
 #include "clang/Tooling/Tooling.h"
@@ -24,8 +25,23 @@
 
 // Create a valid translation unit from a statement.
 static std::string wrapSnippet(StringRef StatementCode) {
-  return ("struct S { S(); S(int); int field; };\n"
+  return ("namespace std {\n"
+          "template <typename T> struct unique_ptr {\n"
+          "  T* operator->() const;\n"
+          "  T& operator*() const;\n"
+          "};\n"
+          "template <typename T> struct shared_ptr {\n"
+          "  T* operator->() const;\n"
+          "  T& operator*() const;\n"
+          "};\n"
+          "}\n"
+          "struct A { void super(); };\n"
+          "struct S : public A { S(); S(int); int Field; };\n"
           "S operator+(const S &a, const S &b);\n"
+          "struct Smart {\n"
+          "  S* operator->() const;\n"
+          "  S& operator*() const;\n"
+          "};\n"
           "auto test_snippet = []{" +
           StatementCode + "};")
       .str();
@@ -51,7 +67,8 @@
 // `StatementCode` may contain other statements not described by `Matcher`.
 static llvm::Optional<TestMatch> matchStmt(StringRef StatementCode,
                                            StatementMatcher Matcher) {
-  auto AstUnit = buildASTFromCode(wrapSnippet(StatementCode));
+  auto AstUnit = buildASTFromCodeWithArgs(wrapSnippet(StatementCode),
+                                          {"-Wno-unused-value"});
   if (AstUnit == nullptr) {
     ADD_FAILURE() << "AST construction failed";
     return llvm::None;
@@ -95,7 +112,7 @@
   testPredicate(needParensAfterUnaryOperator, "int(3.0);", false);
   testPredicate(needParensAfterUnaryOperator, "void f(); f();", false);
   testPredicate(needParensAfterUnaryOperator, "int a[3]; a[0];", false);
-  testPredicate(needParensAfterUnaryOperator, "S x; x.field;", false);
+  testPredicate(needParensAfterUnaryOperator, "S x; x.Field;", false);
   testPredicate(needParensAfterUnaryOperator, "int x = 1; --x;", false);
   testPredicate(needParensAfterUnaryOperator, "int x = 1; -x;", false);
 }
@@ -117,7 +134,7 @@
   testPredicate(mayEverNeedParens, "int(3.0);", false);
   testPredicate(mayEverNeedParens, "void f(); f();", false);
   testPredicate(mayEverNeedParens, "int a[3]; a[0];", false);
-  testPredicate(mayEverNeedParens, "S x; x.field;", false);
+  testPredicate(mayEverNeedParens, "S x; x.Field;", false);
 }
 
 TEST(SourceCodeBuildersTest, mayEverNeedParensInImplictConversion) {
@@ -126,6 +143,50 @@
   testPredicateOnArg(mayEverNeedParens, "void f(S); f(3 + 5);", true);
 }
 
+TEST(SourceCodeBuildersTest, isKnownPointerLikeTypeUniquePtr) {
+  std::string Snippet = "std::unique_ptr<int> P; P;";
+  auto StmtMatch =
+      matchStmt(Snippet, declRefExpr(hasType(qualType().bind("ty"))));
+  ASSERT_TRUE(StmtMatch) << "Snippet: " << Snippet;
+  EXPECT_TRUE(
+      isKnownPointerLikeType(*StmtMatch->Result.Nodes.getNodeAs<QualType>("ty"),
+                             *StmtMatch->Result.Context))
+      << "Snippet: " << Snippet;
+}
+
+TEST(SourceCodeBuildersTest, isKnownPointerLikeTypeSharedPtr) {
+  std::string Snippet = "std::shared_ptr<int> P; P;";
+  auto StmtMatch =
+      matchStmt(Snippet, declRefExpr(hasType(qualType().bind("ty"))));
+  ASSERT_TRUE(StmtMatch) << "Snippet: " << Snippet;
+  EXPECT_TRUE(
+      isKnownPointerLikeType(*StmtMatch->Result.Nodes.getNodeAs<QualType>("ty"),
+                             *StmtMatch->Result.Context))
+      << "Snippet: " << Snippet;
+}
+
+TEST(SourceCodeBuildersTest, isKnownPointerLikeTypeUnknownTypeFalse) {
+  std::string Snippet = "Smart P; P;";
+  auto StmtMatch =
+      matchStmt(Snippet, declRefExpr(hasType(qualType().bind("ty"))));
+  ASSERT_TRUE(StmtMatch) << "Snippet: " << Snippet;
+  EXPECT_FALSE(
+      isKnownPointerLikeType(*StmtMatch->Result.Nodes.getNodeAs<QualType>("ty"),
+                             *StmtMatch->Result.Context))
+      << "Snippet: " << Snippet;
+}
+
+TEST(SourceCodeBuildersTest, isKnownPointerLikeTypeNormalTypeFalse) {
+  std::string Snippet = "int *P; P;";
+  auto StmtMatch =
+      matchStmt(Snippet, declRefExpr(hasType(qualType().bind("ty"))));
+  ASSERT_TRUE(StmtMatch) << "Snippet: " << Snippet;
+  EXPECT_FALSE(
+      isKnownPointerLikeType(*StmtMatch->Result.Nodes.getNodeAs<QualType>("ty"),
+                             *StmtMatch->Result.Context))
+      << "Snippet: " << Snippet;
+}
+
 static void testBuilder(
     llvm::Optional<std::string> (*Builder)(const Expr &, const ASTContext &),
     StringRef Snippet, StringRef Expected) {
@@ -136,6 +197,15 @@
               ValueIs(std::string(Expected)));
 }
 
+static void testBuildAccess(StringRef Snippet, StringRef Expected,
+                            PLTClass C = PLTClass::Pointer) {
+  auto StmtMatch = matchStmt(Snippet, expr().bind("expr"));
+  ASSERT_TRUE(StmtMatch);
+  EXPECT_THAT(buildAccess(*StmtMatch->Result.Nodes.getNodeAs<Expr>("expr"),
+                          *StmtMatch->Result.Context, C),
+              ValueIs(std::string(Expected)));
+}
+
 TEST(SourceCodeBuildersTest, BuildParensUnaryOp) {
   testBuilder(buildParens, "-4;", "(-4)");
 }
@@ -245,4 +315,117 @@
 TEST(SourceCodeBuildersTest, BuildArrowValueAddressWithParens) {
   testBuilder(buildArrow, "S x; &(true ? x : x);", "(true ? x : x).");
 }
+
+TEST(SourceCodeBuildersTest, BuildAccessValue) {
+  testBuildAccess("S x; x;", "x.");
+}
+
+TEST(SourceCodeBuildersTest, BuildAccessPointerDereference) {
+  testBuildAccess("S *x; *x;", "x->");
+}
+
+TEST(SourceCodeBuildersTest, BuildAccessPointerDereferenceIgnoresParens) {
+  testBuildAccess("S *x; *(x);", "x->");
+}
+
+TEST(SourceCodeBuildersTest, BuildAccessValueBinaryOperation) {
+  testBuildAccess("S x; x + x;", "(x + x).");
+}
+
+TEST(SourceCodeBuildersTest, BuildAccessPointerDereferenceExprWithParens) {
+  testBuildAccess("S *x; *(x + 1);", "(x + 1)->");
+}
+
+TEST(SourceCodeBuildersTest, BuildAccessPointer) {
+  testBuildAccess("S *x; x;", "x->");
+}
+
+TEST(SourceCodeBuildersTest, BuildAccessValueAddress) {
+  testBuildAccess("S x; &x;", "x.");
+}
+
+TEST(SourceCodeBuildersTest, BuildAccessValueAddressIgnoresParens) {
+  testBuildAccess("S x; &(x);", "x.");
+}
+
+TEST(SourceCodeBuildersTest, BuildAccessPointerBinaryOperation) {
+  testBuildAccess("S *x; x + 1;", "(x + 1)->");
+}
+
+TEST(SourceCodeBuildersTest, BuildAccessValueAddressWithParens) {
+  testBuildAccess("S x; &(true ? x : x);", "(true ? x : x).");
+}
+
+TEST(SourceCodeBuildersTest, BuildAccessSmartPointer) {
+  testBuildAccess("std::unique_ptr<int> x; x;", "x->");
+}
+
+TEST(SourceCodeBuildersTest, BuildAccessSmartPointerAsValue) {
+  testBuildAccess("std::unique_ptr<int> x; x;", "x.", PLTClass::Value);
+}
+
+TEST(SourceCodeBuildersTest, BuildAccessSmartPointerDeref) {
+  testBuildAccess("std::unique_ptr<int> x; *x;", "x->");
+}
+
+TEST(SourceCodeBuildersTest, BuildAccessSmartPointerDerefAsValue) {
+  testBuildAccess("std::unique_ptr<int> x; *x;", "(*x).", PLTClass::Value);
+}
+
+TEST(SourceCodeBuildersTest, BuildAccessSmartPointerMemberCall) {
+  StringRef Snippet = R"cc(
+    Smart x;
+    x->Field;
+  )cc";
+  auto StmtMatch =
+      matchStmt(Snippet, memberExpr(hasObjectExpression(expr().bind("expr"))));
+  ASSERT_TRUE(StmtMatch);
+  EXPECT_THAT(buildAccess(*StmtMatch->Result.Nodes.getNodeAs<Expr>("expr"),
+                          *StmtMatch->Result.Context),
+              ValueIs(std::string("x->")));
+}
+
+TEST(SourceCodeBuildersTest, BuildAccessIgnoreImplicit) {
+  StringRef Snippet = R"cc(
+    S x;
+    A *a;
+    a = &x;
+  )cc";
+  auto StmtMatch =
+      matchStmt(Snippet, binaryOperator(isAssignmentOperator(),
+                                        hasRHS(expr().bind("expr"))));
+  ASSERT_TRUE(StmtMatch);
+  EXPECT_THAT(buildAccess(*StmtMatch->Result.Nodes.getNodeAs<Expr>("expr"),
+                          *StmtMatch->Result.Context),
+              ValueIs(std::string("x.")));
+}
+
+TEST(SourceCodeBuildersTest, BuildAccessImplicitThis) {
+  StringRef Snippet = R"cc(
+    struct Struct {
+      void foo() {}
+      void bar() {
+        foo();
+      }
+    };
+  )cc";
+  auto StmtMatch = matchStmt(
+      Snippet,
+      cxxMemberCallExpr(onImplicitObjectArgument(cxxThisExpr().bind("expr"))));
+  ASSERT_TRUE(StmtMatch);
+  EXPECT_THAT(buildAccess(*StmtMatch->Result.Nodes.getNodeAs<Expr>("expr"),
+                          *StmtMatch->Result.Context),
+              ValueIs(std::string()));
+}
+
+TEST(SourceCodeBuildersTest, BuildAccessImplicitThisIgnoreImplicitCasts) {
+  StringRef Snippet = "struct B : public A { void f() { super(); } };";
+  auto StmtMatch = matchStmt(
+      Snippet,
+      cxxMemberCallExpr(onImplicitObjectArgument(expr().bind("expr"))));
+  ASSERT_TRUE(StmtMatch);
+  EXPECT_THAT(buildAccess(*StmtMatch->Result.Nodes.getNodeAs<Expr>("expr"),
+                          *StmtMatch->Result.Context),
+              ValueIs(std::string()));
+}
 } // namespace
diff --git a/clang/unittests/Tooling/StencilTest.cpp b/clang/unittests/Tooling/StencilTest.cpp
--- a/clang/unittests/Tooling/StencilTest.cpp
+++ b/clang/unittests/Tooling/StencilTest.cpp
@@ -36,10 +36,13 @@
     namespace N { class C {}; }
     namespace { class AnonC {}; }
     struct S { int Field; };
-    struct Smart {
-      S* operator->() const;
-      S& operator*() const;
+    namespace std {
+    template <typename T>
+    struct unique_ptr {
+      T* operator->() const;
+      T& operator*() const;
     };
+    }
   )cc";
   return (Preface + ExtraPreface + "auto stencil_test_snippet = []{" +
           StatementCode + "};")
@@ -326,32 +329,15 @@
 TEST_F(StencilTest, MaybeDerefSmartPointer) {
   StringRef Id = "id";
   std::string Snippet = R"cc(
-    Smart x;
+    std::unique_ptr<S> x;
     x;
   )cc";
   testExpr(Id, Snippet, maybeDeref(Id), "*x");
 }
 
-// Tests that unique_ptr specifically is handled.
-TEST_F(StencilTest, MaybeDerefSmartPointerUniquePtr) {
-  StringRef Id = "id";
-  // We deliberately specify `unique_ptr` as empty to verify that it matches
-  // because of its name, rather than its contents.
-  StringRef ExtraPreface =
-      "namespace std { template <typename T> class unique_ptr {}; }\n";
-  StringRef Snippet = R"cc(
-    std::unique_ptr<int> x;
-    x;
-  )cc";
-  auto StmtMatch = matchStmt(Snippet, expr().bind(Id), ExtraPreface);
-  ASSERT_TRUE(StmtMatch);
-  EXPECT_THAT_EXPECTED(maybeDeref(Id)->eval(StmtMatch->Result),
-                       HasValue(std::string("*x")));
-}
-
 TEST_F(StencilTest, MaybeDerefSmartPointerFromMemberExpr) {
   StringRef Id = "id";
-  std::string Snippet = "Smart x; x->Field;";
+  std::string Snippet = "std::unique_ptr<S> x; x->Field;";
   auto StmtMatch =
       matchStmt(Snippet, memberExpr(hasObjectExpression(expr().bind(Id))));
   ASSERT_TRUE(StmtMatch);
@@ -381,12 +367,12 @@
 
 TEST_F(StencilTest, MaybeAddressOfSmartPointer) {
   StringRef Id = "id";
-  testExpr(Id, "Smart x; x;", maybeAddressOf(Id), "x");
+  testExpr(Id, "std::unique_ptr<S> x; x;", maybeAddressOf(Id), "x");
 }
 
 TEST_F(StencilTest, MaybeAddressOfSmartPointerFromMemberCall) {
   StringRef Id = "id";
-  std::string Snippet = "Smart x; x->Field;";
+  std::string Snippet = "std::unique_ptr<S> x; x->Field;";
   auto StmtMatch =
       matchStmt(Snippet, memberExpr(hasObjectExpression(expr().bind(Id))));
   ASSERT_TRUE(StmtMatch);
@@ -396,7 +382,7 @@
 
 TEST_F(StencilTest, MaybeAddressOfSmartPointerDerefNoCancel) {
   StringRef Id = "id";
-  testExpr(Id, "Smart x; *x;", maybeAddressOf(Id), "&*x");
+  testExpr(Id, "std::unique_ptr<S> x; *x;", maybeAddressOf(Id), "&*x");
 }
 
 TEST_F(StencilTest, AccessOpValue) {
@@ -446,7 +432,7 @@
 
 TEST_F(StencilTest, AccessOpSmartPointer) {
   StringRef Snippet = R"cc(
-    Smart x;
+    std::unique_ptr<S> x;
     x;
   )cc";
   StringRef Id = "id";
@@ -455,7 +441,7 @@
 
 TEST_F(StencilTest, AccessOpSmartPointerDereference) {
   StringRef Snippet = R"cc(
-    Smart x;
+    std::unique_ptr<S> x;
     *x;
   )cc";
   StringRef Id = "id";
@@ -464,7 +450,7 @@
 
 TEST_F(StencilTest, AccessOpSmartPointerMemberCall) {
   StringRef Snippet = R"cc(
-    Smart x;
+    std::unique_ptr<S> x;
     x->Field;
   )cc";
   StringRef Id = "id";
diff --git a/clang/utils/TableGen/RISCVVEmitter.cpp b/clang/utils/TableGen/RISCVVEmitter.cpp
--- a/clang/utils/TableGen/RISCVVEmitter.cpp
+++ b/clang/utils/TableGen/RISCVVEmitter.cpp
@@ -141,11 +141,12 @@
 
 enum RISCVPredefinedMacro : RISCVPredefinedMacroT {
   Basic = 0,
-  Zfh = 1 << 1,
-  RV64 = 1 << 2,
-  VectorMaxELen64 = 1 << 3,
-  VectorMaxELenFp32 = 1 << 4,
-  VectorMaxELenFp64 = 1 << 5,
+  V = 1 << 1,
+  Zfh = 1 << 2,
+  RV64 = 1 << 3,
+  VectorMaxELen64 = 1 << 4,
+  VectorMaxELenFp32 = 1 << 5,
+  VectorMaxELenFp64 = 1 << 6,
 };
 
 // TODO refactor RVVIntrinsic class design after support all intrinsic
@@ -808,6 +809,11 @@
   for (auto Feature : RequiredFeatures) {
     if (Feature == "RV64")
       RISCVPredefinedMacros |= RISCVPredefinedMacro::RV64;
+    // Note: Full multiply instruction (mulh, mulhu, mulhsu, smul) for EEW=64
+    // require V.
+    if (Feature == "FullMultiply" &&
+        (RISCVPredefinedMacros & RISCVPredefinedMacro::VectorMaxELen64))
+      RISCVPredefinedMacros |= RISCVPredefinedMacro::V;
   }
 
   // Init OutputType and InputTypes
@@ -1314,6 +1320,8 @@
     return false;
   OS << "#if ";
   ListSeparator LS(" && ");
+  if (PredefinedMacros & RISCVPredefinedMacro::V)
+    OS << LS << "defined(__riscv_v)";
   if (PredefinedMacros & RISCVPredefinedMacro::Zfh)
     OS << LS << "defined(__riscv_zfh)";
   if (PredefinedMacros & RISCVPredefinedMacro::RV64)
diff --git a/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate.c b/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate.c
--- a/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate.c
+++ b/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate.c
@@ -1,12 +1,18 @@
 // REQUIRES: zlib
 
 // Value profiling is currently not supported in lightweight mode.
-// RUN: %clang_pgogen -o %t -g -gdwarf-4 -mllvm --debug-info-correlate -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp
-// RUN: env LLVM_PROFILE_FILE=%t.proflite %run %t
-// RUN: llvm-profdata merge -o %t.profdata --debug-info=%t %t.proflite
-
 // RUN: %clang_pgogen -o %t.normal -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp
 // RUN: env LLVM_PROFILE_FILE=%t.profraw %run %t.normal
 // RUN: llvm-profdata merge -o %t.normal.profdata %t.profraw
 
+// RUN: %clang_pgogen -o %t.d4 -g -gdwarf-4 -mllvm --debug-info-correlate -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp
+// RUN: env LLVM_PROFILE_FILE=%t.d4.proflite %run %t.d4
+// RUN: llvm-profdata merge -o %t.d4.profdata --debug-info=%t.d4 %t.d4.proflite
+
+// RUN: diff %t.normal.profdata %t.d4.profdata
+
+// RUN: %clang_pgogen -o %t -g -mllvm --debug-info-correlate -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp
+// RUN: env LLVM_PROFILE_FILE=%t.proflite %run %t
+// RUN: llvm-profdata merge -o %t.profdata --debug-info=%t %t.proflite
+
 // RUN: diff %t.normal.profdata %t.profdata
diff --git a/compiler-rt/test/profile/Posix/instrprof-get-filename-merge-mode.c b/compiler-rt/test/profile/Posix/instrprof-get-filename-merge-mode.c
--- a/compiler-rt/test/profile/Posix/instrprof-get-filename-merge-mode.c
+++ b/compiler-rt/test/profile/Posix/instrprof-get-filename-merge-mode.c
@@ -1,6 +1,6 @@
 // Test __llvm_profile_get_filename when the on-line merging mode is enabled.
 //
-// RUN: %clang_pgogen -fPIC -shared -o %t.dso %p/../Inputs/instrprof-get-filename-dso.c
+// RUN: %clang_pgogen -fPIC -shared %shared_linker_xopts -o %t.dso %p/../Inputs/instrprof-get-filename-dso.c
 // RUN: %clang_pgogen -o %t %s %t.dso
 // RUN: env LLVM_PROFILE_FILE="%t-%m.profraw" %run %t
 
diff --git a/compiler-rt/test/profile/Posix/lit.local.cfg.py b/compiler-rt/test/profile/Posix/lit.local.cfg.py
--- a/compiler-rt/test/profile/Posix/lit.local.cfg.py
+++ b/compiler-rt/test/profile/Posix/lit.local.cfg.py
@@ -7,3 +7,10 @@
 
 if root.host_os in ['Windows']:
   config.unsupported = True
+
+# AIX usually usually makes use of an explicit export list when linking a shared
+# object, but for the purposes of these tests just export all symbols.
+if root.host_os in ['AIX']:
+  config.substitutions.append(('%shared_linker_xopts', '-Wl,-bexpfull'))
+else:
+  config.substitutions.append(('%shared_linker_xopts', ''))
diff --git a/libcxx/docs/Status/FormatIssues.csv b/libcxx/docs/Status/FormatIssues.csv
--- a/libcxx/docs/Status/FormatIssues.csv
+++ b/libcxx/docs/Status/FormatIssues.csv
@@ -1,6 +1,6 @@
 Number,Name,Assignee,Patch,Status,First released version
 `P0645 <https://wg21.link/P0645>`_,"Text Formatting",Mark de Wever,,|Complete|,Clang 14
-`P1652 <https://wg21.link/P1652>`_,"Printf corner cases in std::format",Mark de Wever,"`D103433 <https://reviews.llvm.org/D103433>`__, `D114001 <https://reviews.llvm.org/D114001>`__",|Review|,
+`P1652 <https://wg21.link/P1652>`_,"Printf corner cases in std::format",Mark de Wever,"`D103433 <https://reviews.llvm.org/D103433>`__, `D114001 <https://reviews.llvm.org/D114001>`__",|Complete|,Clang 14
 `P1892 <https://wg21.link/P1892>`_,"Extended locale-specific presentation specifiers for std::format",Mark de Wever,`D103368 <https://reviews.llvm.org/D103368>`__,|Complete|,Clang 14
 `P1868 <https://wg21.link/P1868>`_,"width: clarifying units of width and precision in std::format (Implements the unicode support.)",Mark de Wever,"`D103413 <https://reviews.llvm.org/D103413>`__ `D103425 <https://reviews.llvm.org/D103425>`__ `D103670 <https://reviews.llvm.org/D103670>`__",|Complete|,Clang 14
 `P2216 <https://wg21.link/P2216>`_,"std::format improvements",Mark de Wever,,|In Progress|,
diff --git a/libcxx/docs/Status/FormatPaper.csv b/libcxx/docs/Status/FormatPaper.csv
--- a/libcxx/docs/Status/FormatPaper.csv
+++ b/libcxx/docs/Status/FormatPaper.csv
@@ -6,21 +6,21 @@
 `[format.context] <https://wg21.link/format.context>`_,"Class template basic_format_context",,Mark de Wever,`D103357 <https://llvm.org/D103357>`__,|Complete|,Clang 14
 `[format.args] <https://wg21.link/format.args>`_,"Class template basic_format_args",,Mark de Wever,`D103357 <https://llvm.org/D103357>`__,|Complete|,Clang 14
 `[format.arg] <https://wg21.link/format.arg>`_,"Class template basic_format_arg",,Mark de Wever,`D103357 <https://llvm.org/D103357>`__,|Complete|,Clang 14
-`[format.arg] <https://wg21.link/format.arg>`_,"Class template basic_format_arg - handle",,Unassigned,,|Not Started|,
-`[format.arg] <https://wg21.link/format.arg>`_,"Class template basic_format_arg - pointers",,Mark de Wever,,|In Progress|,
+`[format.arg] <https://wg21.link/format.arg>`_,"Class template basic_format_arg - handle",,Mark de Wever,,|Complete|,Clang 14
+`[format.arg] <https://wg21.link/format.arg>`_,"Class template basic_format_arg - pointers",,Mark de Wever,,|Complete|,Clang 14
 `[format.arg.store] <https://wg21.link/format.arg.store>`_,"Class template format-arg-store",,Mark de Wever,`D103357 <https://llvm.org/D103357>`__,|Complete|,Clang 14
 `[format.formatter.spec] <https://wg21.link/format.formatter.spec>`_,"Formatter specializations - character types",,Mark de Wever,"`D96664 <https://llvm.org/D96664>`__ `D103466 <https://llvm.org/D103466>`__",|Complete|,Clang 14
 `[format.formatter.spec] <https://wg21.link/format.formatter.spec>`_,"Formatter specializations - string types",,Mark de Wever,"`D96664 <https://llvm.org/D96664>`__ `D103425 <https://reviews.llvm.org/D103425>`__",|Complete|,Clang 14
 `[format.formatter.spec] <https://wg21.link/format.formatter.spec>`_,"Formatter specializations - boolean type",,Mark de Wever,"`D96664 <https://llvm.org/D96664>`__ `D103670 <https://reviews.llvm.org/D103670>`__",|Complete|,Clang 14
 `[format.formatter.spec] <https://wg21.link/format.formatter.spec>`_,"Formatter specializations - integral types",,Mark de Wever,"`D96664 <https://llvm.org/D96664>`__ `D103433 <https://reviews.llvm.org/D103433>`__",|Complete|,Clang 14
-`[format.formatter.spec] <https://wg21.link/format.formatter.spec>`_,"Formatter specializations - floating-point types",`D70631 <https://llvm.org/D70631>`__,Mark de Wever,`D114001 <https://reviews.llvm.org/D114001>`__,|Review|,
-`[format.formatter.spec] <https://wg21.link/format.formatter.spec>`_,"Formatter specializations - pointer types",,Mark de Wever,,|In Progress|,
+`[format.formatter.spec] <https://wg21.link/format.formatter.spec>`_,"Formatter specializations - floating-point types",`D70631 <https://llvm.org/D70631>`__,Mark de Wever,`D114001 <https://reviews.llvm.org/D114001>`__,|Complete|,Clang 14
+`[format.formatter.spec] <https://wg21.link/format.formatter.spec>`_,"Formatter specializations - pointer types",,Mark de Wever,,|Complete|,Clang 14
 `[format.string.std] <https://wg21.link/format.string.std>`_,"Standard format specifiers - character types",,Mark de Wever,`D103368 <https://reviews.llvm.org/D103368>`__,|Complete|,Clang 14
 `[format.string.std] <https://wg21.link/format.string.std>`_,"Standard format specifiers - string types",`D103379 <https://reviews.llvm.org/D103379>`__,Mark de Wever,"`D103368 <https://reviews.llvm.org/D103368>`__ `D103413 <https://reviews.llvm.org/D103413>`__",|Complete|,Clang 14
 `[format.string.std] <https://wg21.link/format.string.std>`_,"Standard format specifiers - boolean type",`D103379 <https://reviews.llvm.org/D103379>`__,Mark de Wever,"`D103368 <https://reviews.llvm.org/D103368>`__ `D103413 <https://reviews.llvm.org/D103413>`__",|Complete|,Clang 14
 `[format.string.std] <https://wg21.link/format.string.std>`_,"Standard format specifiers - integral types",,Mark de Wever,`D103368 <https://reviews.llvm.org/D103368>`__,|Complete|,Clang 14
-`[format.string.std] <https://wg21.link/format.string.std>`_,"Standard format specifiers - floating-point types",,Mark de Wever,`D114001 <https://reviews.llvm.org/D114001>`__,|Review|,
-`[format.string.std] <https://wg21.link/format.string.std>`_,"Standard format specifiers - pointer types",,Mark de Wever,,|In Progress|,
+`[format.string.std] <https://wg21.link/format.string.std>`_,"Standard format specifiers - floating-point types",,Mark de Wever,`D114001 <https://reviews.llvm.org/D114001>`__,|Complete|,Clang 14
+`[format.string.std] <https://wg21.link/format.string.std>`_,"Standard format specifiers - pointer types",,Mark de Wever,,|Complete|,Clang 14
 `[format.functions] <https://wg21.link/format.functions>`_,"Format functions - format(string_view fmt, const Args&... args);",,Mark de Wever,`D96664 <https://llvm.org/D96664>`__,|Complete|,Clang 14
 `[format.functions] <https://wg21.link/format.functions>`_,"Format functions - format(wstring_view fmt, const Args&... args);",,Mark de Wever,`D96664 <https://llvm.org/D96664>`__,|Complete|,Clang 14
 `[format.functions] <https://wg21.link/format.functions>`_,"Format functions - format(const locale& loc, string_view fmt, const Args&... args);",,Mark de Wever,`D96664 <https://llvm.org/D96664>`__,|Complete|,Clang 14
diff --git a/libcxx/test/std/containers/sequences/vector.bool/get_allocator.pass.cpp b/libcxx/test/std/containers/sequences/vector.bool/get_allocator.pass.cpp
--- a/libcxx/test/std/containers/sequences/vector.bool/get_allocator.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector.bool/get_allocator.pass.cpp
@@ -20,13 +20,13 @@
 
 int main(int, char**) {
     {
-        std::allocator<int> alloc;
+        std::allocator<bool> alloc;
         const std::vector<bool> vb(alloc);
         assert(vb.get_allocator() == alloc);
     }
     {
-        other_allocator<int> alloc(1);
-        const std::vector<bool, other_allocator<int> > vb(alloc);
+        other_allocator<bool> alloc(1);
+        const std::vector<bool, other_allocator<bool> > vb(alloc);
         assert(vb.get_allocator() == alloc);
     }
 
diff --git a/libcxx/test/std/utilities/format/format.functions/format.locale.pass.cpp b/libcxx/test/std/utilities/format/format.functions/format.locale.pass.cpp
--- a/libcxx/test/std/utilities/format/format.functions/format.locale.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format.locale.pass.cpp
@@ -27,33 +27,31 @@
 #include "test_macros.h"
 #include "format_tests.h"
 
-auto test = []<class CharT, class... Args>(std::basic_string<CharT> expected,
-                                           std::basic_string<CharT> fmt,
+auto test = []<class CharT, class... Args>(std::basic_string<CharT> expected, std::basic_string<CharT> fmt,
                                            const Args&... args) {
   std::basic_string<CharT> out = std::format(std::locale(), fmt, args...);
   if constexpr (std::same_as<CharT, char>)
     if (out != expected)
-      std::cerr << "\nFormat string   " << fmt << "\nExpected output "
-                << expected << "\nActual output   " << out << '\n';
+      std::cerr << "\nFormat string   " << fmt << "\nExpected output " << expected << "\nActual output   " << out
+                << '\n';
   assert(out == expected);
 };
 
-auto test_exception = []<class CharT, class... Args>(
-    std::string_view what, std::basic_string<CharT> fmt, const Args&... args) {
+auto test_exception =
+    []<class CharT, class... Args>(std::string_view what, std::basic_string<CharT> fmt, const Args&... args) {
 #ifndef TEST_HAS_NO_EXCEPTIONS
   try {
     std::format(std::locale(), fmt, args...);
     if constexpr (std::same_as<CharT, char>)
-      std::cerr << "\nFormat string   " << fmt
-                << "\nDidn't throw an exception.\n";
+      std::cerr << "\nFormat string   " << fmt << "\nDidn't throw an exception.\n";
     assert(false);
   } catch (std::format_error& e) {
-#ifdef _LIBCPP_VERSION
+#  ifdef _LIBCPP_VERSION
     if constexpr (std::same_as<CharT, char>)
       if (e.what() != what)
-        std::cerr << "\nFormat string   " << fmt << "\nExpected exception "
-                  << what << "\nActual exception   " << e.what() << '\n';
-#endif
+        std::cerr << "\nFormat string   " << fmt << "\nExpected exception " << what << "\nActual exception   "
+                  << e.what() << '\n';
+#  endif
     LIBCPP_ASSERT(e.what() == what);
     return;
   }
diff --git a/libcxx/test/std/utilities/format/format.functions/format.pass.cpp b/libcxx/test/std/utilities/format/format.functions/format.pass.cpp
--- a/libcxx/test/std/utilities/format/format.functions/format.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format.pass.cpp
@@ -25,44 +25,42 @@
 #include <format>
 #include <cassert>
 #ifndef _LIBCPP_HAS_NO_LOCALIZATION
-#include <iostream>
+#  include <iostream>
 #endif
 #include <vector>
 
 #include "test_macros.h"
 #include "format_tests.h"
 
-auto test = []<class CharT, class... Args>(std::basic_string<CharT> expected,
-                                           std::basic_string<CharT> fmt,
+auto test = []<class CharT, class... Args>(std::basic_string<CharT> expected, std::basic_string<CharT> fmt,
                                            const Args&... args) {
   std::basic_string<CharT> out = std::format(fmt, args...);
 #ifndef _LIBCPP_HAS_NO_LOCALIZATION
   if constexpr (std::same_as<CharT, char>)
     if (out != expected)
-      std::cerr << "\nFormat string   " << fmt << "\nExpected output "
-                << expected << "\nActual output   " << out << '\n';
+      std::cerr << "\nFormat string   " << fmt << "\nExpected output " << expected << "\nActual output   " << out
+                << '\n';
 #endif
   assert(out == expected);
 };
 
-auto test_exception = []<class CharT, class... Args>(
-    std::string_view what, std::basic_string<CharT> fmt, const Args&... args) {
+auto test_exception =
+    []<class CharT, class... Args>(std::string_view what, std::basic_string<CharT> fmt, const Args&... args) {
 #ifndef TEST_HAS_NO_EXCEPTIONS
   try {
     std::format(fmt, args...);
-#ifndef _LIBCPP_HAS_NO_LOCALIZATION
+#  ifndef _LIBCPP_HAS_NO_LOCALIZATION
     if constexpr (std::same_as<CharT, char>)
-      std::cerr << "\nFormat string   " << fmt
-                << "\nDidn't throw an exception.\n";
-#endif
+      std::cerr << "\nFormat string   " << fmt << "\nDidn't throw an exception.\n";
+#  endif
     assert(false);
   } catch (std::format_error& e) {
-#if defined(_LIBCPP_VERSION) && !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  if defined(_LIBCPP_VERSION) && !defined(_LIBCPP_HAS_NO_LOCALIZATION)
     if constexpr (std::same_as<CharT, char>)
       if (e.what() != what)
-        std::cerr << "\nFormat string   " << fmt << "\nExpected exception "
-                  << what << "\nActual exception   " << e.what() << '\n';
-#endif
+        std::cerr << "\nFormat string   " << fmt << "\nExpected exception " << what << "\nActual exception   "
+                  << e.what() << '\n';
+#  endif
     LIBCPP_ASSERT(e.what() == what);
     return;
   }
diff --git a/libcxx/test/std/utilities/format/format.functions/format_tests.h b/libcxx/test/std/utilities/format/format.functions/format_tests.h
--- a/libcxx/test/std/utilities/format/format.functions/format_tests.h
+++ b/libcxx/test/std/utilities/format/format.functions/format_tests.h
@@ -134,10 +134,10 @@
 std::vector<std::basic_string<CharT>> invalid_types(std::string valid) {
   std::vector<std::basic_string<CharT>> result;
 
-#define CASE(T)                                                                \
-  case #T[0]:                                                                  \
-    result.push_back(STR("Invalid formatter type {:" #T "}"));                 \
-    break;
+#define CASE(T)                                                                                                        \
+case #T[0]:                                                                                                            \
+  result.push_back(STR("Invalid formatter type {:" #T "}"));                                                           \
+  break;
 
   for (auto type : "aAbBcdeEfFgGopsxX") {
     if (valid.find(type) != std::string::npos)
@@ -173,18 +173,15 @@
 }
 
 template <class CharT, class T, class TestFunction, class ExceptionTest>
-void format_test_string(T world, T universe, TestFunction check,
-                        ExceptionTest check_exception) {
+void format_test_string(T world, T universe, TestFunction check, ExceptionTest check_exception) {
 
   // *** Valid input tests ***
   // Unsed argument is ignored. TODO FMT what does the Standard mandate?
   check(STR("hello world"), STR("hello {}"), world, universe);
-  check(STR("hello world and universe"), STR("hello {} and {}"), world,
-        universe);
+  check(STR("hello world and universe"), STR("hello {} and {}"), world, universe);
   check(STR("hello world"), STR("hello {0}"), world, universe);
   check(STR("hello universe"), STR("hello {1}"), world, universe);
-  check(STR("hello universe and world"), STR("hello {1} and {0}"), world,
-        universe);
+  check(STR("hello universe and world"), STR("hello {1} and {0}"), world, universe);
 
   check(STR("hello world"), STR("hello {:_>}"), world);
   check(STR("hello    world"), STR("hello {:>8}"), world);
@@ -225,97 +222,69 @@
   check(STR("hello uni#####"), STR("hello {:#<8.3s}"), universe);
 
   // *** sign ***
-  check_exception("The format-spec should consume the input or end with a '}'",
-                  STR("hello {:-}"), world);
+  check_exception("The format-spec should consume the input or end with a '}'", STR("hello {:-}"), world);
 
   // *** alternate form ***
-  check_exception("The format-spec should consume the input or end with a '}'",
-                  STR("hello {:#}"), world);
+  check_exception("The format-spec should consume the input or end with a '}'", STR("hello {:#}"), world);
 
   // *** zero-padding ***
-  check_exception("A format-spec width field shouldn't have a leading zero",
-                  STR("hello {:0}"), world);
+  check_exception("A format-spec width field shouldn't have a leading zero", STR("hello {:0}"), world);
 
   // *** width ***
 #ifdef _LIBCPP_VERSION
   // This limit isn't specified in the Standard.
-  static_assert(std::__format::__number_max == 2'147'483'647,
-                "Update the assert and the test.");
-  check_exception("The numeric value of the format-spec is too large",
-                  STR("{:2147483648}"), world);
-  check_exception("The numeric value of the format-spec is too large",
-                  STR("{:5000000000}"), world);
-  check_exception("The numeric value of the format-spec is too large",
-                  STR("{:10000000000}"), world);
+  static_assert(std::__format::__number_max == 2'147'483'647, "Update the assert and the test.");
+  check_exception("The numeric value of the format-spec is too large", STR("{:2147483648}"), world);
+  check_exception("The numeric value of the format-spec is too large", STR("{:5000000000}"), world);
+  check_exception("The numeric value of the format-spec is too large", STR("{:10000000000}"), world);
 #endif
 
-  check_exception(
-      "A format-spec width field replacement should have a positive value",
-      STR("hello {:{}}"), world, 0);
-  check_exception(
-      "A format-spec arg-id replacement shouldn't have a negative value",
-      STR("hello {:{}}"), world, -1);
-  check_exception(
-      "A format-spec arg-id replacement exceeds the maximum supported value",
-      STR("hello {:{}}"), world, unsigned(-1));
+  check_exception("A format-spec width field replacement should have a positive value", STR("hello {:{}}"), world, 0);
+  check_exception("A format-spec arg-id replacement shouldn't have a negative value", STR("hello {:{}}"), world, -1);
+  check_exception("A format-spec arg-id replacement exceeds the maximum supported value", STR("hello {:{}}"), world,
+                  unsigned(-1));
   check_exception("Argument index out of bounds", STR("hello {:{}}"), world);
-  check_exception(
-      "A format-spec arg-id replacement argument isn't an integral type",
-      STR("hello {:{}}"), world, universe);
-  check_exception(
-      "Using manual argument numbering in automatic argument numbering mode",
-      STR("hello {:{0}}"), world, 1);
-  check_exception(
-      "Using automatic argument numbering in manual argument numbering mode",
-      STR("hello {0:{}}"), world, 1);
+  check_exception("A format-spec arg-id replacement argument isn't an integral type", STR("hello {:{}}"), world,
+                  universe);
+  check_exception("Using manual argument numbering in automatic argument numbering mode", STR("hello {:{0}}"), world,
+                  1);
+  check_exception("Using automatic argument numbering in manual argument numbering mode", STR("hello {0:{}}"), world,
+                  1);
   // Arg-id may not have leading zeros.
   check_exception("Invalid arg-id", STR("hello {0:{01}}"), world, 1);
 
   // *** precision ***
 #ifdef _LIBCPP_VERSION
   // This limit isn't specified in the Standard.
-  static_assert(std::__format::__number_max == 2'147'483'647,
-                "Update the assert and the test.");
-  check_exception("The numeric value of the format-spec is too large",
-                  STR("{:.2147483648}"), world);
-  check_exception("The numeric value of the format-spec is too large",
-                  STR("{:.5000000000}"), world);
-  check_exception("The numeric value of the format-spec is too large",
-                  STR("{:.10000000000}"), world);
+  static_assert(std::__format::__number_max == 2'147'483'647, "Update the assert and the test.");
+  check_exception("The numeric value of the format-spec is too large", STR("{:.2147483648}"), world);
+  check_exception("The numeric value of the format-spec is too large", STR("{:.5000000000}"), world);
+  check_exception("The numeric value of the format-spec is too large", STR("{:.10000000000}"), world);
 #endif
 
   // Precision 0 allowed, but not useful for string arguments.
   check(STR("hello "), STR("hello {:.{}}"), world, 0);
   // Precision may have leading zeros. Secondly tests the value is still base 10.
   check(STR("hello 0123456789"), STR("hello {:.000010}"), STR("0123456789abcdef"));
-  check_exception(
-      "A format-spec arg-id replacement shouldn't have a negative value",
-      STR("hello {:.{}}"), world, -1);
-  check_exception(
-      "A format-spec arg-id replacement exceeds the maximum supported value",
-      STR("hello {:.{}}"), world, ~0u);
+  check_exception("A format-spec arg-id replacement shouldn't have a negative value", STR("hello {:.{}}"), world, -1);
+  check_exception("A format-spec arg-id replacement exceeds the maximum supported value", STR("hello {:.{}}"), world,
+                  ~0u);
   check_exception("Argument index out of bounds", STR("hello {:.{}}"), world);
-  check_exception(
-      "A format-spec arg-id replacement argument isn't an integral type",
-      STR("hello {:.{}}"), world, universe);
-  check_exception(
-      "Using manual argument numbering in automatic argument numbering mode",
-      STR("hello {:.{0}}"), world, 1);
-  check_exception(
-      "Using automatic argument numbering in manual argument numbering mode",
-      STR("hello {0:.{}}"), world, 1);
+  check_exception("A format-spec arg-id replacement argument isn't an integral type", STR("hello {:.{}}"), world,
+                  universe);
+  check_exception("Using manual argument numbering in automatic argument numbering mode", STR("hello {:.{0}}"), world,
+                  1);
+  check_exception("Using automatic argument numbering in manual argument numbering mode", STR("hello {0:.{}}"), world,
+                  1);
   // Arg-id may not have leading zeros.
   check_exception("Invalid arg-id", STR("hello {0:.{01}}"), world, 1);
 
   // *** locale-specific form ***
-  check_exception("The format-spec should consume the input or end with a '}'",
-                  STR("hello {:L}"), world);
+  check_exception("The format-spec should consume the input or end with a '}'", STR("hello {:L}"), world);
 
   // *** type ***
   for (const auto& fmt : invalid_types<CharT>("s"))
-    check_exception(
-        "The format-spec type has a type not supported for a string argument",
-        fmt, world);
+    check_exception("The format-spec type has a type not supported for a string argument", fmt, world);
 }
 
 template <class CharT, class TestFunction>
@@ -364,13 +333,10 @@
   // Testing the char const[] is a bit tricky due to array to pointer decay.
   // Since there are separate tests in format.formatter.spec the array is not
   // tested here.
-  format_test_string<CharT>(world.c_str(), universe.c_str(), check,
+  format_test_string<CharT>(world.c_str(), universe.c_str(), check, check_exception);
+  format_test_string<CharT>(const_cast<CharT*>(world.c_str()), const_cast<CharT*>(universe.c_str()), check,
                             check_exception);
-  format_test_string<CharT>(const_cast<CharT*>(world.c_str()),
-                            const_cast<CharT*>(universe.c_str()), check,
-                            check_exception);
-  format_test_string<CharT>(std::basic_string_view<CharT>(world),
-                            std::basic_string_view<CharT>(universe), check,
+  format_test_string<CharT>(std::basic_string_view<CharT>(world), std::basic_string_view<CharT>(universe), check,
                             check_exception);
   format_test_string<CharT>(world, universe, check, check_exception);
   format_test_string_unicode<CharT>(check);
@@ -399,60 +365,41 @@
   check(STR("answer is '-false--'"), STR("answer is '{:-^8s}'"), false);
 
   // *** Sign ***
-  check_exception("A sign field isn't allowed in this format-spec", STR("{:-}"),
-                  true);
-  check_exception("A sign field isn't allowed in this format-spec", STR("{:+}"),
-                  true);
-  check_exception("A sign field isn't allowed in this format-spec", STR("{: }"),
-                  true);
-
-  check_exception("A sign field isn't allowed in this format-spec",
-                  STR("{:-s}"), true);
-  check_exception("A sign field isn't allowed in this format-spec",
-                  STR("{:+s}"), true);
-  check_exception("A sign field isn't allowed in this format-spec",
-                  STR("{: s}"), true);
+  check_exception("A sign field isn't allowed in this format-spec", STR("{:-}"), true);
+  check_exception("A sign field isn't allowed in this format-spec", STR("{:+}"), true);
+  check_exception("A sign field isn't allowed in this format-spec", STR("{: }"), true);
+
+  check_exception("A sign field isn't allowed in this format-spec", STR("{:-s}"), true);
+  check_exception("A sign field isn't allowed in this format-spec", STR("{:+s}"), true);
+  check_exception("A sign field isn't allowed in this format-spec", STR("{: s}"), true);
 
   // *** alternate form ***
-  check_exception("An alternate form field isn't allowed in this format-spec",
-                  STR("{:#}"), true);
-  check_exception("An alternate form field isn't allowed in this format-spec",
-                  STR("{:#s}"), true);
+  check_exception("An alternate form field isn't allowed in this format-spec", STR("{:#}"), true);
+  check_exception("An alternate form field isn't allowed in this format-spec", STR("{:#s}"), true);
 
   // *** zero-padding ***
-  check_exception("A zero-padding field isn't allowed in this format-spec",
-                  STR("{:0}"), true);
-  check_exception("A zero-padding field isn't allowed in this format-spec",
-                  STR("{:0s}"), true);
+  check_exception("A zero-padding field isn't allowed in this format-spec", STR("{:0}"), true);
+  check_exception("A zero-padding field isn't allowed in this format-spec", STR("{:0s}"), true);
 
   // *** precision ***
-  check_exception("The format-spec should consume the input or end with a '}'",
-                  STR("{:.}"), true);
-  check_exception("The format-spec should consume the input or end with a '}'",
-                  STR("{:.0}"), true);
-  check_exception("The format-spec should consume the input or end with a '}'",
-                  STR("{:.42}"), true);
-
-  check_exception("The format-spec should consume the input or end with a '}'",
-                  STR("{:.s}"), true);
-  check_exception("The format-spec should consume the input or end with a '}'",
-                  STR("{:.0s}"), true);
-  check_exception("The format-spec should consume the input or end with a '}'",
-                  STR("{:.42s}"), true);
+  check_exception("The format-spec should consume the input or end with a '}'", STR("{:.}"), true);
+  check_exception("The format-spec should consume the input or end with a '}'", STR("{:.0}"), true);
+  check_exception("The format-spec should consume the input or end with a '}'", STR("{:.42}"), true);
+
+  check_exception("The format-spec should consume the input or end with a '}'", STR("{:.s}"), true);
+  check_exception("The format-spec should consume the input or end with a '}'", STR("{:.0s}"), true);
+  check_exception("The format-spec should consume the input or end with a '}'", STR("{:.42s}"), true);
 
   // *** locale-specific form ***
   // See locale-specific_form.pass.cpp
 
   // *** type ***
   for (const auto& fmt : invalid_types<CharT>("bBcdosxX"))
-    check_exception(
-        "The format-spec type has a type not supported for a bool argument",
-        fmt, true);
+    check_exception("The format-spec type has a type not supported for a bool argument", fmt, true);
 }
 
 template <class CharT, class TestFunction, class ExceptionTest>
-void format_test_bool_as_char(TestFunction check,
-                              ExceptionTest check_exception) {
+void format_test_bool_as_char(TestFunction check, ExceptionTest check_exception) {
   // *** align-fill & width ***
   check(STR("answer is '\1     '"), STR("answer is '{:6c}'"), true);
   check(STR("answer is '     \1'"), STR("answer is '{:>6c}'"), true);
@@ -463,47 +410,31 @@
   check(STR("answer is '\1-----'"), STR("answer is '{:-<6c}'"), true);
   check(STR("answer is '--\1---'"), STR("answer is '{:-^6c}'"), true);
 
-  check(std::basic_string<CharT>(CSTR("answer is '\0     '"), 18),
-        STR("answer is '{:6c}'"), false);
-  check(std::basic_string<CharT>(CSTR("answer is '\0     '"), 18),
-        STR("answer is '{:6c}'"), false);
-  check(std::basic_string<CharT>(CSTR("answer is '     \0'"), 18),
-        STR("answer is '{:>6c}'"), false);
-  check(std::basic_string<CharT>(CSTR("answer is '\0     '"), 18),
-        STR("answer is '{:<6c}'"), false);
-  check(std::basic_string<CharT>(CSTR("answer is '  \0   '"), 18),
-        STR("answer is '{:^6c}'"), false);
-
-  check(std::basic_string<CharT>(CSTR("answer is '-----\0'"), 18),
-        STR("answer is '{:->6c}'"), false);
-  check(std::basic_string<CharT>(CSTR("answer is '\0-----'"), 18),
-        STR("answer is '{:-<6c}'"), false);
-  check(std::basic_string<CharT>(CSTR("answer is '--\0---'"), 18),
-        STR("answer is '{:-^6c}'"), false);
+  check(std::basic_string<CharT>(CSTR("answer is '\0     '"), 18), STR("answer is '{:6c}'"), false);
+  check(std::basic_string<CharT>(CSTR("answer is '\0     '"), 18), STR("answer is '{:6c}'"), false);
+  check(std::basic_string<CharT>(CSTR("answer is '     \0'"), 18), STR("answer is '{:>6c}'"), false);
+  check(std::basic_string<CharT>(CSTR("answer is '\0     '"), 18), STR("answer is '{:<6c}'"), false);
+  check(std::basic_string<CharT>(CSTR("answer is '  \0   '"), 18), STR("answer is '{:^6c}'"), false);
+
+  check(std::basic_string<CharT>(CSTR("answer is '-----\0'"), 18), STR("answer is '{:->6c}'"), false);
+  check(std::basic_string<CharT>(CSTR("answer is '\0-----'"), 18), STR("answer is '{:-<6c}'"), false);
+  check(std::basic_string<CharT>(CSTR("answer is '--\0---'"), 18), STR("answer is '{:-^6c}'"), false);
 
   // *** Sign ***
-  check_exception("A sign field isn't allowed in this format-spec",
-                  STR("{:-c}"), true);
-  check_exception("A sign field isn't allowed in this format-spec",
-                  STR("{:+c}"), true);
-  check_exception("A sign field isn't allowed in this format-spec",
-                  STR("{: c}"), true);
+  check_exception("A sign field isn't allowed in this format-spec", STR("{:-c}"), true);
+  check_exception("A sign field isn't allowed in this format-spec", STR("{:+c}"), true);
+  check_exception("A sign field isn't allowed in this format-spec", STR("{: c}"), true);
 
   // *** alternate form ***
-  check_exception("An alternate form field isn't allowed in this format-spec",
-                  STR("{:#c}"), true);
+  check_exception("An alternate form field isn't allowed in this format-spec", STR("{:#c}"), true);
 
   // *** zero-padding ***
-  check_exception("A zero-padding field isn't allowed in this format-spec",
-                  STR("{:0c}"), true);
+  check_exception("A zero-padding field isn't allowed in this format-spec", STR("{:0c}"), true);
 
   // *** precision ***
-  check_exception("The format-spec should consume the input or end with a '}'",
-                  STR("{:.c}"), true);
-  check_exception("The format-spec should consume the input or end with a '}'",
-                  STR("{:.0c}"), true);
-  check_exception("The format-spec should consume the input or end with a '}'",
-                  STR("{:.42c}"), true);
+  check_exception("The format-spec should consume the input or end with a '}'", STR("{:.c}"), true);
+  check_exception("The format-spec should consume the input or end with a '}'", STR("{:.0c}"), true);
+  check_exception("The format-spec should consume the input or end with a '}'", STR("{:.42c}"), true);
 
   // *** locale-specific form ***
   // Note it has no effect but it's allowed.
@@ -511,14 +442,11 @@
 
   // *** type ***
   for (const auto& fmt : invalid_types<CharT>("bBcdosxX"))
-    check_exception(
-        "The format-spec type has a type not supported for a bool argument",
-        fmt, true);
+    check_exception("The format-spec type has a type not supported for a bool argument", fmt, true);
 }
 
 template <class CharT, class TestFunction, class ExceptionTest>
-void format_test_bool_as_integer(TestFunction check,
-                                 ExceptionTest check_exception) {
+void format_test_bool_as_integer(TestFunction check, ExceptionTest check_exception) {
   // *** align-fill & width ***
   check(STR("answer is '1'"), STR("answer is '{:<1d}'"), true);
   check(STR("answer is '1 '"), STR("answer is '{:<2d}'"), true);
@@ -591,26 +519,20 @@
   check(STR("answer is 0X0000000000"), STR("answer is {:#012X}"), false);
 
   // *** precision ***
-  check_exception("The format-spec should consume the input or end with a '}'",
-                  STR("{:.}"), true);
-  check_exception("The format-spec should consume the input or end with a '}'",
-                  STR("{:.0}"), true);
-  check_exception("The format-spec should consume the input or end with a '}'",
-                  STR("{:.42}"), true);
+  check_exception("The format-spec should consume the input or end with a '}'", STR("{:.}"), true);
+  check_exception("The format-spec should consume the input or end with a '}'", STR("{:.0}"), true);
+  check_exception("The format-spec should consume the input or end with a '}'", STR("{:.42}"), true);
 
   // *** locale-specific form ***
   // See locale-specific_form.pass.cpp
 
   // *** type ***
   for (const auto& fmt : invalid_types<CharT>("bBcdosxX"))
-    check_exception(
-        "The format-spec type has a type not supported for a bool argument",
-        fmt, true);
+    check_exception("The format-spec type has a type not supported for a bool argument", fmt, true);
 }
 
 template <class I, class CharT, class TestFunction, class ExceptionTest>
-void format_test_integer_as_integer(TestFunction check,
-                                    ExceptionTest check_exception) {
+void format_test_integer_as_integer(TestFunction check, ExceptionTest check_exception) {
   // *** align-fill & width ***
   check(STR("answer is '42'"), STR("answer is '{:<1}'"), I(42));
   check(STR("answer is '42'"), STR("answer is '{:<2}'"), I(42));
@@ -729,26 +651,20 @@
   check(STR("answer is +0X00000002A"), STR("answer is {:+#012X}"), I(42));
 
   // *** precision ***
-  check_exception("The format-spec should consume the input or end with a '}'",
-                  STR("{:.}"), I(0));
-  check_exception("The format-spec should consume the input or end with a '}'",
-                  STR("{:.0}"), I(0));
-  check_exception("The format-spec should consume the input or end with a '}'",
-                  STR("{:.42}"), I(0));
+  check_exception("The format-spec should consume the input or end with a '}'", STR("{:.}"), I(0));
+  check_exception("The format-spec should consume the input or end with a '}'", STR("{:.0}"), I(0));
+  check_exception("The format-spec should consume the input or end with a '}'", STR("{:.42}"), I(0));
 
   // *** locale-specific form ***
   // See locale-specific_form.pass.cpp
 
   // *** type ***
   for (const auto& fmt : invalid_types<CharT>("bBcdoxX"))
-    check_exception(
-        "The format-spec type has a type not supported for an integer argument",
-        fmt, 42);
+    check_exception("The format-spec type has a type not supported for an integer argument", fmt, 42);
 }
 
 template <class I, class CharT, class TestFunction, class ExceptionTest>
-void format_test_integer_as_char(TestFunction check,
-                                 ExceptionTest check_exception) {
+void format_test_integer_as_char(TestFunction check, ExceptionTest check_exception) {
   // *** align-fill & width ***
   check(STR("answer is '*     '"), STR("answer is '{:6c}'"), I(42));
   check(STR("answer is '     *'"), STR("answer is '{:>6c}'"), I(42));
@@ -761,28 +677,20 @@
 
   // *** Sign ***
   check(STR("answer is *"), STR("answer is {:c}"), I(42));
-  check_exception("A sign field isn't allowed in this format-spec",
-                  STR("answer is {:-c}"), I(42));
-  check_exception("A sign field isn't allowed in this format-spec",
-                  STR("answer is {:+c}"), I(42));
-  check_exception("A sign field isn't allowed in this format-spec",
-                  STR("answer is {: c}"), I(42));
+  check_exception("A sign field isn't allowed in this format-spec", STR("answer is {:-c}"), I(42));
+  check_exception("A sign field isn't allowed in this format-spec", STR("answer is {:+c}"), I(42));
+  check_exception("A sign field isn't allowed in this format-spec", STR("answer is {: c}"), I(42));
 
   // *** alternate form ***
-  check_exception("An alternate form field isn't allowed in this format-spec",
-                  STR("answer is {:#c}"), I(42));
+  check_exception("An alternate form field isn't allowed in this format-spec", STR("answer is {:#c}"), I(42));
 
   // *** zero-padding & width ***
-  check_exception("A zero-padding field isn't allowed in this format-spec",
-                  STR("answer is {:01c}"), I(42));
+  check_exception("A zero-padding field isn't allowed in this format-spec", STR("answer is {:01c}"), I(42));
 
   // *** precision ***
-  check_exception("The format-spec should consume the input or end with a '}'",
-                  STR("{:.c}"), I(0));
-  check_exception("The format-spec should consume the input or end with a '}'",
-                  STR("{:.0c}"), I(0));
-  check_exception("The format-spec should consume the input or end with a '}'",
-                  STR("{:.42c}"), I(0));
+  check_exception("The format-spec should consume the input or end with a '}'", STR("{:.c}"), I(0));
+  check_exception("The format-spec should consume the input or end with a '}'", STR("{:.0c}"), I(0));
+  check_exception("The format-spec should consume the input or end with a '}'", STR("{:.42c}"), I(0));
 
   // *** locale-specific form ***
   // Note it has no effect but it's allowed.
@@ -790,9 +698,7 @@
 
   // *** type ***
   for (const auto& fmt : invalid_types<CharT>("bBcdoxX"))
-    check_exception(
-        "The format-spec type has a type not supported for an integer argument",
-        fmt, I(42));
+    check_exception("The format-spec type has a type not supported for an integer argument", fmt, I(42));
 
   // *** Validate range ***
   // TODO FMT Update test after adding 128-bit support.
@@ -800,18 +706,16 @@
     // The code has some duplications to keep the if statement readable.
     if constexpr (std::signed_integral<CharT>) {
       if constexpr (std::signed_integral<I> && sizeof(I) > sizeof(CharT)) {
-        check_exception("Integral value outside the range of the char type",
-                        STR("{:c}"), std::numeric_limits<I>::min());
-        check_exception("Integral value outside the range of the char type",
-                        STR("{:c}"), std::numeric_limits<I>::max());
-      } else if constexpr (std::unsigned_integral<I> &&
-                           sizeof(I) >= sizeof(CharT)) {
-        check_exception("Integral value outside the range of the char type",
-                        STR("{:c}"), std::numeric_limits<I>::max());
+        check_exception("Integral value outside the range of the char type", STR("{:c}"),
+                        std::numeric_limits<I>::min());
+        check_exception("Integral value outside the range of the char type", STR("{:c}"),
+                        std::numeric_limits<I>::max());
+      } else if constexpr (std::unsigned_integral<I> && sizeof(I) >= sizeof(CharT)) {
+        check_exception("Integral value outside the range of the char type", STR("{:c}"),
+                        std::numeric_limits<I>::max());
       }
     } else if constexpr (sizeof(I) > sizeof(CharT)) {
-      check_exception("Integral value outside the range of the char type",
-                      STR("{:c}"), std::numeric_limits<I>::max());
+      check_exception("Integral value outside the range of the char type", STR("{:c}"), std::numeric_limits<I>::max());
     }
   }
 }
@@ -823,8 +727,7 @@
 }
 
 template <class CharT, class TestFunction, class ExceptionTest>
-void format_test_signed_integer(TestFunction check,
-                                ExceptionTest check_exception) {
+void format_test_signed_integer(TestFunction check, ExceptionTest check_exception) {
   format_test_integer<signed char, CharT>(check, check_exception);
   format_test_integer<short, CharT>(check, check_exception);
   format_test_integer<int, CharT>(check, check_exception);
@@ -839,62 +742,49 @@
   check(STR("-128"), STR("{:#}"), std::numeric_limits<int8_t>::min());
   check(STR("-0x80"), STR("{:#x}"), std::numeric_limits<int8_t>::min());
 
-  check(STR("-0b1000000000000000"), STR("{:#b}"),
-        std::numeric_limits<int16_t>::min());
+  check(STR("-0b1000000000000000"), STR("{:#b}"), std::numeric_limits<int16_t>::min());
   check(STR("-0100000"), STR("{:#o}"), std::numeric_limits<int16_t>::min());
   check(STR("-32768"), STR("{:#}"), std::numeric_limits<int16_t>::min());
   check(STR("-0x8000"), STR("{:#x}"), std::numeric_limits<int16_t>::min());
 
-  check(STR("-0b10000000000000000000000000000000"), STR("{:#b}"),
-        std::numeric_limits<int32_t>::min());
-  check(STR("-020000000000"), STR("{:#o}"),
-        std::numeric_limits<int32_t>::min());
+  check(STR("-0b10000000000000000000000000000000"), STR("{:#b}"), std::numeric_limits<int32_t>::min());
+  check(STR("-020000000000"), STR("{:#o}"), std::numeric_limits<int32_t>::min());
   check(STR("-2147483648"), STR("{:#}"), std::numeric_limits<int32_t>::min());
   check(STR("-0x80000000"), STR("{:#x}"), std::numeric_limits<int32_t>::min());
 
   check(STR("-0b100000000000000000000000000000000000000000000000000000000000000"
             "0"),
         STR("{:#b}"), std::numeric_limits<int64_t>::min());
-  check(STR("-01000000000000000000000"), STR("{:#o}"),
-        std::numeric_limits<int64_t>::min());
-  check(STR("-9223372036854775808"), STR("{:#}"),
-        std::numeric_limits<int64_t>::min());
-  check(STR("-0x8000000000000000"), STR("{:#x}"),
-        std::numeric_limits<int64_t>::min());
+  check(STR("-01000000000000000000000"), STR("{:#o}"), std::numeric_limits<int64_t>::min());
+  check(STR("-9223372036854775808"), STR("{:#}"), std::numeric_limits<int64_t>::min());
+  check(STR("-0x8000000000000000"), STR("{:#x}"), std::numeric_limits<int64_t>::min());
 
   check(STR("0b1111111"), STR("{:#b}"), std::numeric_limits<int8_t>::max());
   check(STR("0177"), STR("{:#o}"), std::numeric_limits<int8_t>::max());
   check(STR("127"), STR("{:#}"), std::numeric_limits<int8_t>::max());
   check(STR("0x7f"), STR("{:#x}"), std::numeric_limits<int8_t>::max());
 
-  check(STR("0b111111111111111"), STR("{:#b}"),
-        std::numeric_limits<int16_t>::max());
+  check(STR("0b111111111111111"), STR("{:#b}"), std::numeric_limits<int16_t>::max());
   check(STR("077777"), STR("{:#o}"), std::numeric_limits<int16_t>::max());
   check(STR("32767"), STR("{:#}"), std::numeric_limits<int16_t>::max());
   check(STR("0x7fff"), STR("{:#x}"), std::numeric_limits<int16_t>::max());
 
-  check(STR("0b1111111111111111111111111111111"), STR("{:#b}"),
-        std::numeric_limits<int32_t>::max());
+  check(STR("0b1111111111111111111111111111111"), STR("{:#b}"), std::numeric_limits<int32_t>::max());
   check(STR("017777777777"), STR("{:#o}"), std::numeric_limits<int32_t>::max());
   check(STR("2147483647"), STR("{:#}"), std::numeric_limits<int32_t>::max());
   check(STR("0x7fffffff"), STR("{:#x}"), std::numeric_limits<int32_t>::max());
 
-  check(
-      STR("0b111111111111111111111111111111111111111111111111111111111111111"),
-      STR("{:#b}"), std::numeric_limits<int64_t>::max());
-  check(STR("0777777777777777777777"), STR("{:#o}"),
-        std::numeric_limits<int64_t>::max());
-  check(STR("9223372036854775807"), STR("{:#}"),
-        std::numeric_limits<int64_t>::max());
-  check(STR("0x7fffffffffffffff"), STR("{:#x}"),
+  check(STR("0b111111111111111111111111111111111111111111111111111111111111111"), STR("{:#b}"),
         std::numeric_limits<int64_t>::max());
+  check(STR("0777777777777777777777"), STR("{:#o}"), std::numeric_limits<int64_t>::max());
+  check(STR("9223372036854775807"), STR("{:#}"), std::numeric_limits<int64_t>::max());
+  check(STR("0x7fffffffffffffff"), STR("{:#x}"), std::numeric_limits<int64_t>::max());
 
   // TODO FMT Add __int128_t test after implementing full range.
 }
 
 template <class CharT, class TestFunction, class ExceptionTest>
-void format_test_unsigned_integer(TestFunction check,
-                                  ExceptionTest check_exception) {
+void format_test_unsigned_integer(TestFunction check, ExceptionTest check_exception) {
   format_test_integer<unsigned char, CharT>(check, check_exception);
   format_test_integer<unsigned short, CharT>(check, check_exception);
   format_test_integer<unsigned, CharT>(check, check_exception);
@@ -909,28 +799,21 @@
   check(STR("255"), STR("{:#}"), std::numeric_limits<uint8_t>::max());
   check(STR("0xff"), STR("{:#x}"), std::numeric_limits<uint8_t>::max());
 
-  check(STR("0b1111111111111111"), STR("{:#b}"),
-        std::numeric_limits<uint16_t>::max());
+  check(STR("0b1111111111111111"), STR("{:#b}"), std::numeric_limits<uint16_t>::max());
   check(STR("0177777"), STR("{:#o}"), std::numeric_limits<uint16_t>::max());
   check(STR("65535"), STR("{:#}"), std::numeric_limits<uint16_t>::max());
   check(STR("0xffff"), STR("{:#x}"), std::numeric_limits<uint16_t>::max());
 
-  check(STR("0b11111111111111111111111111111111"), STR("{:#b}"),
-        std::numeric_limits<uint32_t>::max());
-  check(STR("037777777777"), STR("{:#o}"),
-        std::numeric_limits<uint32_t>::max());
+  check(STR("0b11111111111111111111111111111111"), STR("{:#b}"), std::numeric_limits<uint32_t>::max());
+  check(STR("037777777777"), STR("{:#o}"), std::numeric_limits<uint32_t>::max());
   check(STR("4294967295"), STR("{:#}"), std::numeric_limits<uint32_t>::max());
   check(STR("0xffffffff"), STR("{:#x}"), std::numeric_limits<uint32_t>::max());
 
-  check(
-      STR("0b1111111111111111111111111111111111111111111111111111111111111111"),
-      STR("{:#b}"), std::numeric_limits<uint64_t>::max());
-  check(STR("01777777777777777777777"), STR("{:#o}"),
-        std::numeric_limits<uint64_t>::max());
-  check(STR("18446744073709551615"), STR("{:#}"),
-        std::numeric_limits<uint64_t>::max());
-  check(STR("0xffffffffffffffff"), STR("{:#x}"),
+  check(STR("0b1111111111111111111111111111111111111111111111111111111111111111"), STR("{:#b}"),
         std::numeric_limits<uint64_t>::max());
+  check(STR("01777777777777777777777"), STR("{:#o}"), std::numeric_limits<uint64_t>::max());
+  check(STR("18446744073709551615"), STR("{:#}"), std::numeric_limits<uint64_t>::max());
+  check(STR("0xffffffffffffffff"), STR("{:#x}"), std::numeric_limits<uint64_t>::max());
 
   // TODO FMT Add __uint128_t test after implementing full range.
 }
@@ -959,46 +842,30 @@
   check(STR("answer is '--*---'"), STR("answer is '{:-^6c}'"), CharT('*'));
 
   // *** Sign ***
-  check_exception("A sign field isn't allowed in this format-spec", STR("{:-}"),
-                  CharT('*'));
-  check_exception("A sign field isn't allowed in this format-spec", STR("{:+}"),
-                  CharT('*'));
-  check_exception("A sign field isn't allowed in this format-spec", STR("{: }"),
-                  CharT('*'));
-
-  check_exception("A sign field isn't allowed in this format-spec",
-                  STR("{:-c}"), CharT('*'));
-  check_exception("A sign field isn't allowed in this format-spec",
-                  STR("{:+c}"), CharT('*'));
-  check_exception("A sign field isn't allowed in this format-spec",
-                  STR("{: c}"), CharT('*'));
+  check_exception("A sign field isn't allowed in this format-spec", STR("{:-}"), CharT('*'));
+  check_exception("A sign field isn't allowed in this format-spec", STR("{:+}"), CharT('*'));
+  check_exception("A sign field isn't allowed in this format-spec", STR("{: }"), CharT('*'));
+
+  check_exception("A sign field isn't allowed in this format-spec", STR("{:-c}"), CharT('*'));
+  check_exception("A sign field isn't allowed in this format-spec", STR("{:+c}"), CharT('*'));
+  check_exception("A sign field isn't allowed in this format-spec", STR("{: c}"), CharT('*'));
 
   // *** alternate form ***
-  check_exception("An alternate form field isn't allowed in this format-spec",
-                  STR("{:#}"), CharT('*'));
-  check_exception("An alternate form field isn't allowed in this format-spec",
-                  STR("{:#c}"), CharT('*'));
+  check_exception("An alternate form field isn't allowed in this format-spec", STR("{:#}"), CharT('*'));
+  check_exception("An alternate form field isn't allowed in this format-spec", STR("{:#c}"), CharT('*'));
 
   // *** zero-padding ***
-  check_exception("A zero-padding field isn't allowed in this format-spec",
-                  STR("{:0}"), CharT('*'));
-  check_exception("A zero-padding field isn't allowed in this format-spec",
-                  STR("{:0c}"), CharT('*'));
+  check_exception("A zero-padding field isn't allowed in this format-spec", STR("{:0}"), CharT('*'));
+  check_exception("A zero-padding field isn't allowed in this format-spec", STR("{:0c}"), CharT('*'));
 
   // *** precision ***
-  check_exception("The format-spec should consume the input or end with a '}'",
-                  STR("{:.}"), CharT('*'));
-  check_exception("The format-spec should consume the input or end with a '}'",
-                  STR("{:.0}"), CharT('*'));
-  check_exception("The format-spec should consume the input or end with a '}'",
-                  STR("{:.42}"), CharT('*'));
-
-  check_exception("The format-spec should consume the input or end with a '}'",
-                  STR("{:.c}"), CharT('*'));
-  check_exception("The format-spec should consume the input or end with a '}'",
-                  STR("{:.0c}"), CharT('*'));
-  check_exception("The format-spec should consume the input or end with a '}'",
-                  STR("{:.42c}"), CharT('*'));
+  check_exception("The format-spec should consume the input or end with a '}'", STR("{:.}"), CharT('*'));
+  check_exception("The format-spec should consume the input or end with a '}'", STR("{:.0}"), CharT('*'));
+  check_exception("The format-spec should consume the input or end with a '}'", STR("{:.42}"), CharT('*'));
+
+  check_exception("The format-spec should consume the input or end with a '}'", STR("{:.c}"), CharT('*'));
+  check_exception("The format-spec should consume the input or end with a '}'", STR("{:.0c}"), CharT('*'));
+  check_exception("The format-spec should consume the input or end with a '}'", STR("{:.42c}"), CharT('*'));
 
   // *** locale-specific form ***
   // Note it has no effect but it's allowed.
@@ -1007,14 +874,11 @@
 
   // *** type ***
   for (const auto& fmt : invalid_types<CharT>("bBcdoxX"))
-    check_exception(
-        "The format-spec type has a type not supported for a char argument",
-        fmt, CharT('*'));
+    check_exception("The format-spec type has a type not supported for a char argument", fmt, CharT('*'));
 }
 
 template <class CharT, class TestFunction, class ExceptionTest>
-void format_test_char_as_integer(TestFunction check,
-                                 ExceptionTest check_exception) {
+void format_test_char_as_integer(TestFunction check, ExceptionTest check_exception) {
   // *** align-fill & width ***
   check(STR("answer is '42'"), STR("answer is '{:<1d}'"), CharT('*'));
 
@@ -1067,21 +931,16 @@
   check(STR("answer is +0X00000002A"), STR("answer is {:+#012X}"), CharT('*'));
 
   // *** precision ***
-  check_exception("The format-spec should consume the input or end with a '}'",
-                  STR("{:.d}"), CharT('*'));
-  check_exception("The format-spec should consume the input or end with a '}'",
-                  STR("{:.0d}"), CharT('*'));
-  check_exception("The format-spec should consume the input or end with a '}'",
-                  STR("{:.42d}"), CharT('*'));
+  check_exception("The format-spec should consume the input or end with a '}'", STR("{:.d}"), CharT('*'));
+  check_exception("The format-spec should consume the input or end with a '}'", STR("{:.0d}"), CharT('*'));
+  check_exception("The format-spec should consume the input or end with a '}'", STR("{:.42d}"), CharT('*'));
 
   // *** locale-specific form ***
   // See locale-specific_form.pass.cpp
 
   // *** type ***
   for (const auto& fmt : invalid_types<CharT>("bBcdoxX"))
-    check_exception(
-        "The format-spec type has a type not supported for a char argument",
-        fmt, '*');
+    check_exception("The format-spec type has a type not supported for a char argument", fmt, '*');
 }
 
 template <class F, class CharT, class TestFunction>
@@ -2650,42 +2509,34 @@
 
   // ** Test invalid format strings ***
   check_exception("The format string terminates at a '{'", STR("{"));
-  check_exception("The replacement field misses a terminating '}'", STR("{:"),
-                  42);
+  check_exception("The replacement field misses a terminating '}'", STR("{:"), 42);
 
-  check_exception("The format string contains an invalid escape sequence",
-                  STR("}"));
-  check_exception("The format string contains an invalid escape sequence",
-                  STR("{:}-}"), 42);
+  check_exception("The format string contains an invalid escape sequence", STR("}"));
+  check_exception("The format string contains an invalid escape sequence", STR("{:}-}"), 42);
 
-  check_exception("The format string contains an invalid escape sequence",
-                  STR("} "));
+  check_exception("The format string contains an invalid escape sequence", STR("} "));
 
-  check_exception(
-      "The arg-id of the format-spec starts with an invalid character",
-      STR("{-"), 42);
+  check_exception("The arg-id of the format-spec starts with an invalid character", STR("{-"), 42);
   check_exception("Argument index out of bounds", STR("hello {}"));
   check_exception("Argument index out of bounds", STR("hello {0}"));
   check_exception("Argument index out of bounds", STR("hello {1}"), 42);
 
   // *** Test char format argument ***
   // The `char` to `wchar_t` formatting is tested separately.
-  check(STR("hello 09azAZ!"), STR("hello {}{}{}{}{}{}{}"), CharT('0'),
-        CharT('9'), CharT('a'), CharT('z'), CharT('A'), CharT('Z'), CharT('!'));
+  check(STR("hello 09azAZ!"), STR("hello {}{}{}{}{}{}{}"), CharT('0'), CharT('9'), CharT('a'), CharT('z'), CharT('A'),
+        CharT('Z'), CharT('!'));
 
   format_test_char<CharT>(check, check_exception);
   format_test_char_as_integer<CharT>(check, check_exception);
 
   // *** Test string format argument ***
   {
-    CharT buffer[] = {CharT('0'), CharT('9'), CharT('a'), CharT('z'),
-                      CharT('A'), CharT('Z'), CharT('!'), 0};
+    CharT buffer[] = {CharT('0'), CharT('9'), CharT('a'), CharT('z'), CharT('A'), CharT('Z'), CharT('!'), 0};
     CharT* data = buffer;
     check(STR("hello 09azAZ!"), STR("hello {}"), data);
   }
   {
-    CharT buffer[] = {CharT('0'), CharT('9'), CharT('a'), CharT('z'),
-                      CharT('A'), CharT('Z'), CharT('!'), 0};
+    CharT buffer[] = {CharT('0'), CharT('9'), CharT('a'), CharT('z'), CharT('A'), CharT('Z'), CharT('!'), 0};
     const CharT* data = buffer;
     check(STR("hello 09azAZ!"), STR("hello {}"), data);
   }
@@ -2718,20 +2569,14 @@
   {
     // Note 128-bit support is only partly implemented test the range
     // conditions here.
-    std::basic_string<CharT> min =
-        std::format(STR("{}"), std::numeric_limits<long long>::min());
-    check(min, STR("{}"),
-          static_cast<__int128_t>(std::numeric_limits<long long>::min()));
-    std::basic_string<CharT> max =
-        std::format(STR("{}"), std::numeric_limits<long long>::max());
-    check(max, STR("{}"),
-          static_cast<__int128_t>(std::numeric_limits<long long>::max()));
-    check_exception(
-        "128-bit value is outside of implemented range", STR("{}"),
-        static_cast<__int128_t>(std::numeric_limits<long long>::min()) - 1);
-    check_exception(
-        "128-bit value is outside of implemented range", STR("{}"),
-        static_cast<__int128_t>(std::numeric_limits<long long>::max()) + 1);
+    std::basic_string<CharT> min = std::format(STR("{}"), std::numeric_limits<long long>::min());
+    check(min, STR("{}"), static_cast<__int128_t>(std::numeric_limits<long long>::min()));
+    std::basic_string<CharT> max = std::format(STR("{}"), std::numeric_limits<long long>::max());
+    check(max, STR("{}"), static_cast<__int128_t>(std::numeric_limits<long long>::max()));
+    check_exception("128-bit value is outside of implemented range", STR("{}"),
+                    static_cast<__int128_t>(std::numeric_limits<long long>::min()) - 1);
+    check_exception("128-bit value is outside of implemented range", STR("{}"),
+                    static_cast<__int128_t>(std::numeric_limits<long long>::max()) + 1);
   }
 #endif
   format_test_signed_integer<CharT>(check, check_exception);
@@ -2747,15 +2592,10 @@
   {
     // Note 128-bit support is only partly implemented test the range
     // conditions here.
-    std::basic_string<CharT> max =
-        std::format(STR("{}"), std::numeric_limits<unsigned long long>::max());
-    check(max, STR("{}"),
-          static_cast<__uint128_t>(
-              std::numeric_limits<unsigned long long>::max()));
+    std::basic_string<CharT> max = std::format(STR("{}"), std::numeric_limits<unsigned long long>::max());
+    check(max, STR("{}"), static_cast<__uint128_t>(std::numeric_limits<unsigned long long>::max()));
     check_exception("128-bit value is outside of implemented range", STR("{}"),
-                    static_cast<__uint128_t>(
-                        std::numeric_limits<unsigned long long>::max()) +
-                        1);
+                    static_cast<__uint128_t>(std::numeric_limits<unsigned long long>::max()) + 1);
   }
 #endif
   format_test_unsigned_integer<CharT>(check, check_exception);
diff --git a/libcxx/test/std/utilities/format/format.functions/format_to.locale.pass.cpp b/libcxx/test/std/utilities/format/format.functions/format_to.locale.pass.cpp
--- a/libcxx/test/std/utilities/format/format.functions/format_to.locale.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format_to.locale.pass.cpp
@@ -30,8 +30,7 @@
 #include "test_macros.h"
 #include "format_tests.h"
 
-auto test = []<class CharT, class... Args>(std::basic_string<CharT> expected,
-                                           std::basic_string<CharT> fmt,
+auto test = []<class CharT, class... Args>(std::basic_string<CharT> expected, std::basic_string<CharT> fmt,
                                            const Args&... args) {
   {
     std::basic_string<CharT> out(expected.size(), CharT(' '));
@@ -42,14 +41,12 @@
   {
     std::list<CharT> out;
     std::format_to(std::back_inserter(out), std::locale(), fmt, args...);
-    assert(
-        std::equal(out.begin(), out.end(), expected.begin(), expected.end()));
+    assert(std::equal(out.begin(), out.end(), expected.begin(), expected.end()));
   }
   {
     std::vector<CharT> out;
     std::format_to(std::back_inserter(out), std::locale(), fmt, args...);
-    assert(
-        std::equal(out.begin(), out.end(), expected.begin(), expected.end()));
+    assert(std::equal(out.begin(), out.end(), expected.begin(), expected.end()));
   }
   {
     assert(expected.size() < 4096 && "Update the size of the buffer.");
@@ -61,8 +58,8 @@
   }
 };
 
-auto test_exception = []<class CharT, class... Args>(
-    std::string_view what, std::basic_string<CharT> fmt, const Args&... args) {
+auto test_exception =
+    []<class CharT, class... Args>(std::string_view what, std::basic_string<CharT> fmt, const Args&... args) {
 #ifndef TEST_HAS_NO_EXCEPTIONS
   try {
     std::basic_string<CharT> out;
diff --git a/libcxx/test/std/utilities/format/format.functions/format_to.pass.cpp b/libcxx/test/std/utilities/format/format.functions/format_to.pass.cpp
--- a/libcxx/test/std/utilities/format/format.functions/format_to.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format_to.pass.cpp
@@ -31,8 +31,7 @@
 #include "test_macros.h"
 #include "format_tests.h"
 
-auto test = []<class CharT, class... Args>(std::basic_string<CharT> expected,
-                                           std::basic_string<CharT> fmt,
+auto test = []<class CharT, class... Args>(std::basic_string<CharT> expected, std::basic_string<CharT> fmt,
                                            const Args&... args) {
   {
     std::basic_string<CharT> out(expected.size(), CharT(' '));
@@ -43,14 +42,12 @@
   {
     std::list<CharT> out;
     std::format_to(std::back_inserter(out), fmt, args...);
-    assert(
-        std::equal(out.begin(), out.end(), expected.begin(), expected.end()));
+    assert(std::equal(out.begin(), out.end(), expected.begin(), expected.end()));
   }
   {
     std::vector<CharT> out;
     std::format_to(std::back_inserter(out), fmt, args...);
-    assert(
-        std::equal(out.begin(), out.end(), expected.begin(), expected.end()));
+    assert(std::equal(out.begin(), out.end(), expected.begin(), expected.end()));
   }
   {
     assert(expected.size() < 4096 && "Update the size of the buffer.");
@@ -62,8 +59,8 @@
   }
 };
 
-auto test_exception = []<class CharT, class... Args>(
-    std::string_view what, std::basic_string<CharT> fmt, const Args&... args) {
+auto test_exception =
+    []<class CharT, class... Args>(std::string_view what, std::basic_string<CharT> fmt, const Args&... args) {
 #ifndef TEST_HAS_NO_EXCEPTIONS
   try {
     std::basic_string<CharT> out;
diff --git a/libcxx/test/std/utilities/format/format.functions/format_to_n.locale.pass.cpp b/libcxx/test/std/utilities/format/format.functions/format_to_n.locale.pass.cpp
--- a/libcxx/test/std/utilities/format/format.functions/format_to_n.locale.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format_to_n.locale.pass.cpp
@@ -32,13 +32,11 @@
 #include "test_macros.h"
 #include "format_tests.h"
 
-auto test = []<class CharT, class... Args>(std::basic_string<CharT> expected,
-                                           std::basic_string<CharT> fmt,
+auto test = []<class CharT, class... Args>(std::basic_string<CharT> expected, std::basic_string<CharT> fmt,
                                            const Args&... args) {
   {
     std::list<CharT> out;
-    std::format_to_n_result result = std::format_to_n(
-        std::back_inserter(out), 0, std::locale(), fmt, args...);
+    std::format_to_n_result result = std::format_to_n(std::back_inserter(out), 0, std::locale(), fmt, args...);
     // To avoid signedness warnings make sure formatted_size uses the same type
     // as result.size.
     using diff_type = decltype(result.size);
@@ -49,20 +47,17 @@
   }
   {
     std::vector<CharT> out;
-    std::format_to_n_result result = std::format_to_n(
-        std::back_inserter(out), 5, std::locale(), fmt, args...);
+    std::format_to_n_result result = std::format_to_n(std::back_inserter(out), 5, std::locale(), fmt, args...);
     using diff_type = decltype(result.size);
     diff_type formatted_size = std::formatted_size(std::locale(), fmt, args...);
     diff_type size = std::min<diff_type>(5, formatted_size);
 
     assert(result.size == formatted_size);
-    assert(std::equal(out.begin(), out.end(), expected.begin(),
-                      expected.begin() + size));
+    assert(std::equal(out.begin(), out.end(), expected.begin(), expected.begin() + size));
   }
   {
     std::basic_string<CharT> out;
-    std::format_to_n_result result = std::format_to_n(
-        std::back_inserter(out), 1000, std::locale(), fmt, args...);
+    std::format_to_n_result result = std::format_to_n(std::back_inserter(out), 1000, std::locale(), fmt, args...);
     using diff_type = decltype(result.size);
     diff_type formatted_size = std::formatted_size(std::locale(), fmt, args...);
     diff_type size = std::min<diff_type>(1000, formatted_size);
@@ -73,8 +68,7 @@
   {
     // Test the returned iterator.
     std::basic_string<CharT> out(10, CharT(' '));
-    std::format_to_n_result result =
-        std::format_to_n(out.begin(), 10, std::locale(), fmt, args...);
+    std::format_to_n_result result = std::format_to_n(out.begin(), 10, std::locale(), fmt, args...);
     using diff_type = decltype(result.size);
     diff_type formatted_size = std::formatted_size(std::locale(), fmt, args...);
     diff_type size = std::min<diff_type>(10, formatted_size);
@@ -88,8 +82,7 @@
                   "If the difference type isn't negative the test will fail "
                   "due to using a large positive value.");
     CharT buffer[1] = {CharT(0)};
-    std::format_to_n_result result =
-        std::format_to_n(buffer, -1, std::locale(), fmt, args...);
+    std::format_to_n_result result = std::format_to_n(buffer, -1, std::locale(), fmt, args...);
     using diff_type = decltype(result.size);
     diff_type formatted_size = std::formatted_size(std::locale(), fmt, args...);
 
@@ -99,8 +92,8 @@
   }
 };
 
-auto test_exception = []<class CharT, class... Args>(
-    std::string_view what, std::basic_string<CharT> fmt, const Args&... args) {
+auto test_exception =
+    []<class CharT, class... Args>(std::string_view what, std::basic_string<CharT> fmt, const Args&... args) {
 #ifndef TEST_HAS_NO_EXCEPTIONS
   try {
     std::basic_string<CharT> out;
diff --git a/libcxx/test/std/utilities/format/format.functions/format_to_n.pass.cpp b/libcxx/test/std/utilities/format/format.functions/format_to_n.pass.cpp
--- a/libcxx/test/std/utilities/format/format.functions/format_to_n.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format_to_n.pass.cpp
@@ -29,13 +29,11 @@
 #include "test_macros.h"
 #include "format_tests.h"
 
-auto test = []<class CharT, class... Args>(std::basic_string<CharT> expected,
-                                           std::basic_string<CharT> fmt,
+auto test = []<class CharT, class... Args>(std::basic_string<CharT> expected, std::basic_string<CharT> fmt,
                                            const Args&... args) {
   {
     std::list<CharT> out;
-    std::format_to_n_result result =
-        std::format_to_n(std::back_inserter(out), 0, fmt, args...);
+    std::format_to_n_result result = std::format_to_n(std::back_inserter(out), 0, fmt, args...);
     // To avoid signedness warnings make sure formatted_size uses the same type
     // as result.size.
     using diff_type = decltype(result.size);
@@ -46,20 +44,17 @@
   }
   {
     std::vector<CharT> out;
-    std::format_to_n_result result =
-        std::format_to_n(std::back_inserter(out), 5, fmt, args...);
+    std::format_to_n_result result = std::format_to_n(std::back_inserter(out), 5, fmt, args...);
     using diff_type = decltype(result.size);
     diff_type formatted_size = std::formatted_size(fmt, args...);
     diff_type size = std::min<diff_type>(5, formatted_size);
 
     assert(result.size == formatted_size);
-    assert(std::equal(out.begin(), out.end(), expected.begin(),
-                      expected.begin() + size));
+    assert(std::equal(out.begin(), out.end(), expected.begin(), expected.begin() + size));
   }
   {
     std::basic_string<CharT> out;
-    std::format_to_n_result result =
-        std::format_to_n(std::back_inserter(out), 1000, fmt, args...);
+    std::format_to_n_result result = std::format_to_n(std::back_inserter(out), 1000, fmt, args...);
     using diff_type = decltype(result.size);
     diff_type formatted_size = std::formatted_size(fmt, args...);
     diff_type size = std::min<diff_type>(1000, formatted_size);
@@ -70,8 +65,7 @@
   {
     // Test the returned iterator.
     std::basic_string<CharT> out(10, CharT(' '));
-    std::format_to_n_result result =
-        std::format_to_n(out.begin(), 10, fmt, args...);
+    std::format_to_n_result result = std::format_to_n(out.begin(), 10, fmt, args...);
     using diff_type = decltype(result.size);
     diff_type formatted_size = std::formatted_size(fmt, args...);
     diff_type size = std::min<diff_type>(10, formatted_size);
@@ -95,8 +89,8 @@
   }
 };
 
-auto test_exception = []<class CharT, class... Args>(
-    std::string_view what, std::basic_string<CharT> fmt, const Args&... args) {
+auto test_exception =
+    []<class CharT, class... Args>(std::string_view what, std::basic_string<CharT> fmt, const Args&... args) {
 #ifndef TEST_HAS_NO_EXCEPTIONS
   try {
     std::basic_string<CharT> out;
diff --git a/libcxx/test/std/utilities/format/format.functions/formatted_size.locale.pass.cpp b/libcxx/test/std/utilities/format/format.functions/formatted_size.locale.pass.cpp
--- a/libcxx/test/std/utilities/format/format.functions/formatted_size.locale.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/formatted_size.locale.pass.cpp
@@ -28,15 +28,14 @@
 #include "test_macros.h"
 #include "format_tests.h"
 
-auto test = []<class CharT, class... Args>(std::basic_string<CharT> expected,
-                                           std::basic_string<CharT> fmt,
+auto test = []<class CharT, class... Args>(std::basic_string<CharT> expected, std::basic_string<CharT> fmt,
                                            const Args&... args) {
   size_t size = std::formatted_size(std::locale(), fmt, args...);
   assert(size == expected.size());
 };
 
-auto test_exception = []<class CharT, class... Args>(
-    std::string_view what, std::basic_string<CharT> fmt, const Args&... args) {
+auto test_exception =
+    []<class CharT, class... Args>(std::string_view what, std::basic_string<CharT> fmt, const Args&... args) {
 #ifndef TEST_HAS_NO_EXCEPTIONS
   try {
     std::formatted_size(std::locale(), fmt, args...);
diff --git a/libcxx/test/std/utilities/format/format.functions/formatted_size.pass.cpp b/libcxx/test/std/utilities/format/format.functions/formatted_size.pass.cpp
--- a/libcxx/test/std/utilities/format/format.functions/formatted_size.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/formatted_size.pass.cpp
@@ -25,15 +25,14 @@
 #include "test_macros.h"
 #include "format_tests.h"
 
-auto test = []<class CharT, class... Args>(std::basic_string<CharT> expected,
-                                           std::basic_string<CharT> fmt,
+auto test = []<class CharT, class... Args>(std::basic_string<CharT> expected, std::basic_string<CharT> fmt,
                                            const Args&... args) {
   size_t size = std::formatted_size(fmt, args...);
   assert(size == expected.size());
 };
 
-auto test_exception = []<class CharT, class... Args>(
-    std::string_view what, std::basic_string<CharT> fmt, const Args&... args) {
+auto test_exception =
+    []<class CharT, class... Args>(std::string_view what, std::basic_string<CharT> fmt, const Args&... args) {
 #ifndef TEST_HAS_NO_EXCEPTIONS
   try {
     std::formatted_size(fmt, args...);
diff --git a/libcxx/test/std/utilities/format/format.functions/locale-specific_form.pass.cpp b/libcxx/test/std/utilities/format/format.functions/locale-specific_form.pass.cpp
--- a/libcxx/test/std/utilities/format/format.functions/locale-specific_form.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/locale-specific_form.pass.cpp
@@ -126,21 +126,19 @@
 #endif
 
 template <class CharT, class... Args>
-void test(std::basic_string<CharT> expected, std::basic_string<CharT> fmt,
-          const Args&... args) {
+void test(std::basic_string<CharT> expected, std::basic_string<CharT> fmt, const Args&... args) {
   // *** format ***
   {
     std::basic_string<CharT> out = std::format(fmt, args...);
     if constexpr (std::same_as<CharT, char>)
       if (out != expected)
-        std::cerr << "\nFormat string   " << fmt << "\nExpected output "
-                  << expected << "\nActual output   " << out << '\n';
+        std::cerr << "\nFormat string   " << fmt << "\nExpected output " << expected << "\nActual output   " << out
+                  << '\n';
     assert(out == expected);
   }
   // *** vformat ***
   {
-    std::basic_string<CharT> out =
-        std::vformat(fmt, std::make_format_args<context_t<CharT>>(args...));
+    std::basic_string<CharT> out = std::vformat(fmt, std::make_format_args<context_t<CharT>>(args...));
     assert(out == expected);
   }
   // *** format_to ***
@@ -153,16 +151,14 @@
   // *** vformat_to ***
   {
     std::basic_string<CharT> out(expected.size(), CharT(' '));
-    auto it = std::vformat_to(out.begin(), fmt,
-                              std::make_format_args<context_t<CharT>>(args...));
+    auto it = std::vformat_to(out.begin(), fmt, std::make_format_args<context_t<CharT>>(args...));
     assert(it == out.end());
     assert(out == expected);
   }
   // *** format_to_n ***
   {
     std::basic_string<CharT> out;
-    std::format_to_n_result result =
-        std::format_to_n(std::back_inserter(out), 1000, fmt, args...);
+    std::format_to_n_result result = std::format_to_n(std::back_inserter(out), 1000, fmt, args...);
     using diff_type = decltype(result.size);
     diff_type formatted_size = std::formatted_size(fmt, args...);
     diff_type size = std::min<diff_type>(1000, formatted_size);
@@ -178,21 +174,19 @@
 }
 
 template <class CharT, class... Args>
-void test(std::basic_string<CharT> expected, std::locale loc,
-          std::basic_string<CharT> fmt, const Args&... args) {
+void test(std::basic_string<CharT> expected, std::locale loc, std::basic_string<CharT> fmt, const Args&... args) {
   // *** format ***
   {
     std::basic_string<CharT> out = std::format(loc, fmt, args...);
     if constexpr (std::same_as<CharT, char>)
       if (out != expected)
-        std::cerr << "\nFormat string   " << fmt << "\nExpected output "
-                  << expected << "\nActual output   " << out << '\n';
+        std::cerr << "\nFormat string   " << fmt << "\nExpected output " << expected << "\nActual output   " << out
+                  << '\n';
     assert(out == expected);
   }
   // *** vformat ***
   {
-    std::basic_string<CharT> out = std::vformat(
-        loc, fmt, std::make_format_args<context_t<CharT>>(args...));
+    std::basic_string<CharT> out = std::vformat(loc, fmt, std::make_format_args<context_t<CharT>>(args...));
     assert(out == expected);
   }
   // *** format_to ***
@@ -205,16 +199,14 @@
   // *** vformat_to ***
   {
     std::basic_string<CharT> out(expected.size(), CharT(' '));
-    auto it = std::vformat_to(out.begin(), loc, fmt,
-                              std::make_format_args<context_t<CharT>>(args...));
+    auto it = std::vformat_to(out.begin(), loc, fmt, std::make_format_args<context_t<CharT>>(args...));
     assert(it == out.end());
     assert(out == expected);
   }
   // *** format_to_n ***
   {
     std::basic_string<CharT> out;
-    std::format_to_n_result result =
-        std::format_to_n(std::back_inserter(out), 1000, loc, fmt, args...);
+    std::format_to_n_result result = std::format_to_n(std::back_inserter(out), 1000, loc, fmt, args...);
     using diff_type = decltype(result.size);
     diff_type formatted_size = std::formatted_size(loc, fmt, args...);
     diff_type size = std::min<diff_type>(1000, formatted_size);
@@ -239,13 +231,13 @@
   string_type do_falsename() const override { return "ungültig"; }
 };
 
-#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+#  ifndef TEST_HAS_NO_WIDE_CHARACTERS
 template <>
 struct numpunct_unicode<wchar_t> : std::numpunct<wchar_t> {
   string_type do_truename() const override { return L"gültig"; }
   string_type do_falsename() const override { return L"ungültig"; }
 };
-#endif
+#  endif
 #endif // TEST_HAS_NO_UNICODE
 
 template <class CharT>
@@ -268,8 +260,7 @@
   test(STR("false"), std::locale(LOCALE_en_US_UTF_8), STR("{:L}"), false);
 
 #ifndef TEST_HAS_NO_UNICODE
-  std::locale loc_unicode =
-      std::locale(std::locale(), new numpunct_unicode<CharT>());
+  std::locale loc_unicode = std::locale(std::locale(), new numpunct_unicode<CharT>());
 
   test(STR("gültig"), loc_unicode, STR("{:L}"), true);
   test(STR("ungültig"), loc_unicode, STR("{:L}"), false);
diff --git a/libcxx/test/std/utilities/format/format.functions/vformat.locale.pass.cpp b/libcxx/test/std/utilities/format/format.functions/vformat.locale.pass.cpp
--- a/libcxx/test/std/utilities/format/format.functions/vformat.locale.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/vformat.locale.pass.cpp
@@ -24,20 +24,17 @@
 #include "test_macros.h"
 #include "format_tests.h"
 
-auto test = []<class CharT, class... Args>(std::basic_string<CharT> expected,
-                                           std::basic_string<CharT> fmt,
+auto test = []<class CharT, class... Args>(std::basic_string<CharT> expected, std::basic_string<CharT> fmt,
                                            const Args&... args) {
-  std::basic_string<CharT> out = std::vformat(
-      std::locale(), fmt, std::make_format_args<context_t<CharT>>(args...));
+  std::basic_string<CharT> out = std::vformat(std::locale(), fmt, std::make_format_args<context_t<CharT>>(args...));
   assert(out == expected);
 };
 
-auto test_exception = []<class CharT, class... Args>(
-    std::string_view what, std::basic_string<CharT> fmt, const Args&... args) {
+auto test_exception =
+    []<class CharT, class... Args>(std::string_view what, std::basic_string<CharT> fmt, const Args&... args) {
 #ifndef TEST_HAS_NO_EXCEPTIONS
   try {
-    (void) std::vformat(std::locale(), fmt,
-                        std::make_format_args<context_t<CharT>>(args...));
+    (void)std::vformat(std::locale(), fmt, std::make_format_args<context_t<CharT>>(args...));
     assert(false);
   } catch ([[maybe_unused]] std::format_error& e) {
     LIBCPP_ASSERT(e.what() == what);
diff --git a/libcxx/test/std/utilities/format/format.functions/vformat.pass.cpp b/libcxx/test/std/utilities/format/format.functions/vformat.pass.cpp
--- a/libcxx/test/std/utilities/format/format.functions/vformat.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/vformat.pass.cpp
@@ -23,16 +23,14 @@
 #include "test_macros.h"
 #include "format_tests.h"
 
-auto test = []<class CharT, class... Args>(std::basic_string<CharT> expected,
-                                           std::basic_string<CharT> fmt,
+auto test = []<class CharT, class... Args>(std::basic_string<CharT> expected, std::basic_string<CharT> fmt,
                                            const Args&... args) {
-  std::basic_string<CharT> out =
-      std::vformat(fmt, std::make_format_args<context_t<CharT>>(args...));
+  std::basic_string<CharT> out = std::vformat(fmt, std::make_format_args<context_t<CharT>>(args...));
   assert(out == expected);
 };
 
-auto test_exception = []<class CharT, class... Args>(
-    std::string_view what, std::basic_string<CharT> fmt, const Args&... args) {
+auto test_exception =
+    []<class CharT, class... Args>(std::string_view what, std::basic_string<CharT> fmt, const Args&... args) {
 #ifndef TEST_HAS_NO_EXCEPTIONS
   try {
     TEST_IGNORE_NODISCARD std::vformat(fmt, std::make_format_args<context_t<CharT>>(args...));
diff --git a/libcxx/test/std/utilities/format/format.functions/vformat_to.locale.pass.cpp b/libcxx/test/std/utilities/format/format.functions/vformat_to.locale.pass.cpp
--- a/libcxx/test/std/utilities/format/format.functions/vformat_to.locale.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/vformat_to.locale.pass.cpp
@@ -30,49 +30,40 @@
 #include "test_macros.h"
 #include "format_tests.h"
 
-auto test = []<class CharT, class... Args>(std::basic_string<CharT> expected,
-                                           std::basic_string<CharT> fmt,
+auto test = []<class CharT, class... Args>(std::basic_string<CharT> expected, std::basic_string<CharT> fmt,
                                            const Args&... args) {
   {
     std::basic_string<CharT> out(expected.size(), CharT(' '));
-    auto it = std::vformat_to(out.begin(), std::locale(), fmt,
-                              std::make_format_args<context_t<CharT>>(args...));
+    auto it = std::vformat_to(out.begin(), std::locale(), fmt, std::make_format_args<context_t<CharT>>(args...));
     assert(it == out.end());
     assert(out == expected);
   }
   {
     std::list<CharT> out;
-    std::vformat_to(std::back_inserter(out), std::locale(), fmt,
-                    std::make_format_args<context_t<CharT>>(args...));
-    assert(
-        std::equal(out.begin(), out.end(), expected.begin(), expected.end()));
+    std::vformat_to(std::back_inserter(out), std::locale(), fmt, std::make_format_args<context_t<CharT>>(args...));
+    assert(std::equal(out.begin(), out.end(), expected.begin(), expected.end()));
   }
   {
     std::vector<CharT> out;
-    std::vformat_to(std::back_inserter(out), std::locale(), fmt,
-                    std::make_format_args<context_t<CharT>>(args...));
-    assert(
-        std::equal(out.begin(), out.end(), expected.begin(), expected.end()));
+    std::vformat_to(std::back_inserter(out), std::locale(), fmt, std::make_format_args<context_t<CharT>>(args...));
+    assert(std::equal(out.begin(), out.end(), expected.begin(), expected.end()));
   }
   {
     assert(expected.size() < 4096 && "Update the size of the buffer.");
     CharT out[4096];
-    CharT* it =
-        std::vformat_to(out, std::locale(), fmt,
-                        std::make_format_args<context_t<CharT>>(args...));
+    CharT* it = std::vformat_to(out, std::locale(), fmt, std::make_format_args<context_t<CharT>>(args...));
     assert(std::distance(out, it) == int(expected.size()));
     // Convert to std::string since output contains '\0' for boolean tests.
     assert(std::basic_string<CharT>(out, it) == expected);
   }
 };
 
-auto test_exception = []<class CharT, class... Args>(
-    std::string_view what, std::basic_string<CharT> fmt, const Args&... args) {
+auto test_exception =
+    []<class CharT, class... Args>(std::string_view what, std::basic_string<CharT> fmt, const Args&... args) {
 #ifndef TEST_HAS_NO_EXCEPTIONS
   try {
     std::basic_string<CharT> out;
-    std::vformat_to(std::back_inserter(out), std::locale(), fmt,
-                    std::make_format_args<context_t<CharT>>(args...));
+    std::vformat_to(std::back_inserter(out), std::locale(), fmt, std::make_format_args<context_t<CharT>>(args...));
     assert(false);
   } catch ([[maybe_unused]] std::format_error& e) {
     LIBCPP_ASSERT(e.what() == what);
diff --git a/libcxx/test/std/utilities/format/format.functions/vformat_to.pass.cpp b/libcxx/test/std/utilities/format/format.functions/vformat_to.pass.cpp
--- a/libcxx/test/std/utilities/format/format.functions/vformat_to.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/vformat_to.pass.cpp
@@ -31,48 +31,40 @@
 #include "test_macros.h"
 #include "format_tests.h"
 
-auto test = []<class CharT, class... Args>(std::basic_string<CharT> expected,
-                                           std::basic_string<CharT> fmt,
+auto test = []<class CharT, class... Args>(std::basic_string<CharT> expected, std::basic_string<CharT> fmt,
                                            const Args&... args) {
   {
     std::basic_string<CharT> out(expected.size(), CharT(' '));
-    auto it = std::vformat_to(out.begin(), fmt,
-                              std::make_format_args<context_t<CharT>>(args...));
+    auto it = std::vformat_to(out.begin(), fmt, std::make_format_args<context_t<CharT>>(args...));
     assert(it == out.end());
     assert(out == expected);
   }
   {
     std::list<CharT> out;
-    std::vformat_to(std::back_inserter(out), fmt,
-                    std::make_format_args<context_t<CharT>>(args...));
-    assert(
-        std::equal(out.begin(), out.end(), expected.begin(), expected.end()));
+    std::vformat_to(std::back_inserter(out), fmt, std::make_format_args<context_t<CharT>>(args...));
+    assert(std::equal(out.begin(), out.end(), expected.begin(), expected.end()));
   }
   {
     std::vector<CharT> out;
-    std::vformat_to(std::back_inserter(out), fmt,
-                    std::make_format_args<context_t<CharT>>(args...));
-    assert(
-        std::equal(out.begin(), out.end(), expected.begin(), expected.end()));
+    std::vformat_to(std::back_inserter(out), fmt, std::make_format_args<context_t<CharT>>(args...));
+    assert(std::equal(out.begin(), out.end(), expected.begin(), expected.end()));
   }
   {
     assert(expected.size() < 4096 && "Update the size of the buffer.");
     CharT out[4096];
-    CharT* it = std::vformat_to(
-        out, fmt, std::make_format_args<context_t<CharT>>(args...));
+    CharT* it = std::vformat_to(out, fmt, std::make_format_args<context_t<CharT>>(args...));
     assert(std::distance(out, it) == int(expected.size()));
     // Convert to std::string since output contains '\0' for boolean tests.
     assert(std::basic_string<CharT>(out, it) == expected);
   }
 };
 
-auto test_exception = []<class CharT, class... Args>(
-    std::string_view what, std::basic_string<CharT> fmt, const Args&... args) {
+auto test_exception =
+    []<class CharT, class... Args>(std::string_view what, std::basic_string<CharT> fmt, const Args&... args) {
 #ifndef TEST_HAS_NO_EXCEPTIONS
   try {
     std::basic_string<CharT> out;
-    std::vformat_to(std::back_inserter(out), fmt,
-                    std::make_format_args<context_t<CharT>>(args...));
+    std::vformat_to(std::back_inserter(out), fmt, std::make_format_args<context_t<CharT>>(args...));
     assert(false);
   } catch ([[maybe_unused]] std::format_error& e) {
     LIBCPP_ASSERT(e.what() == what);
diff --git a/lld/ELF/CMakeLists.txt b/lld/ELF/CMakeLists.txt
--- a/lld/ELF/CMakeLists.txt
+++ b/lld/ELF/CMakeLists.txt
@@ -2,6 +2,10 @@
 tablegen(LLVM Options.inc -gen-opt-parser-defs)
 add_public_tablegen_target(ELFOptionsTableGen)
 
+if(LLVM_ENABLE_ZLIB)
+  set(imported_libs ZLIB::ZLIB)
+endif()
+
 add_lld_library(lldELF
   AArch64ErrataFix.cpp
   Arch/AArch64.cpp
@@ -58,6 +62,7 @@
 
   LINK_LIBS
   lldCommon
+  ${imported_libs}
   ${LLVM_PTHREAD_LIB}
 
   DEPENDS
diff --git a/lld/ELF/OutputSections.h b/lld/ELF/OutputSections.h
--- a/lld/ELF/OutputSections.h
+++ b/lld/ELF/OutputSections.h
@@ -25,6 +25,12 @@
 class InputSection;
 class InputSectionBase;
 
+struct CompressedData {
+  std::unique_ptr<SmallVector<uint8_t, 0>[]> shards;
+  uint32_t numShards = 0;
+  uint32_t checksum = 0;
+};
+
 // This represents a section in an output file.
 // It is composed of multiple InputSections.
 // The writer creates multiple OutputSections and assign them unique,
@@ -113,7 +119,7 @@
 private:
   // Used for implementation of --compress-debug-sections option.
   SmallVector<uint8_t, 0> zDebugHeader;
-  SmallVector<char, 0> compressedData;
+  CompressedData compressed;
 
   std::array<uint8_t, 4> getFiller();
 };
diff --git a/lld/ELF/OutputSections.cpp b/lld/ELF/OutputSections.cpp
--- a/lld/ELF/OutputSections.cpp
+++ b/lld/ELF/OutputSections.cpp
@@ -15,7 +15,7 @@
 #include "lld/Common/Memory.h"
 #include "lld/Common/Strings.h"
 #include "llvm/BinaryFormat/Dwarf.h"
-#include "llvm/Support/Compression.h"
+#include "llvm/Config/config.h" // LLVM_ENABLE_ZLIB
 #include "llvm/Support/MD5.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Parallel.h"
@@ -23,6 +23,9 @@
 #include "llvm/Support/TimeProfiler.h"
 #include <regex>
 #include <unordered_set>
+#if LLVM_ENABLE_ZLIB
+#include <zlib.h>
+#endif
 
 using namespace llvm;
 using namespace llvm::dwarf;
@@ -284,13 +287,45 @@
   memcpy(buf + i, filler.data(), size - i);
 }
 
+#if LLVM_ENABLE_ZLIB
+static SmallVector<uint8_t, 0> deflateShard(ArrayRef<uint8_t> in, int level,
+                                            int flush) {
+  // 15 and 8 are default. windowBits=-15 is negative to generate raw deflate
+  // data with no zlib header or trailer.
+  z_stream s = {};
+  deflateInit2(&s, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY);
+  s.next_in = const_cast<uint8_t *>(in.data());
+  s.avail_in = in.size();
+
+  // Allocate a buffer of half of the input size, and grow it by 1.5x if
+  // insufficient.
+  SmallVector<uint8_t, 0> out;
+  size_t pos = 0;
+  out.resize_for_overwrite(std::max<size_t>(in.size() / 2, 64));
+  do {
+    if (pos == out.size())
+      out.resize_for_overwrite(out.size() * 3 / 2);
+    s.next_out = out.data() + pos;
+    s.avail_out = out.size() - pos;
+    (void)deflate(&s, flush);
+    pos = s.next_out - out.data();
+  } while (s.avail_out == 0);
+  assert(s.avail_in == 0);
+
+  out.truncate(pos);
+  deflateEnd(&s);
+  return out;
+}
+#endif
+
 // Compress section contents if this section contains debug info.
 template <class ELFT> void OutputSection::maybeCompress() {
+#if LLVM_ENABLE_ZLIB
   using Elf_Chdr = typename ELFT::Chdr;
 
   // Compress only DWARF debug sections.
   if (!config->compressDebugSections || (flags & SHF_ALLOC) ||
-      !name.startswith(".debug_"))
+      !name.startswith(".debug_") || size == 0)
     return;
 
   llvm::TimeTraceScope timeScope("Compress debug sections");
@@ -309,13 +344,42 @@
   // -O2 is given, we use level 6 to compress debug info more by ~15%. We found
   // that level 7 to 9 doesn't make much difference (~1% more compression) while
   // they take significant amount of time (~2x), so level 6 seems enough.
-  if (Error e = zlib::compress(toStringRef(buf), compressedData,
-                               config->optimize >= 2 ? 6 : 1))
-    fatal("compress failed: " + llvm::toString(std::move(e)));
+  const int level = config->optimize >= 2 ? 6 : Z_BEST_SPEED;
+
+  // Split input into 1-MiB shards.
+  constexpr size_t shardSize = 1 << 20;
+  const size_t numShards = (size + shardSize - 1) / shardSize;
+  auto shardsIn = std::make_unique<ArrayRef<uint8_t>[]>(numShards);
+  for (size_t i = 0, start = 0, end; start != buf.size(); ++i, start = end) {
+    end = std::min(start + shardSize, buf.size());
+    shardsIn[i] = makeArrayRef<uint8_t>(buf.data() + start, end - start);
+  }
+
+  // Compress shards and compute Alder-32 checksums. Use Z_SYNC_FLUSH for all
+  // shards but the last to flush the output to a byte boundary to be
+  // concatenated with the next shard.
+  auto shardsOut = std::make_unique<SmallVector<uint8_t, 0>[]>(numShards);
+  auto shardsAdler = std::make_unique<uint32_t[]>(numShards);
+  parallelForEachN(0, numShards, [&](size_t i) {
+    shardsOut[i] = deflateShard(shardsIn[i], level,
+                                i != numShards - 1 ? Z_SYNC_FLUSH : Z_FINISH);
+    shardsAdler[i] = adler32(1, shardsIn[i].data(), shardsIn[i].size());
+  });
+
+  // Update section size and combine Alder-32 checksums.
+  uint32_t checksum = 1;       // Initial Adler-32 value
+  size = sizeof(Elf_Chdr) + 2; // Elf_Chdir and zlib header
+  for (size_t i = 0; i != numShards; ++i) {
+    size += shardsOut[i].size();
+    checksum = adler32_combine(checksum, shardsAdler[i], shardsIn[i].size());
+  }
+  size += 4; // checksum
 
-  // Update section headers.
-  size = sizeof(Elf_Chdr) + compressedData.size();
+  compressed.shards = std::move(shardsOut);
+  compressed.numShards = numShards;
+  compressed.checksum = checksum;
   flags |= SHF_COMPRESSED;
+#endif
 }
 
 static void writeInt(uint8_t *buf, uint64_t data, uint64_t size) {
@@ -339,10 +403,25 @@
   // If --compress-debug-section is specified and if this is a debug section,
   // we've already compressed section contents. If that's the case,
   // just write it down.
-  if (!compressedData.empty()) {
+  if (compressed.shards) {
     memcpy(buf, zDebugHeader.data(), zDebugHeader.size());
-    memcpy(buf + zDebugHeader.size(), compressedData.data(),
-           compressedData.size());
+    buf += zDebugHeader.size();
+    size -= zDebugHeader.size();
+
+    // Compute shard offsets.
+    auto offsets = std::make_unique<size_t[]>(compressed.numShards);
+    offsets[0] = 2; // zlib header
+    for (size_t i = 1; i != compressed.numShards; ++i)
+      offsets[i] = offsets[i - 1] + compressed.shards[i - 1].size();
+
+    buf[0] = 0x78; // CMF
+    buf[1] = 0x01; // FLG: best speed
+    parallelForEachN(0, compressed.numShards, [&](size_t i) {
+      memcpy(buf + offsets[i], compressed.shards[i].data(),
+             compressed.shards[i].size());
+    });
+
+    write32be(buf + size - 4, compressed.checksum);
     return;
   }
 
diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
--- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
+++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/StringRef.h"
 
 #include "Plugins/Process/Utility/RegisterContextDarwin_arm.h"
@@ -55,6 +56,9 @@
 #include <TargetConditionals.h>
 // GetLLDBSharedCacheUUID() needs to call dlsym()
 #include <dlfcn.h>
+#include <mach/mach_init.h>
+#include <mach/vm_map.h>
+#include <lldb/Host/SafeMachO.h>
 #endif
 
 #ifndef __APPLE__
@@ -155,28 +159,6 @@
                     // and later
 };
 
-struct lldb_copy_dyld_cache_mapping_info {
-  uint64_t address;
-  uint64_t size;
-  uint64_t fileOffset;
-  uint32_t maxProt;
-  uint32_t initProt;
-};
-
-struct lldb_copy_dyld_cache_local_symbols_info {
-  uint32_t nlistOffset;
-  uint32_t nlistCount;
-  uint32_t stringsOffset;
-  uint32_t stringsSize;
-  uint32_t entriesOffset;
-  uint32_t entriesCount;
-};
-struct lldb_copy_dyld_cache_local_symbols_entry {
-  uint32_t dylibOffset;
-  uint32_t nlistStartIndex;
-  uint32_t nlistCount;
-};
-
 static void PrintRegisterValue(RegisterContext *reg_ctx, const char *name,
                                const char *alt_name, size_t reg_byte_size,
                                Stream &data) {
@@ -2257,6 +2239,7 @@
   llvm::StringRef g_objc_v2_prefix_class("_OBJC_CLASS_$_");
   llvm::StringRef g_objc_v2_prefix_metaclass("_OBJC_METACLASS_$_");
   llvm::StringRef g_objc_v2_prefix_ivar("_OBJC_IVAR_$_");
+  UUID image_uuid;
 
   for (i = 0; i < m_header.ncmds; ++i) {
     const lldb::offset_t cmd_offset = offset;
@@ -2324,6 +2307,14 @@
                sizeof(function_starts_load_command));
       break;
 
+    case LC_UUID: {
+      const uint8_t *uuid_bytes = m_data.PeekData(offset, 16);
+
+      if (uuid_bytes)
+        image_uuid = UUID::fromOptionalData(uuid_bytes, 16);
+      break;
+    }
+
     default:
       break;
     }
@@ -2615,8 +2606,6 @@
                                              ? eh_frame_section_sp->GetID()
                                              : static_cast<user_id_t>(NO_SECT);
 
-  lldb::offset_t nlist_data_offset = 0;
-
   uint32_t N_SO_index = UINT32_MAX;
 
   MachSymtabSectionInfo section_info(section_list);
@@ -2682,26 +2671,6 @@
     // Next we need to determine the correct path for the dyld shared cache.
 
     ArchSpec header_arch = GetArchitecture();
-    char dsc_path[PATH_MAX];
-    char dsc_path_development[PATH_MAX];
-
-    snprintf(
-        dsc_path, sizeof(dsc_path), "%s%s%s",
-        "/System/Library/Caches/com.apple.dyld/", /* IPHONE_DYLD_SHARED_CACHE_DIR
-                                                   */
-        "dyld_shared_cache_", /* DYLD_SHARED_CACHE_BASE_NAME */
-        header_arch.GetArchitectureName());
-
-    snprintf(
-        dsc_path_development, sizeof(dsc_path), "%s%s%s%s",
-        "/System/Library/Caches/com.apple.dyld/", /* IPHONE_DYLD_SHARED_CACHE_DIR
-                                                   */
-        "dyld_shared_cache_", /* DYLD_SHARED_CACHE_BASE_NAME */
-        header_arch.GetArchitectureName(), ".development");
-
-    FileSpec dsc_nondevelopment_filespec(dsc_path);
-    FileSpec dsc_development_filespec(dsc_path_development);
-    FileSpec dsc_filespec;
 
     UUID dsc_uuid;
     UUID process_shared_cache_uuid;
@@ -2712,155 +2681,99 @@
                                 process_shared_cache_uuid);
     }
 
-    // First see if we can find an exact match for the inferior process
-    // shared cache UUID in the development or non-development shared caches
-    // on disk.
-    if (process_shared_cache_uuid.IsValid()) {
-      if (FileSystem::Instance().Exists(dsc_development_filespec)) {
-        UUID dsc_development_uuid = GetSharedCacheUUID(
-            dsc_development_filespec, byte_order, addr_byte_size);
-        if (dsc_development_uuid.IsValid() &&
-            dsc_development_uuid == process_shared_cache_uuid) {
-          dsc_filespec = dsc_development_filespec;
-          dsc_uuid = dsc_development_uuid;
-        }
-      }
-      if (!dsc_uuid.IsValid() &&
-          FileSystem::Instance().Exists(dsc_nondevelopment_filespec)) {
-        UUID dsc_nondevelopment_uuid = GetSharedCacheUUID(
-            dsc_nondevelopment_filespec, byte_order, addr_byte_size);
-        if (dsc_nondevelopment_uuid.IsValid() &&
-            dsc_nondevelopment_uuid == process_shared_cache_uuid) {
-          dsc_filespec = dsc_nondevelopment_filespec;
-          dsc_uuid = dsc_nondevelopment_uuid;
-        }
-      }
-    }
+    __block bool found_image = false;
+    __block void *nlist_buffer = nullptr;
+    __block unsigned nlist_count = 0;
+    __block char *string_table = nullptr;
+    __block vm_offset_t vm_nlist_memory = 0;
+    __block mach_msg_type_number_t vm_nlist_bytes_read = 0;
+    __block vm_offset_t vm_string_memory = 0;
+    __block mach_msg_type_number_t vm_string_bytes_read = 0;
+
+    auto _ = llvm::make_scope_exit(^{
+      if (vm_nlist_memory)
+        vm_deallocate(mach_task_self(), vm_nlist_memory, vm_nlist_bytes_read);
+      if (vm_string_memory)
+        vm_deallocate(mach_task_self(), vm_string_memory, vm_string_bytes_read);
+    });
 
-    // Failing a UUID match, prefer the development dyld_shared cache if both
-    // are present.
-    if (!FileSystem::Instance().Exists(dsc_filespec)) {
-      if (FileSystem::Instance().Exists(dsc_development_filespec)) {
-        dsc_filespec = dsc_development_filespec;
-      } else {
-        dsc_filespec = dsc_nondevelopment_filespec;
-      }
-    }
+    typedef llvm::DenseMap<ConstString, uint16_t> UndefinedNameToDescMap;
+    typedef llvm::DenseMap<uint32_t, ConstString> SymbolIndexToName;
+    UndefinedNameToDescMap undefined_name_to_desc;
+    SymbolIndexToName reexport_shlib_needs_fixup;
 
-    /* The dyld_cache_header has a pointer to the
-       dyld_cache_local_symbols_info structure (localSymbolsOffset).
-       The dyld_cache_local_symbols_info structure gives us three things:
-         1. The start and count of the nlist records in the dyld_shared_cache
-       file
-         2. The start and size of the strings for these nlist records
-         3. The start and count of dyld_cache_local_symbols_entry entries
-
-       There is one dyld_cache_local_symbols_entry per dylib/framework in the
-       dyld shared cache.
-       The "dylibOffset" field is the Mach-O header of this dylib/framework in
-       the dyld shared cache.
-       The dyld_cache_local_symbols_entry also lists the start of this
-       dylib/framework's nlist records
-       and the count of how many nlist records there are for this
-       dylib/framework.
-    */
-
-    // Process the dyld shared cache header to find the unmapped symbols
-
-    DataBufferSP dsc_data_sp = MapFileData(
-        dsc_filespec, sizeof(struct lldb_copy_dyld_cache_header_v1), 0);
-    if (!dsc_uuid.IsValid()) {
-      dsc_uuid = GetSharedCacheUUID(dsc_filespec, byte_order, addr_byte_size);
-    }
-    if (dsc_data_sp) {
-      DataExtractor dsc_header_data(dsc_data_sp, byte_order, addr_byte_size);
+    dyld_for_each_installed_shared_cache(^(dyld_shared_cache_t shared_cache) {
+      uuid_t cache_uuid;
+      dyld_shared_cache_copy_uuid(shared_cache, &cache_uuid);
+      if (found_image)
+        return;
 
-      bool uuid_match = true;
-      if (dsc_uuid.IsValid() && process) {
         if (process_shared_cache_uuid.IsValid() &&
-            dsc_uuid != process_shared_cache_uuid) {
-          // The on-disk dyld_shared_cache file is not the same as the one in
-          // this process' memory, don't use it.
-          uuid_match = false;
-          ModuleSP module_sp(GetModule());
-          if (module_sp)
-            module_sp->ReportWarning("process shared cache does not match "
-                                     "on-disk dyld_shared_cache file, some "
-                                     "symbol names will be missing.");
-        }
-      }
+          process_shared_cache_uuid != UUID::fromOptionalData(&cache_uuid, 16))
+        return;
 
-      offset = offsetof(struct lldb_copy_dyld_cache_header_v1, mappingOffset);
-
-      uint32_t mappingOffset = dsc_header_data.GetU32(&offset);
-
-      // If the mappingOffset points to a location inside the header, we've
-      // opened an old dyld shared cache, and should not proceed further.
-      if (uuid_match &&
-          mappingOffset >= sizeof(struct lldb_copy_dyld_cache_header_v1)) {
-
-        DataBufferSP dsc_mapping_info_data_sp = MapFileData(
-            dsc_filespec, sizeof(struct lldb_copy_dyld_cache_mapping_info),
-            mappingOffset);
-
-        DataExtractor dsc_mapping_info_data(dsc_mapping_info_data_sp,
-                                            byte_order, addr_byte_size);
-        offset = 0;
-
-        // The File addresses (from the in-memory Mach-O load commands) for
-        // the shared libraries in the shared library cache need to be
-        // adjusted by an offset to match up with the dylibOffset identifying
-        // field in the dyld_cache_local_symbol_entry's.  This offset is
-        // recorded in mapping_offset_value.
-        const uint64_t mapping_offset_value =
-            dsc_mapping_info_data.GetU64(&offset);
-
-        offset =
-            offsetof(struct lldb_copy_dyld_cache_header_v1, localSymbolsOffset);
-        uint64_t localSymbolsOffset = dsc_header_data.GetU64(&offset);
-        uint64_t localSymbolsSize = dsc_header_data.GetU64(&offset);
-
-        if (localSymbolsOffset && localSymbolsSize) {
-          // Map the local symbols
-          DataBufferSP dsc_local_symbols_data_sp =
-              MapFileData(dsc_filespec, localSymbolsSize, localSymbolsOffset);
-
-          if (dsc_local_symbols_data_sp) {
-            DataExtractor dsc_local_symbols_data(dsc_local_symbols_data_sp,
-                                                 byte_order, addr_byte_size);
-
-            offset = 0;
-
-            typedef llvm::DenseMap<ConstString, uint16_t> UndefinedNameToDescMap;
-            typedef llvm::DenseMap<uint32_t, ConstString> SymbolIndexToName;
-            UndefinedNameToDescMap undefined_name_to_desc;
-            SymbolIndexToName reexport_shlib_needs_fixup;
-
-            // Read the local_symbols_infos struct in one shot
-            struct lldb_copy_dyld_cache_local_symbols_info local_symbols_info;
-            dsc_local_symbols_data.GetU32(&offset,
-                                          &local_symbols_info.nlistOffset, 6);
-
-            SectionSP text_section_sp(
-                section_list->FindSectionByName(GetSegmentNameTEXT()));
-
-            uint32_t header_file_offset =
-                (text_section_sp->GetFileAddress() - mapping_offset_value);
-
-            offset = local_symbols_info.entriesOffset;
-            for (uint32_t entry_index = 0;
-                 entry_index < local_symbols_info.entriesCount; entry_index++) {
-              struct lldb_copy_dyld_cache_local_symbols_entry
-                  local_symbols_entry;
-              local_symbols_entry.dylibOffset =
-                  dsc_local_symbols_data.GetU32(&offset);
-              local_symbols_entry.nlistStartIndex =
-                  dsc_local_symbols_data.GetU32(&offset);
-              local_symbols_entry.nlistCount =
-                  dsc_local_symbols_data.GetU32(&offset);
-
-              if (header_file_offset == local_symbols_entry.dylibOffset) {
-                unmapped_local_symbols_found = local_symbols_entry.nlistCount;
+      dyld_shared_cache_for_each_image(shared_cache, ^(dyld_image_t image) {
+        uuid_t dsc_image_uuid;
+        if (found_image)
+          return;
+
+        dyld_image_copy_uuid(image, &dsc_image_uuid);
+        if (image_uuid != UUID::fromOptionalData(dsc_image_uuid, 16))
+          return;
+
+        found_image = true;
+
+        // Compute the size of the string table. We need to ask dyld for a
+        // new SPI to avoid this step.
+        dyld_image_local_nlist_content_4Symbolication(
+            image, ^(const void *nlistStart, uint64_t nlistCount,
+                     const char *stringTable) {
+              if (!nlistStart || !nlistCount)
+                return;
+
+              // The buffers passed here are valid only inside the block.
+              // Use vm_read to make a cheap copy of them available for our
+              // processing later.
+              kern_return_t ret =
+                  vm_read(mach_task_self(), (vm_address_t)nlistStart,
+                          nlist_byte_size * nlistCount, &vm_nlist_memory,
+                          &vm_nlist_bytes_read);
+              if (ret != KERN_SUCCESS)
+                return;
+              assert(vm_nlist_bytes_read == nlist_byte_size * nlistCount);
+
+              // We don't know the size of the string table. It's cheaper
+              // to map the whol VM region than to determine the size by
+              // parsing all teh nlist entries.
+              vm_address_t string_address = (vm_address_t)stringTable;
+              vm_size_t region_size;
+              mach_msg_type_number_t info_count = VM_REGION_BASIC_INFO_COUNT_64;
+              vm_region_basic_info_data_t info;
+              memory_object_name_t object;
+              ret = vm_region_64(mach_task_self(), &string_address,
+                                 &region_size, VM_REGION_BASIC_INFO_64,
+                                 (vm_region_info_t)&info, &info_count, &object);
+              if (ret != KERN_SUCCESS)
+                return;
+
+              ret = vm_read(mach_task_self(), (vm_address_t)stringTable,
+                            region_size -
+                                ((vm_address_t)stringTable - string_address),
+                            &vm_string_memory, &vm_string_bytes_read);
+              if (ret != KERN_SUCCESS)
+                return;
+
+              nlist_buffer = (void *)vm_nlist_memory;
+              string_table = (char *)vm_string_memory;
+              nlist_count = nlistCount;
+            });
+      });
+    });
+    if (nlist_buffer) {
+      DataExtractor dsc_local_symbols_data(nlist_buffer,
+                                           nlist_count * nlist_byte_size,
+                                           byte_order, addr_byte_size);
+      unmapped_local_symbols_found = nlist_count;
 
                 // The normal nlist code cannot correctly size the Symbols
                 // array, we need to allocate it here.
@@ -2869,13 +2782,10 @@
                     unmapped_local_symbols_found - m_dysymtab.nlocalsym);
                 num_syms = symtab.GetNumSymbols();
 
-                nlist_data_offset =
-                    local_symbols_info.nlistOffset +
-                    (nlist_byte_size * local_symbols_entry.nlistStartIndex);
-                uint32_t string_table_offset = local_symbols_info.stringsOffset;
+      lldb::offset_t nlist_data_offset = 0;
 
                 for (uint32_t nlist_index = 0;
-                     nlist_index < local_symbols_entry.nlistCount;
+                     nlist_index < nlist_count;
                      nlist_index++) {
                   /////////////////////////////
                   {
@@ -2887,8 +2797,7 @@
                     struct nlist_64 nlist = *nlist_maybe;
 
                     SymbolType type = eSymbolTypeInvalid;
-                    const char *symbol_name = dsc_local_symbols_data.PeekCStr(
-                        string_table_offset + nlist.n_strx);
+          const char *symbol_name = string_table + nlist.n_strx;
 
                     if (symbol_name == NULL) {
                       // No symbol should be NULL, even the symbols with no
@@ -2898,7 +2807,7 @@
                           Host::eSystemLogError,
                           "error: DSC unmapped local symbol[%u] has invalid "
                           "string table offset 0x%x in %s, ignoring symbol\n",
-                          entry_index, nlist.n_strx,
+                          nlist_index, nlist.n_strx,
                           module_sp->GetFileSpec().GetPath().c_str());
                       continue;
                     }
@@ -3759,8 +3668,6 @@
                   }
                   /////////////////////////////
                 }
-                break; // No more entries to consider
-              }
             }
 
             for (const auto &pos : reexport_shlib_needs_fixup) {
@@ -3774,14 +3681,9 @@
               }
             }
           }
-        }
-      }
-    }
-  }
 
-  // Must reset this in case it was mutated above!
-  nlist_data_offset = 0;
 #endif
+  lldb::offset_t nlist_data_offset = 0;
 
   if (nlist_data.GetByteSize() > 0) {
 
diff --git a/llvm/include/llvm/Support/Compiler.h b/llvm/include/llvm/Support/Compiler.h
--- a/llvm/include/llvm/Support/Compiler.h
+++ b/llvm/include/llvm/Support/Compiler.h
@@ -17,9 +17,6 @@
 
 #include "llvm/Config/llvm-config.h"
 
-#ifdef __cplusplus
-#include <new>
-#endif
 #include <stddef.h>
 
 #if defined(_MSC_VER)
diff --git a/llvm/include/llvm/Transforms/IPO/IROutliner.h b/llvm/include/llvm/Transforms/IPO/IROutliner.h
--- a/llvm/include/llvm/Transforms/IPO/IROutliner.h
+++ b/llvm/include/llvm/Transforms/IPO/IROutliner.h
@@ -95,6 +95,10 @@
   /// required for the following basic blocks in this case.
   bool EndsInBranch = false;
 
+  /// The PHIBlocks with their corresponding return block based on the return
+  /// value as the key.
+  DenseMap<Value *, BasicBlock *> PHIBlocks;
+
   /// Mapping of the argument number in the deduplicated function
   /// to a given constant, which is used when creating the arguments to the call
   /// to the newly created deduplicated function.  This is handled separately
@@ -182,7 +186,14 @@
   IROutliner(function_ref<TargetTransformInfo &(Function &)> GTTI,
              function_ref<IRSimilarityIdentifier &(Module &)> GIRSI,
              function_ref<OptimizationRemarkEmitter &(Function &)> GORE)
-      : getTTI(GTTI), getIRSI(GIRSI), getORE(GORE) {}
+      : getTTI(GTTI), getIRSI(GIRSI), getORE(GORE) {
+    
+    // Check that the DenseMap implementation has not changed.
+    assert(DenseMapInfo<unsigned>::getEmptyKey() == (unsigned)-1 &&
+           "DenseMapInfo<unsigned>'s empty key isn't -1!");
+    assert(DenseMapInfo<unsigned>::getTombstoneKey() == (unsigned)-2 &&
+           "DenseMapInfo<unsigned>'s tombstone key isn't -2!");
+  }
   bool run(Module &M);
 
 private:
@@ -356,6 +367,11 @@
       Function *F = CI.getCalledFunction();
       if (!F || CI.isIndirectCall() || !F->hasName())
         return false;
+      // Returning twice can cause issues with the state of the function call
+      // that were not expected when the function was used, so we do not include
+      // the call in outlined functions.
+      if (CI.canReturnTwice())
+        return false;
       return true;
     }
     // TODO: Handle FreezeInsts.  Since a frozen value could be frozen inside
diff --git a/llvm/lib/ProfileData/InstrProfCorrelator.cpp b/llvm/lib/ProfileData/InstrProfCorrelator.cpp
--- a/llvm/lib/ProfileData/InstrProfCorrelator.cpp
+++ b/llvm/lib/ProfileData/InstrProfCorrelator.cpp
@@ -167,13 +167,19 @@
     return {};
   }
   auto &DU = *Die.getDwarfUnit();
+  auto AddressSize = DU.getAddressByteSize();
   for (auto &Location : *Locations) {
-    auto AddressSize = DU.getAddressByteSize();
     DataExtractor Data(Location.Expr, DICtx->isLittleEndian(), AddressSize);
     DWARFExpression Expr(Data, AddressSize);
-    for (auto &Op : Expr)
-      if (Op.getCode() == dwarf::DW_OP_addr)
+    for (auto &Op : Expr) {
+      if (Op.getCode() == dwarf::DW_OP_addr) {
         return Op.getRawOperand(0);
+      } else if (Op.getCode() == dwarf::DW_OP_addrx) {
+        uint64_t Index = Op.getRawOperand(0);
+        if (auto SA = DU.getAddrOffsetSectionItem(Index))
+          return SA->Address;
+      }
+    }
   }
   return {};
 }
diff --git a/llvm/lib/Support/MemAlloc.cpp b/llvm/lib/Support/MemAlloc.cpp
--- a/llvm/lib/Support/MemAlloc.cpp
+++ b/llvm/lib/Support/MemAlloc.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/MemAlloc.h"
+#include <new>
 
 // These are out of line to have __cpp_aligned_new not affect ABI.
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -614,6 +614,12 @@
       setOperationAction(ISD::SMUL_LOHI, VT, Expand);
       setOperationAction(ISD::UMUL_LOHI, VT, Expand);
 
+      // nxvXi64 MULHS/MULHU requires the V extension instead of Zve64*.
+      if (VT.getVectorElementType() == MVT::i64 && !Subtarget.hasStdExtV()) {
+        setOperationAction(ISD::MULHU, VT, Expand);
+        setOperationAction(ISD::MULHS, VT, Expand);
+      }
+
       setOperationAction(ISD::SMIN, VT, Legal);
       setOperationAction(ISD::SMAX, VT, Legal);
       setOperationAction(ISD::UMIN, VT, Legal);
@@ -910,8 +916,11 @@
         setOperationAction(ISD::UMAX, VT, Custom);
         setOperationAction(ISD::ABS,  VT, Custom);
 
-        setOperationAction(ISD::MULHS, VT, Custom);
-        setOperationAction(ISD::MULHU, VT, Custom);
+        // vXi64 MULHS/MULHU requires the V extension instead of Zve64*.
+        if (VT.getVectorElementType() != MVT::i64 || Subtarget.hasStdExtV()) {
+          setOperationAction(ISD::MULHS, VT, Custom);
+          setOperationAction(ISD::MULHU, VT, Custom);
+        }
 
         setOperationAction(ISD::SADDSAT, VT, Custom);
         setOperationAction(ISD::UADDSAT, VT, Custom);
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -156,6 +156,7 @@
   bool hasStdExtF() const { return HasStdExtF; }
   bool hasStdExtD() const { return HasStdExtD; }
   bool hasStdExtC() const { return HasStdExtC; }
+  bool hasStdExtV() const { return HasStdExtV; }
   bool hasStdExtZba() const { return HasStdExtZba; }
   bool hasStdExtZbb() const { return HasStdExtZbb; }
   bool hasStdExtZbc() const { return HasStdExtZbc; }
diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp
--- a/llvm/lib/Transforms/IPO/IROutliner.cpp
+++ b/llvm/lib/Transforms/IPO/IROutliner.cpp
@@ -106,6 +106,16 @@
   /// of the region.
   unsigned BranchesToOutside = 0;
 
+  /// Tracker counting backwards from the highest unsigned value possible to
+  /// avoid conflicting with the GVNs of assigned values.  We start at -3 since
+  /// -2 and -1 are assigned by the DenseMap.
+  unsigned PHINodeGVNTracker = -3;
+
+  DenseMap<unsigned,
+           std::pair<std::pair<unsigned, unsigned>, SmallVector<unsigned, 2>>>
+      PHINodeGVNToGVNs;
+  DenseMap<hash_code, unsigned> GVNsToPHINodeGVN;
+
   /// The number of instructions that will be outlined by extracting \ref
   /// Regions.
   InstructionCost Benefit = 0;
@@ -356,6 +366,24 @@
   return Benefit;
 }
 
+/// Check the \p OutputMappings structure for value \p Input, if it exists
+/// it has been used as an output for outlining, and has been renamed, and we
+/// return the new value, otherwise, we return the same value.
+///
+/// \param OutputMappings [in] - The mapping of values to their renamed value
+/// after being used as an output for an outlined region.
+/// \param Input [in] - The value to find the remapped value of, if it exists.
+/// \return The remapped value if it has been renamed, and the same value if has
+/// not.
+static Value *findOutputMapping(const DenseMap<Value *, Value *> OutputMappings,
+                                Value *Input) {
+  DenseMap<Value *, Value *>::const_iterator OutputMapping =
+      OutputMappings.find(Input);
+  if (OutputMapping != OutputMappings.end())
+    return OutputMapping->second;
+  return Input;
+}
+
 /// Find whether \p Region matches the global value numbering to Constant
 /// mapping found so far.
 ///
@@ -832,6 +860,209 @@
   Region.NumExtractedInputs = OriginalIndex;
 }
 
+/// Check if the \p V has any uses outside of the region other than \p PN.
+///
+/// \param V [in] - The value to check.
+/// \param PHILoc [in] - The location in the PHINode of \p V.
+/// \param PN [in] - The PHINode using \p V.
+/// \param Exits [in] - The potential blocks we exit to from the outlined
+/// region.
+/// \param BlocksInRegion [in] - The basic blocks contained in the region.
+/// \returns true if \p V has any use soutside its region other than \p PN.
+static bool outputHasNonPHI(Value *V, unsigned PHILoc, PHINode &PN,
+                            SmallPtrSet<BasicBlock *, 1> &Exits,
+                            DenseSet<BasicBlock *> &BlocksInRegion) {
+  // We check to see if the value is used by the PHINode from some other
+  // predecessor not included in the region.  If it is, we make sure
+  // to keep it as an output.
+  SmallVector<unsigned, 2> IncomingNumbers(PN.getNumIncomingValues());
+  std::iota(IncomingNumbers.begin(), IncomingNumbers.end(), 0);
+  if (any_of(IncomingNumbers, [PHILoc, &PN, V, &BlocksInRegion](unsigned Idx) {
+        return (Idx != PHILoc && V == PN.getIncomingValue(Idx) &&
+                !BlocksInRegion.contains(PN.getIncomingBlock(Idx)));
+      }))
+    return true;
+
+  // Check if the value is used by any other instructions outside the region.
+  return any_of(V->users(), [&Exits, &BlocksInRegion](User *U) {
+    Instruction *I = dyn_cast<Instruction>(U);
+    if (!I)
+      return false;
+
+    // If the use of the item is inside the region, we skip it.  Uses
+    // inside the region give us useful information about how the item could be
+    // used as an output.
+    BasicBlock *Parent = I->getParent();
+    if (BlocksInRegion.contains(Parent))
+      return false;
+
+    // If it's not a PHINode then we definitely know the use matters.  This
+    // output value will not completely combined with another item in a PHINode
+    // as it is directly reference by another non-phi instruction
+    if (!isa<PHINode>(I))
+      return true;
+
+    // If we have a PHINode outside one of the exit locations, then it
+    // can be considered an outside use as well.  If there is a PHINode
+    // contained in the Exit where this values use matters, it will be
+    // caught when we analyze that PHINode.
+    if (!Exits.contains(Parent))
+      return true;
+
+    return false;
+  });
+}
+
+/// Test whether \p CurrentExitFromRegion contains any PhiNodes that should be
+/// considered outputs. A PHINodes is an output when more than one incoming
+/// value has been marked by the CodeExtractor as an output.
+///
+/// \param CurrentExitFromRegion [in] - The block to analyze.
+/// \param PotentialExitsFromRegion [in] - The potential exit blocks from the
+/// region.
+/// \param RegionBlocks [in] - The basic blocks in the region.
+/// \param Outputs [in, out] - The existing outputs for the region, we may add
+/// PHINodes to this as we find that they replace output values.
+/// \param OutputsReplacedByPHINode [out] - A set containing outputs that are
+/// totally replaced  by a PHINode.
+/// \param OutputsWithNonPhiUses [out] - A set containing outputs that are used
+/// in PHINodes, but have other uses, and should still be considered outputs.
+static void analyzeExitPHIsForOutputUses(
+    BasicBlock *CurrentExitFromRegion,
+    SmallPtrSet<BasicBlock *, 1> &PotentialExitsFromRegion,
+    DenseSet<BasicBlock *> &RegionBlocks, SetVector<Value *> &Outputs,
+    DenseSet<Value *> &OutputsReplacedByPHINode,
+    DenseSet<Value *> &OutputsWithNonPhiUses) {
+  for (PHINode &PN : CurrentExitFromRegion->phis()) {
+    // Find all incoming values from the outlining region.
+    SmallVector<unsigned, 2> IncomingVals;
+    for (unsigned I = 0, E = PN.getNumIncomingValues(); I < E; ++I)
+      if (RegionBlocks.contains(PN.getIncomingBlock(I)))
+        IncomingVals.push_back(I);
+
+    // Do not process PHI if there are no predecessors from region.
+    unsigned NumIncomingVals = IncomingVals.size();
+    if (NumIncomingVals == 0)
+      continue;
+
+    // If there is one predecessor, we mark it as a value that needs to be kept
+    // as an output.
+    if (NumIncomingVals == 1) {
+      Value *V = PN.getIncomingValue(*IncomingVals.begin());
+      OutputsWithNonPhiUses.insert(V);
+      OutputsReplacedByPHINode.erase(V);
+      continue;
+    }
+
+    // This PHINode will be used as an output value, so we add it to our list.
+    Outputs.insert(&PN);
+
+    // Not all of the incoming values should be ignored as other inputs and
+    // outputs may have uses in outlined region.  If they have other uses
+    // outside of the single PHINode we should not skip over it.
+    for (unsigned Idx : IncomingVals) {
+      Value *V = PN.getIncomingValue(Idx);
+      if (outputHasNonPHI(V, Idx, PN, PotentialExitsFromRegion, RegionBlocks)) {
+        OutputsWithNonPhiUses.insert(V);
+        OutputsReplacedByPHINode.erase(V);
+        continue;
+      }
+      if (!OutputsWithNonPhiUses.contains(V))
+        OutputsReplacedByPHINode.insert(V);
+    }
+  }
+}
+
+// Represents the type for the unsigned number denoting the output number for
+// phi node, along with the canonical number for the exit block.
+using ArgLocWithBBCanon = std::pair<unsigned, unsigned>;
+// The list of canonical numbers for the incoming values to a PHINode.
+using CanonList = SmallVector<unsigned, 2>;
+// The pair type representing the set of canonical values being combined in the
+// PHINode, along with the location data for the PHINode.
+using PHINodeData = std::pair<ArgLocWithBBCanon, CanonList>;
+
+/// Encode \p PND as an integer for easy lookup based on the argument location,
+/// the parent BasicBlock canonical numbering, and the canonical numbering of
+/// the values stored in the PHINode.
+///
+/// \param PND - The data to hash.
+/// \returns The hash code of \p PND.
+static hash_code encodePHINodeData(PHINodeData &PND) {
+  return llvm::hash_combine(
+      llvm::hash_value(PND.first.first), llvm::hash_value(PND.first.second),
+      llvm::hash_combine_range(PND.second.begin(), PND.second.end()));
+}
+
+/// Create a special GVN for PHINodes that will be used outside of
+/// the region.  We create a hash code based on the Canonical number of the
+/// parent BasicBlock, the canonical numbering of the values stored in the
+/// PHINode and the aggregate argument location.  This is used to find whether
+/// this PHINode type has been given a canonical numbering already.  If not, we
+/// assign it a value and store it for later use.  The value is returned to
+/// identify different output schemes for the set of regions.
+///
+/// \param Region - The region that \p PN is an output for.
+/// \param PN - The PHINode we are analyzing.
+/// \param AggArgIdx - The argument \p PN will be stored into.
+/// \returns An optional holding the assigned canonical number, or None if
+/// there is some attribute of the PHINode blocking it from being used.
+static Optional<unsigned> getGVNForPHINode(OutlinableRegion &Region,
+                                           PHINode *PN, unsigned AggArgIdx) {
+  OutlinableGroup &Group = *Region.Parent;
+  IRSimilarityCandidate &Cand = *Region.Candidate;
+  BasicBlock *PHIBB = PN->getParent();
+  CanonList PHIGVNs;
+  for (Value *Incoming : PN->incoming_values()) {
+    // If we cannot find a GVN, this means that the input to the PHINode is
+    // not included in the region we are trying to analyze, meaning, that if
+    // it was outlined, we would be adding an extra input.  We ignore this
+    // case for now, and so ignore the region.
+    Optional<unsigned> OGVN = Cand.getGVN(Incoming);
+    if (!OGVN.hasValue()) {
+      Region.IgnoreRegion = true;
+      return None;
+    }
+
+    // Collect the canonical numbers of the values in the PHINode.
+    unsigned GVN = OGVN.getValue();
+    OGVN = Cand.getCanonicalNum(GVN);
+    assert(OGVN.hasValue() && "No GVN found for incoming value?");
+    PHIGVNs.push_back(*OGVN);
+  }
+
+  // Now that we have the GVNs for the incoming values, we are going to combine
+  // them with the GVN of the incoming bock, and the output location of the
+  // PHINode to generate a hash value representing this instance of the PHINode.
+  DenseMap<hash_code, unsigned>::iterator GVNToPHIIt;
+  DenseMap<unsigned, PHINodeData>::iterator PHIToGVNIt;
+  Optional<unsigned> BBGVN = Cand.getGVN(PHIBB);
+  assert(BBGVN.hasValue() && "Could not find GVN for the incoming block!");
+
+  BBGVN = Cand.getCanonicalNum(BBGVN.getValue());
+  assert(BBGVN.hasValue() &&
+         "Could not find canonical number for the incoming block!");
+  // Create a pair of the exit block canonical value, and the aggregate
+  // argument location, connected to the canonical numbers stored in the
+  // PHINode.
+  PHINodeData TemporaryPair =
+      std::make_pair(std::make_pair(BBGVN.getValue(), AggArgIdx), PHIGVNs);
+  hash_code PHINodeDataHash = encodePHINodeData(TemporaryPair);
+
+  // Look for and create a new entry in our connection between canonical
+  // numbers for PHINodes, and the set of objects we just created.
+  GVNToPHIIt = Group.GVNsToPHINodeGVN.find(PHINodeDataHash);
+  if (GVNToPHIIt == Group.GVNsToPHINodeGVN.end()) {
+    bool Inserted = false;
+    std::tie(PHIToGVNIt, Inserted) = Group.PHINodeGVNToGVNs.insert(
+        std::make_pair(Group.PHINodeGVNTracker, TemporaryPair));
+    std::tie(GVNToPHIIt, Inserted) = Group.GVNsToPHINodeGVN.insert(
+        std::make_pair(PHINodeDataHash, Group.PHINodeGVNTracker--));
+  }
+
+  return GVNToPHIIt->second;
+}
+
 /// Create a mapping of the output arguments for the \p Region to the output
 /// arguments of the overall outlined function.
 ///
@@ -844,35 +1075,25 @@
   IRSimilarityCandidate &C = *Region.Candidate;
 
   SmallVector<BasicBlock *> BE;
-  DenseSet<BasicBlock *> BBSet;
-  C.getBasicBlocks(BBSet, BE);
+  DenseSet<BasicBlock *> BlocksInRegion;
+  C.getBasicBlocks(BlocksInRegion, BE);
 
   // Find the exits to the region.
   SmallPtrSet<BasicBlock *, 1> Exits;
   for (BasicBlock *Block : BE)
     for (BasicBlock *Succ : successors(Block))
-      if (!BBSet.contains(Succ))
+      if (!BlocksInRegion.contains(Succ))
         Exits.insert(Succ);
 
   // After determining which blocks exit to PHINodes, we add these PHINodes to
   // the set of outputs to be processed.  We also check the incoming values of
   // the PHINodes for whether they should no longer be considered outputs.
-  for (BasicBlock *ExitBB : Exits) {
-    for (PHINode &PN : ExitBB->phis()) {
-      // Find all incoming values from the outlining region.
-      SmallVector<unsigned, 2> IncomingVals;
-      for (unsigned Idx = 0; Idx < PN.getNumIncomingValues(); ++Idx)
-        if (BBSet.contains(PN.getIncomingBlock(Idx)))
-          IncomingVals.push_back(Idx);
-
-      // Do not process PHI if there is one (or fewer) predecessor from region.
-      if (IncomingVals.size() <= 1)
-        continue;
-
-      Region.IgnoreRegion = true;
-      return;
-    }
-  }
+  DenseSet<Value *> OutputsReplacedByPHINode;
+  DenseSet<Value *> OutputsWithNonPhiUses;
+  for (BasicBlock *ExitBB : Exits)
+    analyzeExitPHIsForOutputUses(ExitBB, Exits, BlocksInRegion, Outputs,
+                                 OutputsReplacedByPHINode,
+                                 OutputsWithNonPhiUses);
 
   // This counts the argument number in the extracted function.
   unsigned OriginalIndex = Region.NumExtractedInputs;
@@ -895,9 +1116,13 @@
     // do not have to be in same order, but are functionally the same, we will
     // have to use a different scheme, as one-to-one correspondence is not
     // guaranteed.
-    unsigned GlobalValue = C.getGVN(Output).getValue();
     unsigned ArgumentSize = Group.ArgumentTypes.size();
 
+    // If the output is combined in a PHINode, we make sure to skip over it.
+    if (OutputsReplacedByPHINode.contains(Output))
+      continue;
+
+    unsigned AggArgIdx = 0;
     for (unsigned Jdx = TypeIndex; Jdx < ArgumentSize; Jdx++) {
       if (Group.ArgumentTypes[Jdx] != PointerType::getUnqual(Output->getType()))
         continue;
@@ -909,7 +1134,7 @@
       AggArgsUsed.insert(Jdx);
       Region.ExtractedArgToAgg.insert(std::make_pair(OriginalIndex, Jdx));
       Region.AggArgToExtracted.insert(std::make_pair(Jdx, OriginalIndex));
-      Region.GVNStores.push_back(GlobalValue);
+      AggArgIdx = Jdx;
       break;
     }
 
@@ -918,18 +1143,54 @@
     // function to handle this output and create a mapping to it.
     if (!TypeFound) {
       Group.ArgumentTypes.push_back(PointerType::getUnqual(Output->getType()));
-      AggArgsUsed.insert(Group.ArgumentTypes.size() - 1);
+      // Mark the new pointer type as the last value in the aggregate argument
+      // list.
+      unsigned ArgTypeIdx = Group.ArgumentTypes.size() - 1;
+      AggArgsUsed.insert(ArgTypeIdx);
       Region.ExtractedArgToAgg.insert(
-          std::make_pair(OriginalIndex, Group.ArgumentTypes.size() - 1));
+          std::make_pair(OriginalIndex, ArgTypeIdx));
       Region.AggArgToExtracted.insert(
-          std::make_pair(Group.ArgumentTypes.size() - 1, OriginalIndex));
-      Region.GVNStores.push_back(GlobalValue);
+          std::make_pair(ArgTypeIdx, OriginalIndex));
+      AggArgIdx = ArgTypeIdx;
+    }
+
+    // TODO: Adapt to the extra input from the PHINode.
+    PHINode *PN = dyn_cast<PHINode>(Output);
+
+    Optional<unsigned> GVN;
+    if (PN && !BlocksInRegion.contains(PN->getParent())) {
+      // Values outside the region can be combined into PHINode when we
+      // have multiple exits. We collect both of these into a list to identify
+      // which values are being used in the PHINode. Each list identifies a
+      // different PHINode, and a different output. We store the PHINode as it's
+      // own canonical value.  These canonical values are also dependent on the
+      // output argument it is saved to.
+
+      // If two PHINodes have the same canonical values, but different aggregate
+      // argument locations, then they will have distinct Canonical Values.
+      GVN = getGVNForPHINode(Region, PN, AggArgIdx);
+      if (!GVN.hasValue())
+        return; 
+    } else {
+      // If we do not have a PHINode we use the global value numbering for the
+      // output value, to find the canonical number to add to the set of stored
+      // values.
+      GVN = C.getGVN(Output);
+      GVN = C.getCanonicalNum(*GVN);
     }
 
-    stable_sort(Region.GVNStores);
+    // Each region has a potentially unique set of outputs.  We save which
+    // values are output in a list of canonical values so we can differentiate
+    // among the different store schemes.
+    Region.GVNStores.push_back(*GVN);
+
     OriginalIndex++;
     TypeIndex++;
   }
+
+  // We sort the stored values to make sure that we are not affected by analysis
+  // order when determining what combination of items were stored.
+  stable_sort(Region.GVNStores);
 }
 
 void IROutliner::findAddInputsOutputs(Module &M, OutlinableRegion &Region,
@@ -1065,6 +1326,214 @@
   return Call;
 }
 
+/// Find or create a BasicBlock in the outlined function containing PhiBlocks
+/// for \p RetVal.
+///
+/// \param Group - The OutlinableGroup containing the information about the
+/// overall outlined function.
+/// \param RetVal - The return value or exit option that we are currently
+/// evaluating.
+/// \returns The found or newly created BasicBlock to contain the needed
+/// PHINodes to be used as outputs.
+static BasicBlock *findOrCreatePHIBlock(OutlinableGroup &Group, Value *RetVal) {
+  DenseMap<Value *, BasicBlock *>::iterator PhiBlockForRetVal,
+      ReturnBlockForRetVal;
+  PhiBlockForRetVal = Group.PHIBlocks.find(RetVal);
+  ReturnBlockForRetVal = Group.EndBBs.find(RetVal);
+  assert(ReturnBlockForRetVal != Group.EndBBs.end() &&
+         "Could not find output value!");
+  BasicBlock *ReturnBB = ReturnBlockForRetVal->second;
+
+  // Find if a PHIBlock exists for this return value already.  If it is
+  // the first time we are analyzing this, we will not, so we record it.
+  PhiBlockForRetVal = Group.PHIBlocks.find(RetVal);
+  if (PhiBlockForRetVal != Group.PHIBlocks.end())
+    return PhiBlockForRetVal->second;
+  
+  // If we did not find a block, we create one, and insert it into the
+  // overall function and record it.
+  bool Inserted = false;
+  BasicBlock *PHIBlock = BasicBlock::Create(ReturnBB->getContext(), "phi_block",
+                                            ReturnBB->getParent());
+  std::tie(PhiBlockForRetVal, Inserted) =
+      Group.PHIBlocks.insert(std::make_pair(RetVal, PHIBlock));
+
+  // We find the predecessors of the return block in the newly created outlined
+  // function in order to point them to the new PHIBlock rather than the already
+  // existing return block.
+  SmallVector<BranchInst *, 2> BranchesToChange;
+  for (BasicBlock *Pred : predecessors(ReturnBB))
+    BranchesToChange.push_back(cast<BranchInst>(Pred->getTerminator()));
+
+  // Now we mark the branch instructions found, and change the references of the
+  // return block to the newly created PHIBlock.
+  for (BranchInst *BI : BranchesToChange)
+    for (unsigned Succ = 0, End = BI->getNumSuccessors(); Succ < End; Succ++) {
+      if (BI->getSuccessor(Succ) != ReturnBB)
+        continue;
+      BI->setSuccessor(Succ, PHIBlock);
+    }
+
+  BranchInst::Create(ReturnBB, PHIBlock);
+
+  return PhiBlockForRetVal->second;
+}
+
+/// For the function call now representing the \p Region, find the passed value
+/// to that call that represents Argument \p A at the call location if the
+/// call has already been replaced with a call to the  overall, aggregate
+/// function.
+///
+/// \param A - The Argument to get the passed value for.
+/// \param Region - The extracted Region corresponding to the outlined function.
+/// \returns The Value representing \p A at the call site.
+static Value *
+getPassedArgumentInAlreadyOutlinedFunction(const Argument *A,
+                                           const OutlinableRegion &Region) {
+  // If we don't need to adjust the argument number at all (since the call
+  // has already been replaced by a call to the overall outlined function)
+  // we can just get the specified argument.
+  return Region.Call->getArgOperand(A->getArgNo());
+}
+
+/// For the function call now representing the \p Region, find the passed value
+/// to that call that represents Argument \p A at the call location if the
+/// call has only been replaced by the call to the aggregate function.
+///
+/// \param A - The Argument to get the passed value for.
+/// \param Region - The extracted Region corresponding to the outlined function.
+/// \returns The Value representing \p A at the call site.
+static Value *
+getPassedArgumentAndAdjustArgumentLocation(const Argument *A,
+                                           const OutlinableRegion &Region) {
+  unsigned ArgNum = A->getArgNo();
+  
+  // If it is a constant, we can look at our mapping from when we created
+  // the outputs to figure out what the constant value is.
+  if (Region.AggArgToConstant.count(ArgNum))
+    return Region.AggArgToConstant.find(ArgNum)->second;
+  
+  // If it is not a constant, and we are not looking at the overall function, we
+  // need to adjust which argument we are looking at.
+  ArgNum = Region.AggArgToExtracted.find(ArgNum)->second;
+  return Region.Call->getArgOperand(ArgNum);
+}
+
+/// Find the canonical numbering for the incoming Values into the PHINode \p PN.
+///
+/// \param PN [in] - The PHINode that we are finding the canonical numbers for.
+/// \param Region [in] - The OutlinableRegion containing \p PN.
+/// \param OutputMappings [in] - The mapping of output values from outlined
+/// region to their original values.
+/// \param CanonNums [out] - The canonical numbering for the incoming values to
+/// \p PN.
+/// \param ReplacedWithOutlinedCall - A flag to use the extracted function call
+/// of \p Region rather than the overall function's call.
+static void
+findCanonNumsForPHI(PHINode *PN, OutlinableRegion &Region,
+                    const DenseMap<Value *, Value *> &OutputMappings,
+                    DenseSet<unsigned> &CanonNums,
+                    bool ReplacedWithOutlinedCall = true) {
+  // Iterate over the incoming values.
+  for (unsigned Idx = 0, EIdx = PN->getNumIncomingValues(); Idx < EIdx; Idx++) {
+    Value *IVal = PN->getIncomingValue(Idx);
+    // If we have an argument as incoming value, we need to grab the passed
+    // value from the call itself.
+    if (Argument *A = dyn_cast<Argument>(IVal)) {
+      if (ReplacedWithOutlinedCall)
+        IVal = getPassedArgumentInAlreadyOutlinedFunction(A, Region);
+      else
+        IVal = getPassedArgumentAndAdjustArgumentLocation(A, Region);
+    }
+
+    // Get the original value if it has been replaced by an output value.
+    IVal = findOutputMapping(OutputMappings, IVal);
+
+    // Find and add the canonical number for the incoming value.
+    Optional<unsigned> GVN = Region.Candidate->getGVN(IVal);
+    assert(GVN.hasValue() && "No GVN for incoming value");
+    Optional<unsigned> CanonNum = Region.Candidate->getCanonicalNum(*GVN);
+    assert(CanonNum.hasValue() && "No Canonical Number for GVN");
+    CanonNums.insert(*CanonNum);
+  }
+}
+
+/// Find, or add PHINode \p PN to the combined PHINode Block \p OverallPHIBlock
+/// in order to condense the number of instructions added to the outlined
+/// function.
+///
+/// \param PN [in] - The PHINode that we are finding the canonical numbers for.
+/// \param Region [in] - The OutlinableRegion containing \p PN. 
+/// \param OverallPhiBlock [in] - The overall PHIBlock we are trying to find
+/// \p PN in.
+/// \param OutputMappings [in] - The mapping of output values from outlined
+/// region to their original values.
+/// \return the newly found or created PHINode in \p OverallPhiBlock.
+static PHINode*
+findOrCreatePHIInBlock(PHINode &PN, OutlinableRegion &Region,
+                       BasicBlock *OverallPhiBlock,
+                       const DenseMap<Value *, Value *> &OutputMappings) {
+  OutlinableGroup &Group = *Region.Parent;
+  
+  DenseSet<unsigned> PNCanonNums;
+  // We have to use the extracted function since we have merged this region into
+  // the overall function yet.  We make sure to reassign the argument numbering
+  // since it is possible that the argument ordering is different between the
+  // functions.
+  findCanonNumsForPHI(&PN, Region, OutputMappings, PNCanonNums,
+                      /* ReplacedWithOutlinedCall = */ false);
+
+  OutlinableRegion *FirstRegion = Group.Regions[0];
+  DenseSet<unsigned> CurrentCanonNums;
+  // Find the Canonical Numbering for each PHINode, if it matches, we replace
+  // the uses of the PHINode we are searching for, with the found PHINode.
+  for (PHINode &CurrPN : OverallPhiBlock->phis()) {
+    CurrentCanonNums.clear();
+    findCanonNumsForPHI(&CurrPN, *FirstRegion, OutputMappings, CurrentCanonNums,
+                        /* ReplacedWithOutlinedCall = */ true);
+
+    if (all_of(PNCanonNums, [&CurrentCanonNums](unsigned CanonNum) {
+          return CurrentCanonNums.contains(CanonNum);
+        }))
+      return &CurrPN;
+  }
+
+  // If we've made it here, it means we weren't able to replace the PHINode, so
+  // we must insert it ourselves.
+  PHINode *NewPN = cast<PHINode>(PN.clone());
+  NewPN->insertBefore(&*OverallPhiBlock->begin());
+  for (unsigned Idx = 0, Edx = NewPN->getNumIncomingValues(); Idx < Edx;
+       Idx++) {
+    Value *IncomingVal = NewPN->getIncomingValue(Idx);
+    BasicBlock *IncomingBlock = NewPN->getIncomingBlock(Idx);
+
+    // Find corresponding basic block in the overall function for the incoming
+    // block.
+    Instruction *FirstNonPHI = IncomingBlock->getFirstNonPHI();
+    assert(FirstNonPHI && "Incoming block is empty?");
+    Value *CorrespondingVal =
+        Region.findCorrespondingValueIn(*FirstRegion, FirstNonPHI);
+    assert(CorrespondingVal && "Value is nullptr?");
+    BasicBlock *BlockToUse = cast<Instruction>(CorrespondingVal)->getParent();
+    NewPN->setIncomingBlock(Idx, BlockToUse);
+
+    // If we have an argument we make sure we replace using the argument from
+    // the correct function.
+    if (Argument *A = dyn_cast<Argument>(IncomingVal)) {
+      Value *Val = Group.OutlinedFunction->getArg(A->getArgNo());
+      NewPN->setIncomingValue(Idx, Val);
+      continue;
+    }
+    
+    // Find the corresponding value in the overall function.
+    IncomingVal = findOutputMapping(OutputMappings, IncomingVal);
+    Value *Val = Region.findCorrespondingValueIn(*FirstRegion, IncomingVal);
+    assert(Val && "Value is nullptr?");
+    NewPN->setIncomingValue(Idx, Val);
+  }
+  return NewPN;
+}
+
 // Within an extracted function, replace the argument uses of the extracted
 // region with the arguments of the function for an OutlinableGroup.
 //
@@ -1077,6 +1546,7 @@
 static void
 replaceArgumentUses(OutlinableRegion &Region,
                     DenseMap<Value *, BasicBlock *> &OutputBBs,
+                    const DenseMap<Value *, Value *> &OutputMappings,
                     bool FirstFunction = false) {
   OutlinableGroup &Group = *Region.Parent;
   assert(Region.ExtractedFunction && "Region has no extracted function?");
@@ -1146,12 +1616,46 @@
       LLVM_DEBUG(dbgs() << "Move store for instruction " << *I << " to "
                         << *OutputBB << "\n");
 
-      if (FirstFunction)
+      // If this is storing a PHINode, we must make sure it is included in the
+      // overall function.
+      if (!isa<PHINode>(ValueOperand)) {
+        if (FirstFunction)
+          continue;
+        Value *CorrVal =
+            Region.findCorrespondingValueIn(*Group.Regions[0], ValueOperand);
+        assert(CorrVal && "Value is nullptr?");
+        NewI->setOperand(0, CorrVal);
+        continue;
+      }
+      PHINode *PN = cast<PHINode>(SI->getValueOperand());
+      // If it has a value, it was not split by the code extractor, which
+      // is what we are looking for.
+      if (Region.Candidate->getGVN(PN).hasValue())
         continue;
-      Value *CorrVal =
-          Region.findCorrespondingValueIn(*Group.Regions[0], ValueOperand);
-      assert(CorrVal && "Value is nullptr?");
-      NewI->setOperand(0, CorrVal);
+
+      // We record the parent block for the PHINode in the Region so that
+      // we can exclude it from checks later on.
+      Region.PHIBlocks.insert(std::make_pair(RetVal, PN->getParent()));
+
+      // If this is the first function, we do not need to worry about mergiing
+      // this with any other block in the overall outlined function, so we can
+      // just continue.
+      if (FirstFunction) {
+        BasicBlock *PHIBlock = PN->getParent();
+        Group.PHIBlocks.insert(std::make_pair(RetVal, PHIBlock));
+        continue;
+      }
+
+      // We look for the aggregate block that contains the PHINodes leading into
+      // this exit path. If we can't find one, we create one.
+      BasicBlock *OverallPhiBlock = findOrCreatePHIBlock(Group, RetVal);
+
+      // For our PHINode, we find the combined canonical numbering, and
+      // attempt to find a matching PHINode in the overall PHIBlock.  If we
+      // cannot, we copy the PHINode and move it into this new block.
+      PHINode *NewPN =
+          findOrCreatePHIInBlock(*PN, Region, OverallPhiBlock, OutputMappings);
+      NewI->setOperand(0, NewPN);
     }
 
     // If we added an edge for basic blocks without a predecessor, we remove it
@@ -1392,7 +1896,12 @@
     Module &M, OutlinableGroup &OG, DenseMap<Value *, BasicBlock *> &EndBBs,
     std::vector<DenseMap<Value *, BasicBlock *>> &OutputStoreBBs) {
   // We only need the switch statement if there is more than one store
-  // combination.
+  // combination, or there is more than one set of output blocks.  The first
+  // will occur when we store different sets of values for two different
+  // regions.  The second will occur when we have two outputs that are combined
+  // in a PHINode outside of the region in one outlined instance, and are used
+  // seaparately in another. This will create the same set of OutputGVNs, but
+  // will generate two different output schemes.
   if (OG.OutputGVNCombinations.size() > 1) {
     Function *AggFunc = OG.OutlinedFunction;
     // Create a final block for each different return block.
@@ -1435,8 +1944,14 @@
     return;
   }
 
+  assert(OutputStoreBBs.size() < 2 && "Different store sets not handled!");
+
   // If there needs to be stores, move them from the output blocks to their
-  // corresponding ending block.
+  // corresponding ending block.  We do not check that the OutputGVNCombinations
+  // is equal to 1 here since that could just been the case where there are 0
+  // outputs. Instead, we check whether there is more than one set of output
+  // blocks since this is the only case where we would have to move the
+  // stores, and erase the extraneous blocks.
   if (OutputStoreBBs.size() == 1) {
     LLVM_DEBUG(dbgs() << "Move store instructions to the end block in "
                       << *OG.OutlinedFunction << "\n");
@@ -1468,10 +1983,13 @@
 /// set of stores needed for the different functions.
 /// \param [in,out] FuncsToRemove - Extracted functions to erase from module
 /// once outlining is complete.
+/// \param [in] OutputMappings - Extracted functions to erase from module
+/// once outlining is complete.
 static void fillOverallFunction(
     Module &M, OutlinableGroup &CurrentGroup,
     std::vector<DenseMap<Value *, BasicBlock *>> &OutputStoreBBs,
-    std::vector<Function *> &FuncsToRemove) {
+    std::vector<Function *> &FuncsToRemove,
+    const DenseMap<Value *, Value *> &OutputMappings) {
   OutlinableRegion *CurrentOS = CurrentGroup.Regions[0];
 
   // Move first extracted function's instructions into new function.
@@ -1491,7 +2009,7 @@
                              CurrentGroup.OutlinedFunction, "output_block_0");
   CurrentOS->OutputBlockNum = 0;
 
-  replaceArgumentUses(*CurrentOS, NewBBs, true);
+  replaceArgumentUses(*CurrentOS, NewBBs, OutputMappings, true);
   replaceConstants(*CurrentOS);
 
   // We first identify if any output blocks are empty, if they are we remove
@@ -1525,7 +2043,8 @@
 
   OutlinableRegion *CurrentOS;
 
-  fillOverallFunction(M, CurrentGroup, OutputStoreBBs, FuncsToRemove);
+  fillOverallFunction(M, CurrentGroup, OutputStoreBBs, FuncsToRemove,
+                      OutputMappings);
 
   std::vector<Value *> SortedKeys;
   for (unsigned Idx = 1; Idx < CurrentGroup.Regions.size(); Idx++) {
@@ -1539,8 +2058,7 @@
     createAndInsertBasicBlocks(
         CurrentGroup.EndBBs, NewBBs, CurrentGroup.OutlinedFunction,
         "output_block_" + Twine(static_cast<unsigned>(Idx)));
-
-    replaceArgumentUses(*CurrentOS, NewBBs);
+    replaceArgumentUses(*CurrentOS, NewBBs, OutputMappings);
     alignOutputBlockWithAggFunc(CurrentGroup, *CurrentOS, NewBBs,
                                 CurrentGroup.EndBBs, OutputMappings,
                                 OutputStoreBBs);
@@ -1708,6 +2226,34 @@
   return RegionBenefit;
 }
 
+/// For the \p OutputCanon number passed in find the value represented by this
+/// canonical number. If it is from a PHINode, we pick the first incoming
+/// value and return that Value instead.
+///
+/// \param Region - The OutlinableRegion to get the Value from.
+/// \param OutputCanon - The canonical number to find the Value from.
+/// \returns The Value represented by a canonical number \p OutputCanon in \p
+/// Region.
+static Value *findOutputValueInRegion(OutlinableRegion &Region,
+                                      unsigned OutputCanon) {
+  OutlinableGroup &CurrentGroup = *Region.Parent;
+  // If the value is greater than the value in the tracker, we have a
+  // PHINode and will instead use one of the incoming values to find the
+  // type.
+  if (OutputCanon > CurrentGroup.PHINodeGVNTracker) {
+    auto It = CurrentGroup.PHINodeGVNToGVNs.find(OutputCanon);
+    assert(It != CurrentGroup.PHINodeGVNToGVNs.end() &&
+           "Could not find GVN set for PHINode number!");
+    assert(It->second.second.size() > 0 && "PHINode does not have any values!");
+    OutputCanon = *It->second.second.begin();
+  }
+  Optional<unsigned> OGVN = Region.Candidate->fromCanonicalNum(OutputCanon);
+  assert(OGVN.hasValue() && "Could not find GVN for Canonical Number?");
+  Optional<Value *> OV = Region.Candidate->fromGVN(*OGVN);
+  assert(OV.hasValue() && "Could not find value for GVN?");
+  return *OV;
+}
+
 InstructionCost
 IROutliner::findCostOutputReloads(OutlinableGroup &CurrentGroup) {
   InstructionCost OverallCost = 0;
@@ -1715,10 +2261,8 @@
     TargetTransformInfo &TTI = getTTI(*Region->StartBB->getParent());
 
     // Each output incurs a load after the call, so we add that to the cost.
-    for (unsigned OutputGVN : Region->GVNStores) {
-      Optional<Value *> OV = Region->Candidate->fromGVN(OutputGVN);
-      assert(OV.hasValue() && "Could not find value for GVN?");
-      Value *V = OV.getValue();
+    for (unsigned OutputCanon : Region->GVNStores) {
+      Value *V = findOutputValueInRegion(*Region, OutputCanon);
       InstructionCost LoadCost =
           TTI.getMemoryOpCost(Instruction::Load, V->getType(), Align(1), 0,
                               TargetTransformInfo::TCK_CodeSize);
@@ -1747,6 +2291,7 @@
   InstructionCost OutputCost = 0;
   unsigned NumOutputBranches = 0;
 
+  OutlinableRegion &FirstRegion = *CurrentGroup.Regions[0];
   IRSimilarityCandidate &Candidate = *CurrentGroup.Regions[0]->Candidate;
   DenseSet<BasicBlock *> CandidateBlocks;
   Candidate.getBasicBlocks(CandidateBlocks);
@@ -1772,10 +2317,8 @@
 
   for (const ArrayRef<unsigned> &OutputUse :
        CurrentGroup.OutputGVNCombinations) {
-    for (unsigned GVN : OutputUse) {
-      Optional<Value *> OV = Candidate.fromGVN(GVN);
-      assert(OV.hasValue() && "Could not find value for GVN?");
-      Value *V = OV.getValue();
+    for (unsigned OutputCanon : OutputUse) {
+      Value *V = findOutputValueInRegion(FirstRegion, OutputCanon);
       InstructionCost StoreCost =
           TTI.getMemoryOpCost(Instruction::Load, V->getType(), Align(1), 0,
                               TargetTransformInfo::TCK_CodeSize);
@@ -2035,8 +2578,8 @@
         continue;
 
       SmallVector<BasicBlock *> BE;
-      DenseSet<BasicBlock *> BBSet;
-      OS->Candidate->getBasicBlocks(BBSet, BE);
+      DenseSet<BasicBlock *> BlocksInRegion;
+      OS->Candidate->getBasicBlocks(BlocksInRegion, BE);
       OS->CE = new (ExtractorAllocator.Allocate())
           CodeExtractor(BE, nullptr, false, nullptr, nullptr, nullptr, false,
                         false, "outlined");
@@ -2146,8 +2689,8 @@
     OutlinedRegions.clear();
     for (OutlinableRegion *OS : CurrentGroup.Regions) {
       SmallVector<BasicBlock *> BE;
-      DenseSet<BasicBlock *> BBSet;
-      OS->Candidate->getBasicBlocks(BBSet, BE);
+      DenseSet<BasicBlock *> BlocksInRegion;
+      OS->Candidate->getBasicBlocks(BlocksInRegion, BE);
       OS->CE = new (ExtractorAllocator.Allocate())
           CodeExtractor(BE, nullptr, false, nullptr, nullptr, nullptr, false,
                         false, "outlined");
diff --git a/llvm/test/Bindings/Go/go.test b/llvm/test/Bindings/Go/go.test
--- a/llvm/test/Bindings/Go/go.test
+++ b/llvm/test/Bindings/Go/go.test
@@ -1,4 +1,5 @@
 ; RUN: llvm-go test llvm.org/llvm/bindings/go/llvm
 
 ; REQUIRES: shell, default_triple
-; UNSUPPORTED: asan, ubsan, msan
+;; Building Go bindings with Clang is currently unsupported on AIX.
+; UNSUPPORTED: asan, ubsan, msan, -aix
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -0,0 +1,2095 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-none-linux-gnu %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-BASE
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+dotprod %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-DOT
+
+define i32 @add_v4i32_v4i32(<4 x i32> %x) {
+; CHECK-LABEL: add_v4i32_v4i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
+  ret i32 %z
+}
+
+define i64 @add_v4i32_v4i64_zext(<4 x i32> %x) {
+; CHECK-LABEL: add_v4i32_v4i64_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %xx = zext <4 x i32> %x to <4 x i64>
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
+  ret i64 %z
+}
+
+define i64 @add_v4i32_v4i64_sext(<4 x i32> %x) {
+; CHECK-LABEL: add_v4i32_v4i64_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll v1.2d, v0.2s, #0
+; CHECK-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %xx = sext <4 x i32> %x to <4 x i64>
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
+  ret i64 %z
+}
+
+define i64 @add_v2i32_v2i64_zext(<2 x i32> %x) {
+; CHECK-LABEL: add_v2i32_v2i64_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %xx = zext <2 x i32> %x to <2 x i64>
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
+  ret i64 %z
+}
+
+define i64 @add_v2i32_v2i64_sext(<2 x i32> %x) {
+; CHECK-LABEL: add_v2i32_v2i64_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll v0.2d, v0.2s, #0
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %xx = sext <2 x i32> %x to <2 x i64>
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
+  ret i64 %z
+}
+
+define i32 @add_v8i16_v8i32_zext(<8 x i16> %x) {
+; CHECK-LABEL: add_v8i16_v8i32_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-NEXT:    uaddw2 v0.4s, v1.4s, v0.8h
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %xx = zext <8 x i16> %x to <8 x i32>
+  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
+  ret i32 %z
+}
+
+define i32 @add_v8i16_v8i32_sext(<8 x i16> %x) {
+; CHECK-LABEL: add_v8i16_v8i32_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-NEXT:    saddw2 v0.4s, v1.4s, v0.8h
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %xx = sext <8 x i16> %x to <8 x i32>
+  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
+  ret i32 %z
+}
+
+define i32 @add_v4i16_v4i32_zext(<4 x i16> %x) {
+; CHECK-LABEL: add_v4i16_v4i32_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %xx = zext <4 x i16> %x to <4 x i32>
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
+  ret i32 %z
+}
+
+define i32 @add_v4i16_v4i32_sext(<4 x i16> %x) {
+; CHECK-LABEL: add_v4i16_v4i32_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %xx = sext <4 x i16> %x to <4 x i32>
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
+  ret i32 %z
+}
+
+define zeroext i16 @add_v8i16_v8i16(<8 x i16> %x) {
+; CHECK-LABEL: add_v8i16_v8i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    addv h0, v0.8h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
+  ret i16 %z
+}
+
+define i64 @add_v8i16_v8i64_zext(<8 x i16> %x) {
+; CHECK-LABEL: add_v8i16_v8i64_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
+; CHECK-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %xx = zext <8 x i16> %x to <8 x i64>
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
+  ret i64 %z
+}
+
+define i64 @add_v8i16_v8i64_sext(<8 x i16> %x) {
+; CHECK-LABEL: add_v8i16_v8i64_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
+; CHECK-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %xx = sext <8 x i16> %x to <8 x i64>
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
+  ret i64 %z
+}
+
+define i64 @add_v4i16_v4i64_zext(<4 x i16> %x) {
+; CHECK-LABEL: add_v4i16_v4i64_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %xx = zext <4 x i16> %x to <4 x i64>
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
+  ret i64 %z
+}
+
+define i64 @add_v4i16_v4i64_sext(<4 x i16> %x) {
+; CHECK-LABEL: add_v4i16_v4i64_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-NEXT:    sshll v1.2d, v0.2s, #0
+; CHECK-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %xx = sext <4 x i16> %x to <4 x i64>
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
+  ret i64 %z
+}
+
+define i64 @add_v2i16_v2i64_zext(<2 x i16> %x) {
+; CHECK-LABEL: add_v2i16_v2i64_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi d1, #0x00ffff0000ffff
+; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %xx = zext <2 x i16> %x to <2 x i64>
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
+  ret i64 %z
+}
+
+define i64 @add_v2i16_v2i64_sext(<2 x i16> %x) {
+; CHECK-LABEL: add_v2i16_v2i64_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    shl v0.2d, v0.2d, #48
+; CHECK-NEXT:    sshr v0.2d, v0.2d, #48
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %xx = sext <2 x i16> %x to <2 x i64>
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
+  ret i64 %z
+}
+
+define i32 @add_v16i8_v16i32_zext(<16 x i8> %x) {
+; CHECK-BASE-LABEL: add_v16i8_v16i32_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll2 v1.8h, v0.16b, #0
+; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    uaddl2 v2.4s, v0.8h, v1.8h
+; CHECK-BASE-NEXT:    uaddl v0.4s, v0.4h, v1.4h
+; CHECK-BASE-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-BASE-NEXT:    addv s0, v0.4s
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v16i8_v16i32_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v1.16b, #1
+; CHECK-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-DOT-NEXT:    udot v2.4s, v0.16b, v1.16b
+; CHECK-DOT-NEXT:    addv s0, v2.4s
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
+entry:
+  %xx = zext <16 x i8> %x to <16 x i32>
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
+  ret i32 %z
+}
+
+define i32 @add_v16i8_v16i32_sext(<16 x i8> %x) {
+; CHECK-BASE-LABEL: add_v16i8_v16i32_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    sshll2 v1.8h, v0.16b, #0
+; CHECK-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    saddl2 v2.4s, v0.8h, v1.8h
+; CHECK-BASE-NEXT:    saddl v0.4s, v0.4h, v1.4h
+; CHECK-BASE-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-BASE-NEXT:    addv s0, v0.4s
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v16i8_v16i32_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v1.16b, #1
+; CHECK-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-DOT-NEXT:    sdot v2.4s, v0.16b, v1.16b
+; CHECK-DOT-NEXT:    addv s0, v2.4s
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
+entry:
+  %xx = sext <16 x i8> %x to <16 x i32>
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
+  ret i32 %z
+}
+
+define i32 @add_v8i8_v8i32_zext(<8 x i8> %x) {
+; CHECK-BASE-LABEL: add_v8i8_v8i32_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    uaddw2 v0.4s, v1.4s, v0.8h
+; CHECK-BASE-NEXT:    addv s0, v0.4s
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v8i8_v8i32_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v1.8b, #1
+; CHECK-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-DOT-NEXT:    udot v2.2s, v0.8b, v1.8b
+; CHECK-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
+entry:
+  %xx = zext <8 x i8> %x to <8 x i32>
+  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
+  ret i32 %z
+}
+
+define i32 @add_v8i8_v8i32_sext(<8 x i8> %x) {
+; CHECK-BASE-LABEL: add_v8i8_v8i32_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    saddw2 v0.4s, v1.4s, v0.8h
+; CHECK-BASE-NEXT:    addv s0, v0.4s
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v8i8_v8i32_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v1.8b, #1
+; CHECK-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-DOT-NEXT:    sdot v2.2s, v0.8b, v1.8b
+; CHECK-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
+entry:
+  %xx = sext <8 x i8> %x to <8 x i32>
+  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
+  ret i32 %z
+}
+
+define i32 @add_v4i8_v4i32_zext(<4 x i8> %x) {
+; CHECK-LABEL: add_v4i8_v4i32_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %xx = zext <4 x i8> %x to <4 x i32>
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
+  ret i32 %z
+}
+
+define i32 @add_v4i8_v4i32_sext(<4 x i8> %x) {
+; CHECK-LABEL: add_v4i8_v4i32_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    shl v0.4s, v0.4s, #24
+; CHECK-NEXT:    sshr v0.4s, v0.4s, #24
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %xx = sext <4 x i8> %x to <4 x i32>
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
+  ret i32 %z
+}
+
+define zeroext i16 @add_v16i8_v16i16_zext(<16 x i8> %x) {
+; CHECK-LABEL: add_v16i8_v16i16_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v1.8h, v0.8b, #0
+; CHECK-NEXT:    uaddw2 v0.8h, v1.8h, v0.16b
+; CHECK-NEXT:    addv h0, v0.8h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %xx = zext <16 x i8> %x to <16 x i16>
+  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
+  ret i16 %z
+}
+
+define signext i16 @add_v16i8_v16i16_sext(<16 x i8> %x) {
+; CHECK-LABEL: add_v16i8_v16i16_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll v1.8h, v0.8b, #0
+; CHECK-NEXT:    saddw2 v0.8h, v1.8h, v0.16b
+; CHECK-NEXT:    addv h0, v0.8h
+; CHECK-NEXT:    smov w0, v0.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %xx = sext <16 x i8> %x to <16 x i16>
+  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
+  ret i16 %z
+}
+
+define zeroext i16 @add_v8i8_v8i16_zext(<8 x i8> %x) {
+; CHECK-LABEL: add_v8i8_v8i16_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    addv h0, v0.8h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %xx = zext <8 x i8> %x to <8 x i16>
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
+  ret i16 %z
+}
+
+define signext i16 @add_v8i8_v8i16_sext(<8 x i8> %x) {
+; CHECK-LABEL: add_v8i8_v8i16_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:    addv h0, v0.8h
+; CHECK-NEXT:    smov w0, v0.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %xx = sext <8 x i8> %x to <8 x i16>
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
+  ret i16 %z
+}
+
+define zeroext i8 @add_v16i8_v16i8(<16 x i8> %x) {
+; CHECK-LABEL: add_v16i8_v16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    addv b0, v0.16b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
+  ret i8 %z
+}
+
+define i64 @add_v16i8_v16i64_zext(<16 x i8> %x) {
+; CHECK-LABEL: add_v16i8_v16i64_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll2 v1.8h, v0.16b, #0
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ushll2 v2.4s, v1.8h, #0
+; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-NEXT:    ushll2 v3.4s, v0.8h, #0
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    uaddl2 v4.2d, v3.4s, v2.4s
+; CHECK-NEXT:    uaddl2 v5.2d, v0.4s, v1.4s
+; CHECK-NEXT:    uaddl v2.2d, v3.2s, v2.2s
+; CHECK-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-NEXT:    add v1.2d, v5.2d, v4.2d
+; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %xx = zext <16 x i8> %x to <16 x i64>
+  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
+  ret i64 %z
+}
+
+define i64 @add_v16i8_v16i64_sext(<16 x i8> %x) {
+; CHECK-LABEL: add_v16i8_v16i64_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll2 v1.8h, v0.16b, #0
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:    sshll2 v2.4s, v1.8h, #0
+; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-NEXT:    sshll2 v3.4s, v0.8h, #0
+; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-NEXT:    saddl2 v4.2d, v3.4s, v2.4s
+; CHECK-NEXT:    saddl2 v5.2d, v0.4s, v1.4s
+; CHECK-NEXT:    saddl v2.2d, v3.2s, v2.2s
+; CHECK-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-NEXT:    add v1.2d, v5.2d, v4.2d
+; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %xx = sext <16 x i8> %x to <16 x i64>
+  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
+  ret i64 %z
+}
+
+define i64 @add_v8i8_v8i64_zext(<8 x i8> %x) {
+; CHECK-LABEL: add_v8i8_v8i64_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
+; CHECK-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %xx = zext <8 x i8> %x to <8 x i64>
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
+  ret i64 %z
+}
+
+define i64 @add_v8i8_v8i64_sext(<8 x i8> %x) {
+; CHECK-LABEL: add_v8i8_v8i64_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
+; CHECK-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %xx = sext <8 x i8> %x to <8 x i64>
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
+  ret i64 %z
+}
+
+define i64 @add_v4i8_v4i64_zext(<4 x i8> %x) {
+; CHECK-LABEL: add_v4i8_v4i64_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %xx = zext <4 x i8> %x to <4 x i64>
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
+  ret i64 %z
+}
+
+define i64 @add_v4i8_v4i64_sext(<4 x i8> %x) {
+; CHECK-LABEL: add_v4i8_v4i64_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-NEXT:    sshr v1.2d, v1.2d, #56
+; CHECK-NEXT:    ssra v1.2d, v0.2d, #56
+; CHECK-NEXT:    addp d0, v1.2d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %xx = sext <4 x i8> %x to <4 x i64>
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
+  ret i64 %z
+}
+
+define i64 @add_v2i8_v2i64_zext(<2 x i8> %x) {
+; CHECK-LABEL: add_v2i8_v2i64_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi d1, #0x0000ff000000ff
+; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %xx = zext <2 x i8> %x to <2 x i64>
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
+  ret i64 %z
+}
+
+define i64 @add_v2i8_v2i64_sext(<2 x i8> %x) {
+; CHECK-LABEL: add_v2i8_v2i64_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-NEXT:    sshr v0.2d, v0.2d, #56
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %xx = sext <2 x i8> %x to <2 x i64>
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
+  ret i64 %z
+}
+
+define i64 @add_v2i64_v2i64(<2 x i64> %x) {
+; CHECK-LABEL: add_v2i64_v2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x)
+  ret i64 %z
+}
+
+define i32 @add_v4i32_v4i32_acc(<4 x i32> %x, i32 %a) {
+; CHECK-LABEL: add_v4i32_v4i32_acc:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    add w0, w8, w0
+; CHECK-NEXT:    ret
+entry:
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
+  %r = add i32 %z, %a
+  ret i32 %r
+}
+
+define i64 @add_v4i32_v4i64_acc_zext(<4 x i32> %x, i64 %a) {
+; CHECK-LABEL: add_v4i32_v4i64_acc_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    add x0, x8, x0
+; CHECK-NEXT:    ret
+entry:
+  %xx = zext <4 x i32> %x to <4 x i64>
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
+  %r = add i64 %z, %a
+  ret i64 %r
+}
+
+define i64 @add_v4i32_v4i64_acc_sext(<4 x i32> %x, i64 %a) {
+; CHECK-LABEL: add_v4i32_v4i64_acc_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll v1.2d, v0.2s, #0
+; CHECK-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    add x0, x8, x0
+; CHECK-NEXT:    ret
+entry:
+  %xx = sext <4 x i32> %x to <4 x i64>
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
+  %r = add i64 %z, %a
+  ret i64 %r
+}
+
+define i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, i64 %a) {
+; CHECK-LABEL: add_v2i32_v2i64_acc_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    add x0, x8, x0
+; CHECK-NEXT:    ret
+entry:
+  %xx = zext <2 x i32> %x to <2 x i64>
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
+  %r = add i64 %z, %a
+  ret i64 %r
+}
+
+define i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, i64 %a) {
+; CHECK-LABEL: add_v2i32_v2i64_acc_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll v0.2d, v0.2s, #0
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    add x0, x8, x0
+; CHECK-NEXT:    ret
+entry:
+  %xx = sext <2 x i32> %x to <2 x i64>
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
+  %r = add i64 %z, %a
+  ret i64 %r
+}
+
+define i32 @add_v8i16_v8i32_acc_zext(<8 x i16> %x, i32 %a) {
+; CHECK-LABEL: add_v8i16_v8i32_acc_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-NEXT:    uaddw2 v0.4s, v1.4s, v0.8h
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    add w0, w8, w0
+; CHECK-NEXT:    ret
+entry:
+  %xx = zext <8 x i16> %x to <8 x i32>
+  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
+  %r = add i32 %z, %a
+  ret i32 %r
+}
+
+define i32 @add_v8i16_v8i32_acc_sext(<8 x i16> %x, i32 %a) {
+; CHECK-LABEL: add_v8i16_v8i32_acc_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-NEXT:    saddw2 v0.4s, v1.4s, v0.8h
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    add w0, w8, w0
+; CHECK-NEXT:    ret
+entry:
+  %xx = sext <8 x i16> %x to <8 x i32>
+  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
+  %r = add i32 %z, %a
+  ret i32 %r
+}
+
+define i32 @add_v4i16_v4i32_acc_zext(<4 x i16> %x, i32 %a) {
+; CHECK-LABEL: add_v4i16_v4i32_acc_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    add w0, w8, w0
+; CHECK-NEXT:    ret
+entry:
+  %xx = zext <4 x i16> %x to <4 x i32>
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
+  %r = add i32 %z, %a
+  ret i32 %r
+}
+
+define i32 @add_v4i16_v4i32_acc_sext(<4 x i16> %x, i32 %a) {
+; CHECK-LABEL: add_v4i16_v4i32_acc_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    add w0, w8, w0
+; CHECK-NEXT:    ret
+entry:
+  %xx = sext <4 x i16> %x to <4 x i32>
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
+  %r = add i32 %z, %a
+  ret i32 %r
+}
+
+define zeroext i16 @add_v8i16_v8i16_acc(<8 x i16> %x, i16 %a) {
+; CHECK-LABEL: add_v8i16_v8i16_acc:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    addv h0, v0.8h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    and w0, w8, #0xffff
+; CHECK-NEXT:    ret
+entry:
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
+  %r = add i16 %z, %a
+  ret i16 %r
+}
+
+define i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, i64 %a) {
+; CHECK-LABEL: add_v8i16_v8i64_acc_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
+; CHECK-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    add x0, x8, x0
+; CHECK-NEXT:    ret
+entry:
+  %xx = zext <8 x i16> %x to <8 x i64>
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
+  %r = add i64 %z, %a
+  ret i64 %r
+}
+
+define i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, i64 %a) {
+; CHECK-LABEL: add_v8i16_v8i64_acc_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
+; CHECK-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    add x0, x8, x0
+; CHECK-NEXT:    ret
+entry:
+  %xx = sext <8 x i16> %x to <8 x i64>
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
+  %r = add i64 %z, %a
+  ret i64 %r
+}
+
+define i64 @add_v4i16_v4i64_acc_zext(<4 x i16> %x, i64 %a) {
+; CHECK-LABEL: add_v4i16_v4i64_acc_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    add x0, x8, x0
+; CHECK-NEXT:    ret
+entry:
+  %xx = zext <4 x i16> %x to <4 x i64>
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
+  %r = add i64 %z, %a
+  ret i64 %r
+}
+
+define i64 @add_v4i16_v4i64_acc_sext(<4 x i16> %x, i64 %a) {
+; CHECK-LABEL: add_v4i16_v4i64_acc_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-NEXT:    sshll v1.2d, v0.2s, #0
+; CHECK-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    add x0, x8, x0
+; CHECK-NEXT:    ret
+entry:
+  %xx = sext <4 x i16> %x to <4 x i64>
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
+  %r = add i64 %z, %a
+  ret i64 %r
+}
+
+define i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, i64 %a) {
+; CHECK-LABEL: add_v2i16_v2i64_acc_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi d1, #0x00ffff0000ffff
+; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    add x0, x8, x0
+; CHECK-NEXT:    ret
+entry:
+  %xx = zext <2 x i16> %x to <2 x i64>
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
+  %r = add i64 %z, %a
+  ret i64 %r
+}
+
+define i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, i64 %a) {
+; CHECK-LABEL: add_v2i16_v2i64_acc_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    shl v0.2d, v0.2d, #48
+; CHECK-NEXT:    sshr v0.2d, v0.2d, #48
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    add x0, x8, x0
+; CHECK-NEXT:    ret
+entry:
+  %xx = sext <2 x i16> %x to <2 x i64>
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
+  %r = add i64 %z, %a
+  ret i64 %r
+}
+
+define i32 @add_v16i8_v16i32_acc_zext(<16 x i8> %x, i32 %a) {
+; CHECK-BASE-LABEL: add_v16i8_v16i32_acc_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll2 v1.8h, v0.16b, #0
+; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    uaddl2 v2.4s, v0.8h, v1.8h
+; CHECK-BASE-NEXT:    uaddl v0.4s, v0.4h, v1.4h
+; CHECK-BASE-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-BASE-NEXT:    addv s0, v0.4s
+; CHECK-BASE-NEXT:    fmov w8, s0
+; CHECK-BASE-NEXT:    add w0, w8, w0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v16i8_v16i32_acc_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v1.16b, #1
+; CHECK-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-DOT-NEXT:    udot v2.4s, v0.16b, v1.16b
+; CHECK-DOT-NEXT:    addv s0, v2.4s
+; CHECK-DOT-NEXT:    fmov w8, s0
+; CHECK-DOT-NEXT:    add w0, w8, w0
+; CHECK-DOT-NEXT:    ret
+entry:
+  %xx = zext <16 x i8> %x to <16 x i32>
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
+  %r = add i32 %z, %a
+  ret i32 %r
+}
+
+define i32 @add_v16i8_v16i32_acc_sext(<16 x i8> %x, i32 %a) {
+; CHECK-BASE-LABEL: add_v16i8_v16i32_acc_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    sshll2 v1.8h, v0.16b, #0
+; CHECK-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    saddl2 v2.4s, v0.8h, v1.8h
+; CHECK-BASE-NEXT:    saddl v0.4s, v0.4h, v1.4h
+; CHECK-BASE-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-BASE-NEXT:    addv s0, v0.4s
+; CHECK-BASE-NEXT:    fmov w8, s0
+; CHECK-BASE-NEXT:    add w0, w8, w0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v16i8_v16i32_acc_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v1.16b, #1
+; CHECK-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-DOT-NEXT:    sdot v2.4s, v0.16b, v1.16b
+; CHECK-DOT-NEXT:    addv s0, v2.4s
+; CHECK-DOT-NEXT:    fmov w8, s0
+; CHECK-DOT-NEXT:    add w0, w8, w0
+; CHECK-DOT-NEXT:    ret
+entry:
+  %xx = sext <16 x i8> %x to <16 x i32>
+  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
+  %r = add i32 %z, %a
+  ret i32 %r
+}
+
+define i32 @add_v8i8_v8i32_acc_zext(<8 x i8> %x, i32 %a) {
+; CHECK-BASE-LABEL: add_v8i8_v8i32_acc_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    uaddw2 v0.4s, v1.4s, v0.8h
+; CHECK-BASE-NEXT:    addv s0, v0.4s
+; CHECK-BASE-NEXT:    fmov w8, s0
+; CHECK-BASE-NEXT:    add w0, w8, w0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v8i8_v8i32_acc_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v1.8b, #1
+; CHECK-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-DOT-NEXT:    udot v2.2s, v0.8b, v1.8b
+; CHECK-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
+; CHECK-DOT-NEXT:    fmov w8, s0
+; CHECK-DOT-NEXT:    add w0, w8, w0
+; CHECK-DOT-NEXT:    ret
+entry:
+  %xx = zext <8 x i8> %x to <8 x i32>
+  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
+  %r = add i32 %z, %a
+  ret i32 %r
+}
+
+define i32 @add_v8i8_v8i32_acc_sext(<8 x i8> %x, i32 %a) {
+; CHECK-BASE-LABEL: add_v8i8_v8i32_acc_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    saddw2 v0.4s, v1.4s, v0.8h
+; CHECK-BASE-NEXT:    addv s0, v0.4s
+; CHECK-BASE-NEXT:    fmov w8, s0
+; CHECK-BASE-NEXT:    add w0, w8, w0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v8i8_v8i32_acc_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v1.8b, #1
+; CHECK-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-DOT-NEXT:    sdot v2.2s, v0.8b, v1.8b
+; CHECK-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
+; CHECK-DOT-NEXT:    fmov w8, s0
+; CHECK-DOT-NEXT:    add w0, w8, w0
+; CHECK-DOT-NEXT:    ret
+entry:
+  %xx = sext <8 x i8> %x to <8 x i32>
+  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
+  %r = add i32 %z, %a
+  ret i32 %r
+}
+
+define i32 @add_v4i8_v4i32_acc_zext(<4 x i8> %x, i32 %a) {
+; CHECK-LABEL: add_v4i8_v4i32_acc_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    add w0, w8, w0
+; CHECK-NEXT:    ret
+entry:
+  %xx = zext <4 x i8> %x to <4 x i32>
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
+  %r = add i32 %z, %a
+  ret i32 %r
+}
+
+define i32 @add_v4i8_v4i32_acc_sext(<4 x i8> %x, i32 %a) {
+; CHECK-LABEL: add_v4i8_v4i32_acc_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    shl v0.4s, v0.4s, #24
+; CHECK-NEXT:    sshr v0.4s, v0.4s, #24
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    add w0, w8, w0
+; CHECK-NEXT:    ret
+entry:
+  %xx = sext <4 x i8> %x to <4 x i32>
+  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
+  %r = add i32 %z, %a
+  ret i32 %r
+}
+
+define zeroext i16 @add_v16i8_v16i16_acc_zext(<16 x i8> %x, i16 %a) {
+; CHECK-LABEL: add_v16i8_v16i16_acc_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v1.8h, v0.8b, #0
+; CHECK-NEXT:    uaddw2 v0.8h, v1.8h, v0.16b
+; CHECK-NEXT:    addv h0, v0.8h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    and w0, w8, #0xffff
+; CHECK-NEXT:    ret
+entry:
+  %xx = zext <16 x i8> %x to <16 x i16>
+  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
+  %r = add i16 %z, %a
+  ret i16 %r
+}
+
+define signext i16 @add_v16i8_v16i16_acc_sext(<16 x i8> %x, i16 %a) {
+; CHECK-LABEL: add_v16i8_v16i16_acc_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll v1.8h, v0.8b, #0
+; CHECK-NEXT:    saddw2 v0.8h, v1.8h, v0.16b
+; CHECK-NEXT:    addv h0, v0.8h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    sxth w0, w8
+; CHECK-NEXT:    ret
+entry:
+  %xx = sext <16 x i8> %x to <16 x i16>
+  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
+  %r = add i16 %z, %a
+  ret i16 %r
+}
+
+define zeroext i16 @add_v8i8_v8i16_acc_zext(<8 x i8> %x, i16 %a) {
+; CHECK-LABEL: add_v8i8_v8i16_acc_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    addv h0, v0.8h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    and w0, w8, #0xffff
+; CHECK-NEXT:    ret
+entry:
+  %xx = zext <8 x i8> %x to <8 x i16>
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
+  %r = add i16 %z, %a
+  ret i16 %r
+}
+
+define signext i16 @add_v8i8_v8i16_acc_sext(<8 x i8> %x, i16 %a) {
+; CHECK-LABEL: add_v8i8_v8i16_acc_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:    addv h0, v0.8h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    sxth w0, w8
+; CHECK-NEXT:    ret
+entry:
+  %xx = sext <8 x i8> %x to <8 x i16>
+  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
+  %r = add i16 %z, %a
+  ret i16 %r
+}
+
+define zeroext i8 @add_v16i8_v16i8_acc(<16 x i8> %x, i8 %a) {
+; CHECK-LABEL: add_v16i8_v16i8_acc:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    addv b0, v0.16b
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    and w0, w8, #0xff
+; CHECK-NEXT:    ret
+entry:
+  %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
+  %r = add i8 %z, %a
+  ret i8 %r
+}
+
+define i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, i64 %a) {
+; CHECK-LABEL: add_v16i8_v16i64_acc_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll2 v1.8h, v0.16b, #0
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ushll2 v2.4s, v1.8h, #0
+; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-NEXT:    ushll2 v3.4s, v0.8h, #0
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    uaddl2 v4.2d, v3.4s, v2.4s
+; CHECK-NEXT:    uaddl2 v5.2d, v0.4s, v1.4s
+; CHECK-NEXT:    uaddl v2.2d, v3.2s, v2.2s
+; CHECK-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-NEXT:    add v1.2d, v5.2d, v4.2d
+; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    add x0, x8, x0
+; CHECK-NEXT:    ret
+entry:
+  %xx = zext <16 x i8> %x to <16 x i64>
+  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
+  %r = add i64 %z, %a
+  ret i64 %r
+}
+
+define i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, i64 %a) {
+; CHECK-LABEL: add_v16i8_v16i64_acc_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll2 v1.8h, v0.16b, #0
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:    sshll2 v2.4s, v1.8h, #0
+; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-NEXT:    sshll2 v3.4s, v0.8h, #0
+; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-NEXT:    saddl2 v4.2d, v3.4s, v2.4s
+; CHECK-NEXT:    saddl2 v5.2d, v0.4s, v1.4s
+; CHECK-NEXT:    saddl v2.2d, v3.2s, v2.2s
+; CHECK-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-NEXT:    add v1.2d, v5.2d, v4.2d
+; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    add x0, x8, x0
+; CHECK-NEXT:    ret
+entry:
+  %xx = sext <16 x i8> %x to <16 x i64>
+  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
+  %r = add i64 %z, %a
+  ret i64 %r
+}
+
+define i64 @add_v8i8_v8i64_acc_zext(<8 x i8> %x, i64 %a) {
+; CHECK-LABEL: add_v8i8_v8i64_acc_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
+; CHECK-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    add x0, x8, x0
+; CHECK-NEXT:    ret
+entry:
+  %xx = zext <8 x i8> %x to <8 x i64>
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
+  %r = add i64 %z, %a
+  ret i64 %r
+}
+
+define i64 @add_v8i8_v8i64_acc_sext(<8 x i8> %x, i64 %a) {
+; CHECK-LABEL: add_v8i8_v8i64_acc_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
+; CHECK-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    add x0, x8, x0
+; CHECK-NEXT:    ret
+entry:
+  %xx = sext <8 x i8> %x to <8 x i64>
+  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
+  %r = add i64 %z, %a
+  ret i64 %r
+}
+
+define i64 @add_v4i8_v4i64_acc_zext(<4 x i8> %x, i64 %a) {
+; CHECK-LABEL: add_v4i8_v4i64_acc_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    add x0, x8, x0
+; CHECK-NEXT:    ret
+entry:
+  %xx = zext <4 x i8> %x to <4 x i64>
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
+  %r = add i64 %z, %a
+  ret i64 %r
+}
+
+define i64 @add_v4i8_v4i64_acc_sext(<4 x i8> %x, i64 %a) {
+; CHECK-LABEL: add_v4i8_v4i64_acc_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-NEXT:    sshr v1.2d, v1.2d, #56
+; CHECK-NEXT:    ssra v1.2d, v0.2d, #56
+; CHECK-NEXT:    addp d0, v1.2d
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    add x0, x8, x0
+; CHECK-NEXT:    ret
+entry:
+  %xx = sext <4 x i8> %x to <4 x i64>
+  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
+  %r = add i64 %z, %a
+  ret i64 %r
+}
+
+define i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, i64 %a) {
+; CHECK-LABEL: add_v2i8_v2i64_acc_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi d1, #0x0000ff000000ff
+; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    add x0, x8, x0
+; CHECK-NEXT:    ret
+entry:
+  %xx = zext <2 x i8> %x to <2 x i64>
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
+  %r = add i64 %z, %a
+  ret i64 %r
+}
+
+define i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, i64 %a) {
+; CHECK-LABEL: add_v2i8_v2i64_acc_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-NEXT:    sshr v0.2d, v0.2d, #56
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    add x0, x8, x0
+; CHECK-NEXT:    ret
+entry:
+  %xx = sext <2 x i8> %x to <2 x i64>
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
+  %r = add i64 %z, %a
+  ret i64 %r
+}
+
+define i64 @add_v2i64_v2i64_acc(<2 x i64> %x, i64 %a) {
+; CHECK-LABEL: add_v2i64_v2i64_acc:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    add x0, x8, x0
+; CHECK-NEXT:    ret
+entry:
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x)
+  %r = add i64 %z, %a
+  ret i64 %r
+}
+
+define i32 @add_pair_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: add_pair_v4i32_v4i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
+  %z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %y)
+  %z = add i32 %z1, %z2
+  ret i32 %z
+}
+
+define i64 @add_pair_v4i32_v4i64_zext(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: add_pair_v4i32_v4i64_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v2.2d, v0.2s, #0
+; CHECK-NEXT:    ushll v3.2d, v1.2s, #0
+; CHECK-NEXT:    uaddw2 v0.2d, v2.2d, v0.4s
+; CHECK-NEXT:    uaddw2 v1.2d, v3.2d, v1.4s
+; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %xx = zext <4 x i32> %x to <4 x i64>
+  %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
+  %yy = zext <4 x i32> %y to <4 x i64>
+  %z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy)
+  %z = add i64 %z1, %z2
+  ret i64 %z
+}
+
+define i64 @add_pair_v4i32_v4i64_sext(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: add_pair_v4i32_v4i64_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll v2.2d, v0.2s, #0
+; CHECK-NEXT:    sshll v3.2d, v1.2s, #0
+; CHECK-NEXT:    saddw2 v0.2d, v2.2d, v0.4s
+; CHECK-NEXT:    saddw2 v1.2d, v3.2d, v1.4s
+; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %xx = sext <4 x i32> %x to <4 x i64>
+  %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
+  %yy = sext <4 x i32> %y to <4 x i64>
+  %z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy)
+  %z = add i64 %z1, %z2
+  ret i64 %z
+}
+
+define i64 @add_pair_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: add_pair_v2i32_v2i64_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %xx = zext <2 x i32> %x to <2 x i64>
+  %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
+  %yy = zext <2 x i32> %y to <2 x i64>
+  %z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy)
+  %z = add i64 %z1, %z2
+  ret i64 %z
+}
+
+define i64 @add_pair_v2i32_v2i64_sext(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: add_pair_v2i32_v2i64_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %xx = sext <2 x i32> %x to <2 x i64>
+  %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
+  %yy = sext <2 x i32> %y to <2 x i64>
+  %z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy)
+  %z = add i64 %z1, %z2
+  ret i64 %z
+}
+
+define i32 @add_pair_v8i16_v8i32_zext(<8 x i16> %x, <8 x i16> %y) {
+; CHECK-LABEL: add_pair_v8i16_v8i32_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v2.4s, v0.4h, #0
+; CHECK-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-NEXT:    uaddw2 v0.4s, v2.4s, v0.8h
+; CHECK-NEXT:    uaddw2 v1.4s, v3.4s, v1.8h
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %xx = zext <8 x i16> %x to <8 x i32>
+  %z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
+  %yy = zext <8 x i16> %y to <8 x i32>
+  %z2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %yy)
+  %z = add i32 %z1, %z2
+  ret i32 %z
+}
+
+define i32 @add_pair_v8i16_v8i32_sext(<8 x i16> %x, <8 x i16> %y) {
+; CHECK-LABEL: add_pair_v8i16_v8i32_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll v2.4s, v0.4h, #0
+; CHECK-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-NEXT:    saddw2 v0.4s, v2.4s, v0.8h
+; CHECK-NEXT:    saddw2 v1.4s, v3.4s, v1.8h
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %xx = sext <8 x i16> %x to <8 x i32>
+  %z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
+  %yy = sext <8 x i16> %y to <8 x i32>
+  %z2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %yy)
+  %z = add i32 %z1, %z2
+  ret i32 %z
+}
+
+define i32 @add_pair_v4i16_v4i32_zext(<4 x i16> %x, <4 x i16> %y) {
+; CHECK-LABEL: add_pair_v4i16_v4i32_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    uaddl v0.4s, v0.4h, v1.4h
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %xx = zext <4 x i16> %x to <4 x i32>
+  %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
+  %yy = zext <4 x i16> %y to <4 x i32>
+  %z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %yy)
+  %z = add i32 %z1, %z2
+  ret i32 %z
+}
+
+define i32 @add_pair_v4i16_v4i32_sext(<4 x i16> %x, <4 x i16> %y) {
+; CHECK-LABEL: add_pair_v4i16_v4i32_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    saddl v0.4s, v0.4h, v1.4h
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %xx = sext <4 x i16> %x to <4 x i32>
+  %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
+  %yy = sext <4 x i16> %y to <4 x i32>
+  %z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %yy)
+  %z = add i32 %z1, %z2
+  ret i32 %z
+}
+
+define zeroext i16 @add_pair_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y) {
+; CHECK-LABEL: add_pair_v8i16_v8i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    addv h0, v0.8h
+; CHECK-NEXT:    addv h1, v1.8h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    fmov w9, s1
+; CHECK-NEXT:    add w8, w8, w9
+; CHECK-NEXT:    and w0, w8, #0xffff
+; CHECK-NEXT:    ret
+entry:
+  %z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
+  %z2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %y)
+  %z = add i16 %z1, %z2
+  ret i16 %z
+}
+
+define i64 @add_pair_v8i16_v8i64_zext(<8 x i16> %x, <8 x i16> %y) {
+; CHECK-LABEL: add_pair_v8i16_v8i64_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll2 v2.4s, v0.8h, #0
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ushll2 v3.4s, v1.8h, #0
+; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-NEXT:    uaddl2 v4.2d, v0.4s, v2.4s
+; CHECK-NEXT:    uaddl v0.2d, v0.2s, v2.2s
+; CHECK-NEXT:    uaddl2 v2.2d, v1.4s, v3.4s
+; CHECK-NEXT:    uaddl v1.2d, v1.2s, v3.2s
+; CHECK-NEXT:    add v0.2d, v0.2d, v4.2d
+; CHECK-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %xx = zext <8 x i16> %x to <8 x i64>
+  %z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
+  %yy = zext <8 x i16> %y to <8 x i64>
+  %z2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %yy)
+  %z = add i64 %z1, %z2
+  ret i64 %z
+}
+
+define i64 @add_pair_v8i16_v8i64_sext(<8 x i16> %x, <8 x i16> %y) {
+; CHECK-LABEL: add_pair_v8i16_v8i64_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll2 v2.4s, v0.8h, #0
+; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-NEXT:    sshll2 v3.4s, v1.8h, #0
+; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-NEXT:    saddl2 v4.2d, v0.4s, v2.4s
+; CHECK-NEXT:    saddl v0.2d, v0.2s, v2.2s
+; CHECK-NEXT:    saddl2 v2.2d, v1.4s, v3.4s
+; CHECK-NEXT:    saddl v1.2d, v1.2s, v3.2s
+; CHECK-NEXT:    add v0.2d, v0.2d, v4.2d
+; CHECK-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %xx = sext <8 x i16> %x to <8 x i64>
+  %z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
+  %yy = sext <8 x i16> %y to <8 x i64>
+  %z2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %yy)
+  %z = add i64 %z1, %z2
+  ret i64 %z
+}
+
+define i64 @add_pair_v4i16_v4i64_zext(<4 x i16> %x, <4 x i16> %y) {
+; CHECK-LABEL: add_pair_v4i16_v4i64_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-NEXT:    ushll v2.2d, v0.2s, #0
+; CHECK-NEXT:    ushll v3.2d, v1.2s, #0
+; CHECK-NEXT:    uaddw2 v0.2d, v2.2d, v0.4s
+; CHECK-NEXT:    uaddw2 v1.2d, v3.2d, v1.4s
+; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %xx = zext <4 x i16> %x to <4 x i64>
+  %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
+  %yy = zext <4 x i16> %y to <4 x i64>
+  %z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy)
+  %z = add i64 %z1, %z2
+  ret i64 %z
+}
+
+define i64 @add_pair_v4i16_v4i64_sext(<4 x i16> %x, <4 x i16> %y) {
+; CHECK-LABEL: add_pair_v4i16_v4i64_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-NEXT:    sshll v2.2d, v0.2s, #0
+; CHECK-NEXT:    sshll v3.2d, v1.2s, #0
+; CHECK-NEXT:    saddw2 v0.2d, v2.2d, v0.4s
+; CHECK-NEXT:    saddw2 v1.2d, v3.2d, v1.4s
+; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %xx = sext <4 x i16> %x to <4 x i64>
+  %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
+  %yy = sext <4 x i16> %y to <4 x i64>
+  %z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy)
+  %z = add i64 %z1, %z2
+  ret i64 %z
+}
+
+define i64 @add_pair_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %y) {
+; CHECK-LABEL: add_pair_v2i16_v2i64_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi d2, #0x00ffff0000ffff
+; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %xx = zext <2 x i16> %x to <2 x i64>
+  %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
+  %yy = zext <2 x i16> %y to <2 x i64>
+  %z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy)
+  %z = add i64 %z1, %z2
+  ret i64 %z
+}
+
+define i64 @add_pair_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %y) {
+; CHECK-LABEL: add_pair_v2i16_v2i64_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-NEXT:    shl v0.2d, v0.2d, #48
+; CHECK-NEXT:    shl v1.2d, v1.2d, #48
+; CHECK-NEXT:    sshr v0.2d, v0.2d, #48
+; CHECK-NEXT:    ssra v0.2d, v1.2d, #48
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %xx = sext <2 x i16> %x to <2 x i64>
+  %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
+  %yy = sext <2 x i16> %y to <2 x i64>
+  %z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy)
+  %z = add i64 %z1, %z2
+  ret i64 %z
+}
+
+define i32 @add_pair_v16i8_v16i32_zext(<16 x i8> %x, <16 x i8> %y) {
+; CHECK-BASE-LABEL: add_pair_v16i8_v16i32_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll2 v2.8h, v0.16b, #0
+; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    ushll2 v3.8h, v1.16b, #0
+; CHECK-BASE-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-BASE-NEXT:    uaddl2 v4.4s, v0.8h, v2.8h
+; CHECK-BASE-NEXT:    uaddl v0.4s, v0.4h, v2.4h
+; CHECK-BASE-NEXT:    uaddl2 v2.4s, v1.8h, v3.8h
+; CHECK-BASE-NEXT:    uaddl v1.4s, v1.4h, v3.4h
+; CHECK-BASE-NEXT:    add v0.4s, v0.4s, v4.4s
+; CHECK-BASE-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-BASE-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-BASE-NEXT:    addv s0, v0.4s
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v16i8_v16i32_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v2.16b, #1
+; CHECK-DOT-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-DOT-NEXT:    udot v3.4s, v1.16b, v2.16b
+; CHECK-DOT-NEXT:    udot v3.4s, v0.16b, v2.16b
+; CHECK-DOT-NEXT:    addv s0, v3.4s
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
+entry:
+  %xx = zext <16 x i8> %x to <16 x i32>
+  %z1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
+  %yy = zext <16 x i8> %y to <16 x i32>
+  %z2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %yy)
+  %z = add i32 %z1, %z2
+  ret i32 %z
+}
+
+define i32 @add_pair_v16i8_v16i32_sext(<16 x i8> %x, <16 x i8> %y) {
+; CHECK-BASE-LABEL: add_pair_v16i8_v16i32_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    sshll2 v2.8h, v0.16b, #0
+; CHECK-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    sshll2 v3.8h, v1.16b, #0
+; CHECK-BASE-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-BASE-NEXT:    saddl2 v4.4s, v0.8h, v2.8h
+; CHECK-BASE-NEXT:    saddl v0.4s, v0.4h, v2.4h
+; CHECK-BASE-NEXT:    saddl2 v2.4s, v1.8h, v3.8h
+; CHECK-BASE-NEXT:    saddl v1.4s, v1.4h, v3.4h
+; CHECK-BASE-NEXT:    add v0.4s, v0.4s, v4.4s
+; CHECK-BASE-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-BASE-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-BASE-NEXT:    addv s0, v0.4s
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v16i8_v16i32_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v2.16b, #1
+; CHECK-DOT-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-DOT-NEXT:    sdot v3.4s, v1.16b, v2.16b
+; CHECK-DOT-NEXT:    sdot v3.4s, v0.16b, v2.16b
+; CHECK-DOT-NEXT:    addv s0, v3.4s
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
+entry:
+  %xx = sext <16 x i8> %x to <16 x i32>
+  %z1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
+  %yy = sext <16 x i8> %y to <16 x i32>
+  %z2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %yy)
+  %z = add i32 %z1, %z2
+  ret i32 %z
+}
+
+define i32 @add_pair_v8i8_v8i32_zext(<8 x i8> %x, <8 x i8> %y) {
+; CHECK-BASE-LABEL: add_pair_v8i8_v8i32_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-BASE-NEXT:    ushll v2.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-BASE-NEXT:    uaddw2 v0.4s, v2.4s, v0.8h
+; CHECK-BASE-NEXT:    uaddw2 v1.4s, v3.4s, v1.8h
+; CHECK-BASE-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-BASE-NEXT:    addv s0, v0.4s
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v8i8_v8i32_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v2.8b, #1
+; CHECK-DOT-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-DOT-NEXT:    udot v3.2s, v1.8b, v2.8b
+; CHECK-DOT-NEXT:    udot v3.2s, v0.8b, v2.8b
+; CHECK-DOT-NEXT:    addp v0.2s, v3.2s, v3.2s
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
+entry:
+  %xx = zext <8 x i8> %x to <8 x i32>
+  %z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
+  %yy = zext <8 x i8> %y to <8 x i32>
+  %z2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %yy)
+  %z = add i32 %z1, %z2
+  ret i32 %z
+}
+
+define i32 @add_pair_v8i8_v8i32_sext(<8 x i8> %x, <8 x i8> %y) {
+; CHECK-BASE-LABEL: add_pair_v8i8_v8i32_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-BASE-NEXT:    sshll v2.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-BASE-NEXT:    saddw2 v0.4s, v2.4s, v0.8h
+; CHECK-BASE-NEXT:    saddw2 v1.4s, v3.4s, v1.8h
+; CHECK-BASE-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-BASE-NEXT:    addv s0, v0.4s
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v8i8_v8i32_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v2.8b, #1
+; CHECK-DOT-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-DOT-NEXT:    sdot v3.2s, v1.8b, v2.8b
+; CHECK-DOT-NEXT:    sdot v3.2s, v0.8b, v2.8b
+; CHECK-DOT-NEXT:    addp v0.2s, v3.2s, v3.2s
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
+entry:
+  %xx = sext <8 x i8> %x to <8 x i32>
+  %z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
+  %yy = sext <8 x i8> %y to <8 x i32>
+  %z2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %yy)
+  %z = add i32 %z1, %z2
+  ret i32 %z
+}
+
+define i32 @add_pair_v4i8_v4i32_zext(<4 x i8> %x, <4 x i8> %y) {
+; CHECK-LABEL: add_pair_v4i8_v4i32_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-NEXT:    bic v1.4h, #255, lsl #8
+; CHECK-NEXT:    uaddl v0.4s, v0.4h, v1.4h
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %xx = zext <4 x i8> %x to <4 x i32>
+  %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
+  %yy = zext <4 x i8> %y to <4 x i32>
+  %z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %yy)
+  %z = add i32 %z1, %z2
+  ret i32 %z
+}
+
+define i32 @add_pair_v4i8_v4i32_sext(<4 x i8> %x, <4 x i8> %y) {
+; CHECK-LABEL: add_pair_v4i8_v4i32_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-NEXT:    shl v0.4s, v0.4s, #24
+; CHECK-NEXT:    shl v1.4s, v1.4s, #24
+; CHECK-NEXT:    sshr v0.4s, v0.4s, #24
+; CHECK-NEXT:    ssra v0.4s, v1.4s, #24
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %xx = sext <4 x i8> %x to <4 x i32>
+  %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
+  %yy = sext <4 x i8> %y to <4 x i32>
+  %z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %yy)
+  %z = add i32 %z1, %z2
+  ret i32 %z
+}
+
+define zeroext i16 @add_pair_v16i8_v16i16_zext(<16 x i8> %x, <16 x i8> %y) {
+; CHECK-LABEL: add_pair_v16i8_v16i16_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-NEXT:    uaddw2 v0.8h, v2.8h, v0.16b
+; CHECK-NEXT:    uaddw2 v1.8h, v3.8h, v1.16b
+; CHECK-NEXT:    addv h0, v0.8h
+; CHECK-NEXT:    addv h1, v1.8h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    fmov w9, s1
+; CHECK-NEXT:    add w8, w8, w9
+; CHECK-NEXT:    and w0, w8, #0xffff
+; CHECK-NEXT:    ret
+entry:
+  %xx = zext <16 x i8> %x to <16 x i16>
+  %z1 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
+  %yy = zext <16 x i8> %y to <16 x i16>
+  %z2 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %yy)
+  %z = add i16 %z1, %z2
+  ret i16 %z
+}
+
+define signext i16 @add_pair_v16i8_v16i16_sext(<16 x i8> %x, <16 x i8> %y) {
+; CHECK-LABEL: add_pair_v16i8_v16i16_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll v2.8h, v0.8b, #0
+; CHECK-NEXT:    sshll v3.8h, v1.8b, #0
+; CHECK-NEXT:    saddw2 v0.8h, v2.8h, v0.16b
+; CHECK-NEXT:    saddw2 v1.8h, v3.8h, v1.16b
+; CHECK-NEXT:    addv h0, v0.8h
+; CHECK-NEXT:    addv h1, v1.8h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    fmov w9, s1
+; CHECK-NEXT:    add w8, w8, w9
+; CHECK-NEXT:    sxth w0, w8
+; CHECK-NEXT:    ret
+entry:
+  %xx = sext <16 x i8> %x to <16 x i16>
+  %z1 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
+  %yy = sext <16 x i8> %y to <16 x i16>
+  %z2 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %yy)
+  %z = add i16 %z1, %z2
+  ret i16 %z
+}
+
+define zeroext i16 @add_pair_v8i8_v8i16_zext(<8 x i8> %x, <8 x i8> %y) {
+; CHECK-LABEL: add_pair_v8i8_v8i16_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NEXT:    addv h0, v0.8h
+; CHECK-NEXT:    addv h1, v1.8h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    fmov w9, s1
+; CHECK-NEXT:    add w8, w8, w9
+; CHECK-NEXT:    and w0, w8, #0xffff
+; CHECK-NEXT:    ret
+entry:
+  %xx = zext <8 x i8> %x to <8 x i16>
+  %z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
+  %yy = zext <8 x i8> %y to <8 x i16>
+  %z2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %yy)
+  %z = add i16 %z1, %z2
+  ret i16 %z
+}
+
+define signext i16 @add_pair_v8i8_v8i16_sext(<8 x i8> %x, <8 x i8> %y) {
+; CHECK-LABEL: add_pair_v8i8_v8i16_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-NEXT:    addv h0, v0.8h
+; CHECK-NEXT:    addv h1, v1.8h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    fmov w9, s1
+; CHECK-NEXT:    add w8, w8, w9
+; CHECK-NEXT:    sxth w0, w8
+; CHECK-NEXT:    ret
+entry:
+  %xx = sext <8 x i8> %x to <8 x i16>
+  %z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
+  %yy = sext <8 x i8> %y to <8 x i16>
+  %z2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %yy)
+  %z = add i16 %z1, %z2
+  ret i16 %z
+}
+
+define zeroext i8 @add_pair_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y) {
+; CHECK-LABEL: add_pair_v16i8_v16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    addv b0, v0.16b
+; CHECK-NEXT:    addv b1, v1.16b
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    fmov w9, s1
+; CHECK-NEXT:    add w8, w8, w9
+; CHECK-NEXT:    and w0, w8, #0xff
+; CHECK-NEXT:    ret
+entry:
+  %z1 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
+  %z2 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %y)
+  %z = add i8 %z1, %z2
+  ret i8 %z
+}
+
+define i64 @add_pair_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y) {
+; CHECK-LABEL: add_pair_v16i8_v16i64_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll2 v2.8h, v0.16b, #0
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ushll2 v3.8h, v1.16b, #0
+; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NEXT:    ushll2 v4.4s, v2.8h, #0
+; CHECK-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-NEXT:    ushll2 v5.4s, v0.8h, #0
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ushll v6.4s, v3.4h, #0
+; CHECK-NEXT:    ushll v7.4s, v1.4h, #0
+; CHECK-NEXT:    ushll2 v3.4s, v3.8h, #0
+; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-NEXT:    uaddl2 v16.2d, v5.4s, v4.4s
+; CHECK-NEXT:    uaddl v4.2d, v5.2s, v4.2s
+; CHECK-NEXT:    uaddl2 v5.2d, v0.4s, v2.4s
+; CHECK-NEXT:    uaddl v0.2d, v0.2s, v2.2s
+; CHECK-NEXT:    uaddl2 v2.2d, v1.4s, v3.4s
+; CHECK-NEXT:    uaddl v1.2d, v1.2s, v3.2s
+; CHECK-NEXT:    uaddl2 v3.2d, v7.4s, v6.4s
+; CHECK-NEXT:    uaddl v6.2d, v7.2s, v6.2s
+; CHECK-NEXT:    add v5.2d, v5.2d, v16.2d
+; CHECK-NEXT:    add v0.2d, v0.2d, v4.2d
+; CHECK-NEXT:    add v2.2d, v3.2d, v2.2d
+; CHECK-NEXT:    add v1.2d, v6.2d, v1.2d
+; CHECK-NEXT:    add v0.2d, v0.2d, v5.2d
+; CHECK-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %xx = zext <16 x i8> %x to <16 x i64>
+  %z1 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
+  %yy = zext <16 x i8> %y to <16 x i64>
+  %z2 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %yy)
+  %z = add i64 %z1, %z2
+  ret i64 %z
+}
+
+define i64 @add_pair_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y) {
+; CHECK-LABEL: add_pair_v16i8_v16i64_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll2 v2.8h, v0.16b, #0
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:    sshll2 v3.8h, v1.16b, #0
+; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-NEXT:    sshll2 v4.4s, v2.8h, #0
+; CHECK-NEXT:    sshll v2.4s, v2.4h, #0
+; CHECK-NEXT:    sshll2 v5.4s, v0.8h, #0
+; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-NEXT:    sshll v6.4s, v3.4h, #0
+; CHECK-NEXT:    sshll v7.4s, v1.4h, #0
+; CHECK-NEXT:    sshll2 v3.4s, v3.8h, #0
+; CHECK-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-NEXT:    saddl2 v16.2d, v5.4s, v4.4s
+; CHECK-NEXT:    saddl v4.2d, v5.2s, v4.2s
+; CHECK-NEXT:    saddl2 v5.2d, v0.4s, v2.4s
+; CHECK-NEXT:    saddl v0.2d, v0.2s, v2.2s
+; CHECK-NEXT:    saddl2 v2.2d, v1.4s, v3.4s
+; CHECK-NEXT:    saddl v1.2d, v1.2s, v3.2s
+; CHECK-NEXT:    saddl2 v3.2d, v7.4s, v6.4s
+; CHECK-NEXT:    saddl v6.2d, v7.2s, v6.2s
+; CHECK-NEXT:    add v5.2d, v5.2d, v16.2d
+; CHECK-NEXT:    add v0.2d, v0.2d, v4.2d
+; CHECK-NEXT:    add v2.2d, v3.2d, v2.2d
+; CHECK-NEXT:    add v1.2d, v6.2d, v1.2d
+; CHECK-NEXT:    add v0.2d, v0.2d, v5.2d
+; CHECK-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %xx = sext <16 x i8> %x to <16 x i64>
+  %z1 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
+  %yy = sext <16 x i8> %y to <16 x i64>
+  %z2 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %yy)
+  %z = add i64 %z1, %z2
+  ret i64 %z
+}
+
+define i64 @add_pair_v8i8_v8i64_zext(<8 x i8> %x, <8 x i8> %y) {
+; CHECK-LABEL: add_pair_v8i8_v8i64_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NEXT:    ushll2 v2.4s, v0.8h, #0
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ushll2 v3.4s, v1.8h, #0
+; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-NEXT:    uaddl2 v4.2d, v0.4s, v2.4s
+; CHECK-NEXT:    uaddl v0.2d, v0.2s, v2.2s
+; CHECK-NEXT:    uaddl2 v2.2d, v1.4s, v3.4s
+; CHECK-NEXT:    uaddl v1.2d, v1.2s, v3.2s
+; CHECK-NEXT:    add v0.2d, v0.2d, v4.2d
+; CHECK-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %xx = zext <8 x i8> %x to <8 x i64>
+  %z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
+  %yy = zext <8 x i8> %y to <8 x i64>
+  %z2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %yy)
+  %z = add i64 %z1, %z2
+  ret i64 %z
+}
+
+define i64 @add_pair_v8i8_v8i64_sext(<8 x i8> %x, <8 x i8> %y) {
+; CHECK-LABEL: add_pair_v8i8_v8i64_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-NEXT:    sshll2 v2.4s, v0.8h, #0
+; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-NEXT:    sshll2 v3.4s, v1.8h, #0
+; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-NEXT:    saddl2 v4.2d, v0.4s, v2.4s
+; CHECK-NEXT:    saddl v0.2d, v0.2s, v2.2s
+; CHECK-NEXT:    saddl2 v2.2d, v1.4s, v3.4s
+; CHECK-NEXT:    saddl v1.2d, v1.2s, v3.2s
+; CHECK-NEXT:    add v0.2d, v0.2d, v4.2d
+; CHECK-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %xx = sext <8 x i8> %x to <8 x i64>
+  %z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
+  %yy = sext <8 x i8> %y to <8 x i64>
+  %z2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %yy)
+  %z = add i64 %z1, %z2
+  ret i64 %z
+}
+
+define i64 @add_pair_v4i8_v4i64_zext(<4 x i8> %x, <4 x i8> %y) {
+; CHECK-LABEL: add_pair_v4i8_v4i64_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-NEXT:    bic v1.4h, #255, lsl #8
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-NEXT:    ushll v2.2d, v0.2s, #0
+; CHECK-NEXT:    ushll v3.2d, v1.2s, #0
+; CHECK-NEXT:    uaddw2 v0.2d, v2.2d, v0.4s
+; CHECK-NEXT:    uaddw2 v1.2d, v3.2d, v1.4s
+; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %xx = zext <4 x i8> %x to <4 x i64>
+  %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
+  %yy = zext <4 x i8> %y to <4 x i64>
+  %z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy)
+  %z = add i64 %z1, %z2
+  ret i64 %z
+}
+
+define i64 @add_pair_v4i8_v4i64_sext(<4 x i8> %x, <4 x i8> %y) {
+; CHECK-LABEL: add_pair_v4i8_v4i64_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-NEXT:    ushll v2.2d, v0.2s, #0
+; CHECK-NEXT:    ushll v3.2d, v1.2s, #0
+; CHECK-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-NEXT:    ushll2 v1.2d, v1.4s, #0
+; CHECK-NEXT:    shl v2.2d, v2.2d, #56
+; CHECK-NEXT:    shl v3.2d, v3.2d, #56
+; CHECK-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-NEXT:    sshr v2.2d, v2.2d, #56
+; CHECK-NEXT:    sshr v3.2d, v3.2d, #56
+; CHECK-NEXT:    ssra v2.2d, v0.2d, #56
+; CHECK-NEXT:    ssra v3.2d, v1.2d, #56
+; CHECK-NEXT:    add v0.2d, v2.2d, v3.2d
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %xx = sext <4 x i8> %x to <4 x i64>
+  %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
+  %yy = sext <4 x i8> %y to <4 x i64>
+  %z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy)
+  %z = add i64 %z1, %z2
+  ret i64 %z
+}
+
+define i64 @add_pair_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %y) {
+; CHECK-LABEL: add_pair_v2i8_v2i64_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi d2, #0x0000ff000000ff
+; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %xx = zext <2 x i8> %x to <2 x i64>
+  %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
+  %yy = zext <2 x i8> %y to <2 x i64>
+  %z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy)
+  %z = add i64 %z1, %z2
+  ret i64 %z
+}
+
+define i64 @add_pair_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %y) {
+; CHECK-LABEL: add_pair_v2i8_v2i64_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-NEXT:    sshr v0.2d, v0.2d, #56
+; CHECK-NEXT:    ssra v0.2d, v1.2d, #56
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %xx = sext <2 x i8> %x to <2 x i64>
+  %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
+  %yy = sext <2 x i8> %y to <2 x i64>
+  %z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy)
+  %z = add i64 %z1, %z2
+  ret i64 %z
+}
+
+define i64 @add_pair_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: add_pair_v2i64_v2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x)
+  %z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %y)
+  %z = add i64 %z1, %z2
+  ret i64 %z
+}
+
+declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
+declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
+declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vdiv-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vdiv-sdnode.ll
--- a/llvm/test/CodeGen/RISCV/rvv/vdiv-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vdiv-sdnode.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32-V
+; RUN: llc -mtriple=riscv32 -mattr=+zve64x -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,ZVE64X
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64-V
+; RUN: llc -mtriple=riscv64 -mattr=+zve64x -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,ZVE64X
 
 define <vscale x 1 x i8> @vdiv_vv_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb) {
 ; CHECK-LABEL: vdiv_vv_nxv1i8:
@@ -895,38 +897,45 @@
 }
 
 define <vscale x 1 x i64> @vdiv_vi_nxv1i64_0(<vscale x 1 x i64> %va) {
-; RV32-LABEL: vdiv_vi_nxv1i64_0:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a0, 748983
-; RV32-NEXT:    addi a0, a0, -586
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    lui a0, 898779
-; RV32-NEXT:    addi a0, a0, 1755
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vlse64.v v9, (a0), zero
-; RV32-NEXT:    vmulh.vv v8, v8, v9
-; RV32-NEXT:    li a0, 63
-; RV32-NEXT:    vsrl.vx v9, v8, a0
-; RV32-NEXT:    vsra.vi v8, v8, 1
-; RV32-NEXT:    vadd.vv v8, v8, v9
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    ret
+; RV32-V-LABEL: vdiv_vi_nxv1i64_0:
+; RV32-V:       # %bb.0:
+; RV32-V-NEXT:    addi sp, sp, -16
+; RV32-V-NEXT:    .cfi_def_cfa_offset 16
+; RV32-V-NEXT:    lui a0, 748983
+; RV32-V-NEXT:    addi a0, a0, -586
+; RV32-V-NEXT:    sw a0, 12(sp)
+; RV32-V-NEXT:    lui a0, 898779
+; RV32-V-NEXT:    addi a0, a0, 1755
+; RV32-V-NEXT:    sw a0, 8(sp)
+; RV32-V-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
+; RV32-V-NEXT:    addi a0, sp, 8
+; RV32-V-NEXT:    vlse64.v v9, (a0), zero
+; RV32-V-NEXT:    vmulh.vv v8, v8, v9
+; RV32-V-NEXT:    li a0, 63
+; RV32-V-NEXT:    vsrl.vx v9, v8, a0
+; RV32-V-NEXT:    vsra.vi v8, v8, 1
+; RV32-V-NEXT:    vadd.vv v8, v8, v9
+; RV32-V-NEXT:    addi sp, sp, 16
+; RV32-V-NEXT:    ret
 ;
-; RV64-LABEL: vdiv_vi_nxv1i64_0:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a0, %hi(.LCPI58_0)
-; RV64-NEXT:    ld a0, %lo(.LCPI58_0)(a0)
-; RV64-NEXT:    vsetvli a1, zero, e64, m1, ta, mu
-; RV64-NEXT:    vmulh.vx v8, v8, a0
-; RV64-NEXT:    li a0, 63
-; RV64-NEXT:    vsrl.vx v9, v8, a0
-; RV64-NEXT:    vsra.vi v8, v8, 1
-; RV64-NEXT:    vadd.vv v8, v8, v9
-; RV64-NEXT:    ret
+; ZVE64X-LABEL: vdiv_vi_nxv1i64_0:
+; ZVE64X:       # %bb.0:
+; ZVE64X-NEXT:    li a0, -7
+; ZVE64X-NEXT:    vsetvli a1, zero, e64, m1, ta, mu
+; ZVE64X-NEXT:    vdiv.vx v8, v8, a0
+; ZVE64X-NEXT:    ret
+;
+; RV64-V-LABEL: vdiv_vi_nxv1i64_0:
+; RV64-V:       # %bb.0:
+; RV64-V-NEXT:    lui a0, %hi(.LCPI58_0)
+; RV64-V-NEXT:    ld a0, %lo(.LCPI58_0)(a0)
+; RV64-V-NEXT:    vsetvli a1, zero, e64, m1, ta, mu
+; RV64-V-NEXT:    vmulh.vx v8, v8, a0
+; RV64-V-NEXT:    li a0, 63
+; RV64-V-NEXT:    vsrl.vx v9, v8, a0
+; RV64-V-NEXT:    vsra.vi v8, v8, 1
+; RV64-V-NEXT:    vadd.vv v8, v8, v9
+; RV64-V-NEXT:    ret
   %head = insertelement <vscale x 1 x i64> undef, i64 -7, i32 0
   %splat = shufflevector <vscale x 1 x i64> %head, <vscale x 1 x i64> undef, <vscale x 1 x i32> zeroinitializer
   %vc = sdiv <vscale x 1 x i64> %va, %splat
@@ -969,38 +978,45 @@
 }
 
 define <vscale x 2 x i64> @vdiv_vi_nxv2i64_0(<vscale x 2 x i64> %va) {
-; RV32-LABEL: vdiv_vi_nxv2i64_0:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a0, 748983
-; RV32-NEXT:    addi a0, a0, -586
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    lui a0, 898779
-; RV32-NEXT:    addi a0, a0, 1755
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vlse64.v v10, (a0), zero
-; RV32-NEXT:    vmulh.vv v8, v8, v10
-; RV32-NEXT:    li a0, 63
-; RV32-NEXT:    vsrl.vx v10, v8, a0
-; RV32-NEXT:    vsra.vi v8, v8, 1
-; RV32-NEXT:    vadd.vv v8, v8, v10
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    ret
+; RV32-V-LABEL: vdiv_vi_nxv2i64_0:
+; RV32-V:       # %bb.0:
+; RV32-V-NEXT:    addi sp, sp, -16
+; RV32-V-NEXT:    .cfi_def_cfa_offset 16
+; RV32-V-NEXT:    lui a0, 748983
+; RV32-V-NEXT:    addi a0, a0, -586
+; RV32-V-NEXT:    sw a0, 12(sp)
+; RV32-V-NEXT:    lui a0, 898779
+; RV32-V-NEXT:    addi a0, a0, 1755
+; RV32-V-NEXT:    sw a0, 8(sp)
+; RV32-V-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
+; RV32-V-NEXT:    addi a0, sp, 8
+; RV32-V-NEXT:    vlse64.v v10, (a0), zero
+; RV32-V-NEXT:    vmulh.vv v8, v8, v10
+; RV32-V-NEXT:    li a0, 63
+; RV32-V-NEXT:    vsrl.vx v10, v8, a0
+; RV32-V-NEXT:    vsra.vi v8, v8, 1
+; RV32-V-NEXT:    vadd.vv v8, v8, v10
+; RV32-V-NEXT:    addi sp, sp, 16
+; RV32-V-NEXT:    ret
 ;
-; RV64-LABEL: vdiv_vi_nxv2i64_0:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a0, %hi(.LCPI61_0)
-; RV64-NEXT:    ld a0, %lo(.LCPI61_0)(a0)
-; RV64-NEXT:    vsetvli a1, zero, e64, m2, ta, mu
-; RV64-NEXT:    vmulh.vx v8, v8, a0
-; RV64-NEXT:    li a0, 63
-; RV64-NEXT:    vsrl.vx v10, v8, a0
-; RV64-NEXT:    vsra.vi v8, v8, 1
-; RV64-NEXT:    vadd.vv v8, v8, v10
-; RV64-NEXT:    ret
+; ZVE64X-LABEL: vdiv_vi_nxv2i64_0:
+; ZVE64X:       # %bb.0:
+; ZVE64X-NEXT:    li a0, -7
+; ZVE64X-NEXT:    vsetvli a1, zero, e64, m2, ta, mu
+; ZVE64X-NEXT:    vdiv.vx v8, v8, a0
+; ZVE64X-NEXT:    ret
+;
+; RV64-V-LABEL: vdiv_vi_nxv2i64_0:
+; RV64-V:       # %bb.0:
+; RV64-V-NEXT:    lui a0, %hi(.LCPI61_0)
+; RV64-V-NEXT:    ld a0, %lo(.LCPI61_0)(a0)
+; RV64-V-NEXT:    vsetvli a1, zero, e64, m2, ta, mu
+; RV64-V-NEXT:    vmulh.vx v8, v8, a0
+; RV64-V-NEXT:    li a0, 63
+; RV64-V-NEXT:    vsrl.vx v10, v8, a0
+; RV64-V-NEXT:    vsra.vi v8, v8, 1
+; RV64-V-NEXT:    vadd.vv v8, v8, v10
+; RV64-V-NEXT:    ret
   %head = insertelement <vscale x 2 x i64> undef, i64 -7, i32 0
   %splat = shufflevector <vscale x 2 x i64> %head, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
   %vc = sdiv <vscale x 2 x i64> %va, %splat
@@ -1043,38 +1059,45 @@
 }
 
 define <vscale x 4 x i64> @vdiv_vi_nxv4i64_0(<vscale x 4 x i64> %va) {
-; RV32-LABEL: vdiv_vi_nxv4i64_0:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a0, 748983
-; RV32-NEXT:    addi a0, a0, -586
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    lui a0, 898779
-; RV32-NEXT:    addi a0, a0, 1755
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vmulh.vv v8, v8, v12
-; RV32-NEXT:    li a0, 63
-; RV32-NEXT:    vsrl.vx v12, v8, a0
-; RV32-NEXT:    vsra.vi v8, v8, 1
-; RV32-NEXT:    vadd.vv v8, v8, v12
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    ret
+; RV32-V-LABEL: vdiv_vi_nxv4i64_0:
+; RV32-V:       # %bb.0:
+; RV32-V-NEXT:    addi sp, sp, -16
+; RV32-V-NEXT:    .cfi_def_cfa_offset 16
+; RV32-V-NEXT:    lui a0, 748983
+; RV32-V-NEXT:    addi a0, a0, -586
+; RV32-V-NEXT:    sw a0, 12(sp)
+; RV32-V-NEXT:    lui a0, 898779
+; RV32-V-NEXT:    addi a0, a0, 1755
+; RV32-V-NEXT:    sw a0, 8(sp)
+; RV32-V-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
+; RV32-V-NEXT:    addi a0, sp, 8
+; RV32-V-NEXT:    vlse64.v v12, (a0), zero
+; RV32-V-NEXT:    vmulh.vv v8, v8, v12
+; RV32-V-NEXT:    li a0, 63
+; RV32-V-NEXT:    vsrl.vx v12, v8, a0
+; RV32-V-NEXT:    vsra.vi v8, v8, 1
+; RV32-V-NEXT:    vadd.vv v8, v8, v12
+; RV32-V-NEXT:    addi sp, sp, 16
+; RV32-V-NEXT:    ret
 ;
-; RV64-LABEL: vdiv_vi_nxv4i64_0:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a0, %hi(.LCPI64_0)
-; RV64-NEXT:    ld a0, %lo(.LCPI64_0)(a0)
-; RV64-NEXT:    vsetvli a1, zero, e64, m4, ta, mu
-; RV64-NEXT:    vmulh.vx v8, v8, a0
-; RV64-NEXT:    li a0, 63
-; RV64-NEXT:    vsrl.vx v12, v8, a0
-; RV64-NEXT:    vsra.vi v8, v8, 1
-; RV64-NEXT:    vadd.vv v8, v8, v12
-; RV64-NEXT:    ret
+; ZVE64X-LABEL: vdiv_vi_nxv4i64_0:
+; ZVE64X:       # %bb.0:
+; ZVE64X-NEXT:    li a0, -7
+; ZVE64X-NEXT:    vsetvli a1, zero, e64, m4, ta, mu
+; ZVE64X-NEXT:    vdiv.vx v8, v8, a0
+; ZVE64X-NEXT:    ret
+;
+; RV64-V-LABEL: vdiv_vi_nxv4i64_0:
+; RV64-V:       # %bb.0:
+; RV64-V-NEXT:    lui a0, %hi(.LCPI64_0)
+; RV64-V-NEXT:    ld a0, %lo(.LCPI64_0)(a0)
+; RV64-V-NEXT:    vsetvli a1, zero, e64, m4, ta, mu
+; RV64-V-NEXT:    vmulh.vx v8, v8, a0
+; RV64-V-NEXT:    li a0, 63
+; RV64-V-NEXT:    vsrl.vx v12, v8, a0
+; RV64-V-NEXT:    vsra.vi v8, v8, 1
+; RV64-V-NEXT:    vadd.vv v8, v8, v12
+; RV64-V-NEXT:    ret
   %head = insertelement <vscale x 4 x i64> undef, i64 -7, i32 0
   %splat = shufflevector <vscale x 4 x i64> %head, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
   %vc = sdiv <vscale x 4 x i64> %va, %splat
@@ -1117,41 +1140,47 @@
 }
 
 define <vscale x 8 x i64> @vdiv_vi_nxv8i64_0(<vscale x 8 x i64> %va) {
-; RV32-LABEL: vdiv_vi_nxv8i64_0:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a0, 748983
-; RV32-NEXT:    addi a0, a0, -586
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    lui a0, 898779
-; RV32-NEXT:    addi a0, a0, 1755
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vmulh.vv v8, v8, v16
-; RV32-NEXT:    li a0, 63
-; RV32-NEXT:    vsrl.vx v16, v8, a0
-; RV32-NEXT:    vsra.vi v8, v8, 1
-; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    ret
+; RV32-V-LABEL: vdiv_vi_nxv8i64_0:
+; RV32-V:       # %bb.0:
+; RV32-V-NEXT:    addi sp, sp, -16
+; RV32-V-NEXT:    .cfi_def_cfa_offset 16
+; RV32-V-NEXT:    lui a0, 748983
+; RV32-V-NEXT:    addi a0, a0, -586
+; RV32-V-NEXT:    sw a0, 12(sp)
+; RV32-V-NEXT:    lui a0, 898779
+; RV32-V-NEXT:    addi a0, a0, 1755
+; RV32-V-NEXT:    sw a0, 8(sp)
+; RV32-V-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
+; RV32-V-NEXT:    addi a0, sp, 8
+; RV32-V-NEXT:    vlse64.v v16, (a0), zero
+; RV32-V-NEXT:    vmulh.vv v8, v8, v16
+; RV32-V-NEXT:    li a0, 63
+; RV32-V-NEXT:    vsrl.vx v16, v8, a0
+; RV32-V-NEXT:    vsra.vi v8, v8, 1
+; RV32-V-NEXT:    vadd.vv v8, v8, v16
+; RV32-V-NEXT:    addi sp, sp, 16
+; RV32-V-NEXT:    ret
 ;
-; RV64-LABEL: vdiv_vi_nxv8i64_0:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a0, %hi(.LCPI67_0)
-; RV64-NEXT:    ld a0, %lo(.LCPI67_0)(a0)
-; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, mu
-; RV64-NEXT:    vmulh.vx v8, v8, a0
-; RV64-NEXT:    li a0, 63
-; RV64-NEXT:    vsrl.vx v16, v8, a0
-; RV64-NEXT:    vsra.vi v8, v8, 1
-; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    ret
+; ZVE64X-LABEL: vdiv_vi_nxv8i64_0:
+; ZVE64X:       # %bb.0:
+; ZVE64X-NEXT:    li a0, -7
+; ZVE64X-NEXT:    vsetvli a1, zero, e64, m8, ta, mu
+; ZVE64X-NEXT:    vdiv.vx v8, v8, a0
+; ZVE64X-NEXT:    ret
+;
+; RV64-V-LABEL: vdiv_vi_nxv8i64_0:
+; RV64-V:       # %bb.0:
+; RV64-V-NEXT:    lui a0, %hi(.LCPI67_0)
+; RV64-V-NEXT:    ld a0, %lo(.LCPI67_0)(a0)
+; RV64-V-NEXT:    vsetvli a1, zero, e64, m8, ta, mu
+; RV64-V-NEXT:    vmulh.vx v8, v8, a0
+; RV64-V-NEXT:    li a0, 63
+; RV64-V-NEXT:    vsrl.vx v16, v8, a0
+; RV64-V-NEXT:    vsra.vi v8, v8, 1
+; RV64-V-NEXT:    vadd.vv v8, v8, v16
+; RV64-V-NEXT:    ret
   %head = insertelement <vscale x 8 x i64> undef, i64 -7, i32 0
   %splat = shufflevector <vscale x 8 x i64> %head, <vscale x 8 x i64> undef, <vscale x 8 x i32> zeroinitializer
   %vc = sdiv <vscale x 8 x i64> %va, %splat
   ret <vscale x 8 x i64> %vc
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/vdivu-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vdivu-sdnode.ll
--- a/llvm/test/CodeGen/RISCV/rvv/vdivu-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vdivu-sdnode.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32-V
+; RUN: llc -mtriple=riscv32 -mattr=+zve64x -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,ZVE64X
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64-V
+; RUN: llc -mtriple=riscv64 -mattr=+zve64x -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,ZVE64X
 
 define <vscale x 1 x i8> @vdivu_vv_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb) {
 ; CHECK-LABEL: vdivu_vv_nxv1i8:
@@ -820,33 +822,40 @@
 }
 
 define <vscale x 1 x i64> @vdivu_vi_nxv1i64_0(<vscale x 1 x i64> %va) {
-; RV32-LABEL: vdivu_vi_nxv1i64_0:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a0, 131072
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    li a0, 1
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vlse64.v v9, (a0), zero
-; RV32-NEXT:    vmulhu.vv v8, v8, v9
-; RV32-NEXT:    li a0, 61
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    ret
+; RV32-V-LABEL: vdivu_vi_nxv1i64_0:
+; RV32-V:       # %bb.0:
+; RV32-V-NEXT:    addi sp, sp, -16
+; RV32-V-NEXT:    .cfi_def_cfa_offset 16
+; RV32-V-NEXT:    lui a0, 131072
+; RV32-V-NEXT:    sw a0, 12(sp)
+; RV32-V-NEXT:    li a0, 1
+; RV32-V-NEXT:    sw a0, 8(sp)
+; RV32-V-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
+; RV32-V-NEXT:    addi a0, sp, 8
+; RV32-V-NEXT:    vlse64.v v9, (a0), zero
+; RV32-V-NEXT:    vmulhu.vv v8, v8, v9
+; RV32-V-NEXT:    li a0, 61
+; RV32-V-NEXT:    vsrl.vx v8, v8, a0
+; RV32-V-NEXT:    addi sp, sp, 16
+; RV32-V-NEXT:    ret
 ;
-; RV64-LABEL: vdivu_vi_nxv1i64_0:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a0, 1
-; RV64-NEXT:    slli a0, a0, 61
-; RV64-NEXT:    addi a0, a0, 1
-; RV64-NEXT:    vsetvli a1, zero, e64, m1, ta, mu
-; RV64-NEXT:    vmulhu.vx v8, v8, a0
-; RV64-NEXT:    li a0, 61
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; ZVE64X-LABEL: vdivu_vi_nxv1i64_0:
+; ZVE64X:       # %bb.0:
+; ZVE64X-NEXT:    li a0, -7
+; ZVE64X-NEXT:    vsetvli a1, zero, e64, m1, ta, mu
+; ZVE64X-NEXT:    vdivu.vx v8, v8, a0
+; ZVE64X-NEXT:    ret
+;
+; RV64-V-LABEL: vdivu_vi_nxv1i64_0:
+; RV64-V:       # %bb.0:
+; RV64-V-NEXT:    li a0, 1
+; RV64-V-NEXT:    slli a0, a0, 61
+; RV64-V-NEXT:    addi a0, a0, 1
+; RV64-V-NEXT:    vsetvli a1, zero, e64, m1, ta, mu
+; RV64-V-NEXT:    vmulhu.vx v8, v8, a0
+; RV64-V-NEXT:    li a0, 61
+; RV64-V-NEXT:    vsrl.vx v8, v8, a0
+; RV64-V-NEXT:    ret
   %head = insertelement <vscale x 1 x i64> undef, i64 -7, i32 0
   %splat = shufflevector <vscale x 1 x i64> %head, <vscale x 1 x i64> undef, <vscale x 1 x i32> zeroinitializer
   %vc = udiv <vscale x 1 x i64> %va, %splat
@@ -916,33 +925,40 @@
 }
 
 define <vscale x 2 x i64> @vdivu_vi_nxv2i64_0(<vscale x 2 x i64> %va) {
-; RV32-LABEL: vdivu_vi_nxv2i64_0:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a0, 131072
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    li a0, 1
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vlse64.v v10, (a0), zero
-; RV32-NEXT:    vmulhu.vv v8, v8, v10
-; RV32-NEXT:    li a0, 61
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    ret
+; RV32-V-LABEL: vdivu_vi_nxv2i64_0:
+; RV32-V:       # %bb.0:
+; RV32-V-NEXT:    addi sp, sp, -16
+; RV32-V-NEXT:    .cfi_def_cfa_offset 16
+; RV32-V-NEXT:    lui a0, 131072
+; RV32-V-NEXT:    sw a0, 12(sp)
+; RV32-V-NEXT:    li a0, 1
+; RV32-V-NEXT:    sw a0, 8(sp)
+; RV32-V-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
+; RV32-V-NEXT:    addi a0, sp, 8
+; RV32-V-NEXT:    vlse64.v v10, (a0), zero
+; RV32-V-NEXT:    vmulhu.vv v8, v8, v10
+; RV32-V-NEXT:    li a0, 61
+; RV32-V-NEXT:    vsrl.vx v8, v8, a0
+; RV32-V-NEXT:    addi sp, sp, 16
+; RV32-V-NEXT:    ret
 ;
-; RV64-LABEL: vdivu_vi_nxv2i64_0:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a0, 1
-; RV64-NEXT:    slli a0, a0, 61
-; RV64-NEXT:    addi a0, a0, 1
-; RV64-NEXT:    vsetvli a1, zero, e64, m2, ta, mu
-; RV64-NEXT:    vmulhu.vx v8, v8, a0
-; RV64-NEXT:    li a0, 61
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; ZVE64X-LABEL: vdivu_vi_nxv2i64_0:
+; ZVE64X:       # %bb.0:
+; ZVE64X-NEXT:    li a0, -7
+; ZVE64X-NEXT:    vsetvli a1, zero, e64, m2, ta, mu
+; ZVE64X-NEXT:    vdivu.vx v8, v8, a0
+; ZVE64X-NEXT:    ret
+;
+; RV64-V-LABEL: vdivu_vi_nxv2i64_0:
+; RV64-V:       # %bb.0:
+; RV64-V-NEXT:    li a0, 1
+; RV64-V-NEXT:    slli a0, a0, 61
+; RV64-V-NEXT:    addi a0, a0, 1
+; RV64-V-NEXT:    vsetvli a1, zero, e64, m2, ta, mu
+; RV64-V-NEXT:    vmulhu.vx v8, v8, a0
+; RV64-V-NEXT:    li a0, 61
+; RV64-V-NEXT:    vsrl.vx v8, v8, a0
+; RV64-V-NEXT:    ret
   %head = insertelement <vscale x 2 x i64> undef, i64 -7, i32 0
   %splat = shufflevector <vscale x 2 x i64> %head, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
   %vc = udiv <vscale x 2 x i64> %va, %splat
@@ -1012,33 +1028,40 @@
 }
 
 define <vscale x 4 x i64> @vdivu_vi_nxv4i64_0(<vscale x 4 x i64> %va) {
-; RV32-LABEL: vdivu_vi_nxv4i64_0:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a0, 131072
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    li a0, 1
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vmulhu.vv v8, v8, v12
-; RV32-NEXT:    li a0, 61
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    ret
+; RV32-V-LABEL: vdivu_vi_nxv4i64_0:
+; RV32-V:       # %bb.0:
+; RV32-V-NEXT:    addi sp, sp, -16
+; RV32-V-NEXT:    .cfi_def_cfa_offset 16
+; RV32-V-NEXT:    lui a0, 131072
+; RV32-V-NEXT:    sw a0, 12(sp)
+; RV32-V-NEXT:    li a0, 1
+; RV32-V-NEXT:    sw a0, 8(sp)
+; RV32-V-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
+; RV32-V-NEXT:    addi a0, sp, 8
+; RV32-V-NEXT:    vlse64.v v12, (a0), zero
+; RV32-V-NEXT:    vmulhu.vv v8, v8, v12
+; RV32-V-NEXT:    li a0, 61
+; RV32-V-NEXT:    vsrl.vx v8, v8, a0
+; RV32-V-NEXT:    addi sp, sp, 16
+; RV32-V-NEXT:    ret
 ;
-; RV64-LABEL: vdivu_vi_nxv4i64_0:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a0, 1
-; RV64-NEXT:    slli a0, a0, 61
-; RV64-NEXT:    addi a0, a0, 1
-; RV64-NEXT:    vsetvli a1, zero, e64, m4, ta, mu
-; RV64-NEXT:    vmulhu.vx v8, v8, a0
-; RV64-NEXT:    li a0, 61
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; ZVE64X-LABEL: vdivu_vi_nxv4i64_0:
+; ZVE64X:       # %bb.0:
+; ZVE64X-NEXT:    li a0, -7
+; ZVE64X-NEXT:    vsetvli a1, zero, e64, m4, ta, mu
+; ZVE64X-NEXT:    vdivu.vx v8, v8, a0
+; ZVE64X-NEXT:    ret
+;
+; RV64-V-LABEL: vdivu_vi_nxv4i64_0:
+; RV64-V:       # %bb.0:
+; RV64-V-NEXT:    li a0, 1
+; RV64-V-NEXT:    slli a0, a0, 61
+; RV64-V-NEXT:    addi a0, a0, 1
+; RV64-V-NEXT:    vsetvli a1, zero, e64, m4, ta, mu
+; RV64-V-NEXT:    vmulhu.vx v8, v8, a0
+; RV64-V-NEXT:    li a0, 61
+; RV64-V-NEXT:    vsrl.vx v8, v8, a0
+; RV64-V-NEXT:    ret
   %head = insertelement <vscale x 4 x i64> undef, i64 -7, i32 0
   %splat = shufflevector <vscale x 4 x i64> %head, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
   %vc = udiv <vscale x 4 x i64> %va, %splat
@@ -1108,33 +1131,40 @@
 }
 
 define <vscale x 8 x i64> @vdivu_vi_nxv8i64_0(<vscale x 8 x i64> %va) {
-; RV32-LABEL: vdivu_vi_nxv8i64_0:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a0, 131072
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    li a0, 1
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vmulhu.vv v8, v8, v16
-; RV32-NEXT:    li a0, 61
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    ret
+; RV32-V-LABEL: vdivu_vi_nxv8i64_0:
+; RV32-V:       # %bb.0:
+; RV32-V-NEXT:    addi sp, sp, -16
+; RV32-V-NEXT:    .cfi_def_cfa_offset 16
+; RV32-V-NEXT:    lui a0, 131072
+; RV32-V-NEXT:    sw a0, 12(sp)
+; RV32-V-NEXT:    li a0, 1
+; RV32-V-NEXT:    sw a0, 8(sp)
+; RV32-V-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
+; RV32-V-NEXT:    addi a0, sp, 8
+; RV32-V-NEXT:    vlse64.v v16, (a0), zero
+; RV32-V-NEXT:    vmulhu.vv v8, v8, v16
+; RV32-V-NEXT:    li a0, 61
+; RV32-V-NEXT:    vsrl.vx v8, v8, a0
+; RV32-V-NEXT:    addi sp, sp, 16
+; RV32-V-NEXT:    ret
 ;
-; RV64-LABEL: vdivu_vi_nxv8i64_0:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a0, 1
-; RV64-NEXT:    slli a0, a0, 61
-; RV64-NEXT:    addi a0, a0, 1
-; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, mu
-; RV64-NEXT:    vmulhu.vx v8, v8, a0
-; RV64-NEXT:    li a0, 61
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; ZVE64X-LABEL: vdivu_vi_nxv8i64_0:
+; ZVE64X:       # %bb.0:
+; ZVE64X-NEXT:    li a0, -7
+; ZVE64X-NEXT:    vsetvli a1, zero, e64, m8, ta, mu
+; ZVE64X-NEXT:    vdivu.vx v8, v8, a0
+; ZVE64X-NEXT:    ret
+;
+; RV64-V-LABEL: vdivu_vi_nxv8i64_0:
+; RV64-V:       # %bb.0:
+; RV64-V-NEXT:    li a0, 1
+; RV64-V-NEXT:    slli a0, a0, 61
+; RV64-V-NEXT:    addi a0, a0, 1
+; RV64-V-NEXT:    vsetvli a1, zero, e64, m8, ta, mu
+; RV64-V-NEXT:    vmulhu.vx v8, v8, a0
+; RV64-V-NEXT:    li a0, 61
+; RV64-V-NEXT:    vsrl.vx v8, v8, a0
+; RV64-V-NEXT:    ret
   %head = insertelement <vscale x 8 x i64> undef, i64 -7, i32 0
   %splat = shufflevector <vscale x 8 x i64> %head, <vscale x 8 x i64> undef, <vscale x 8 x i32> zeroinitializer
   %vc = udiv <vscale x 8 x i64> %va, %splat
diff --git a/llvm/test/CodeGen/RISCV/rvv/vrem-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vrem-sdnode.ll
--- a/llvm/test/CodeGen/RISCV/rvv/vrem-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vrem-sdnode.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32-V
+; RUN: llc -mtriple=riscv32 -mattr=+zve64x -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,ZVE64X
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64-V
+; RUN: llc -mtriple=riscv64 -mattr=+zve64x -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,ZVE64X
 
 define <vscale x 1 x i8> @vrem_vv_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb) {
 ; CHECK-LABEL: vrem_vv_nxv1i8:
@@ -929,42 +931,49 @@
 }
 
 define <vscale x 1 x i64> @vrem_vi_nxv1i64_0(<vscale x 1 x i64> %va) {
-; RV32-LABEL: vrem_vi_nxv1i64_0:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a0, 748983
-; RV32-NEXT:    addi a0, a0, -586
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    lui a0, 898779
-; RV32-NEXT:    addi a0, a0, 1755
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vlse64.v v9, (a0), zero
-; RV32-NEXT:    vmulh.vv v9, v8, v9
-; RV32-NEXT:    li a0, 63
-; RV32-NEXT:    vsrl.vx v10, v9, a0
-; RV32-NEXT:    vsra.vi v9, v9, 1
-; RV32-NEXT:    vadd.vv v9, v9, v10
-; RV32-NEXT:    li a0, -7
-; RV32-NEXT:    vnmsac.vx v8, a0, v9
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    ret
+; RV32-V-LABEL: vrem_vi_nxv1i64_0:
+; RV32-V:       # %bb.0:
+; RV32-V-NEXT:    addi sp, sp, -16
+; RV32-V-NEXT:    .cfi_def_cfa_offset 16
+; RV32-V-NEXT:    lui a0, 748983
+; RV32-V-NEXT:    addi a0, a0, -586
+; RV32-V-NEXT:    sw a0, 12(sp)
+; RV32-V-NEXT:    lui a0, 898779
+; RV32-V-NEXT:    addi a0, a0, 1755
+; RV32-V-NEXT:    sw a0, 8(sp)
+; RV32-V-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
+; RV32-V-NEXT:    addi a0, sp, 8
+; RV32-V-NEXT:    vlse64.v v9, (a0), zero
+; RV32-V-NEXT:    vmulh.vv v9, v8, v9
+; RV32-V-NEXT:    li a0, 63
+; RV32-V-NEXT:    vsrl.vx v10, v9, a0
+; RV32-V-NEXT:    vsra.vi v9, v9, 1
+; RV32-V-NEXT:    vadd.vv v9, v9, v10
+; RV32-V-NEXT:    li a0, -7
+; RV32-V-NEXT:    vnmsac.vx v8, a0, v9
+; RV32-V-NEXT:    addi sp, sp, 16
+; RV32-V-NEXT:    ret
 ;
-; RV64-LABEL: vrem_vi_nxv1i64_0:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a0, %hi(.LCPI56_0)
-; RV64-NEXT:    ld a0, %lo(.LCPI56_0)(a0)
-; RV64-NEXT:    vsetvli a1, zero, e64, m1, ta, mu
-; RV64-NEXT:    vmulh.vx v9, v8, a0
-; RV64-NEXT:    li a0, 63
-; RV64-NEXT:    vsrl.vx v10, v9, a0
-; RV64-NEXT:    vsra.vi v9, v9, 1
-; RV64-NEXT:    vadd.vv v9, v9, v10
-; RV64-NEXT:    li a0, -7
-; RV64-NEXT:    vnmsac.vx v8, a0, v9
-; RV64-NEXT:    ret
+; ZVE64X-LABEL: vrem_vi_nxv1i64_0:
+; ZVE64X:       # %bb.0:
+; ZVE64X-NEXT:    li a0, -7
+; ZVE64X-NEXT:    vsetvli a1, zero, e64, m1, ta, mu
+; ZVE64X-NEXT:    vrem.vx v8, v8, a0
+; ZVE64X-NEXT:    ret
+;
+; RV64-V-LABEL: vrem_vi_nxv1i64_0:
+; RV64-V:       # %bb.0:
+; RV64-V-NEXT:    lui a0, %hi(.LCPI56_0)
+; RV64-V-NEXT:    ld a0, %lo(.LCPI56_0)(a0)
+; RV64-V-NEXT:    vsetvli a1, zero, e64, m1, ta, mu
+; RV64-V-NEXT:    vmulh.vx v9, v8, a0
+; RV64-V-NEXT:    li a0, 63
+; RV64-V-NEXT:    vsrl.vx v10, v9, a0
+; RV64-V-NEXT:    vsra.vi v9, v9, 1
+; RV64-V-NEXT:    vadd.vv v9, v9, v10
+; RV64-V-NEXT:    li a0, -7
+; RV64-V-NEXT:    vnmsac.vx v8, a0, v9
+; RV64-V-NEXT:    ret
   %head = insertelement <vscale x 1 x i64> undef, i64 -7, i32 0
   %splat = shufflevector <vscale x 1 x i64> %head, <vscale x 1 x i64> undef, <vscale x 1 x i32> zeroinitializer
   %vc = srem <vscale x 1 x i64> %va, %splat
@@ -1007,42 +1016,49 @@
 }
 
 define <vscale x 2 x i64> @vrem_vi_nxv2i64_0(<vscale x 2 x i64> %va) {
-; RV32-LABEL: vrem_vi_nxv2i64_0:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a0, 748983
-; RV32-NEXT:    addi a0, a0, -586
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    lui a0, 898779
-; RV32-NEXT:    addi a0, a0, 1755
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vlse64.v v10, (a0), zero
-; RV32-NEXT:    vmulh.vv v10, v8, v10
-; RV32-NEXT:    li a0, 63
-; RV32-NEXT:    vsrl.vx v12, v10, a0
-; RV32-NEXT:    vsra.vi v10, v10, 1
-; RV32-NEXT:    vadd.vv v10, v10, v12
-; RV32-NEXT:    li a0, -7
-; RV32-NEXT:    vnmsac.vx v8, a0, v10
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    ret
+; RV32-V-LABEL: vrem_vi_nxv2i64_0:
+; RV32-V:       # %bb.0:
+; RV32-V-NEXT:    addi sp, sp, -16
+; RV32-V-NEXT:    .cfi_def_cfa_offset 16
+; RV32-V-NEXT:    lui a0, 748983
+; RV32-V-NEXT:    addi a0, a0, -586
+; RV32-V-NEXT:    sw a0, 12(sp)
+; RV32-V-NEXT:    lui a0, 898779
+; RV32-V-NEXT:    addi a0, a0, 1755
+; RV32-V-NEXT:    sw a0, 8(sp)
+; RV32-V-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
+; RV32-V-NEXT:    addi a0, sp, 8
+; RV32-V-NEXT:    vlse64.v v10, (a0), zero
+; RV32-V-NEXT:    vmulh.vv v10, v8, v10
+; RV32-V-NEXT:    li a0, 63
+; RV32-V-NEXT:    vsrl.vx v12, v10, a0
+; RV32-V-NEXT:    vsra.vi v10, v10, 1
+; RV32-V-NEXT:    vadd.vv v10, v10, v12
+; RV32-V-NEXT:    li a0, -7
+; RV32-V-NEXT:    vnmsac.vx v8, a0, v10
+; RV32-V-NEXT:    addi sp, sp, 16
+; RV32-V-NEXT:    ret
 ;
-; RV64-LABEL: vrem_vi_nxv2i64_0:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a0, %hi(.LCPI59_0)
-; RV64-NEXT:    ld a0, %lo(.LCPI59_0)(a0)
-; RV64-NEXT:    vsetvli a1, zero, e64, m2, ta, mu
-; RV64-NEXT:    vmulh.vx v10, v8, a0
-; RV64-NEXT:    li a0, 63
-; RV64-NEXT:    vsrl.vx v12, v10, a0
-; RV64-NEXT:    vsra.vi v10, v10, 1
-; RV64-NEXT:    vadd.vv v10, v10, v12
-; RV64-NEXT:    li a0, -7
-; RV64-NEXT:    vnmsac.vx v8, a0, v10
-; RV64-NEXT:    ret
+; ZVE64X-LABEL: vrem_vi_nxv2i64_0:
+; ZVE64X:       # %bb.0:
+; ZVE64X-NEXT:    li a0, -7
+; ZVE64X-NEXT:    vsetvli a1, zero, e64, m2, ta, mu
+; ZVE64X-NEXT:    vrem.vx v8, v8, a0
+; ZVE64X-NEXT:    ret
+;
+; RV64-V-LABEL: vrem_vi_nxv2i64_0:
+; RV64-V:       # %bb.0:
+; RV64-V-NEXT:    lui a0, %hi(.LCPI59_0)
+; RV64-V-NEXT:    ld a0, %lo(.LCPI59_0)(a0)
+; RV64-V-NEXT:    vsetvli a1, zero, e64, m2, ta, mu
+; RV64-V-NEXT:    vmulh.vx v10, v8, a0
+; RV64-V-NEXT:    li a0, 63
+; RV64-V-NEXT:    vsrl.vx v12, v10, a0
+; RV64-V-NEXT:    vsra.vi v10, v10, 1
+; RV64-V-NEXT:    vadd.vv v10, v10, v12
+; RV64-V-NEXT:    li a0, -7
+; RV64-V-NEXT:    vnmsac.vx v8, a0, v10
+; RV64-V-NEXT:    ret
   %head = insertelement <vscale x 2 x i64> undef, i64 -7, i32 0
   %splat = shufflevector <vscale x 2 x i64> %head, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
   %vc = srem <vscale x 2 x i64> %va, %splat
@@ -1085,42 +1101,49 @@
 }
 
 define <vscale x 4 x i64> @vrem_vi_nxv4i64_0(<vscale x 4 x i64> %va) {
-; RV32-LABEL: vrem_vi_nxv4i64_0:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a0, 748983
-; RV32-NEXT:    addi a0, a0, -586
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    lui a0, 898779
-; RV32-NEXT:    addi a0, a0, 1755
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vmulh.vv v12, v8, v12
-; RV32-NEXT:    li a0, 63
-; RV32-NEXT:    vsrl.vx v16, v12, a0
-; RV32-NEXT:    vsra.vi v12, v12, 1
-; RV32-NEXT:    vadd.vv v12, v12, v16
-; RV32-NEXT:    li a0, -7
-; RV32-NEXT:    vnmsac.vx v8, a0, v12
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    ret
+; RV32-V-LABEL: vrem_vi_nxv4i64_0:
+; RV32-V:       # %bb.0:
+; RV32-V-NEXT:    addi sp, sp, -16
+; RV32-V-NEXT:    .cfi_def_cfa_offset 16
+; RV32-V-NEXT:    lui a0, 748983
+; RV32-V-NEXT:    addi a0, a0, -586
+; RV32-V-NEXT:    sw a0, 12(sp)
+; RV32-V-NEXT:    lui a0, 898779
+; RV32-V-NEXT:    addi a0, a0, 1755
+; RV32-V-NEXT:    sw a0, 8(sp)
+; RV32-V-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
+; RV32-V-NEXT:    addi a0, sp, 8
+; RV32-V-NEXT:    vlse64.v v12, (a0), zero
+; RV32-V-NEXT:    vmulh.vv v12, v8, v12
+; RV32-V-NEXT:    li a0, 63
+; RV32-V-NEXT:    vsrl.vx v16, v12, a0
+; RV32-V-NEXT:    vsra.vi v12, v12, 1
+; RV32-V-NEXT:    vadd.vv v12, v12, v16
+; RV32-V-NEXT:    li a0, -7
+; RV32-V-NEXT:    vnmsac.vx v8, a0, v12
+; RV32-V-NEXT:    addi sp, sp, 16
+; RV32-V-NEXT:    ret
 ;
-; RV64-LABEL: vrem_vi_nxv4i64_0:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a0, %hi(.LCPI62_0)
-; RV64-NEXT:    ld a0, %lo(.LCPI62_0)(a0)
-; RV64-NEXT:    vsetvli a1, zero, e64, m4, ta, mu
-; RV64-NEXT:    vmulh.vx v12, v8, a0
-; RV64-NEXT:    li a0, 63
-; RV64-NEXT:    vsrl.vx v16, v12, a0
-; RV64-NEXT:    vsra.vi v12, v12, 1
-; RV64-NEXT:    vadd.vv v12, v12, v16
-; RV64-NEXT:    li a0, -7
-; RV64-NEXT:    vnmsac.vx v8, a0, v12
-; RV64-NEXT:    ret
+; ZVE64X-LABEL: vrem_vi_nxv4i64_0:
+; ZVE64X:       # %bb.0:
+; ZVE64X-NEXT:    li a0, -7
+; ZVE64X-NEXT:    vsetvli a1, zero, e64, m4, ta, mu
+; ZVE64X-NEXT:    vrem.vx v8, v8, a0
+; ZVE64X-NEXT:    ret
+;
+; RV64-V-LABEL: vrem_vi_nxv4i64_0:
+; RV64-V:       # %bb.0:
+; RV64-V-NEXT:    lui a0, %hi(.LCPI62_0)
+; RV64-V-NEXT:    ld a0, %lo(.LCPI62_0)(a0)
+; RV64-V-NEXT:    vsetvli a1, zero, e64, m4, ta, mu
+; RV64-V-NEXT:    vmulh.vx v12, v8, a0
+; RV64-V-NEXT:    li a0, 63
+; RV64-V-NEXT:    vsrl.vx v16, v12, a0
+; RV64-V-NEXT:    vsra.vi v12, v12, 1
+; RV64-V-NEXT:    vadd.vv v12, v12, v16
+; RV64-V-NEXT:    li a0, -7
+; RV64-V-NEXT:    vnmsac.vx v8, a0, v12
+; RV64-V-NEXT:    ret
   %head = insertelement <vscale x 4 x i64> undef, i64 -7, i32 0
   %splat = shufflevector <vscale x 4 x i64> %head, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
   %vc = srem <vscale x 4 x i64> %va, %splat
@@ -1163,42 +1186,49 @@
 }
 
 define <vscale x 8 x i64> @vrem_vi_nxv8i64_0(<vscale x 8 x i64> %va) {
-; RV32-LABEL: vrem_vi_nxv8i64_0:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a0, 748983
-; RV32-NEXT:    addi a0, a0, -586
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    lui a0, 898779
-; RV32-NEXT:    addi a0, a0, 1755
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vmulh.vv v16, v8, v16
-; RV32-NEXT:    li a0, 63
-; RV32-NEXT:    vsrl.vx v24, v16, a0
-; RV32-NEXT:    vsra.vi v16, v16, 1
-; RV32-NEXT:    vadd.vv v16, v16, v24
-; RV32-NEXT:    li a0, -7
-; RV32-NEXT:    vnmsac.vx v8, a0, v16
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    ret
+; RV32-V-LABEL: vrem_vi_nxv8i64_0:
+; RV32-V:       # %bb.0:
+; RV32-V-NEXT:    addi sp, sp, -16
+; RV32-V-NEXT:    .cfi_def_cfa_offset 16
+; RV32-V-NEXT:    lui a0, 748983
+; RV32-V-NEXT:    addi a0, a0, -586
+; RV32-V-NEXT:    sw a0, 12(sp)
+; RV32-V-NEXT:    lui a0, 898779
+; RV32-V-NEXT:    addi a0, a0, 1755
+; RV32-V-NEXT:    sw a0, 8(sp)
+; RV32-V-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
+; RV32-V-NEXT:    addi a0, sp, 8
+; RV32-V-NEXT:    vlse64.v v16, (a0), zero
+; RV32-V-NEXT:    vmulh.vv v16, v8, v16
+; RV32-V-NEXT:    li a0, 63
+; RV32-V-NEXT:    vsrl.vx v24, v16, a0
+; RV32-V-NEXT:    vsra.vi v16, v16, 1
+; RV32-V-NEXT:    vadd.vv v16, v16, v24
+; RV32-V-NEXT:    li a0, -7
+; RV32-V-NEXT:    vnmsac.vx v8, a0, v16
+; RV32-V-NEXT:    addi sp, sp, 16
+; RV32-V-NEXT:    ret
 ;
-; RV64-LABEL: vrem_vi_nxv8i64_0:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a0, %hi(.LCPI65_0)
-; RV64-NEXT:    ld a0, %lo(.LCPI65_0)(a0)
-; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, mu
-; RV64-NEXT:    vmulh.vx v16, v8, a0
-; RV64-NEXT:    li a0, 63
-; RV64-NEXT:    vsrl.vx v24, v16, a0
-; RV64-NEXT:    vsra.vi v16, v16, 1
-; RV64-NEXT:    vadd.vv v16, v16, v24
-; RV64-NEXT:    li a0, -7
-; RV64-NEXT:    vnmsac.vx v8, a0, v16
-; RV64-NEXT:    ret
+; ZVE64X-LABEL: vrem_vi_nxv8i64_0:
+; ZVE64X:       # %bb.0:
+; ZVE64X-NEXT:    li a0, -7
+; ZVE64X-NEXT:    vsetvli a1, zero, e64, m8, ta, mu
+; ZVE64X-NEXT:    vrem.vx v8, v8, a0
+; ZVE64X-NEXT:    ret
+;
+; RV64-V-LABEL: vrem_vi_nxv8i64_0:
+; RV64-V:       # %bb.0:
+; RV64-V-NEXT:    lui a0, %hi(.LCPI65_0)
+; RV64-V-NEXT:    ld a0, %lo(.LCPI65_0)(a0)
+; RV64-V-NEXT:    vsetvli a1, zero, e64, m8, ta, mu
+; RV64-V-NEXT:    vmulh.vx v16, v8, a0
+; RV64-V-NEXT:    li a0, 63
+; RV64-V-NEXT:    vsrl.vx v24, v16, a0
+; RV64-V-NEXT:    vsra.vi v16, v16, 1
+; RV64-V-NEXT:    vadd.vv v16, v16, v24
+; RV64-V-NEXT:    li a0, -7
+; RV64-V-NEXT:    vnmsac.vx v8, a0, v16
+; RV64-V-NEXT:    ret
   %head = insertelement <vscale x 8 x i64> undef, i64 -7, i32 0
   %splat = shufflevector <vscale x 8 x i64> %head, <vscale x 8 x i64> undef, <vscale x 8 x i32> zeroinitializer
   %vc = srem <vscale x 8 x i64> %va, %splat
diff --git a/llvm/test/CodeGen/RISCV/rvv/vremu-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vremu-sdnode.ll
--- a/llvm/test/CodeGen/RISCV/rvv/vremu-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vremu-sdnode.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32-V
+; RUN: llc -mtriple=riscv32 -mattr=+zve64x -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,ZVE64X
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64-V
+; RUN: llc -mtriple=riscv64 -mattr=+zve64x -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,ZVE64X
 
 define <vscale x 1 x i8> @vremu_vv_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb) {
 ; CHECK-LABEL: vremu_vv_nxv1i8:
@@ -854,37 +856,44 @@
 }
 
 define <vscale x 1 x i64> @vremu_vi_nxv1i64_0(<vscale x 1 x i64> %va) {
-; RV32-LABEL: vremu_vi_nxv1i64_0:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a0, 131072
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    li a0, 1
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vlse64.v v9, (a0), zero
-; RV32-NEXT:    vmulhu.vv v9, v8, v9
-; RV32-NEXT:    li a0, 61
-; RV32-NEXT:    vsrl.vx v9, v9, a0
-; RV32-NEXT:    li a0, -7
-; RV32-NEXT:    vnmsac.vx v8, a0, v9
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    ret
+; RV32-V-LABEL: vremu_vi_nxv1i64_0:
+; RV32-V:       # %bb.0:
+; RV32-V-NEXT:    addi sp, sp, -16
+; RV32-V-NEXT:    .cfi_def_cfa_offset 16
+; RV32-V-NEXT:    lui a0, 131072
+; RV32-V-NEXT:    sw a0, 12(sp)
+; RV32-V-NEXT:    li a0, 1
+; RV32-V-NEXT:    sw a0, 8(sp)
+; RV32-V-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
+; RV32-V-NEXT:    addi a0, sp, 8
+; RV32-V-NEXT:    vlse64.v v9, (a0), zero
+; RV32-V-NEXT:    vmulhu.vv v9, v8, v9
+; RV32-V-NEXT:    li a0, 61
+; RV32-V-NEXT:    vsrl.vx v9, v9, a0
+; RV32-V-NEXT:    li a0, -7
+; RV32-V-NEXT:    vnmsac.vx v8, a0, v9
+; RV32-V-NEXT:    addi sp, sp, 16
+; RV32-V-NEXT:    ret
 ;
-; RV64-LABEL: vremu_vi_nxv1i64_0:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a0, 1
-; RV64-NEXT:    slli a0, a0, 61
-; RV64-NEXT:    addi a0, a0, 1
-; RV64-NEXT:    vsetvli a1, zero, e64, m1, ta, mu
-; RV64-NEXT:    vmulhu.vx v9, v8, a0
-; RV64-NEXT:    li a0, 61
-; RV64-NEXT:    vsrl.vx v9, v9, a0
-; RV64-NEXT:    li a0, -7
-; RV64-NEXT:    vnmsac.vx v8, a0, v9
-; RV64-NEXT:    ret
+; ZVE64X-LABEL: vremu_vi_nxv1i64_0:
+; ZVE64X:       # %bb.0:
+; ZVE64X-NEXT:    li a0, -7
+; ZVE64X-NEXT:    vsetvli a1, zero, e64, m1, ta, mu
+; ZVE64X-NEXT:    vremu.vx v8, v8, a0
+; ZVE64X-NEXT:    ret
+;
+; RV64-V-LABEL: vremu_vi_nxv1i64_0:
+; RV64-V:       # %bb.0:
+; RV64-V-NEXT:    li a0, 1
+; RV64-V-NEXT:    slli a0, a0, 61
+; RV64-V-NEXT:    addi a0, a0, 1
+; RV64-V-NEXT:    vsetvli a1, zero, e64, m1, ta, mu
+; RV64-V-NEXT:    vmulhu.vx v9, v8, a0
+; RV64-V-NEXT:    li a0, 61
+; RV64-V-NEXT:    vsrl.vx v9, v9, a0
+; RV64-V-NEXT:    li a0, -7
+; RV64-V-NEXT:    vnmsac.vx v8, a0, v9
+; RV64-V-NEXT:    ret
   %head = insertelement <vscale x 1 x i64> undef, i64 -7, i32 0
   %splat = shufflevector <vscale x 1 x i64> %head, <vscale x 1 x i64> undef, <vscale x 1 x i32> zeroinitializer
   %vc = urem <vscale x 1 x i64> %va, %splat
@@ -958,37 +967,44 @@
 }
 
 define <vscale x 2 x i64> @vremu_vi_nxv2i64_0(<vscale x 2 x i64> %va) {
-; RV32-LABEL: vremu_vi_nxv2i64_0:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a0, 131072
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    li a0, 1
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vlse64.v v10, (a0), zero
-; RV32-NEXT:    vmulhu.vv v10, v8, v10
-; RV32-NEXT:    li a0, 61
-; RV32-NEXT:    vsrl.vx v10, v10, a0
-; RV32-NEXT:    li a0, -7
-; RV32-NEXT:    vnmsac.vx v8, a0, v10
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    ret
+; RV32-V-LABEL: vremu_vi_nxv2i64_0:
+; RV32-V:       # %bb.0:
+; RV32-V-NEXT:    addi sp, sp, -16
+; RV32-V-NEXT:    .cfi_def_cfa_offset 16
+; RV32-V-NEXT:    lui a0, 131072
+; RV32-V-NEXT:    sw a0, 12(sp)
+; RV32-V-NEXT:    li a0, 1
+; RV32-V-NEXT:    sw a0, 8(sp)
+; RV32-V-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
+; RV32-V-NEXT:    addi a0, sp, 8
+; RV32-V-NEXT:    vlse64.v v10, (a0), zero
+; RV32-V-NEXT:    vmulhu.vv v10, v8, v10
+; RV32-V-NEXT:    li a0, 61
+; RV32-V-NEXT:    vsrl.vx v10, v10, a0
+; RV32-V-NEXT:    li a0, -7
+; RV32-V-NEXT:    vnmsac.vx v8, a0, v10
+; RV32-V-NEXT:    addi sp, sp, 16
+; RV32-V-NEXT:    ret
 ;
-; RV64-LABEL: vremu_vi_nxv2i64_0:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a0, 1
-; RV64-NEXT:    slli a0, a0, 61
-; RV64-NEXT:    addi a0, a0, 1
-; RV64-NEXT:    vsetvli a1, zero, e64, m2, ta, mu
-; RV64-NEXT:    vmulhu.vx v10, v8, a0
-; RV64-NEXT:    li a0, 61
-; RV64-NEXT:    vsrl.vx v10, v10, a0
-; RV64-NEXT:    li a0, -7
-; RV64-NEXT:    vnmsac.vx v8, a0, v10
-; RV64-NEXT:    ret
+; ZVE64X-LABEL: vremu_vi_nxv2i64_0:
+; ZVE64X:       # %bb.0:
+; ZVE64X-NEXT:    li a0, -7
+; ZVE64X-NEXT:    vsetvli a1, zero, e64, m2, ta, mu
+; ZVE64X-NEXT:    vremu.vx v8, v8, a0
+; ZVE64X-NEXT:    ret
+;
+; RV64-V-LABEL: vremu_vi_nxv2i64_0:
+; RV64-V:       # %bb.0:
+; RV64-V-NEXT:    li a0, 1
+; RV64-V-NEXT:    slli a0, a0, 61
+; RV64-V-NEXT:    addi a0, a0, 1
+; RV64-V-NEXT:    vsetvli a1, zero, e64, m2, ta, mu
+; RV64-V-NEXT:    vmulhu.vx v10, v8, a0
+; RV64-V-NEXT:    li a0, 61
+; RV64-V-NEXT:    vsrl.vx v10, v10, a0
+; RV64-V-NEXT:    li a0, -7
+; RV64-V-NEXT:    vnmsac.vx v8, a0, v10
+; RV64-V-NEXT:    ret
   %head = insertelement <vscale x 2 x i64> undef, i64 -7, i32 0
   %splat = shufflevector <vscale x 2 x i64> %head, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
   %vc = urem <vscale x 2 x i64> %va, %splat
@@ -1062,37 +1078,44 @@
 }
 
 define <vscale x 4 x i64> @vremu_vi_nxv4i64_0(<vscale x 4 x i64> %va) {
-; RV32-LABEL: vremu_vi_nxv4i64_0:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a0, 131072
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    li a0, 1
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vmulhu.vv v12, v8, v12
-; RV32-NEXT:    li a0, 61
-; RV32-NEXT:    vsrl.vx v12, v12, a0
-; RV32-NEXT:    li a0, -7
-; RV32-NEXT:    vnmsac.vx v8, a0, v12
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    ret
+; RV32-V-LABEL: vremu_vi_nxv4i64_0:
+; RV32-V:       # %bb.0:
+; RV32-V-NEXT:    addi sp, sp, -16
+; RV32-V-NEXT:    .cfi_def_cfa_offset 16
+; RV32-V-NEXT:    lui a0, 131072
+; RV32-V-NEXT:    sw a0, 12(sp)
+; RV32-V-NEXT:    li a0, 1
+; RV32-V-NEXT:    sw a0, 8(sp)
+; RV32-V-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
+; RV32-V-NEXT:    addi a0, sp, 8
+; RV32-V-NEXT:    vlse64.v v12, (a0), zero
+; RV32-V-NEXT:    vmulhu.vv v12, v8, v12
+; RV32-V-NEXT:    li a0, 61
+; RV32-V-NEXT:    vsrl.vx v12, v12, a0
+; RV32-V-NEXT:    li a0, -7
+; RV32-V-NEXT:    vnmsac.vx v8, a0, v12
+; RV32-V-NEXT:    addi sp, sp, 16
+; RV32-V-NEXT:    ret
 ;
-; RV64-LABEL: vremu_vi_nxv4i64_0:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a0, 1
-; RV64-NEXT:    slli a0, a0, 61
-; RV64-NEXT:    addi a0, a0, 1
-; RV64-NEXT:    vsetvli a1, zero, e64, m4, ta, mu
-; RV64-NEXT:    vmulhu.vx v12, v8, a0
-; RV64-NEXT:    li a0, 61
-; RV64-NEXT:    vsrl.vx v12, v12, a0
-; RV64-NEXT:    li a0, -7
-; RV64-NEXT:    vnmsac.vx v8, a0, v12
-; RV64-NEXT:    ret
+; ZVE64X-LABEL: vremu_vi_nxv4i64_0:
+; ZVE64X:       # %bb.0:
+; ZVE64X-NEXT:    li a0, -7
+; ZVE64X-NEXT:    vsetvli a1, zero, e64, m4, ta, mu
+; ZVE64X-NEXT:    vremu.vx v8, v8, a0
+; ZVE64X-NEXT:    ret
+;
+; RV64-V-LABEL: vremu_vi_nxv4i64_0:
+; RV64-V:       # %bb.0:
+; RV64-V-NEXT:    li a0, 1
+; RV64-V-NEXT:    slli a0, a0, 61
+; RV64-V-NEXT:    addi a0, a0, 1
+; RV64-V-NEXT:    vsetvli a1, zero, e64, m4, ta, mu
+; RV64-V-NEXT:    vmulhu.vx v12, v8, a0
+; RV64-V-NEXT:    li a0, 61
+; RV64-V-NEXT:    vsrl.vx v12, v12, a0
+; RV64-V-NEXT:    li a0, -7
+; RV64-V-NEXT:    vnmsac.vx v8, a0, v12
+; RV64-V-NEXT:    ret
   %head = insertelement <vscale x 4 x i64> undef, i64 -7, i32 0
   %splat = shufflevector <vscale x 4 x i64> %head, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
   %vc = urem <vscale x 4 x i64> %va, %splat
@@ -1166,37 +1189,44 @@
 }
 
 define <vscale x 8 x i64> @vremu_vi_nxv8i64_0(<vscale x 8 x i64> %va) {
-; RV32-LABEL: vremu_vi_nxv8i64_0:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a0, 131072
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    li a0, 1
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vmulhu.vv v16, v8, v16
-; RV32-NEXT:    li a0, 61
-; RV32-NEXT:    vsrl.vx v16, v16, a0
-; RV32-NEXT:    li a0, -7
-; RV32-NEXT:    vnmsac.vx v8, a0, v16
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    ret
+; RV32-V-LABEL: vremu_vi_nxv8i64_0:
+; RV32-V:       # %bb.0:
+; RV32-V-NEXT:    addi sp, sp, -16
+; RV32-V-NEXT:    .cfi_def_cfa_offset 16
+; RV32-V-NEXT:    lui a0, 131072
+; RV32-V-NEXT:    sw a0, 12(sp)
+; RV32-V-NEXT:    li a0, 1
+; RV32-V-NEXT:    sw a0, 8(sp)
+; RV32-V-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
+; RV32-V-NEXT:    addi a0, sp, 8
+; RV32-V-NEXT:    vlse64.v v16, (a0), zero
+; RV32-V-NEXT:    vmulhu.vv v16, v8, v16
+; RV32-V-NEXT:    li a0, 61
+; RV32-V-NEXT:    vsrl.vx v16, v16, a0
+; RV32-V-NEXT:    li a0, -7
+; RV32-V-NEXT:    vnmsac.vx v8, a0, v16
+; RV32-V-NEXT:    addi sp, sp, 16
+; RV32-V-NEXT:    ret
 ;
-; RV64-LABEL: vremu_vi_nxv8i64_0:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a0, 1
-; RV64-NEXT:    slli a0, a0, 61
-; RV64-NEXT:    addi a0, a0, 1
-; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, mu
-; RV64-NEXT:    vmulhu.vx v16, v8, a0
-; RV64-NEXT:    li a0, 61
-; RV64-NEXT:    vsrl.vx v16, v16, a0
-; RV64-NEXT:    li a0, -7
-; RV64-NEXT:    vnmsac.vx v8, a0, v16
-; RV64-NEXT:    ret
+; ZVE64X-LABEL: vremu_vi_nxv8i64_0:
+; ZVE64X:       # %bb.0:
+; ZVE64X-NEXT:    li a0, -7
+; ZVE64X-NEXT:    vsetvli a1, zero, e64, m8, ta, mu
+; ZVE64X-NEXT:    vremu.vx v8, v8, a0
+; ZVE64X-NEXT:    ret
+;
+; RV64-V-LABEL: vremu_vi_nxv8i64_0:
+; RV64-V:       # %bb.0:
+; RV64-V-NEXT:    li a0, 1
+; RV64-V-NEXT:    slli a0, a0, 61
+; RV64-V-NEXT:    addi a0, a0, 1
+; RV64-V-NEXT:    vsetvli a1, zero, e64, m8, ta, mu
+; RV64-V-NEXT:    vmulhu.vx v16, v8, a0
+; RV64-V-NEXT:    li a0, 61
+; RV64-V-NEXT:    vsrl.vx v16, v16, a0
+; RV64-V-NEXT:    li a0, -7
+; RV64-V-NEXT:    vnmsac.vx v8, a0, v16
+; RV64-V-NEXT:    ret
   %head = insertelement <vscale x 8 x i64> undef, i64 -7, i32 0
   %splat = shufflevector <vscale x 8 x i64> %head, <vscale x 8 x i64> undef, <vscale x 8 x i32> zeroinitializer
   %vc = urem <vscale x 8 x i64> %va, %splat
diff --git a/llvm/test/Transforms/IROutliner/gvn-output-set-overload.ll b/llvm/test/Transforms/IROutliner/gvn-output-set-overload.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/IROutliner/gvn-output-set-overload.ll
@@ -0,0 +1,122 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s
+
+; Show that we do differentiate between outputs of the region stored in PHINodes
+; versus those stored outside of PHINodes.
+
+define void @function1(i32* %a, i32* %b) {
+entry:
+  %0 = alloca i32, align 4
+  %c = load i32, i32* %0, align 4
+  br label %test1
+test1:
+  %e = load i32, i32* %0, align 4
+  br i1 true, label %first, label %test
+test:
+  %d = load i32, i32* %0, align 4
+  br i1 true, label %first, label %next
+first:
+  %1 = phi i32 [ %c, %test ], [ %e, %test1 ]
+  ret void
+next:
+  ret void
+}
+
+define void @function2(i32* %a, i32* %b) {
+entry:
+  %0 = alloca i32, align 4
+  %c = load i32, i32* %0, align 4
+  br label %test1
+test1:
+  %e = load i32, i32* %0, align 4
+  br i1 true, label %first, label %test
+test:
+  %d = load i32, i32* %0, align 4
+  br i1 true, label %first, label %next
+first:
+  ret void
+next:
+  %1 = add i32 %c, 1
+  %2 = add i32 %e, 1
+  ret void
+}
+; CHECK-LABEL: @function1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTCE_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[DOTCE_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @outlined_ir_func_0(i32* [[TMP0]], i32* [[DOTCE_LOC]], i32* null, i32 0)
+; CHECK-NEXT:    [[DOTCE_RELOAD:%.*]] = load i32, i32* [[DOTCE_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    br i1 [[TMP1]], label [[FIRST:%.*]], label [[NEXT:%.*]]
+; CHECK:       first:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i32 [ [[DOTCE_RELOAD]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret void
+; CHECK:       next:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: @function2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[E_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[C_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[C_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[LT_CAST1:%.*]] = bitcast i32* [[E_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @outlined_ir_func_0(i32* [[TMP0]], i32* [[C_LOC]], i32* [[E_LOC]], i32 1)
+; CHECK-NEXT:    [[C_RELOAD:%.*]] = load i32, i32* [[C_LOC]], align 4
+; CHECK-NEXT:    [[E_RELOAD:%.*]] = load i32, i32* [[E_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    br i1 [[TMP1]], label [[FIRST:%.*]], label [[NEXT:%.*]]
+; CHECK:       first:
+; CHECK-NEXT:    ret void
+; CHECK:       next:
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[C_RELOAD]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[E_RELOAD]], 1
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define internal i1 @outlined_ir_func_0(
+; CHECK-NEXT:  newFuncRoot:
+; CHECK-NEXT:    br label [[ENTRY_TO_OUTLINE:%.*]]
+; CHECK:       entry_to_outline:
+; CHECK-NEXT:    [[C:%.*]] = load i32, i32* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    br label [[TEST1:%.*]]
+; CHECK:       test1:
+; CHECK-NEXT:    [[E:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    br i1 true, label [[FIRST_SPLIT:%.*]], label [[TEST:%.*]]
+; CHECK:       test:
+; CHECK-NEXT:    [[D:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    br i1 true, label [[FIRST_SPLIT]], label [[NEXT_EXITSTUB:%.*]]
+; CHECK:       first.split:
+; CHECK-NEXT:    [[DOTCE:%.*]] = phi i32 [ [[C]], [[TEST]] ], [ [[E]], [[TEST1]] ]
+; CHECK-NEXT:    br label [[FIRST_EXITSTUB:%.*]]
+; CHECK:       first.exitStub:
+; CHECK-NEXT:    switch i32 [[TMP3:%.*]], label [[FINAL_BLOCK_1:%.*]] [
+; CHECK-NEXT:    i32 0, label [[OUTPUT_BLOCK_0_1:%.*]]
+; CHECK-NEXT:    i32 1, label [[OUTPUT_BLOCK_1_1:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       next.exitStub:
+; CHECK-NEXT:    switch i32 [[TMP3]], label [[FINAL_BLOCK_0:%.*]] [
+; CHECK-NEXT:    i32 0, label [[OUTPUT_BLOCK_1_0:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       output_block_0_1:
+; CHECK-NEXT:    store i32 [[DOTCE]], i32* [[TMP1:%.*]], align 4
+; CHECK-NEXT:    br label [[FINAL_BLOCK_1]]
+; CHECK:       output_block_1_0:
+; CHECK-NEXT:    store i32 [[C]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    store i32 [[E]], i32* [[TMP2:%.*]], align 4
+; CHECK-NEXT:    br label [[FINAL_BLOCK_0]]
+; CHECK:       output_block_1_1:
+; CHECK-NEXT:    store i32 [[C]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    store i32 [[E]], i32* [[TMP2]], align 4
+; CHECK-NEXT:    br label [[FINAL_BLOCK_1]]
+; CHECK:       final_block_0:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       final_block_1:
+; CHECK-NEXT:    ret i1 true
+;
diff --git a/llvm/test/Transforms/IROutliner/illegal-returns-twice.ll b/llvm/test/Transforms/IROutliner/illegal-returns-twice.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/IROutliner/illegal-returns-twice.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s
+
+; This test checks that we do not outline functions that are marked as returns
+; twice, since these can alter the frame of the function and affect how the
+; outliner behaves, causing miscompiles.
+
+; Function Attrs: optsize returns_twice
+declare i32 @setjmp(i32*) local_unnamed_addr #1
+@tmp_jmpb = global [37 x i32] zeroinitializer, align 16
+
+define void @function1() {
+; CHECK-LABEL: @function1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[C:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 2, i32* [[A]], align 4
+; CHECK-NEXT:    store i32 3, i32* [[B]], align 4
+; CHECK-NEXT:    store i32 4, i32* [[C]], align 4
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @setjmp(i32* getelementptr inbounds ([37 x i32], [37 x i32]* @tmp_jmpb, i64 0, i64 0))
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[C]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  %c = alloca i32, align 4
+  store i32 2, i32* %a, align 4
+  store i32 3, i32* %b, align 4
+  store i32 4, i32* %c, align 4
+  %call = call i32 @setjmp(i32* getelementptr inbounds ([37 x i32], [37 x i32]* @tmp_jmpb, i64 0, i64 0))
+  %al = load i32, i32* %a
+  %bl = load i32, i32* %b
+  %cl = load i32, i32* %c
+  ret void
+}
+
+define void @function2() {
+; CHECK-LABEL: @function2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[C:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 2, i32* [[A]], align 4
+; CHECK-NEXT:    store i32 3, i32* [[B]], align 4
+; CHECK-NEXT:    store i32 4, i32* [[C]], align 4
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @setjmp(i32* getelementptr inbounds ([37 x i32], [37 x i32]* @tmp_jmpb, i64 0, i64 0))
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[C]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  %c = alloca i32, align 4
+  store i32 2, i32* %a, align 4
+  store i32 3, i32* %b, align 4
+  store i32 4, i32* %c, align 4
+  %call = call i32 @setjmp(i32* getelementptr inbounds ([37 x i32], [37 x i32]* @tmp_jmpb, i64 0, i64 0))
+  %al = load i32, i32* %a
+  %bl = load i32, i32* %b
+  %cl = load i32, i32* %c
+  ret void
+}
+
+attributes #1 = { optsize returns_twice }
diff --git a/llvm/test/Transforms/IROutliner/mismatched-phi-exits-not-in-first-outlined.ll b/llvm/test/Transforms/IROutliner/mismatched-phi-exits-not-in-first-outlined.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/IROutliner/mismatched-phi-exits-not-in-first-outlined.ll
@@ -0,0 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s
+
+; Show that we are able to extract blocks that contain PHINodes, and selectively
+; store into it's respective block, creating a new block if needed.
+
+define void @function1(i32* %a, i32* %b) {
+entry:
+  %0 = alloca i32, align 4
+  %c = load i32, i32* %0, align 4
+  br label %test1
+test1:
+  %e = load i32, i32* %0, align 4
+  br label %first
+test:
+  %d = load i32, i32* %0, align 4
+  br label %first
+first:
+  ret void
+}
+
+define void @function2(i32* %a, i32* %b) {
+entry:
+  %0 = alloca i32, align 4
+  %c = load i32, i32* %0, align 4
+  br label %test1
+test1:
+  %e = load i32, i32* %0, align 4
+  br label %first
+test:
+  %d = load i32, i32* %0, align 4
+  br label %first
+first:
+  %1 = phi i32 [ %c, %test ], [ %e, %test1 ]
+  ret void
+}
+; CHECK-LABEL: @function1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[TMP0]], i32* null, i32 -1)
+; CHECK-NEXT:    br label [[FIRST:%.*]]
+; CHECK:       first:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: @function2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTCE_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[DOTCE_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[TMP0]], i32* [[DOTCE_LOC]], i32 0)
+; CHECK-NEXT:    [[DOTCE_RELOAD:%.*]] = load i32, i32* [[DOTCE_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    br label [[FIRST:%.*]]
+; CHECK:       first:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[DOTCE_RELOAD]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define internal void @outlined_ir_func_0(
+; CHECK-NEXT:  newFuncRoot:
+; CHECK-NEXT:    br label [[ENTRY_TO_OUTLINE:%.*]]
+; CHECK:       entry_to_outline:
+; CHECK-NEXT:    [[C:%.*]] = load i32, i32* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    br label [[TEST1:%.*]]
+; CHECK:       test1:
+; CHECK-NEXT:    [[E:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    br label [[PHI_BLOCK:%.*]]
+; CHECK:       test:
+; CHECK-NEXT:    [[D:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    br label [[PHI_BLOCK]]
+; CHECK:       first.exitStub:
+; CHECK-NEXT:    switch i32 [[TMP2:%.*]], label [[FINAL_BLOCK_0:%.*]] [
+; CHECK-NEXT:    i32 0, label [[OUTPUT_BLOCK_1_0:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       output_block_1_0:
+; CHECK-NEXT:    store i32 [[TMP3:%.*]], i32* [[TMP1:%.*]], align 4
+; CHECK-NEXT:    br label [[FINAL_BLOCK_0]]
+; CHECK:       phi_block:
+; CHECK-NEXT:    [[TMP3]] = phi i32 [ [[C]], [[TEST:%.*]] ], [ [[E]], [[TEST1]] ]
+; CHECK-NEXT:    br label [[FIRST_EXITSTUB:%.*]]
+; CHECK:       final_block_0:
+; CHECK-NEXT:    ret void
+;
diff --git a/llvm/test/Transforms/IROutliner/mismatched-phi-exits.ll b/llvm/test/Transforms/IROutliner/mismatched-phi-exits.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/IROutliner/mismatched-phi-exits.ll
@@ -0,0 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s
+
+; Show that we are able to extract blocks that contain PHINodes, and selectively
+; store into it's respective block, only using if needed.
+
+define void @function1(i32* %a, i32* %b) {
+entry:
+  %0 = alloca i32, align 4
+  %c = load i32, i32* %0, align 4
+  br label %test1
+test1:
+  %e = load i32, i32* %0, align 4
+  br label %first
+test:
+  %d = load i32, i32* %0, align 4
+  br label %first
+first:
+  %1 = phi i32 [ %c, %test ], [ %e, %test1 ]
+  ret void
+}
+
+define void @function2(i32* %a, i32* %b) {
+entry:
+  %0 = alloca i32, align 4
+  %c = load i32, i32* %0, align 4
+  br label %test1
+test1:
+  %e = load i32, i32* %0, align 4
+  br label %first
+test:
+  %d = load i32, i32* %0, align 4
+  br label %first
+first:
+  ret void
+}
+; CHECK-LABEL: @function1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTCE_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[DOTCE_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[TMP0]], i32* [[DOTCE_LOC]], i32 0)
+; CHECK-NEXT:    [[DOTCE_RELOAD:%.*]] = load i32, i32* [[DOTCE_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    br label [[FIRST:%.*]]
+; CHECK:       first:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[DOTCE_RELOAD]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: @function2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[TMP0]], i32* null, i32 -1)
+; CHECK-NEXT:    br label [[FIRST:%.*]]
+; CHECK:       first:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define internal void  @outlined_ir_func_0(
+; CHECK-NEXT:  newFuncRoot:
+; CHECK-NEXT:    br label [[ENTRY_TO_OUTLINE:%.*]]
+; CHECK:       entry_to_outline:
+; CHECK-NEXT:    [[C:%.*]] = load i32, i32* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    br label [[TEST1:%.*]]
+; CHECK:       test1:
+; CHECK-NEXT:    [[E:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    br label [[FIRST_SPLIT:%.*]]
+; CHECK:       test:
+; CHECK-NEXT:    [[D:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    br label [[FIRST_SPLIT]]
+; CHECK:       first.split:
+; CHECK-NEXT:    [[DOTCE:%.*]] = phi i32 [ [[C]], [[TEST:%.*]] ], [ [[E]], [[TEST1]] ]
+; CHECK-NEXT:    br label [[FIRST_EXITSTUB:%.*]]
+; CHECK:       first.exitStub:
+; CHECK-NEXT:    switch i32 [[TMP2:%.*]], label [[FINAL_BLOCK_0:%.*]] [
+; CHECK-NEXT:    i32 0, label [[OUTPUT_BLOCK_0_0:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       output_block_0_0:
+; CHECK-NEXT:    store i32 [[DOTCE]], i32* [[TMP1:%.*]], align 4
+; CHECK-NEXT:    br label [[FINAL_BLOCK_0]]
+; CHECK:       final_block_0:
+; CHECK-NEXT:    ret void
+;
diff --git a/llvm/test/Transforms/IROutliner/mismatched-phi-outputs-ordering.ll b/llvm/test/Transforms/IROutliner/mismatched-phi-outputs-ordering.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/IROutliner/mismatched-phi-outputs-ordering.ll
@@ -0,0 +1,150 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s
+
+; Show that we do not extract similar regions that would involve the splitting
+; of phi nodes on exit.
+
+define void @function1(i32* %a, i32* %b) {
+entry:
+  %0 = alloca i32, align 4
+  %c = load i32, i32* %0, align 4
+  br label %test1
+test1:
+  %e = load i32, i32* %0, align 4
+  br i1 true, label %first, label %test
+test:
+  %d = load i32, i32* %0, align 4
+  br i1 true, label %first, label %next
+first:
+  %1 = phi i32 [ %c, %test ], [ %e, %test1 ]
+  ret void
+next:
+  %2 = add i32 %d, 1
+  %3 = add i32 %e, 1
+  ret void
+}
+
+define void @function2(i32* %a, i32* %b) {
+entry:
+  %0 = alloca i32, align 4
+  %c = load i32, i32* %0, align 4
+  br label %test1
+test1:
+  %e = load i32, i32* %0, align 4
+  br i1 true, label %first, label %test
+test:
+  %d = load i32, i32* %0, align 4
+  br i1 true, label %first, label %next
+first:
+  ret void
+next:
+  %1 = add i32 %d, 1
+  %2 = add i32 %e, 1
+  ret void
+}
+; CHECK-LABEL: @function1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTCE_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[D_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[E_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[E_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[LT_CAST1:%.*]] = bitcast i32* [[D_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    [[LT_CAST2:%.*]] = bitcast i32* [[DOTCE_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST2]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @outlined_ir_func_0(i32* [[TMP0]], i32* [[E_LOC]], i32* [[D_LOC]], i32* [[DOTCE_LOC]], i32 0)
+; CHECK-NEXT:    [[E_RELOAD:%.*]] = load i32, i32* [[E_LOC]], align 4
+; CHECK-NEXT:    [[D_RELOAD:%.*]] = load i32, i32* [[D_LOC]], align 4
+; CHECK-NEXT:    [[DOTCE_RELOAD:%.*]] = load i32, i32* [[DOTCE_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST2]])
+; CHECK-NEXT:    br i1 [[TMP1]], label [[FIRST:%.*]], label [[NEXT:%.*]]
+; CHECK:       first:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i32 [ [[DOTCE_RELOAD]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret void
+; CHECK:       next:
+; CHECK-NEXT:    call void @outlined_ir_func_1(i32 [[D_RELOAD]], i32 [[E_RELOAD]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: @function2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[D_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[E_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[E_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[LT_CAST1:%.*]] = bitcast i32* [[D_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @outlined_ir_func_0(i32* [[TMP0]], i32* [[E_LOC]], i32* [[D_LOC]], i32* null, i32 1)
+; CHECK-NEXT:    [[E_RELOAD:%.*]] = load i32, i32* [[E_LOC]], align 4
+; CHECK-NEXT:    [[D_RELOAD:%.*]] = load i32, i32* [[D_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    br i1 [[TMP1]], label [[FIRST:%.*]], label [[NEXT:%.*]]
+; CHECK:       first:
+; CHECK-NEXT:    ret void
+; CHECK:       next:
+; CHECK-NEXT:    call void @outlined_ir_func_1(i32 [[D_RELOAD]], i32 [[E_RELOAD]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define internal i1 @outlined_ir_func_0(
+; CHECK-NEXT:  newFuncRoot:
+; CHECK-NEXT:    br label [[ENTRY_TO_OUTLINE:%.*]]
+; CHECK:       entry_to_outline:
+; CHECK-NEXT:    [[C:%.*]] = load i32, i32* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    br label [[TEST1:%.*]]
+; CHECK:       test1:
+; CHECK-NEXT:    [[E:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    br i1 true, label [[FIRST_SPLIT:%.*]], label [[TEST:%.*]]
+; CHECK:       test:
+; CHECK-NEXT:    [[D:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    br i1 true, label [[FIRST_SPLIT]], label [[NEXT_EXITSTUB:%.*]]
+; CHECK:       first.split:
+; CHECK-NEXT:    [[DOTCE:%.*]] = phi i32 [ [[C]], [[TEST]] ], [ [[E]], [[TEST1]] ]
+; CHECK-NEXT:    br label [[FIRST_EXITSTUB:%.*]]
+; CHECK:       first.exitStub:
+; CHECK-NEXT:    switch i32 [[TMP4:%.*]], label [[FINAL_BLOCK_1:%.*]] [
+; CHECK-NEXT:    i32 0, label [[OUTPUT_BLOCK_0_1:%.*]]
+; CHECK-NEXT:    i32 1, label [[OUTPUT_BLOCK_1_1:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       next.exitStub:
+; CHECK-NEXT:    switch i32 [[TMP4]], label [[FINAL_BLOCK_0:%.*]] [
+; CHECK-NEXT:    i32 0, label [[OUTPUT_BLOCK_0_0:%.*]]
+; CHECK-NEXT:    i32 1, label [[OUTPUT_BLOCK_1_0:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       output_block_0_0:
+; CHECK-NEXT:    store i32 [[E]], i32* [[TMP1:%.*]], align 4
+; CHECK-NEXT:    store i32 [[D]], i32* [[TMP2:%.*]], align 4
+; CHECK-NEXT:    br label [[FINAL_BLOCK_0]]
+; CHECK:       output_block_0_1:
+; CHECK-NEXT:    store i32 [[E]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    store i32 [[DOTCE]], i32* [[TMP3:%.*]], align 4
+; CHECK-NEXT:    br label [[FINAL_BLOCK_1]]
+; CHECK:       output_block_1_0:
+; CHECK-NEXT:    store i32 [[E]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    store i32 [[D]], i32* [[TMP2]], align 4
+; CHECK-NEXT:    br label [[FINAL_BLOCK_0]]
+; CHECK:       output_block_1_1:
+; CHECK-NEXT:    store i32 [[E]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    br label [[FINAL_BLOCK_1]]
+; CHECK:       final_block_0:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       final_block_1:
+; CHECK-NEXT:    ret i1 true
+;
+;
+; CHECK-LABEL: @outlined_ir_func_1(
+; CHECK-NEXT:  newFuncRoot:
+; CHECK-NEXT:    br label [[NEXT_TO_OUTLINE:%.*]]
+; CHECK:       next_to_outline:
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP0:%.*]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP1:%.*]], 1
+; CHECK-NEXT:    br label [[NEXT_AFTER_OUTLINE_EXITSTUB:%.*]]
+; CHECK:       next_after_outline.exitStub:
+; CHECK-NEXT:    ret void
+;
diff --git a/llvm/test/Transforms/IROutliner/outlining-branches-phi-nodes.ll b/llvm/test/Transforms/IROutliner/outlining-branches-phi-nodes.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/IROutliner/outlining-branches-phi-nodes.ll
@@ -0,0 +1,165 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s
+
+; Here we have multiple exits, but the different sources, same outputs are
+; needed, this checks that they are compressed, and moved into the appropriate
+; output blocks.
+
+define void @outline_outputs1() #0 {
+entry:
+  %output = alloca i32, align 4
+  %result = alloca i32, align 4
+  %output2 = alloca i32, align 4
+  %result2 = alloca i32, align 4
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  br label %block_2
+block_1:
+  %a2 = alloca i32, align 4
+  %b2 = alloca i32, align 4
+  br label %block_2
+block_2:
+  %a2val = load i32, i32* %a
+  %b2val = load i32, i32* %b
+  %add2 = add i32 2, %a2val
+  %mul2 = mul i32 2, %b2val
+  br label %block_5
+block_3:
+  %aval = load i32, i32* %a
+  %bval = load i32, i32* %b
+  %add = add i32 2, %aval
+  %mul = mul i32 2, %bval
+  br label %block_4
+block_4:
+  store i32 %add, i32* %output, align 4
+  store i32 %mul, i32* %result, align 4
+  br label %block_6
+block_5:
+  store i32 %add2, i32* %output, align 4
+  store i32 %mul2, i32* %result, align 4
+  br label %block_6
+block_6:
+  %diff = phi i32 [%aval, %block_4], [%a2val, %block_5]
+  ret void
+}
+
+define void @outline_outputs2() #0 {
+entry:
+  %output = alloca i32, align 4
+  %result = alloca i32, align 4
+  %output2 = alloca i32, align 4
+  %result2 = alloca i32, align 4
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  br label %block_2
+block_1:
+  %a2 = alloca i32, align 4
+  %b2 = alloca i32, align 4
+  br label %block_2
+block_2:
+  %a2val = load i32, i32* %a
+  %b2val = load i32, i32* %b
+  %add2 = add i32 2, %a2val
+  %mul2 = mul i32 2, %b2val
+  br label %block_5
+block_3:
+  %aval = load i32, i32* %a
+  %bval = load i32, i32* %b
+  %add = add i32 2, %aval
+  %mul = mul i32 2, %bval
+  br label %block_4
+block_4:
+  store i32 %add, i32* %output, align 4
+  store i32 %mul, i32* %result, align 4
+  br label %block_6
+block_5:
+  store i32 %add2, i32* %output, align 4
+  store i32 %mul2, i32* %result, align 4
+  br label %block_6
+block_6:
+  %diff = phi i32 [%aval, %block_4], [%a2val, %block_5]
+  ret void
+}
+
+; CHECK-LABEL: @outline_outputs1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DIFF_CE_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br label [[BLOCK_2:%.*]]
+; CHECK:       block_1:
+; CHECK-NEXT:    [[A2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br label [[BLOCK_2]]
+; CHECK:       block_2:
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[DIFF_CE_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[RESULT]], i32* [[DIFF_CE_LOC]])
+; CHECK-NEXT:    [[DIFF_CE_RELOAD:%.*]] = load i32, i32* [[DIFF_CE_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    br label [[BLOCK_6:%.*]]
+; CHECK:       block_6:
+; CHECK-NEXT:    [[DIFF:%.*]] = phi i32 [ [[DIFF_CE_RELOAD]], [[BLOCK_2]] ]
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: @outline_outputs2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DIFF_CE_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br label [[BLOCK_2:%.*]]
+; CHECK:       block_1:
+; CHECK-NEXT:    [[A2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br label [[BLOCK_2]]
+; CHECK:       block_2:
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[DIFF_CE_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[RESULT]], i32* [[DIFF_CE_LOC]])
+; CHECK-NEXT:    [[DIFF_CE_RELOAD:%.*]] = load i32, i32* [[DIFF_CE_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    br label [[BLOCK_6:%.*]]
+; CHECK:       block_6:
+; CHECK-NEXT:    [[DIFF:%.*]] = phi i32 [ [[DIFF_CE_RELOAD]], [[BLOCK_2]] ]
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK:  define internal void @outlined_ir_func_0(
+; CHECK-NEXT:  newFuncRoot:
+; CHECK-NEXT:    br label [[BLOCK_2_TO_OUTLINE:%.*]]
+; CHECK:       block_2_to_outline:
+; CHECK-NEXT:    [[A2VAL:%.*]] = load i32, i32* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    [[B2VAL:%.*]] = load i32, i32* [[TMP1:%.*]], align 4
+; CHECK-NEXT:    [[ADD2:%.*]] = add i32 2, [[A2VAL]]
+; CHECK-NEXT:    [[MUL2:%.*]] = mul i32 2, [[B2VAL]]
+; CHECK-NEXT:    br label [[BLOCK_5:%.*]]
+; CHECK:       block_3:
+; CHECK-NEXT:    [[AVAL:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    [[BVAL:%.*]] = load i32, i32* [[TMP1]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 2, [[AVAL]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 2, [[BVAL]]
+; CHECK-NEXT:    br label [[BLOCK_4:%.*]]
+; CHECK:       block_4:
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[TMP2:%.*]], align 4
+; CHECK-NEXT:    store i32 [[MUL]], i32* [[TMP3:%.*]], align 4
+; CHECK-NEXT:    br label [[BLOCK_6_SPLIT:%.*]]
+; CHECK:       block_5:
+; CHECK-NEXT:    store i32 [[ADD2]], i32* [[TMP2]], align 4
+; CHECK-NEXT:    store i32 [[MUL2]], i32* [[TMP3]], align 4
+; CHECK-NEXT:    br label [[BLOCK_6_SPLIT]]
+; CHECK:       block_6.split:
+; CHECK-NEXT:    [[DIFF_CE:%.*]] = phi i32 [ [[AVAL]], [[BLOCK_4]] ], [ [[A2VAL]], [[BLOCK_5]] ]
+; CHECK-NEXT:    br label [[BLOCK_6_EXITSTUB:%.*]]
+; CHECK:       block_6.exitStub:
+; CHECK-NEXT:    store i32 [[DIFF_CE]], i32* [[TMP4:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
diff --git a/llvm/test/Transforms/IROutliner/outlining-exits-to-phi-node.ll b/llvm/test/Transforms/IROutliner/outlining-exits-to-phi-node.ll
--- a/llvm/test/Transforms/IROutliner/outlining-exits-to-phi-node.ll
+++ b/llvm/test/Transforms/IROutliner/outlining-exits-to-phi-node.ll
@@ -37,42 +37,50 @@
 }
 ; CHECK-LABEL: @function1(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTCE_LOC:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[C:%.*]] = load i32, i32* [[TMP0]], align 4
-; CHECK-NEXT:    br label [[TEST1:%.*]]
-; CHECK:       test1:
-; CHECK-NEXT:    [[E:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[DOTCE_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[TMP0]], i32* [[DOTCE_LOC]])
+; CHECK-NEXT:    [[DOTCE_RELOAD:%.*]] = load i32, i32* [[DOTCE_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
 ; CHECK-NEXT:    br label [[FIRST:%.*]]
-; CHECK:       test:
-; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[TMP0]])
-; CHECK-NEXT:    br label [[FIRST]]
 ; CHECK:       first:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[C]], [[TEST:%.*]] ], [ [[E]], [[TEST1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[DOTCE_RELOAD]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: @function2(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTCE_LOC:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[C:%.*]] = load i32, i32* [[TMP0]], align 4
-; CHECK-NEXT:    br label [[TEST1:%.*]]
-; CHECK:       test1:
-; CHECK-NEXT:    [[E:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[DOTCE_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[TMP0]], i32* [[DOTCE_LOC]])
+; CHECK-NEXT:    [[DOTCE_RELOAD:%.*]] = load i32, i32* [[DOTCE_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
 ; CHECK-NEXT:    br label [[FIRST:%.*]]
-; CHECK:       test:
-; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[TMP0]])
-; CHECK-NEXT:    br label [[FIRST]]
 ; CHECK:       first:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[C]], [[TEST:%.*]] ], [ [[E]], [[TEST1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[DOTCE_RELOAD]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: define internal void @outlined_ir_func_0(
+; CHECK-LABEL: define internal void @outlined_ir_func_0(
 ; CHECK-NEXT:  newFuncRoot:
-; CHECK-NEXT:    br label [[TEST_TO_OUTLINE:%.*]]
-; CHECK:       test_to_outline:
-; CHECK-NEXT:    [[D:%.*]] = load i32, i32* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    br label [[ENTRY_TO_OUTLINE:%.*]]
+; CHECK:       entry_to_outline:
+; CHECK-NEXT:    [[C:%.*]] = load i32, i32* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    br label [[TEST1:%.*]]
+; CHECK:       test1:
+; CHECK-NEXT:    [[E:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    br label [[FIRST_SPLIT:%.*]]
+; CHECK:       test:
+; CHECK-NEXT:    [[D:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    br label [[FIRST_SPLIT]]
+; CHECK:       first.split:
+; CHECK-NEXT:    [[DOTCE:%.*]] = phi i32 [ [[C]], [[TEST:%.*]] ], [ [[E]], [[TEST1]] ]
 ; CHECK-NEXT:    br label [[FIRST_EXITSTUB:%.*]]
 ; CHECK:       first.exitStub:
+; CHECK-NEXT:    store i32 [[DOTCE]], i32* [[TMP1:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/IROutliner/phi-nodes-output-overload.ll b/llvm/test/Transforms/IROutliner/phi-nodes-output-overload.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/IROutliner/phi-nodes-output-overload.ll
@@ -0,0 +1,112 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s
+
+; Show that we do not extract similar regions that would involve the splitting
+; of phi nodes on exit.
+
+define void @function1(i32* %a, i32* %b) {
+entry:
+  %0 = alloca i32, align 4
+  %c = load i32, i32* %0, align 4
+  br label %test1
+test1:
+  %e = load i32, i32* %0, align 4
+  br i1 true, label %first, label %next
+test:
+  %d = load i32, i32* %0, align 4
+  br i1 true, label %first, label %next
+first:
+  %1 = phi i32 [ %c, %test ], [ %e, %test1 ]
+  ret void
+next:
+  ret void
+}
+
+define void @function2(i32* %a, i32* %b) {
+entry:
+  %0 = alloca i32, align 4
+  %c = load i32, i32* %0, align 4
+  br label %test1
+test1:
+  %e = load i32, i32* %0, align 4
+  br i1 true, label %first, label %next
+test:
+  %d = load i32, i32* %0, align 4
+  br i1 true, label %first, label %next
+first:
+  ret void
+next:
+  %1 = phi i32 [ %c, %test ], [ %e, %test1 ]
+  ret void
+}
+; CHECK-LABEL: @function1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTCE_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[DOTCE_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @outlined_ir_func_0(i32* [[TMP0]], i32* [[DOTCE_LOC]], i32 0)
+; CHECK-NEXT:    [[DOTCE_RELOAD:%.*]] = load i32, i32* [[DOTCE_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    br i1 [[TMP1]], label [[FIRST:%.*]], label [[NEXT:%.*]]
+; CHECK:       first:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i32 [ [[DOTCE_RELOAD]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret void
+; CHECK:       next:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: @function2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTCE_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[DOTCE_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @outlined_ir_func_0(i32* [[TMP0]], i32* [[DOTCE_LOC]], i32 1)
+; CHECK-NEXT:    [[DOTCE_RELOAD:%.*]] = load i32, i32* [[DOTCE_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    br i1 [[TMP1]], label [[FIRST:%.*]], label [[NEXT:%.*]]
+; CHECK:       first:
+; CHECK-NEXT:    ret void
+; CHECK:       next:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i32 [ [[DOTCE_RELOAD]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define internal i1 @outlined_ir_func_0(
+; CHECK-NEXT:  newFuncRoot:
+; CHECK-NEXT:    br label [[ENTRY_TO_OUTLINE:%.*]]
+; CHECK:       entry_to_outline:
+; CHECK-NEXT:    [[C:%.*]] = load i32, i32* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    br label [[TEST1:%.*]]
+; CHECK:       test1:
+; CHECK-NEXT:    [[E:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    br i1 true, label [[FIRST_SPLIT:%.*]], label [[PHI_BLOCK:%.*]]
+; CHECK:       test:
+; CHECK-NEXT:    [[D:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    br i1 true, label [[FIRST_SPLIT]], label [[PHI_BLOCK]]
+; CHECK:       first.split:
+; CHECK-NEXT:    [[DOTCE:%.*]] = phi i32 [ [[C]], [[TEST:%.*]] ], [ [[E]], [[TEST1]] ]
+; CHECK-NEXT:    br label [[FIRST_EXITSTUB:%.*]]
+; CHECK:       first.exitStub:
+; CHECK-NEXT:    switch i32 [[TMP2:%.*]], label [[FINAL_BLOCK_1:%.*]] [
+; CHECK-NEXT:    i32 0, label [[OUTPUT_BLOCK_0_1:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       next.exitStub:
+; CHECK-NEXT:    switch i32 [[TMP2]], label [[FINAL_BLOCK_0:%.*]] [
+; CHECK-NEXT:    i32 0, label [[OUTPUT_BLOCK_1_0:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       output_block_0_1:
+; CHECK-NEXT:    store i32 [[DOTCE]], i32* [[TMP1:%.*]], align 4
+; CHECK-NEXT:    br label [[FINAL_BLOCK_1]]
+; CHECK:       output_block_1_0:
+; CHECK-NEXT:    store i32 [[TMP3:%.*]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    br label [[FINAL_BLOCK_0]]
+; CHECK:       phi_block:
+; CHECK-NEXT:    [[TMP3]] = phi i32 [ [[C]], [[TEST]] ], [ [[E]], [[TEST1]] ]
+; CHECK-NEXT:    br label [[NEXT_EXITSTUB:%.*]]
+; CHECK:       final_block_0:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       final_block_1:
+; CHECK-NEXT:    ret i1 true
+;
diff --git a/llvm/test/Transforms/IROutliner/region-inputs-in-phi-nodes.ll b/llvm/test/Transforms/IROutliner/region-inputs-in-phi-nodes.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/IROutliner/region-inputs-in-phi-nodes.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s
+
+; Show that we are able to propogate inputs to the region into the split PHINode
+; outside of the region if necessary.
+
+define void @function1(i32* %a, i32* %b) {
+entry:
+  %0 = alloca i32, align 4
+  %c = load i32, i32* %0, align 4
+  %z = add i32 %c, %c
+  br i1 true, label %test1, label %first
+test1:
+  %e = load i32, i32* %0, align 4
+  %1 = add i32 %c, %c
+  br i1 true, label %first, label %test
+test:
+  %d = load i32, i32* %0, align 4
+  br i1 true, label %first, label %next
+first:
+  %2 = phi i32 [ %d, %test ], [ %e, %test1 ], [ %c, %entry ]
+  ret void
+next:
+  ret void
+}
+
+define void @function2(i32* %a, i32* %b) {
+entry:
+  %0 = alloca i32, align 4
+  %c = load i32, i32* %0, align 4
+  %z = mul i32 %c, %c
+  br i1 true, label %test1, label %first
+test1:
+  %e = load i32, i32* %0, align 4
+  %1 = add i32 %c, %c
+  br i1 true, label %first, label %test
+test:
+  %d = load i32, i32* %0, align 4
+  br i1 true, label %first, label %next
+first:
+  %2 = phi i32 [ %d, %test ], [ %e, %test1 ], [ %c, %entry ]
+  ret void
+next:
+  ret void
+}
+; CHECK-LABEL: @function1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTCE_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[C:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    [[Z:%.*]] = add i32 [[C]], [[C]]
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[DOTCE_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[TARGETBLOCK:%.*]] = call i1 @outlined_ir_func_0(i32* [[TMP0]], i32 [[C]], i32* [[DOTCE_LOC]])
+; CHECK-NEXT:    [[DOTCE_RELOAD:%.*]] = load i32, i32* [[DOTCE_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    br i1 [[TARGETBLOCK]], label [[FIRST:%.*]], label [[NEXT:%.*]]
+; CHECK:       first:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[DOTCE_RELOAD]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret void
+; CHECK:       next:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: @function2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTCE_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[C:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    [[Z:%.*]] = mul i32 [[C]], [[C]]
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[DOTCE_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[TARGETBLOCK:%.*]] = call i1 @outlined_ir_func_0(i32* [[TMP0]], i32 [[C]], i32* [[DOTCE_LOC]])
+; CHECK-NEXT:    [[DOTCE_RELOAD:%.*]] = load i32, i32* [[DOTCE_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    br i1 [[TARGETBLOCK]], label [[FIRST:%.*]], label [[NEXT:%.*]]
+; CHECK:       first:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[DOTCE_RELOAD]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret void
+; CHECK:       next:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define internal i1 @outlined_ir_func_0(
+; CHECK-NEXT:  newFuncRoot:
+; CHECK-NEXT:    br label [[ENTRY_TO_OUTLINE:%.*]]
+; CHECK:       entry_to_outline:
+; CHECK-NEXT:    br i1 true, label [[TEST1:%.*]], label [[FIRST_SPLIT:%.*]]
+; CHECK:       test1:
+; CHECK-NEXT:    [[E:%.*]] = load i32, i32* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP1:%.*]], [[TMP1]]
+; CHECK-NEXT:    br i1 true, label [[FIRST_SPLIT]], label [[TEST:%.*]]
+; CHECK:       test:
+; CHECK-NEXT:    [[D:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    br i1 true, label [[FIRST_SPLIT]], label [[NEXT_EXITSTUB:%.*]]
+; CHECK:       first.split:
+; CHECK-NEXT:    [[DOTCE:%.*]] = phi i32 [ [[D]], [[TEST]] ], [ [[E]], [[TEST1]] ], [ [[TMP1]], [[ENTRY_TO_OUTLINE]] ]
+; CHECK-NEXT:    br label [[FIRST_EXITSTUB:%.*]]
+; CHECK:       first.exitStub:
+; CHECK-NEXT:    store i32 [[DOTCE]], i32* [[TMP2:%.*]], align 4
+; CHECK-NEXT:    ret i1 true
+; CHECK:       next.exitStub:
+; CHECK-NEXT:    ret i1 false
+;
diff --git a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
@@ -994,7 +994,7 @@
   LogicalResult matchAndRewrite(GenericOp genericOp,
                                 PatternRewriter &rewriter) const override {
     // Only apply to elementwise linalg on tensor.
-    if (!genericOp.hasTensorSemantics() ||
+    if (!genericOp.hasTensorSemantics() || genericOp.hasIndexSemantics() ||
         genericOp.getNumParallelLoops() != genericOp.getNumLoops())
       return failure();
     // Only support identity output maps. It could be extended to permuations if
diff --git a/mlir/test/Dialect/Linalg/fusion-push-reshape.mlir b/mlir/test/Dialect/Linalg/fusion-push-reshape.mlir
--- a/mlir/test/Dialect/Linalg/fusion-push-reshape.mlir
+++ b/mlir/test/Dialect/Linalg/fusion-push-reshape.mlir
@@ -124,3 +124,30 @@
 //  CHECK-SAME:   outs(%{{.+}} : tensor<6x5xf32>)
 //       CHECK:   tensor.expand_shape %[[OP]]
 //  CHECK-SAME:   tensor<6x5xf32> into tensor<2x3x5xf32>
+
+// -----
+
+func @generic_op_index_semantics(%A: tensor<?x16xi64>, %B: tensor<16xi64>, %init: tensor<?x112x16xi64>) -> tensor<?x112x16xi64> {
+  %0 = tensor.expand_shape %A [[0, 1], [2]]
+      : tensor<?x16xi64> into tensor<?x112x16xi64>
+  %2 = linalg.generic {indexing_maps = [
+    affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>,
+    affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
+    iterator_types = ["parallel", "parallel", "parallel"]}
+  ins(%0, %B : tensor<?x112x16xi64>, tensor<16xi64>)
+  outs(%init : tensor<?x112x16xi64>) {
+  ^bb0(%arg1: i64, %arg2: i64, %arg3: i64):  // no predecessors
+    %index = linalg.index 0 : index
+    %1 = arith.index_cast %index : index to i64
+    %add = arith.addi %arg1, %1 : i64
+    %s = arith.subi %add, %arg2 : i64
+    linalg.yield %s : i64
+  } -> tensor<?x112x16xi64>
+  return %2 : tensor<?x112x16xi64>
+}
+//      CHECK: func @generic_op_index_semantics
+// CHECK-SAME:     %[[ARG0:.+]]: tensor<?x16xi64>
+//      CHECK:   %[[RESHAPE:.+]] = tensor.expand_shape %[[ARG0]]
+//      CHECK:   %[[RESULT:.+]] = linalg.generic
+// CHECK-SAME:       ins(%[[RESHAPE]]
+//      CHECK:   return %[[RESULT]]