diff --git a/clang-tools-extra/clang-doc/Serialize.cpp b/clang-tools-extra/clang-doc/Serialize.cpp
--- a/clang-tools-extra/clang-doc/Serialize.cpp
+++ b/clang-tools-extra/clang-doc/Serialize.cpp
@@ -168,7 +168,7 @@
 }
 
 bool ClangDocCommentVisitor::isWhitespaceOnly(llvm::StringRef S) const {
-  return std::all_of(S.begin(), S.end(), isspace);
+  return llvm::all_of(S, isspace);
 }
 
 std::string ClangDocCommentVisitor::getCommandName(unsigned CommandID) const {
diff --git a/clang-tools-extra/clang-move/Move.cpp b/clang-tools-extra/clang-move/Move.cpp
--- a/clang-tools-extra/clang-move/Move.cpp
+++ b/clang-tools-extra/clang-move/Move.cpp
@@ -920,8 +920,7 @@
       return false;
     }
   };
-  if (std::none_of(UnremovedDeclsInOldHeader.begin(),
-                   UnremovedDeclsInOldHeader.end(), IsSupportedKind) &&
+  if (llvm::none_of(UnremovedDeclsInOldHeader, IsSupportedKind) &&
       !Context->Spec.OldHeader.empty()) {
     auto &SM = RemovedDecls[0]->getASTContext().getSourceManager();
     moveAll(SM, Context->Spec.OldHeader, Context->Spec.NewHeader);
diff --git a/clang-tools-extra/clang-tidy/objc/NSDateFormatterCheck.cpp b/clang-tools-extra/clang-tidy/objc/NSDateFormatterCheck.cpp
--- a/clang-tools-extra/clang-tidy/objc/NSDateFormatterCheck.cpp
+++ b/clang-tools-extra/clang-tidy/objc/NSDateFormatterCheck.cpp
@@ -38,14 +38,10 @@
 // set of reserved characters. See:
 // https://www.unicode.org/reports/tr35/tr35.html#Invalid_Patterns
 bool isValidDatePattern(StringRef Pattern) {
-  for (auto &PatternChar : Pattern) {
-    if (isalpha(PatternChar)) {
-      if (!llvm::is_contained(ValidDatePatternChars, PatternChar)) {
-        return false;
-      }
-    }
-  }
-  return true;
+  return llvm::all_of(Pattern, [](const auto &PatternChar) {
+    return !isalpha(PatternChar) ||
+           llvm::is_contained(ValidDatePatternChars, PatternChar);
+  });
 }
 
 // Checks if the string pattern used as a date format specifier contains
diff --git a/clang-tools-extra/clangd/URI.cpp b/clang-tools-extra/clangd/URI.cpp
--- a/clang-tools-extra/clangd/URI.cpp
+++ b/clang-tools-extra/clangd/URI.cpp
@@ -142,7 +142,7 @@
     return false;
   if (!llvm::isAlpha(Scheme[0]))
     return false;
-  return std::all_of(Scheme.begin() + 1, Scheme.end(), [](char C) {
+  return llvm::all_of(llvm::drop_begin(Scheme), [](char C) {
     return llvm::isAlnum(C) || C == '+' || C == '.' || C == '-';
   });
 }
diff --git a/clang-tools-extra/clangd/index/CanonicalIncludes.cpp b/clang-tools-extra/clangd/index/CanonicalIncludes.cpp
--- a/clang-tools-extra/clangd/index/CanonicalIncludes.cpp
+++ b/clang-tools-extra/clangd/index/CanonicalIncludes.cpp
@@ -777,12 +777,11 @@
                llvm::sys::path::end(Path)) <= MaxSuffixComponents;
   }));
   // ... and precise.
-  assert(llvm::find_if(SystemHeaderMap->keys(), [](llvm::StringRef Path) {
-           return std::distance(llvm::sys::path::begin(
-                                    Path, llvm::sys::path::Style::posix),
-                                llvm::sys::path::end(Path)) ==
-                  MaxSuffixComponents;
-         }) != SystemHeaderMap->keys().end());
+  assert(llvm::any_of(SystemHeaderMap->keys(), [](llvm::StringRef Path) {
+    return std::distance(
+               llvm::sys::path::begin(Path, llvm::sys::path::Style::posix),
+               llvm::sys::path::end(Path)) == MaxSuffixComponents;
+  }));
 
   // FIXME: Suffix mapping contains invalid entries for C, so only enable it for
   // CPP.
diff --git a/clang-tools-extra/clangd/refactor/Rename.cpp b/clang-tools-extra/clangd/refactor/Rename.cpp
--- a/clang-tools-extra/clangd/refactor/Rename.cpp
+++ b/clang-tools-extra/clangd/refactor/Rename.cpp
@@ -757,12 +757,12 @@
     return StartOffset.takeError();
   if (!EndOffset)
     return EndOffset.takeError();
-  if (llvm::find_if(
+  if (llvm::none_of(
           *MainFileRenameEdit,
           [&StartOffset, &EndOffset](const clang::tooling::Replacement &R) {
             return R.getOffset() == *StartOffset &&
                    R.getLength() == *EndOffset - *StartOffset;
-          }) == MainFileRenameEdit->end()) {
+          })) {
     return makeError(ReasonToReject::NoSymbolFound);
   }
   RenameResult Result;
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -71,6 +71,9 @@
 - Fix `#57008 <https://github.com/llvm/llvm-project/issues/57008>`_ - Builtin
   C++ language extension type traits instantiated by a template with unexpected
   number of arguments cause an assertion fault.
+- Fix multi-level pack expansion of undeclared function parameters.
+  This fixes `Issue 56094 <https://github.com/llvm/llvm-project/issues/56094>`_.
+
 
 Improvements to Clang's diagnostics
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h
--- a/clang/include/clang/AST/Expr.h
+++ b/clang/include/clang/AST/Expr.h
@@ -3061,10 +3061,6 @@
                               PREARGS_START + getNumPreArgs() + getNumArgs());
   }
 
-  /// getNumCommas - Return the number of commas that must have been present in
-  /// this function call.
-  unsigned getNumCommas() const { return getNumArgs() ? getNumArgs() - 1 : 0; }
-
   /// Get FPOptionsOverride from trailing storage.
   FPOptionsOverride getStoredFPFeatures() const {
     assert(hasStoredFPFeatures());
diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
--- a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
+++ b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
@@ -348,10 +348,12 @@
 
   /// Returns the `DeclContext` of the block being analysed, if any. Otherwise,
   /// returns null.
-  const DeclContext *getDeclCtx() { return DeclCtx; }
+  const DeclContext *getDeclCtx() { return CallStack.back(); }
 
-  /// Sets the `DeclContext` of the block being analysed.
-  void setDeclCtx(const DeclContext *Ctx) { DeclCtx = Ctx; }
+  /// Returns whether this `Environment` can be extended to analyze the given
+  /// `Callee` (i.e. if `pushCall` can be used), with recursion disallowed and a
+  /// given `MaxDepth`.
+  bool canDescend(unsigned MaxDepth, const DeclContext *Callee) const;
 
   /// Returns the `ControlFlowContext` registered for `F`, if any. Otherwise,
   /// returns null.
@@ -390,7 +392,7 @@
   DataflowAnalysisContext *DACtx;
 
   // `DeclContext` of the block being analysed if provided.
-  const DeclContext *DeclCtx = nullptr;
+  std::vector<const DeclContext *> CallStack;
 
   // In a properly initialized `Environment`, `ReturnLoc` should only be null if
   // its `DeclContext` could not be cast to a `FunctionDecl`.
diff --git a/clang/include/clang/Analysis/FlowSensitive/Transfer.h b/clang/include/clang/Analysis/FlowSensitive/Transfer.h
--- a/clang/include/clang/Analysis/FlowSensitive/Transfer.h
+++ b/clang/include/clang/Analysis/FlowSensitive/Transfer.h
@@ -21,7 +21,11 @@
 namespace clang {
 namespace dataflow {
 
-struct ContextSensitiveOptions {};
+struct ContextSensitiveOptions {
+  /// The maximum depth to analyze. A value of zero is equivalent to disabling
+  /// context-sensitive analysis entirely.
+  unsigned Depth = 2;
+};
 
 struct TransferOptions {
   /// Options for analyzing function bodies when present in the translation
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -3486,8 +3486,8 @@
 bool FunctionDecl::hasOneParamOrDefaultArgs() const {
   return getNumParams() == 1 ||
          (getNumParams() > 1 &&
-          std::all_of(param_begin() + 1, param_end(),
-                      [](ParmVarDecl *P) { return P->hasDefaultArg(); }));
+          llvm::all_of(llvm::drop_begin(parameters()),
+                       [](ParmVarDecl *P) { return P->hasDefaultArg(); }));
 }
 
 /// The combination of the extern and inline keywords under MSVC forces
diff --git a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
--- a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
+++ b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
@@ -154,10 +154,10 @@
     : DACtx(&DACtx), FlowConditionToken(&DACtx.makeFlowConditionToken()) {}
 
 Environment::Environment(const Environment &Other)
-    : DACtx(Other.DACtx), DeclCtx(Other.DeclCtx), ReturnLoc(Other.ReturnLoc),
-      ThisPointeeLoc(Other.ThisPointeeLoc), DeclToLoc(Other.DeclToLoc),
-      ExprToLoc(Other.ExprToLoc), LocToVal(Other.LocToVal),
-      MemberLocToStruct(Other.MemberLocToStruct),
+    : DACtx(Other.DACtx), CallStack(Other.CallStack),
+      ReturnLoc(Other.ReturnLoc), ThisPointeeLoc(Other.ThisPointeeLoc),
+      DeclToLoc(Other.DeclToLoc), ExprToLoc(Other.ExprToLoc),
+      LocToVal(Other.LocToVal), MemberLocToStruct(Other.MemberLocToStruct),
       FlowConditionToken(&DACtx->forkFlowCondition(*Other.FlowConditionToken)) {
 }
 
@@ -168,11 +168,11 @@
 }
 
 Environment::Environment(DataflowAnalysisContext &DACtx,
-                         const DeclContext &DeclCtxArg)
+                         const DeclContext &DeclCtx)
     : Environment(DACtx) {
-  setDeclCtx(&DeclCtxArg);
+  CallStack.push_back(&DeclCtx);
 
-  if (const auto *FuncDecl = dyn_cast<FunctionDecl>(DeclCtx)) {
+  if (const auto *FuncDecl = dyn_cast<FunctionDecl>(&DeclCtx)) {
     assert(FuncDecl->getBody() != nullptr);
     initGlobalVars(*FuncDecl->getBody(), *this);
     for (const auto *ParamDecl : FuncDecl->parameters()) {
@@ -187,7 +187,7 @@
     ReturnLoc = &createStorageLocation(ReturnType);
   }
 
-  if (const auto *MethodDecl = dyn_cast<CXXMethodDecl>(DeclCtx)) {
+  if (const auto *MethodDecl = dyn_cast<CXXMethodDecl>(&DeclCtx)) {
     auto *Parent = MethodDecl->getParent();
     assert(Parent != nullptr);
     if (Parent->isLambda())
@@ -205,6 +205,13 @@
   }
 }
 
+bool Environment::canDescend(unsigned MaxDepth,
+                             const DeclContext *Callee) const {
+  return CallStack.size() <= MaxDepth &&
+         std::find(CallStack.begin(), CallStack.end(), Callee) ==
+             CallStack.end();
+}
+
 Environment Environment::pushCall(const CallExpr *Call) const {
   Environment Env(*this);
 
@@ -239,7 +246,7 @@
 
 void Environment::pushCallInternal(const FunctionDecl *FuncDecl,
                                    ArrayRef<const Expr *> Args) {
-  setDeclCtx(FuncDecl);
+  CallStack.push_back(FuncDecl);
 
   // FIXME: In order to allow the callee to reference globals, we probably need
   // to call `initGlobalVars` here in some way.
@@ -326,13 +333,13 @@
   assert(DACtx == Other.DACtx);
   assert(ReturnLoc == Other.ReturnLoc);
   assert(ThisPointeeLoc == Other.ThisPointeeLoc);
-  assert(DeclCtx == Other.DeclCtx);
+  assert(CallStack == Other.CallStack);
 
   auto Effect = LatticeJoinEffect::Unchanged;
 
   Environment JoinedEnv(*DACtx);
 
-  JoinedEnv.setDeclCtx(DeclCtx);
+  JoinedEnv.CallStack = CallStack;
   JoinedEnv.ReturnLoc = ReturnLoc;
   JoinedEnv.ThisPointeeLoc = ThisPointeeLoc;
 
diff --git a/clang/lib/Analysis/FlowSensitive/Transfer.cpp b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
--- a/clang/lib/Analysis/FlowSensitive/Transfer.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
@@ -661,7 +661,8 @@
   // `F` of `S`. The type `E` must be either `CallExpr` or `CXXConstructExpr`.
   template <typename E>
   void transferInlineCall(const E *S, const FunctionDecl *F) {
-    if (!Options.ContextSensitiveOpts)
+    if (!(Options.ContextSensitiveOpts &&
+          Env.canDescend(Options.ContextSensitiveOpts->Depth, F)))
       return;
 
     const ControlFlowContext *CFCtx = Env.getControlFlowContext(F);
@@ -689,7 +690,7 @@
     assert(CFCtx->getDecl() != nullptr &&
            "ControlFlowContexts in the environment should always carry a decl");
     auto Analysis = NoopAnalysis(CFCtx->getDecl()->getASTContext(),
-                                 DataflowAnalysisOptions());
+                                 DataflowAnalysisOptions{Options});
 
     auto BlockToOutputState =
         dataflow::runDataflowAnalysis(*CFCtx, Analysis, CalleeEnv);
diff --git a/clang/lib/Analysis/ReachableCode.cpp b/clang/lib/Analysis/ReachableCode.cpp
--- a/clang/lib/Analysis/ReachableCode.cpp
+++ b/clang/lib/Analysis/ReachableCode.cpp
@@ -299,6 +299,12 @@
     if (isa<BinaryOperator>(Term)) {
       return isConfigurationValue(Term, PP);
     }
+    // Do not treat constexpr if statement successors as unreachable in warnings
+    // since the point of these statements is to determine branches at compile
+    // time.
+    if (const auto *IS = dyn_cast<IfStmt>(Term);
+        IS != nullptr && IS->isConstexpr())
+      return true;
   }
 
   const Stmt *Cond = B->getTerminatorCondition(/* stripParens */ false);
diff --git a/clang/lib/Frontend/TextDiagnostic.cpp b/clang/lib/Frontend/TextDiagnostic.cpp
--- a/clang/lib/Frontend/TextDiagnostic.cpp
+++ b/clang/lib/Frontend/TextDiagnostic.cpp
@@ -332,8 +332,7 @@
     return;
 
   // No special characters are allowed in CaretLine.
-  assert(CaretLine.end() ==
-         llvm::find_if(CaretLine, [](char c) { return c < ' ' || '~' < c; }));
+  assert(llvm::none_of(CaretLine, [](char c) { return c < ' ' || '~' < c; }));
 
   // Find the slice that we need to display the full caret line
   // correctly.
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -3788,9 +3788,8 @@
                   // Variable is used if it has been marked as an array, array
                   // section, array shaping or the variable iself.
                   return StackComponents.size() == 1 ||
-                         std::all_of(
-                             std::next(StackComponents.rbegin()),
-                             StackComponents.rend(),
+                         llvm::all_of(
+                             llvm::drop_begin(llvm::reverse(StackComponents)),
                              [](const OMPClauseMappableExprCommon::
                                     MappableComponent &MC) {
                                return MC.getAssociatedDeclaration() ==
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -5792,6 +5792,7 @@
                                        = dyn_cast<PackExpansionType>(OldType)) {
       // We have a function parameter pack that may need to be expanded.
       QualType Pattern = Expansion->getPattern();
+      NumExpansions = Expansion->getNumExpansions();
       SmallVector<UnexpandedParameterPack, 2> Unexpanded;
       getSema().collectUnexpandedParameterPacks(Pattern, Unexpanded);
 
diff --git a/clang/test/CXX/temp/temp.decls/temp.variadic/p5.cpp b/clang/test/CXX/temp/temp.decls/temp.variadic/p5.cpp
--- a/clang/test/CXX/temp/temp.decls/temp.variadic/p5.cpp
+++ b/clang/test/CXX/temp/temp.decls/temp.variadic/p5.cpp
@@ -469,3 +469,25 @@
   bar(b);
 }
 }
+
+namespace pr56094 {
+template <typename... T> struct D {
+  template <typename... U> using B = int(int (*...p)(T, U));
+  // expected-error@-1 {{pack expansion contains parameter pack 'U' that has a different length (1 vs. 2) from outer parameter packs}}
+  template <typename U1, typename U2> D(B<U1, U2> *);
+  // expected-note@-1 {{in instantiation of template type alias 'B' requested here}}
+};
+using t1 = D<float>::B<int>;
+// expected-note@-1 {{in instantiation of template class 'pr56094::D<float>' requested here}}
+
+template <bool...> struct F {};
+template <class...> struct G {};
+template <bool... I> struct E {
+  template <bool... U> using B = G<F<I, U>...>;
+  // expected-error@-1 {{pack expansion contains parameter pack 'U' that has a different length (1 vs. 2) from outer parameter packs}}
+  template <bool U1, bool U2> E(B<U1, U2> *);
+  // expected-note@-1 {{in instantiation of template type alias 'B' requested here}}
+};
+using t2 = E<true>::B<false>;
+// expected-note@-1 {{in instantiation of template class 'pr56094::E<true>' requested here}}
+} // namespace pr56094
diff --git a/clang/test/CodeGenCXX/pragma-init_seg.cpp b/clang/test/CodeGenCXX/pragma-init_seg.cpp
--- a/clang/test/CodeGenCXX/pragma-init_seg.cpp
+++ b/clang/test/CodeGenCXX/pragma-init_seg.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -no-opaque-pointers %s -triple=i686-pc-win32 -fms-extensions -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -triple=i686-pc-win32 -fms-extensions -emit-llvm -o - | FileCheck %s
 
 int f();
 
@@ -10,12 +10,12 @@
 #pragma init_seg(compiler)
 int x = f();
 // CHECK: @"?x@simple_init@@3HA" = dso_local global i32 0, align 4
-// CHECK: @__cxx_init_fn_ptr = private constant void ()* @"??__Ex@simple_init@@YAXXZ", section ".CRT$XCC"
+// CHECK: @__cxx_init_fn_ptr = private constant ptr @"??__Ex@simple_init@@YAXXZ", section ".CRT$XCC"
 
 #pragma init_seg(lib)
 int y = f();
 // CHECK: @"?y@simple_init@@3HA" = dso_local global i32 0, align 4
-// CHECK: @__cxx_init_fn_ptr.1 = private constant void ()* @"??__Ey@simple_init@@YAXXZ", section ".CRT$XCL"
+// CHECK: @__cxx_init_fn_ptr.1 = private constant ptr @"??__Ey@simple_init@@YAXXZ", section ".CRT$XCL"
 
 #pragma init_seg(user)
 int z = f();
@@ -29,14 +29,14 @@
 namespace {
 int x = f();
 // CHECK: @"?x@?A0x{{[^@]*}}@internal_init@@3HA" = internal global i32 0, align 4
-// CHECK: @__cxx_init_fn_ptr.2 = private constant void ()* @"??__Ex@?A0x{{[^@]*}}@internal_init@@YAXXZ", section ".asdf"
+// CHECK: @__cxx_init_fn_ptr.2 = private constant ptr @"??__Ex@?A0x{{[^@]*}}@internal_init@@YAXXZ", section ".asdf"
 }
 }
 
 namespace selectany_init {
 int __declspec(selectany) x = f();
 // CHECK: @"?x@selectany_init@@3HA" = weak_odr dso_local global i32 0, comdat, align 4
-// CHECK: @__cxx_init_fn_ptr.3 = private constant void ()* @"??__Ex@selectany_init@@YAXXZ", section ".asdf", comdat($"?x@selectany_init@@3HA")
+// CHECK: @__cxx_init_fn_ptr.3 = private constant ptr @"??__Ex@selectany_init@@YAXXZ", section ".asdf", comdat($"?x@selectany_init@@3HA")
 }
 
 namespace explicit_template_instantiation {
@@ -44,7 +44,7 @@
 template <typename T> const int A<T>::x = f();
 template struct A<int>;
 // CHECK: @"?x@?$A@H@explicit_template_instantiation@@2HB" = weak_odr dso_local global i32 0, comdat, align 4
-// CHECK: @__cxx_init_fn_ptr.4 = private constant void ()* @"??__E?x@?$A@H@explicit_template_instantiation@@2HB@@YAXXZ", section ".asdf", comdat($"?x@?$A@H@explicit_template_instantiation@@2HB")
+// CHECK: @__cxx_init_fn_ptr.4 = private constant ptr @"??__E?x@?$A@H@explicit_template_instantiation@@2HB@@YAXXZ", section ".asdf", comdat($"?x@?$A@H@explicit_template_instantiation@@2HB")
 }
 
 namespace implicit_template_instantiation {
@@ -52,21 +52,21 @@
 template <typename T> const int A<T>::x = f();
 int g() { return A<int>::x; }
 // CHECK: @"?x@?$A@H@implicit_template_instantiation@@2HB" = linkonce_odr dso_local global i32 0, comdat, align 4
-// CHECK: @__cxx_init_fn_ptr.5 = private constant void ()* @"??__E?x@?$A@H@implicit_template_instantiation@@2HB@@YAXXZ", section ".asdf", comdat($"?x@?$A@H@implicit_template_instantiation@@2HB")
+// CHECK: @__cxx_init_fn_ptr.5 = private constant ptr @"??__E?x@?$A@H@implicit_template_instantiation@@2HB@@YAXXZ", section ".asdf", comdat($"?x@?$A@H@implicit_template_instantiation@@2HB")
 }
 
 // ... and here's where we emitted user level ctors.
-// CHECK: @llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }]
-// CHECK: [{ i32, void ()*, i8* } { i32 65535, void ()* @_GLOBAL__sub_I_pragma_init_seg.cpp, i8* null }]
+// CHECK: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }]
+// CHECK: [{ i32, ptr, ptr } { i32 65535, ptr @_GLOBAL__sub_I_pragma_init_seg.cpp, ptr null }]
 
 // We have to mark everything used so we can survive globalopt, even through
 // LTO.  There's no way LLVM could really understand if data in the .asdf
 // section is really used or dead.
 //
-// CHECK: @llvm.used = appending global [6 x i8*]
-// CHECK: [i8* bitcast (void ()** @__cxx_init_fn_ptr to i8*),
-// CHECK: i8* bitcast (void ()** @__cxx_init_fn_ptr.1 to i8*),
-// CHECK: i8* bitcast (void ()** @__cxx_init_fn_ptr.2 to i8*),
-// CHECK: i8* bitcast (void ()** @__cxx_init_fn_ptr.3 to i8*),
-// CHECK: i8* bitcast (void ()** @__cxx_init_fn_ptr.4 to i8*),
-// CHECK: i8* bitcast (void ()** @__cxx_init_fn_ptr.5 to i8*)], section "llvm.metadata"
+// CHECK: @llvm.used = appending global [6 x ptr]
+// CHECK: [ptr @__cxx_init_fn_ptr,
+// CHECK: ptr @__cxx_init_fn_ptr.1,
+// CHECK: ptr @__cxx_init_fn_ptr.2,
+// CHECK: ptr @__cxx_init_fn_ptr.3,
+// CHECK: ptr @__cxx_init_fn_ptr.4,
+// CHECK: ptr @__cxx_init_fn_ptr.5], section "llvm.metadata"
diff --git a/clang/test/Driver/avr-ld.c b/clang/test/Driver/avr-ld.c
--- a/clang/test/Driver/avr-ld.c
+++ b/clang/test/Driver/avr-ld.c
@@ -1,44 +1,44 @@
-// RUN: %clang -### --target=avr -mmcu=at90s2313 --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKA %s
+// RUN: %clang -### --target=avr -mmcu=at90s2313 --rtlib=libgcc --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKA %s
 // LINKA: {{".*ld.*"}} {{.*}} {{"-L.*tiny-stack"}} {{.*}} "-Tdata=0x800060" "--start-group" {{.*}} "-lat90s2313" {{.*}} "--end-group" "-mavr2"
 
-// RUN: %clang -### --target=avr -mmcu=at90s8515 --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKB %s
+// RUN: %clang -### --target=avr -mmcu=at90s8515 --rtlib=libgcc --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKB %s
 // LINKB: {{".*ld.*"}} {{.*}} "-Tdata=0x800060" "--start-group" {{.*}} "-lat90s8515" {{.*}} "--end-group" "-mavr2"
 
-// RUN: %clang -### --target=avr -mmcu=attiny13 --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKC %s
+// RUN: %clang -### --target=avr -mmcu=attiny13 --rtlib=libgcc --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKC %s
 // LINKC: {{".*ld.*"}} {{.*}} {{"-L.*avr25/tiny-stack"}} {{.*}} "-Tdata=0x800060" "--start-group" {{.*}} "-lattiny13" {{.*}} "--end-group" "-mavr25"
 
-// RUN: %clang -### --target=avr -mmcu=attiny44 --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKD %s
+// RUN: %clang -### --target=avr -mmcu=attiny44 --rtlib=libgcc --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKD %s
 // LINKD: {{".*ld.*"}} {{.*}} {{"-L.*avr25"}} {{.*}} "-Tdata=0x800060" "--start-group" {{.*}} "-lattiny44" {{.*}} "--end-group" "-mavr25"
 
-// RUN: %clang -### --target=avr -mmcu=atmega103 --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKE %s
+// RUN: %clang -### --target=avr -mmcu=atmega103 --rtlib=libgcc --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKE %s
 // LINKE: {{".*ld.*"}} {{.*}} {{"-L.*avr31"}} {{.*}} "-Tdata=0x800060" "--start-group" {{.*}} "-latmega103" {{.*}} "--end-group" "-mavr31"
 
-// RUN: %clang -### --target=avr -mmcu=atmega8u2 --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKF %s
+// RUN: %clang -### --target=avr -mmcu=atmega8u2 --rtlib=libgcc --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKF %s
 // LINKF: {{".*ld.*"}} {{.*}} {{"-L.*avr35"}} {{.*}} "-Tdata=0x800100" "--start-group" {{.*}} "-latmega8u2" {{.*}} "--end-group" "-mavr35"
 
-// RUN: %clang -### --target=avr -mmcu=atmega48pa --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKG %s
+// RUN: %clang -### --target=avr -mmcu=atmega48pa --rtlib=libgcc --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKG %s
 // LINKG: {{".*ld.*"}} {{.*}} {{"-L.*avr4"}} {{.*}} "-Tdata=0x800100" "--start-group" {{.*}} "-latmega48pa" {{.*}} "--end-group" "-mavr4"
 
-// RUN: %clang -### --target=avr -mmcu=atmega328 --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKH %s
+// RUN: %clang -### --target=avr -mmcu=atmega328 --rtlib=libgcc --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKH %s
 // LINKH: {{".*ld.*"}} {{.*}} {{"-L.*avr5"}} {{.*}} "-Tdata=0x800100" "--start-group" {{.*}} "-latmega328" {{.*}} "--end-group" "-mavr5"
 
-// RUN: %clang -### --target=avr -mmcu=atmega1281 --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKI %s
+// RUN: %clang -### --target=avr -mmcu=atmega1281 --rtlib=libgcc --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKI %s
 // LINKI: {{".*ld.*"}} {{.*}} {{"-L.*avr51"}} {{.*}} "-Tdata=0x800200" "--start-group" {{.*}} "-latmega1281" {{.*}} "--end-group" "-mavr51"
 
-// RUN: %clang -### --target=avr -mmcu=atmega2560 --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKJ %s
+// RUN: %clang -### --target=avr -mmcu=atmega2560 --rtlib=libgcc --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKJ %s
 // LINKJ: {{".*ld.*"}} {{.*}} {{"-L.*avr6"}} {{.*}} "-Tdata=0x800200" "--start-group" {{.*}} "-latmega2560" {{.*}} "--end-group" "-mavr6"
 
-// RUN: %clang -### --target=avr -mmcu=attiny10 --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKK %s
+// RUN: %clang -### --target=avr -mmcu=attiny10 --rtlib=libgcc --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKK %s
 // LINKK: {{".*ld.*"}} {{.*}} {{"-L.*avrtiny"}} {{.*}} "-Tdata=0x800040" "--start-group" {{.*}} "-lattiny10" {{.*}} "--end-group" "-mavrtiny"
 
-// RUN: %clang -### --target=avr -mmcu=atxmega16a4 --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKL %s
+// RUN: %clang -### --target=avr -mmcu=atxmega16a4 --rtlib=libgcc --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKL %s
 // LINKL: {{".*ld.*"}} {{.*}} {{"-L.*avrxmega2"}} {{.*}} "-Tdata=0x802000" "--start-group" {{.*}} "-latxmega16a4" {{.*}} "--end-group" "-mavrxmega2"
 
-// RUN: %clang -### --target=avr -mmcu=atxmega64b3 --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKM %s
+// RUN: %clang -### --target=avr -mmcu=atxmega64b3 --rtlib=libgcc --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKM %s
 // LINKM: {{".*ld.*"}} {{.*}} {{"-L.*avrxmega4"}} {{.*}} "-Tdata=0x802000" "--start-group" {{.*}} "-latxmega64b3" {{.*}} "--end-group" "-mavrxmega4"
 
-// RUN: %clang -### --target=avr -mmcu=atxmega128a3u --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKN %s
+// RUN: %clang -### --target=avr -mmcu=atxmega128a3u --rtlib=libgcc --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKN %s
 // LINKN: {{".*ld.*"}} {{.*}} {{"-L.*avrxmega6"}} {{.*}} "-Tdata=0x802000" "--start-group" {{.*}} "-latxmega128a3u" {{.*}} "--end-group" "-mavrxmega6"
 
-// RUN: %clang -### --target=avr -mmcu=atxmega128a1 --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKO %s
+// RUN: %clang -### --target=avr -mmcu=atxmega128a1 --rtlib=libgcc --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKO %s
 // LINKO: {{".*ld.*"}} {{.*}} {{"-L.*avrxmega7"}} {{.*}} "-Tdata=0x802000" "--start-group" {{.*}} "-latxmega128a1" {{.*}} "--end-group" "-mavrxmega7"
diff --git a/clang/test/Driver/avr-toolchain.c b/clang/test/Driver/avr-toolchain.c
--- a/clang/test/Driver/avr-toolchain.c
+++ b/clang/test/Driver/avr-toolchain.c
@@ -73,7 +73,6 @@
 // LDS1: "-T" "avr.lds"
 // LDS1-NOT: "-mavr5"
 
-// RUN: %clang %s -### --target=avr -mmcu=atmega328 --sysroot=%S/Inputs/basic_avr_tree/ -resource-dir=%S/Inputs/resource_dir_with_per_target_subdir 2>&1 | FileCheck --check-prefix=LIBGCC %s
 // RUN: %clang %s -### --target=avr -mmcu=atmega328 --sysroot=%S/Inputs/basic_avr_tree/ -resource-dir=%S/Inputs/resource_dir_with_per_target_subdir --rtlib=libgcc 2>&1 | FileCheck --check-prefix=LIBGCC %s
 // LIBGCC: "-lgcc"
 // LIBGCC-NOT: libclang_rt
diff --git a/clang/test/SemaCXX/unreachable-code.cpp b/clang/test/SemaCXX/unreachable-code.cpp
--- a/clang/test/SemaCXX/unreachable-code.cpp
+++ b/clang/test/SemaCXX/unreachable-code.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fcxx-exceptions -fexceptions -fsyntax-only -Wunreachable-code-aggressive -fblocks -verify %s
+// RUN: %clang_cc1 -std=c++17 -fcxx-exceptions -fexceptions -fsyntax-only -Wunreachable-code-aggressive -fblocks -verify %s
 
 int j;
 int bar();
@@ -99,3 +99,34 @@
 }
 
 }
+
+namespace gh57123 {
+  bool foo() {
+    if constexpr (true) {
+      if (true)
+        return true;
+      else
+        return false; // expected-warning {{will never be executed}}
+    }
+    else
+      return false; // no-warning
+  }
+
+  bool bar() {
+    if (true)
+      return true;
+    else
+      return false; // expected-warning {{will never be executed}}
+  }
+
+  bool baz() {
+    if constexpr (true)
+      return true;
+    else {
+      if (true)
+        return true;
+      else
+        return false; // expected-warning {{will never be executed}}
+    }
+  }
+}
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -853,8 +853,8 @@
 // `__start_` and `__stop_` symbols.
 bool isValidCIdentifier(StringRef S) {
   return !S.empty() && (isAlpha(S[0]) || S[0] == '_') &&
-         std::all_of(S.begin() + 1, S.end(),
-                     [](char C) { return C == '_' || isAlnum(C); });
+         llvm::all_of(llvm::drop_begin(S),
+                      [](char C) { return C == '_' || isAlnum(C); });
 }
 
 Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
diff --git a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
--- a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
@@ -3902,6 +3902,36 @@
               {TransferOptions{/*.ContextSensitiveOpts=*/llvm::None}});
 }
 
+TEST(TransferTest, ContextSensitiveDepthZero) {
+  std::string Code = R"(
+    bool GiveBool();
+    void SetBool(bool &Var) { Var = true; }
+
+    void target() {
+      bool Foo = GiveBool();
+      SetBool(Foo);
+      // [[p]]
+    }
+  )";
+  runDataflow(Code,
+              [](llvm::ArrayRef<
+                     std::pair<std::string, DataflowAnalysisState<NoopLattice>>>
+                     Results,
+                 ASTContext &ASTCtx) {
+                ASSERT_THAT(Results, ElementsAre(Pair("p", _)));
+                const Environment &Env = Results[0].second.Env;
+
+                const ValueDecl *FooDecl = findValueDecl(ASTCtx, "Foo");
+                ASSERT_THAT(FooDecl, NotNull());
+
+                auto &FooVal =
+                    *cast<BoolValue>(Env.getValue(*FooDecl, SkipPast::None));
+                EXPECT_FALSE(Env.flowConditionImplies(FooVal));
+                EXPECT_FALSE(Env.flowConditionImplies(Env.makeNot(FooVal)));
+              },
+              {TransferOptions{ContextSensitiveOptions{/*.Depth=*/0}}});
+}
+
 TEST(TransferTest, ContextSensitiveSetTrue) {
   std::string Code = R"(
     bool GiveBool();
@@ -4000,7 +4030,7 @@
               {TransferOptions{ContextSensitiveOptions{}}});
 }
 
-TEST(TransferTest, ContextSensitiveSetTwoLayers) {
+TEST(TransferTest, ContextSensitiveSetTwoLayersDepthOne) {
   std::string Code = R"(
     bool GiveBool();
     void SetBool1(bool &Var) { Var = true; }
@@ -4028,7 +4058,146 @@
                 EXPECT_FALSE(Env.flowConditionImplies(FooVal));
                 EXPECT_FALSE(Env.flowConditionImplies(Env.makeNot(FooVal)));
               },
-              {TransferOptions{ContextSensitiveOptions{}}});
+              {TransferOptions{ContextSensitiveOptions{/*.Depth=*/1}}});
+}
+
+TEST(TransferTest, ContextSensitiveSetTwoLayersDepthTwo) {
+  std::string Code = R"(
+    bool GiveBool();
+    void SetBool1(bool &Var) { Var = true; }
+    void SetBool2(bool &Var) { SetBool1(Var); }
+
+    void target() {
+      bool Foo = GiveBool();
+      SetBool2(Foo);
+      // [[p]]
+    }
+  )";
+  runDataflow(Code,
+              [](llvm::ArrayRef<
+                     std::pair<std::string, DataflowAnalysisState<NoopLattice>>>
+                     Results,
+                 ASTContext &ASTCtx) {
+                ASSERT_THAT(Results, ElementsAre(Pair("p", _)));
+                const Environment &Env = Results[0].second.Env;
+
+                const ValueDecl *FooDecl = findValueDecl(ASTCtx, "Foo");
+                ASSERT_THAT(FooDecl, NotNull());
+
+                auto &FooVal =
+                    *cast<BoolValue>(Env.getValue(*FooDecl, SkipPast::None));
+                EXPECT_TRUE(Env.flowConditionImplies(FooVal));
+              },
+              {TransferOptions{ContextSensitiveOptions{/*.Depth=*/2}}});
+}
+
+TEST(TransferTest, ContextSensitiveSetThreeLayersDepthTwo) {
+  std::string Code = R"(
+    bool GiveBool();
+    void SetBool1(bool &Var) { Var = true; }
+    void SetBool2(bool &Var) { SetBool1(Var); }
+    void SetBool3(bool &Var) { SetBool2(Var); }
+
+    void target() {
+      bool Foo = GiveBool();
+      SetBool3(Foo);
+      // [[p]]
+    }
+  )";
+  runDataflow(Code,
+              [](llvm::ArrayRef<
+                     std::pair<std::string, DataflowAnalysisState<NoopLattice>>>
+                     Results,
+                 ASTContext &ASTCtx) {
+                ASSERT_THAT(Results, ElementsAre(Pair("p", _)));
+                const Environment &Env = Results[0].second.Env;
+
+                const ValueDecl *FooDecl = findValueDecl(ASTCtx, "Foo");
+                ASSERT_THAT(FooDecl, NotNull());
+
+                auto &FooVal =
+                    *cast<BoolValue>(Env.getValue(*FooDecl, SkipPast::None));
+                EXPECT_FALSE(Env.flowConditionImplies(FooVal));
+                EXPECT_FALSE(Env.flowConditionImplies(Env.makeNot(FooVal)));
+              },
+              {TransferOptions{ContextSensitiveOptions{/*.Depth=*/2}}});
+}
+
+TEST(TransferTest, ContextSensitiveSetThreeLayersDepthThree) {
+  std::string Code = R"(
+    bool GiveBool();
+    void SetBool1(bool &Var) { Var = true; }
+    void SetBool2(bool &Var) { SetBool1(Var); }
+    void SetBool3(bool &Var) { SetBool2(Var); }
+
+    void target() {
+      bool Foo = GiveBool();
+      SetBool3(Foo);
+      // [[p]]
+    }
+  )";
+  runDataflow(Code,
+              [](llvm::ArrayRef<
+                     std::pair<std::string, DataflowAnalysisState<NoopLattice>>>
+                     Results,
+                 ASTContext &ASTCtx) {
+                ASSERT_THAT(Results, ElementsAre(Pair("p", _)));
+                const Environment &Env = Results[0].second.Env;
+
+                const ValueDecl *FooDecl = findValueDecl(ASTCtx, "Foo");
+                ASSERT_THAT(FooDecl, NotNull());
+
+                auto &FooVal =
+                    *cast<BoolValue>(Env.getValue(*FooDecl, SkipPast::None));
+                EXPECT_TRUE(Env.flowConditionImplies(FooVal));
+              },
+              {TransferOptions{ContextSensitiveOptions{/*.Depth=*/3}}});
+}
+
+TEST(TransferTest, ContextSensitiveMutualRecursion) {
+  std::string Code = R"(
+    bool Pong(bool X, bool Y);
+
+    bool Ping(bool X, bool Y) {
+      if (X) {
+        return Y;
+      } else {
+        return Pong(!X, Y);
+      }
+    }
+
+    bool Pong(bool X, bool Y) {
+      if (Y) {
+        return X;
+      } else {
+        return Ping(X, !Y);
+      }
+    }
+
+    void target() {
+      bool Foo = Ping(false, false);
+      // [[p]]
+    }
+  )";
+  runDataflow(Code,
+              [](llvm::ArrayRef<
+                     std::pair<std::string, DataflowAnalysisState<NoopLattice>>>
+                     Results,
+                 ASTContext &ASTCtx) {
+                ASSERT_THAT(Results, ElementsAre(Pair("p", _)));
+                // The analysis doesn't crash...
+                const Environment &Env = Results[0].second.Env;
+
+                const ValueDecl *FooDecl = findValueDecl(ASTCtx, "Foo");
+                ASSERT_THAT(FooDecl, NotNull());
+
+                auto &FooVal =
+                    *cast<BoolValue>(Env.getValue(*FooDecl, SkipPast::None));
+                // ... but it also can't prove anything here.
+                EXPECT_FALSE(Env.flowConditionImplies(FooVal));
+                EXPECT_FALSE(Env.flowConditionImplies(Env.makeNot(FooVal)));
+              },
+              {TransferOptions{ContextSensitiveOptions{/*.Depth=*/4}}});
 }
 
 TEST(TransferTest, ContextSensitiveSetMultipleLines) {
diff --git a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
--- a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
+++ b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
@@ -404,17 +404,14 @@
     if (!groupInPedantic(Group))
       continue;
 
-    unsigned ParentsInPedantic = 0;
     const std::vector<Record*> &Parents = DiagGroupParents.getParents(Group);
-    for (unsigned j = 0, ej = Parents.size(); j != ej; ++j) {
-      if (groupInPedantic(Parents[j]))
-        ++ParentsInPedantic;
-    }
+    bool AllParentsInPedantic =
+        llvm::all_of(Parents, [&](Record *R) { return groupInPedantic(R); });
     // If all the parents are in -Wpedantic, this means that this diagnostic
     // group will be indirectly included by -Wpedantic already.  In that
     // case, do not add it directly to -Wpedantic.  If the group has no
     // parents, obviously it should go into -Wpedantic.
-    if (Parents.size() > 0 && ParentsInPedantic == Parents.size())
+    if (Parents.size() > 0 && AllParentsInPedantic)
       continue;
 
     if (RecordVec *V = GroupsInPedantic.dyn_cast<RecordVec*>())
diff --git a/compiler-rt/lib/msan/msan_report.cpp b/compiler-rt/lib/msan/msan_report.cpp
--- a/compiler-rt/lib/msan/msan_report.cpp
+++ b/compiler-rt/lib/msan/msan_report.cpp
@@ -37,14 +37,14 @@
 static void DescribeStackOrigin(const char *so, uptr pc) {
   Decorator d;
   Printf("%s", d.Origin());
-  if (so == nullptr) {
-    Printf("  %sUninitialized value was created in the stack frame%s\n",
-           d.Origin(), d.Default());
-  } else {
+  if (so) {
     Printf(
         "  %sUninitialized value was created by an allocation of '%s%s%s'"
         " in the stack frame%s\n",
         d.Origin(), d.Name(), so, d.Origin(), d.Default());
+  } else {
+    Printf("  %sUninitialized value was created in the stack frame%s\n",
+           d.Origin(), d.Default());
   }
 
   if (pc)
diff --git a/compiler-rt/lib/ubsan/CMakeLists.txt b/compiler-rt/lib/ubsan/CMakeLists.txt
--- a/compiler-rt/lib/ubsan/CMakeLists.txt
+++ b/compiler-rt/lib/ubsan/CMakeLists.txt
@@ -192,7 +192,8 @@
     add_compiler_rt_runtime(clang_rt.ubsan_standalone
       STATIC
       ARCHS ${UBSAN_SUPPORTED_ARCH}
-      SOURCES ubsan_init_standalone_preinit.cpp
+      SOURCES
+        ubsan_init_standalone_preinit.cpp
       ADDITIONAL_HEADERS ${UBSAN_HEADERS}
       OBJECT_LIBS RTSanitizerCommon
               RTSanitizerCommonLibc
diff --git a/flang/cmake/modules/AddFlang.cmake b/flang/cmake/modules/AddFlang.cmake
--- a/flang/cmake/modules/AddFlang.cmake
+++ b/flang/cmake/modules/AddFlang.cmake
@@ -18,7 +18,7 @@
 
 macro(add_flang_library name)
   cmake_parse_arguments(ARG
-    "SHARED;STATIC"
+    "SHARED;STATIC;INSTALL_WITH_TOOLCHAIN"
     ""
     "ADDITIONAL_HEADERS"
     ${ARGN})
@@ -65,7 +65,8 @@
 
   if (TARGET ${name})
 
-    if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY OR ${name} STREQUAL "libflang")
+    if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY OR ${name} STREQUAL "libflang"
+        OR ARG_INSTALL_WITH_TOOLCHAIN)
       get_target_export_arg(${name} Flang export_to_flangtargets UMBRELLA flang-libraries)
       install(TARGETS ${name}
         COMPONENT ${name}
diff --git a/flang/docs/PolymorphicEntities.md b/flang/docs/PolymorphicEntities.md
new file mode 100644
--- /dev/null
+++ b/flang/docs/PolymorphicEntities.md
@@ -0,0 +1,871 @@
+# Polymorphic Entities
+
+A polymorphic entity is a data entity that can be of different type during the
+execution of a program.
+
+This document aims to give insights at the representation of polymorphic
+entities in FIR and how polymorphic related constructs and features are lowered
+to FIR.
+
+## Fortran standard
+
+Here is a list of the sections and constraints of the Fortran standard involved
+for polymorphic entities.
+
+- 7.3.2.1 - 7.3.2.2: TYPE specifier (TYPE(*))
+  - C708
+  - C709
+  - C710
+  - C711
+- 7.3.2.3: CLASS specifier
+- 7.5.4.5: The passed-object dummy argument
+  - C760
+- 9.7.1: ALLOCATE statement
+  - C933
+- 9.7.2: NULLIFY statement
+   - When a NULLIFY statement is applied to a polymorphic pointer (7.3.2.3),
+     its dynamic type becomes the same as its declared type.
+- 10.2.2.3: Data pointer assignment
+- 11.1.3: ASSOCIATE construct
+- 11.1.11: SELECT TYPE construct
+  - C1157
+  - C1158
+  - C1159
+  - C1160
+  - C1161
+  - C1162
+  - C1163
+  - C1164
+  - C1165
+- 16.9.76 EXTENDS_TYPE_OF (A, MOLD)
+- 16.9.165 SAME_TYPE_AS (A, B)
+- 16.9.184 STORAGE_SIZE (A [, KIND])
+- C.10.5 Polymorphic Argument Association (15.5.2.9)
+
+---
+
+## Representation in FIR
+
+### Polymorphic entities `CLASS(type1)`
+
+A polymorphic entity is represented as a class type in FIR. In the example below
+the dummy argument `p` is passed to the subroutine `foo` as a polymorphic entity
+with the extensible type `point`. The type information captured in the class is
+the best statically available at compile time.
+`!fir.class` is a new type introduced for polymorphic entities. It's similar to
+a box type but allows the distinction between a monomorphic and a polymorphic
+descriptor.
+A specific `BoxTypeInterface` (TypeInterface) can be introduced to share the
+same API for both types where it is necessary. `!fir.class` and `!fir.box` can
+also be based on a same `BaseBoxType` similar to the `BaseMemRefType` done for
+MemRef.
+
+**Fortran**
+```fortran
+type point
+  real :: x, y
+end type point
+
+type, extends(point) :: point_3d
+  real :: z
+end type
+
+subroutine foo(p)
+  class(point) :: p
+  ! code of the subroutine
+end subroutine
+```
+
+**FIR**
+```c
+func.func @foo(%p : !fir.class<!fir.type<_QTpoint{x:f32,y:f32}>>)
+```
+
+### Unlimited polymorphic entities `CLASS(*)`
+
+The unlimited polymorphic entity is represented as a class type with `*`.
+
+**Fortran**
+```fortran
+subroutine bar(x)
+  class(*) :: x
+  ! code of the subroutine
+end subroutine
+```
+
+**FIR**
+```c
+func.func @bar(%x : !fir.class<*>)
+```
+
+### Assumed-type `TYPE(*)`
+
+Assumed type is added in Fortran 2018 and it is available only for dummy
+arguments. It's mainly used for interfaces to non-Fortran code and is similar
+to C's `void`. It's not part of polymorphic entities directly but it's not
+currently implemented in flang.
+
+Assumed-type is represented as `!fir.type<*>`.
+
+### SELECT TYPE construct
+
+The `SELECT TYPE` construct select for execution at most one of its constituent
+block. The selection is based on the dynamic type of the selector.
+
+**Fortran**
+```fortran
+type point
+  real :: x, y
+end type point
+type, extends(point) :: point_3d
+  real :: z
+end type point_3d
+type, extends(point) :: color_point
+  integer :: color
+end type color_point
+
+type(point), target :: p
+type(point_3d), target :: p3
+type(color_point), target :: c
+class(point), pointer :: p_or_c
+p_or_c => c
+select type ( a => p_or_c )
+class is (point)
+  print*, a%x, a%y
+type is (point_3d)
+  print*, a%x, a%y, a%z
+class default
+  print*,
+end select
+```
+
+From the Fortran standard:
+> A `TYPE IS` type guard statement matches the selector if the dynamic type
+and kind type parameter values of the selector are the same as those specified
+by the statement. A `CLASS IS` type guard statement matches the selector if the
+dynamic type of the selector is an extension of the type specified by the
+statement and the kind type parameter values specified by the statement are the
+same as the corresponding type parameter values of the dynamic type of the
+selector.
+
+In the example above the `CLASS IS` type guard is matched.
+
+The construct is lowered to a specific FIR operation `fir.select_type`. It is
+similar to other FIR "select" operations such as `fir.select` and
+`fir.select_rank`. The dynamic type of the selector value is matched against a
+list of type descriptor. The `TYPE IS` type guard statement is represented by a
+`#fir.type_is` attribute and the `CLASS IS` type guard statement is represented
+by a `#fir.class_is` attribute.
+The `CLASS DEFAULT` type guard statement is represented by a `unit` attribute.
+
+**FIR**
+```
+fir.select_type %p : !fir.class<!fir.type<_QTpoint{x:f32,y:f32}>> [
+  #fir.class_is<!fir.type<_QTpoint{x:f32,y:f32}>>, ^bb1,
+  #fir.type_is<!fir.type<_QTpoint_3d{x:f32,y:f32,z:f32}>>, ^bb2,
+  unit, ^bb3]
+```
+
+Lowering of the `fir.select_type` operation will produce a if-then-else ladder.
+The testing of the dynamic type of the selector is done by calling runtime
+functions.
+
+The runtime has two functions to compare dynamic types . Note that this two
+functions _ignore_ the values of `KIND` type parameters. A version of these
+functions that does not _ignore_ the value of the `KIND` type parameters will
+be implemented for the  `SELECT TYPE` type guards testing.
+
+Currently available functions for the `EXTENDS_TYPE_OF` and `SAME_TYPE_AS`
+intrinsics (`flang/include/flang/Evaluate/type.h`).
+```cpp
+std::optional<bool> ExtendsTypeOf(const DynamicType &) const;
+std::optional<bool> SameTypeAs(const DynamicType &) const;
+```
+
+**FIR** (lower level FIR/MLIR after conversion to an if-then-else ladder)
+```
+module  {
+  func @f(%arg0: !fir.class<*>) -> i32 {
+    %c4_i32 = arith.constant 4 : i32
+    %c8_i32 = arith.constant 8 : i32
+    %c16_i32 = arith.constant 16 : i32
+    %0 = fir.gentypedesc !fir.tdesc<!fir.type<!fir.type<_QTpoint{x:f32,y:f32}>>>
+    %1 = fir.convert %arg0 : (!fir.class<!fir.type<_QTpoint{x:f32,y:f32}>>) -> !fir.box<none>
+    %2 = fir.convert %0 : (!fir.tdesc<!fir.type<!fir.type<_QTpoint{x:f32,y:f32}>>>) -> !fir.ref<none>
+    %3 = fir.call @ExtendsTypeOfWithKind(%1, %2) : (!fir.box<none>, !fir.ref<none>) -> i1
+    cond_br %3, ^bb2(%c4_i32 : i32), ^bb1
+  ^bb1:  // pred: ^bb0
+    %4 = fir.gentypedesc !fir.type<_QTpoint_3d{x:f32,y:f32,z:f32}>
+    %5 = fir.convert %arg0 : (!fir.class<!fir.type<_QTpoint{x:f32,y:f32}>>) -> !fir.box<none>
+    %6 = fir.convert %4 : (!fir.tdesc<!fir.type<_QTpoint_3d{x:f32,y:f32,z:f32}>>) -> !fir.ref<none>
+    %7 = fir.call @SameTypeAsWithKind(%5, %6) : (!fir.box<none>, !fir.ref<none>) -> i1
+    cond_br %7, ^bb4(%c16_i32 : i32), ^bb3
+  ^bb2(%8: i32):  // pred: ^bb0
+    return %8 : i32
+  ^bb3:  // pred: ^bb1
+    br ^bb5(%c8_i32 : i32)
+  ^bb4(%9: i32):  // pred: ^bb1
+    %10 = arith.addi %9, %9 : i32
+    return %10 : i32
+  ^bb5(%11: i32):  // pred: ^bb3
+    %12 = arith.muli %11, %11 : i32
+    return %12 : i32
+  }
+  func private @ExactSameTypeAsWithKind(!fir.box<none>, !fir.ref<none>) -> i1
+  func private @SameTypeAsWithKind(!fir.box<none>, !fir.ref<none>) -> i1
+}
+```
+
+Note: some dynamic type checks can be inlined for performance. Type check with
+intrinsic types when dealing with unlimited polymorphic entities is an ideal
+candidate for inlined checks.
+
+---
+
+## Dynamic dispatch
+
+Dynamic dispatch is the process of selecting which implementation of a
+polymorphic procedure to call at runtime. The runtime already has information
+to be used in this process (more information can be found here:
+[RuntimeTypeInfo.md](RuntimeTypeInfo.md)).
+
+The declaration of the data structures are present in
+`flang/runtime/type-info.h`.
+
+In the example below, there is a basic type `shape` with two type extensions
+`triangle` and `rectangle`.
+The two type extensions override the `get_area` type-bound procedure.
+
+**UML**
+```
+
+                          |---------------------|
+                          |        Shape        |
+                          |---------------------|
+                          | + color:integer     |
+                          | + isFilled:logical  |
+                          |---------------------|
+                          | + init()            |
+                          | + get_area():real   |
+                          |---------------------|
+                                     /\
+                                    /__\
+                                     |
+            |---------------------------------------------------|
+            |                                                   |
+            |                                                   |
+|---------------------|                              |---------------------|
+|      triangle       |                              |      rectangle      |
+|---------------------|                              |---------------------|
+| + base:real         |                              | + length:real       |
+| + height:real       |                              | + width:real        |
+|---------------------|                              |---------------------|
+| + get_area():real   |                              | + get_area():real   |
+|---------------------|                              |---------------------|
+
+```
+
+**Fortran**
+```fortran
+module geometry
+type :: shape
+  integer :: color
+  logical :: isFilled
+contains
+  procedure :: get_area => get_area_shape
+  procedure :: init => init_shape
+end type shape
+
+type, extends(shape) :: triangle
+  real :: base
+  real :: height
+contains
+  procedure :: get_area => get_area_triangle
+end type triangle
+
+type, extends(shape) :: rectangle
+  real :: length
+  real :: width
+contains
+  procedure :: get_area => get_area_rectangle
+end type rectangle
+
+type shape_array
+  class(shape), allocatable :: item
+end type
+
+contains
+
+function get_area_shape(this)
+  real :: get_area_shape
+  class(shape) :: this
+  get_area_shape = 0.0
+end function
+
+subroutine init_shape(this, color)
+  class(shape) :: this
+  integer :: color
+  this%color = color
+  this%isFilled = .false.
+end subroutine
+
+function get_area_triangle(this)
+  real :: get_area_triangle
+  class(triangle) :: this
+  get_area_triangle = (this%base * this%height) / 2
+end function
+
+function get_area_rectangle(this)
+  real :: get_area_rectangle
+  class(rectangle) :: this
+  get_area_rectangle = this%length * this%width
+end function
+
+function get_all_area(shapes)
+  real :: get_all_area
+  type(shape_array) :: shapes(:)
+  real :: sum
+  integer :: i
+
+  get_all_area = 0.0
+
+  do i = 1, size(shapes)
+    get_all_area = get_all_area + shapes(i)%item%get_area()
+  end do
+end function
+
+subroutine set_base_values(sh, v1, v2)
+  class(shape) :: sh
+  real, intent(in) :: v1, v2
+
+  select type (sh)
+  type is (triangle)
+    sh%base = v1
+    sh%height = v2
+  type is (rectangle)
+    sh%length = v1
+    sh%width = v2
+  class default
+    print*,'Cannot set values'
+  end select
+end subroutine
+
+end module
+
+program foo
+  use geometry
+
+  real :: area
+
+  type(shape_array), dimension(2) :: shapes
+
+  allocate (triangle::shapes(1)%item)
+  allocate (rectangle::shapes(2)%item)
+
+  do i = 1, size(shapes)
+    call shapes(i)%item%init(i)
+  end do
+
+  call set_base_values(shapes(1)%item, 2.0, 1.5)
+  call set_base_values(shapes(2)%item, 5.0, 4.5)
+
+  area = get_all_area(shapes)
+
+  print*, area
+
+  deallocate(shapes(1)%item)
+  deallocate(shapes(2)%item)
+end program
+```
+
+The `fir.dispatch` operation is used to perform a dynamic dispatch. This
+operation is comparable to the `fir.call` operation but for polymorphic
+entities.
+Call to `NON_OVERRIDABLE` type-bound procedure are resolved at compile time and
+a `fir.call` operation is emitted instead of a `fir.dispatch`.
+When the type of a polymorphic entity can be fully determined at compile
+time, a `fir.dispatch` op can even be converted to a `fir.call` op. This will
+be discussed in more detailed later in the document in the devirtualization
+section.
+
+**FIR**
+Here is simple example of the `fir.dispatch` operation. The operation specify
+the binding name of the type-bound procedure to be called and pass the
+descriptor as argument. If the `NOPASS` attribute is set then the descriptor is
+not passed as argument when lowered. If `PASS(arg-name)` is specified, the
+`fir.pass` attribute is added to point to the PASS argument in the
+`fir.dispatch` operation. `fir.nopass` attribute is added for the `NOPASS`. The
+descriptor still need to be present in the `fir.dispatch` operation for the
+dynamic dispatch. The CodeGen will then omit the descriptor in the argument
+of the generated call.
+
+The dispatch explanation focus only on the call to `get_area()` as seen in the
+example.
+
+**Fortran**
+```fortran
+get_all_area = get_all_area + shapes(i)%item%get_area()
+```
+
+**FIR**
+```c
+%1 = fir.convert %0 : (!fir.ref<!fir.class<!fir.type<_QMgeometryTtriangle{color:i32,isFilled:!fir.logical<4>,base:f32,height:f32>>>) -> !fir.ref<!fir.box<none>>
+%2 = fir.dispatch "get_area"(%1) : (!fir.ref<!fir.box<none>>) -> f32
+```
+
+The type information is stored in the `f18Addendum` of the descriptor. The
+format is defined in `flang/runtime/type-info.h` and part of its representation
+in LLVM IR is shown below. The binding is comparable to a vtable. Each derived
+type has a complete type-bound procedure table in which all of the bindings of
+its ancestor types appear first.
+
+**LLVMIR**
+
+Representation of the derived type information with the bindings.
+```c
+%_QM__fortran_type_infoTderivedtype = type { { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, { ptr, i64, i32, i8, i8, i8, i8 }, i64, { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, i32, i8, i8, i8, i8, [4 x i8] }
+%_QM__fortran_type_infoTbinding = type { %_QM__fortran_builtinsT__builtin_c_funptr, { ptr, i64, i32, i8, i8, i8, i8 } }
+%_QM__fortran_builtinsT__builtin_c_funptr = type { i64 }
+```
+
+The `fir.dispatch` is then lowered to use the runtime information to extract the
+correct function from the vtable and to perform the actual call. Here is
+what it can look like in pseudo LLVM IR code.
+
+**LLVMIR**
+```c
+// Retrieve the bindings (vtable) from the type information from the descriptor
+%1 = call %_QM__fortran_type_infoTbinding* @_FortranAGetBindings(%desc)
+// Retrieve the position of the specific bindings in the table
+%2 = call i32 @_FortranAGetBindingOffset(%1, "get_area")
+// Get the binding from the table
+%3 = getelementptr %_QM__fortran_type_infoTbinding, %_QM__fortran_type_infoTbinding* %1, i32 0, i32 %2
+// Get the function pointer from the binding
+%4 = getelementptr %_QM__fortran_builtinsT__builtin_c_funptr, %_QM__fortran_type_infoTbinding %3, i32 0, i32 0
+// Cast func pointer
+%5 = inttoptr i64 %4 to <procedure pointer>
+// Load the function
+%6 = load f32(%_QMgeometryTshape*)*, %5
+// Perform the actual function call
+%7 = call f32 %6(%_QMgeometryTshape* %shape)
+```
+
+_Note:_ functions `@_FortranAGetBindings` and `@_FortranAGetBindingOffset` are
+not available in the runtime and will need to be implemented.
+
+- `@_FortranAGetBindings` retrieves the bindings from the descriptor. The
+  descriptor holds the type information that holds the bindings.
+- `@_FortranAGetBindingOffset` retrieves the procedure offset in the bindings
+  based on the binding name provided.
+
+Retrieving the binding table and the offset are done separately so multiple
+dynamic dispatch on the same polymorphic entities can be optimized (the binding
+table is retrieved only once for multiple call).
+
+### Passing polymorphic entities as argument
+
+**Fortran**
+```fortran
+TYPE t1
+END TYPE
+TYPE, EXTENDS(t1) :: t2
+END TYPE
+```
+
+1) Dummy argument is fixed type and actual argument is fixed type.
+    - `TYPE(t1)` to `TYPE(t1)`: Nothing special to take into consideration.
+2) Dummy argument is polymorphic and actual argument is fixed type. In these
+   cases, the actual argument need to be boxed to be passed to the
+   subroutine/function since those are expecting a descriptor.
+   ```c
+   func.func @_QMmod1Ps(%arg0: !fir.class<!fir.type<_QMmod1Tshape{x:i32,y:i32}>>)
+   func.func @_QQmain() {
+     %0 = fir.alloca !fir.type<_QMmod1Tshape{x:i32,y:i32}> {uniq_name = "_QFEsh"}
+     %1 = fir.embox %0 : (!fir.ref<!fir.type<_QMmod1Tshape{x:i32,y:i32}>>) -> !fir.class<!fir.type<_QMmod1Tshape{x:i32,y:i32}>>
+     fir.call @_QMmod1Ps(%1) : (!fir.class<!fir.type<_QMmod1Tshape{x:i32,y:i32}>>) -> ()
+     return
+   }
+   ```
+    - `TYPE(t1)` to `CLASS(t1)`
+    - `TYPE(t2)` to `CLASS(t1)`
+    - `TYPE(t1)` to `CLASS(t2)` - Invalid
+    - `TYPE(t2)` to `CLASS(t2)`
+3) Actual argument is polymorphic and dummy argument is fixed type. These case
+   are restricted to the declared type of the polymorphic entities.
+    - The simple case is when the actual argument is a scalar
+      polymorphic entity passed to a non-PDT. The caller just extract the
+      base address from the descriptor and pass it to the function.
+    - In other cases, the caller needs to perform a copyin/copyout since it
+      cannot just extract the base address of the `CLASS(T)` because it is
+      likely not contiguous.
+    - `CLASS(t1)` to `TYPE(t1)`
+    - `CLASS(t2)` to `TYPE(t1)` - Invalid
+    - `CLASS(t1)` to `TYPE(t2)` - Invalid
+    - `CLASS(t2)` to `TYPE(t2)`
+4) Both actual and dummy arguments are polymorphic. These particular cases are
+   straight forward. The function expect polymorphic entities already.
+   The boxed type is passed without change.
+    - `CLASS(t1)` to `CLASS(t1)`
+    - `CLASS(t2)` to `CLASS(t1)`
+    - `CLASS(t1)` to `CLASS(t2)` - Invalid
+    - `CLASS(t2)` to `CLASS(t2)`
+
+### User-Defined Derived Type Input/Output
+
+User-Defined Derived Type Input/Output allows to define how a derived-type
+is read or written from/to a file.
+
+There are 4 basic subroutines that can be defined:
+- Formatted READ
+- Formatted WRITE
+- Unformatted READ
+- Unformatted WRITE
+
+Here are their respective interfaces:
+
+**Fortran**
+```fortran
+subroutine read_formatted(dtv, unit, iotype, v_list, iostat, iomsg)
+subroutine write_formatted(dtv, unit, iotype, v_list, iostat, iomsg)
+subroutine read_unformatted(dtv, unit, iotype, v_list, iostat, iomsg)
+subroutine write_unformatted(dtv, unit, iotype, v_list, iostat, iomsg)
+```
+
+When defined on a derived-type, these specific type-bound procedures are stored
+as special bindings in the type descriptor (see `SpecialBinding` in
+`flang/runtime/type-info.h`).
+
+With a derived-type the function call to `@_FortranAioOutputDescriptor` from IO
+runtime will be emitted in lowering.
+
+**Fortran**
+```fortran
+type(t) :: x
+write(10), x
+```
+
+**FIR**
+```c
+%5 = fir.call @_FortranAioBeginUnformattedOutput(%c10_i32, %4, %c56_i32) : (i32, !fir.ref<i8>, i32) -> !fir.ref<i8>
+%6 = fir.embox %2 : (!fir.ref<!fir.type<_QTt>>) -> !fir.class<!fir.type<_QTt>>
+%7 = fir.convert %6 : (!fir.class<!fir.type<_QTt>>) -> !fir.box<none>
+%8 = fir.call @_FortranAioOutputDescriptor(%5, %7) : (!fir.ref<i8>, !fir.box<none>) -> i1
+%9 = fir.call @_FortranAioEndIoStatement(%5) : (!fir.ref<i8>) -> i32
+```
+
+When dealing with polymorphic entities the call to IO runtime can stay
+unchanged. The runtime function `OutputDescriptor` can make the dynamic dispatch
+to the correct binding stored in the descriptor.
+
+### Finalization
+
+The `FINAL` specifies a final subroutine that might  be executed when a data
+entity of that type is finalized. Section 7.5.6.3 defines when finalization
+occurs.
+
+Final subroutines like User-Defined Derived Type Input/Output are stored as
+special bindings in the type descriptor. The runtime is able to handle the
+finalization with a call the the `@_FortranADestroy` function
+(`flang/include/flang/Runtime/derived-api.h`).
+
+**FIR**
+```c
+%5 = fir.call @_FortranADestroy(%desc) : (!fir.box<none>) -> none
+```
+
+The `@_FortranADestroy` function will take care to call the final subroutines
+and the ones from the parent type.
+
+Appropriate call to finalization have to be lowered at the right places (7.5.6.3
+When finalization occurs).
+
+### Devirtualization
+
+Sometimes there is enough information at compile-time to avoid going through
+a dynamic dispatch for a type-bound procedure call on a polymorphic entity. To
+be able to perform this optimization directly in FIR the dispatch table is also
+present statically with the `fir.dispatch_table` and `fir.dt_entry` operations.
+
+Here is an example of these operations representing the dispatch tables for the
+same example than for the dynamic dispatch.
+
+**FIR**
+```
+fir.dispatch_table @_QMgeometryE.dt.shape {
+  fir.dt_entry init, @_QMgeometryPinit_shape
+  fir.dt_entry get_area, @_QMgeometryPget_area_shape
+}
+
+fir.dispatch_table @_QMgeometryE.dt.rectangle {
+  fir.dt_entry init, @_QMgeometryPinit_shape
+  fir.dt_entry get_area, @_QMgeometryPget_area_rectangle
+}
+
+fir.dispatch_table @_QMgeometryE.dt.triangle {
+  fir.dt_entry init, @_QMgeometryPinit_shape
+  fir.dt_entry get_area, @_QMgeometryPget_area_triangle
+}
+```
+
+With this information, an optimization pass can replace `fir.dispatch`
+operations with `fir.call` operations to the correct functions when the type is
+know at compile time.
+
+This is the case in a `type is` type-guard block as illustrated below.
+
+**Fortran**
+```fortran
+subroutine get_only_triangle_area(sh)
+  class(shape) :: sh
+  real :: area
+
+  select type (sh)
+  type is (triangle)
+    area = sh%get_area()
+  class default
+    area = 0.0
+  end select
+
+end subroutine
+```
+
+**FIR**
+
+The call to `get_area` in the `type is (triangle)` guard can be replaced.
+```c
+%3 = fir.dispatch "get_area"(%desc)
+// Replaced by
+%3 = fir.call @get_area_triangle(%desc)
+```
+
+Another example would be the one below. In this case as well, a dynamic dispatch
+is not necessary and a `fir.call` can be emitted instead.
+
+**Fortran**
+```fortran
+real :: area
+class(shape), pointer :: sh
+type(triangle), target :: tr
+
+sh => tr
+
+area = sh%get_area()
+```
+
+Note that the frontend is already replacing some of the dynamic dispatch calls
+with the correct static ones. The optimization pass is useful for cases not
+handled by the frontend and especially cases showing up after some other
+optimizations are applied.
+
+### `ALLOCATE`/`DEALLOCATE` statements
+
+The allocation and deallocation of polymorphic entities are delegated to the
+runtime.
+The corresponding function signatures can be found in
+`flang/include/flang/Runtime/allocatable.h` and in
+`flang/include/flang/Runtime/pointer.h` for pointer allocation.
+
+`ALLOCATE`
+
+The `ALLOCATE` statement is lowered to runtime calls as shown in the example
+below.
+
+**Fortran**
+```fortran
+allocate(triangle::shapes(1)%item)
+allocate(rectangle::shapes(2)%item)
+```
+
+**FIR**
+```c
+%0 = fir.alloca !fir.class<!fir.type<_QMgeometryTtriangle{color:i32,isFilled:!fir.logical<4>,base:f32,height:f32>>
+%1 = fir.alloca !fir.class<!fir.type<_QMgeometryTtriangle{color:i32,isFilled:!fir.logical<4>,base:f32,height:f32}>>
+%3 = fir.convert %0 : (!fir.ref<!fir.class<!fir.type<_QMgeometryTtriangle{color:i32,isFilled:!fir.logical<4>,base:f32,height:f32>>>) -> !fir.ref<!fir.box<none>>
+%4 = fir.gentypedesc !fir.type<_QMgeometryTtriangle{color:i32,isFilled:!fir.logical<4>,base:f32,height:f32}>>
+%5 = fir.call @_FortranAAllocatableInitDerived(%3, %4)
+
+%6 = fir.convert %1 : (!fir.ref<!fir.class<_QMgeometryTtriangle{color:i32,isFilled:!fir.logical<4>,base:f32,height:f32}>>>) -> !fir.ref<!fir.box<none>>
+%7 = fir.gentypedesc !fir.type<_QMgeometryTtriangle{color:i32,isFilled:!fir.logical<4>,base:f32,height:f32}>> %8 = fir.call @_FortranAAllocatableInitDerived(%6, %7)
+```
+
+For pointer allocation, the `PointerAllocate` function is used.
+
+`DEALLOCATE`
+
+The `DEALLOCATE` statement is lowered to a runtime call to
+`AllocatableDeallocate` and `PointerDeallocate` for pointers.
+
+**Fortran**
+```fortran
+deallocate(shapes(1)%item)
+deallocate(shapes(2)%item)
+```
+
+**FIR**
+```c
+%8 = fir.call @_FortranAAllocatableDeallocate(%desc1)
+%9 = fir.call @_FortranAAllocatableDeallocate(%desc2)
+```
+
+### `EXTENDS_TYPE_OF`/`SAME_TYPE_AS` intrinsics
+
+`EXTENDS_TYPE_OF` and `SAME_TYPE_AS` intrinsics have implementation in the
+runtime. Respectively `SameTypeAs` and `ExtendsTypeOf` in
+`flang/include/flang/Evaluate/type.h`.
+
+Both intrinsic functions are lowered to their respective runtime calls.
+
+### Assignment / Pointer assignment
+
+Intrinsic assignment of an object to another is already implemented in the
+runtime. The function `@_FortranAAsssign` performs the correct operations.
+
+Available in `flang/include/flang/Runtime/assign.h`.
+
+### User defined assignment and operator
+
+**Fortran**
+```fortran
+module mod1
+type t1
+contains
+  procedure :: assign_t1
+  generic :: assignment(=) => assign_t1
+end type t1
+
+type, extends(t1) :: t2
+end type
+
+contains
+
+subroutine assign_t1(to, from)
+  class(t1), intent(inout) :: to
+  class(t1), intent(in) :: from
+  ! Custom code for the assignment
+end subroutine
+
+subroutine assign_t2(to, from)
+  class(t2), intent(inout) :: to
+  class(t2), intent(in) :: from
+  ! Custom code for the assignment
+end subroutine
+
+end module
+
+program main
+use mod
+
+class(t1), allocatable :: v1
+class(t1), allocatable :: v2
+
+allocate(t2::v1)
+allocate(t2::v2)
+
+v2 = v1
+
+end program
+```
+
+In the example above the assignment `v2 = v1` is done by a call to `assign_t1`.
+This is resolved at compile time since `t2` could not have a generic type-bound
+procedure for assignment with an interface that is not distinguishable. This
+is the same for user defined operators.
+
+### `NULLIFY`
+
+When a `NULLIFY` statement is applied to a polymorphic pointer (7.3.2.3), its
+dynamic type becomes the same as its declared type.
+
+The `NULLIFY` statement is lowered to a call to the corresponding runtime
+function `PointerNullifyDerived` in `flang/include/flang/Runtime/pointer.h`.
+
+### Impact on existing FIR operations dealing with descriptors
+
+Currently, FIR has a couple of operations taking descriptors as inputs or
+producing descriptors as outputs. These operations might need to deal with the
+dynamic type of polymorphic entities.
+
+- `fir.load`/`fir.store`
+  - Currently a `fir.load` of a `fir.box` is a special case. In the code
+    generation no copy is made. This could be problematic with polymorphic
+    entities. When a `fir.load` is performed on a `fir.class` type, the dynamic
+    can be copied.
+
+  **Fortran**
+  ```fortran
+  module mod1
+    class(shape), pointer :: a
+  contains
+  subroutine sub1(a, b)
+    class(shape) :: b
+    associate (b => a)
+      ! Some more code
+    end associate
+  end subroutine
+  end module
+  ```
+
+  In the example above, the dynamic type of `a` and `b` might be different. The
+  dynamic type of `a` must be copied when it is associated on `b`.
+
+  **FIR**
+  ```c
+  // fir.load must copy the dynamic type from the pointer `a`
+  %0 = fir.address_of(@_QMmod1Ea) : !fir.ref<!fir.class<!fir.ptr<!fir.type<_QMmod1Tshape{x:i32,y:i32}>>>>
+  %1 = fir.load %0 : !fir.ref<!fir.class<!fir.ptr<!fir.type<_QMmod1Tshape{x:i32,y:i32}>>>>
+  ```
+
+- `fir.embox`
+  - The embox operation is used to create a descriptor from a reference. With
+    polymorphic entities, it is used to create a polymorphic descriptor from
+    a derived type. The declared type of the descriptor and the derived type
+    are identical. The dynamic type of the descriptor must be set when it is
+    created. This is already handled by lowering.
+
+- `fir.rebox`
+  - The rebox operation is used to create a new descriptor from a another
+    descriptor with new optional dimension. If the original descriptor is a
+    polymorphic entities its dynamic type must be propagated to the new
+    descriptor.
+  ```
+  %0 = fir.slice %c10, %c33, %c2 : (index, index, index) -> !fir.slice<1>
+  %1 = fir.shift %c0 : (index) -> !fir.shift<1>
+  %2 = fir.rebox %x(%1)[%0] : (!fir.class<!fir.array<?x!fir.type<>>>, !fir.shift<1>, !fir.slice<1>) -> !fir.class<!fir.array<?x!fir.type<>>>
+  ```
+---
+
+# Testing
+
+- Lowering part is tested with LIT tests in tree
+- Polymorphic entities involved a lot of runtime information so executable
+  tests will be useful for full testing.
+
+---
+
+# Current TODOs
+Current list of TODOs in lowering:
+- `flang/lib/Lower/Allocatable.cpp:465` not yet implemented: SOURCE allocation
+- `flang/lib/Lower/Allocatable.cpp:468` not yet implemented: MOLD allocation
+- `flang/lib/Lower/Allocatable.cpp:471` not yet implemented: polymorphic entity allocation
+- `flang/lib/Lower/Bridge.cpp:448` not yet implemented: create polymorphic host associated copy
+- `flang/lib/Lower/Bridge.cpp:2185` not yet implemented: assignment to polymorphic allocatable
+- `flang/lib/Lower/Bridge.cpp:2288` not yet implemented: pointer assignment involving polymorphic entity
+- `flang/lib/Lower/Bridge.cpp:2316` not yet implemented: pointer assignment involving polymorphic entity
+- `flang/lib/Lower/CallInterface.cpp:795` not yet implemented: support for polymorphic types
+- `flang/lib/Lower/ConvertType.cpp:237` not yet implemented: support for polymorphic types
+
+Current list of TODOs in code generation:
+
+- `flang/lib/Optimizer/CodeGen/CodeGen.cpp:897` not yet implemented: fir.dispatch codegen
+- `flang/lib/Optimizer/CodeGen/CodeGen.cpp:911` not yet implemented: fir.dispatch_table codegen
+- `flang/lib/Optimizer/CodeGen/CodeGen.cpp:924` not yet implemented: fir.dt_entry codegen
+- `flang/lib/Optimizer/CodeGen/CodeGen.cpp:2651` not yet implemented: fir.gentypedesc codegen
+
+---
+
+Resources:
+- [1] https://www.pgroup.com/blogs/posts/f03-oop-part1.htm
+- [2] https://www.pgroup.com/blogs/posts/f03-oop-part2.htm
+- [3] https://www.pgroup.com/blogs/posts/f03-oop-part3.htm
+- [4] https://www.pgroup.com/blogs/posts/f03-oop-part4.htm
+- [5] Modern Fortran explained
diff --git a/flang/include/flang/Common/idioms.h b/flang/include/flang/Common/idioms.h
--- a/flang/include/flang/Common/idioms.h
+++ b/flang/include/flang/Common/idioms.h
@@ -123,6 +123,9 @@
   const std::size_t value;
 };
 
+template<typename A>
+ListItemCount(std::initializer_list<A>) -> ListItemCount<A>;
+
 #define ENUM_CLASS(NAME, ...) \
   enum class NAME { __VA_ARGS__ }; \
   LLVM_ATTRIBUTE_UNUSED static constexpr std::size_t NAME##_enumSize{[] { \
diff --git a/flang/lib/Decimal/CMakeLists.txt b/flang/lib/Decimal/CMakeLists.txt
--- a/flang/lib/Decimal/CMakeLists.txt
+++ b/flang/lib/Decimal/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-add_flang_library(FortranDecimal
+add_flang_library(FortranDecimal INSTALL_WITH_TOOLCHAIN
   binary-to-decimal.cpp
   decimal-to-binary.cpp
 )
diff --git a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
--- a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
+++ b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
@@ -31,11 +31,14 @@
 #include "flang/Optimizer/Support/FIRContext.h"
 #include "flang/Optimizer/Transforms/Passes.h"
 #include "mlir/IR/Matchers.h"
+#include "mlir/IR/TypeUtilities.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/RegionUtils.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 
 #define DEBUG_TYPE "flang-simplify-intrinsics"
 
@@ -159,8 +162,13 @@
 /// with signature provided by \p funcOp. The caller is responsible
 /// for saving/restoring the original insertion point of \p builder.
 /// \p funcOp is expected to be empty on entry to this function.
+/// \p arg1ElementTy and \p arg2ElementTy specify elements types
+/// of the underlying array objects - they are used to generate proper
+/// element accesses.
 static void genFortranADotBody(fir::FirOpBuilder &builder,
-                               mlir::func::FuncOp &funcOp) {
+                               mlir::func::FuncOp &funcOp,
+                               mlir::Type arg1ElementTy,
+                               mlir::Type arg2ElementTy) {
   // function FortranADotProduct<T>_simplified(arr1, arr2)
   //   T, dimension(:) :: arr1, arr2
   //   T product = 0
@@ -171,14 +179,15 @@
   //   FortranADotProduct<T>_simplified = product
   // end function FortranADotProduct<T>_simplified
   auto loc = mlir::UnknownLoc::get(builder.getContext());
-  mlir::Type elementType = funcOp.getResultTypes()[0];
+  mlir::Type resultElementType = funcOp.getResultTypes()[0];
   builder.setInsertionPointToEnd(funcOp.addEntryBlock());
 
   mlir::IndexType idxTy = builder.getIndexType();
 
-  mlir::Value zero = elementType.isa<mlir::FloatType>()
-                         ? builder.createRealConstant(loc, elementType, 0.0)
-                         : builder.createIntegerConstant(loc, elementType, 0);
+  mlir::Value zero =
+      resultElementType.isa<mlir::FloatType>()
+          ? builder.createRealConstant(loc, resultElementType, 0.0)
+          : builder.createIntegerConstant(loc, resultElementType, 0);
 
   mlir::Block::BlockArgListType args = funcOp.front().getArguments();
   mlir::Value arg1 = args[0];
@@ -187,10 +196,12 @@
   mlir::Value zeroIdx = builder.createIntegerConstant(loc, idxTy, 0);
 
   fir::SequenceType::Shape flatShape = {fir::SequenceType::getUnknownExtent()};
-  mlir::Type arrTy = fir::SequenceType::get(flatShape, elementType);
-  mlir::Type boxArrTy = fir::BoxType::get(arrTy);
-  mlir::Value array1 = builder.create<fir::ConvertOp>(loc, boxArrTy, arg1);
-  mlir::Value array2 = builder.create<fir::ConvertOp>(loc, boxArrTy, arg2);
+  mlir::Type arrTy1 = fir::SequenceType::get(flatShape, arg1ElementTy);
+  mlir::Type boxArrTy1 = fir::BoxType::get(arrTy1);
+  mlir::Value array1 = builder.create<fir::ConvertOp>(loc, boxArrTy1, arg1);
+  mlir::Type arrTy2 = fir::SequenceType::get(flatShape, arg2ElementTy);
+  mlir::Type boxArrTy2 = fir::BoxType::get(arrTy2);
+  mlir::Value array2 = builder.create<fir::ConvertOp>(loc, boxArrTy2, arg2);
   // This version takes the loop trip count from the first argument.
   // If the first argument's box has unknown (at compilation time)
   // extent, then it may be better to take the extent from the second
@@ -216,19 +227,25 @@
   mlir::OpBuilder::InsertPoint loopEndPt = builder.saveInsertionPoint();
   builder.setInsertionPointToStart(loop.getBody());
 
-  mlir::Type eleRefTy = builder.getRefType(elementType);
+  mlir::Type eleRef1Ty = builder.getRefType(arg1ElementTy);
   mlir::Value index = loop.getInductionVar();
   mlir::Value addr1 =
-      builder.create<fir::CoordinateOp>(loc, eleRefTy, array1, index);
+      builder.create<fir::CoordinateOp>(loc, eleRef1Ty, array1, index);
   mlir::Value elem1 = builder.create<fir::LoadOp>(loc, addr1);
+  // Convert to the result type.
+  elem1 = builder.create<fir::ConvertOp>(loc, resultElementType, elem1);
+
+  mlir::Type eleRef2Ty = builder.getRefType(arg2ElementTy);
   mlir::Value addr2 =
-      builder.create<fir::CoordinateOp>(loc, eleRefTy, array2, index);
+      builder.create<fir::CoordinateOp>(loc, eleRef2Ty, array2, index);
   mlir::Value elem2 = builder.create<fir::LoadOp>(loc, addr2);
+  // Convert to the result type.
+  elem2 = builder.create<fir::ConvertOp>(loc, resultElementType, elem2);
 
-  if (elementType.isa<mlir::FloatType>())
+  if (resultElementType.isa<mlir::FloatType>())
     sumVal = builder.create<mlir::arith::AddFOp>(
         loc, builder.create<mlir::arith::MulFOp>(loc, elem1, elem2), sumVal);
-  else if (elementType.isa<mlir::IntegerType>())
+  else if (resultElementType.isa<mlir::IntegerType>())
     sumVal = builder.create<mlir::arith::AddIOp>(
         loc, builder.create<mlir::arith::MulIOp>(loc, elem1, elem2), sumVal);
   else
@@ -317,6 +334,29 @@
   return 0;
 }
 
+/// Given the call operation's box argument \p val, discover
+/// the element type of the underlying array object.
+/// \returns the element type or llvm::None if the type cannot
+/// be reliably found.
+/// We expect that the argument is a result of fir.convert
+/// with the destination type of !fir.box<none>.
+static llvm::Optional<mlir::Type> getArgElementType(mlir::Value val) {
+  mlir::Operation *defOp;
+  do {
+    defOp = val.getDefiningOp();
+    // Analyze only sequences of convert operations.
+    if (!mlir::isa<fir::ConvertOp>(defOp))
+      return llvm::None;
+    val = defOp->getOperand(0);
+    // The convert operation is expected to convert from one
+    // box type to another box type.
+    auto boxType = val.getType().cast<fir::BoxType>();
+    auto elementType = fir::unwrapSeqOrBoxedSeqType(boxType);
+    if (!elementType.isa<mlir::NoneType>())
+      return elementType;
+  } while (true);
+}
+
 void SimplifyIntrinsicsPass::runOnOperation() {
   LLVM_DEBUG(llvm::dbgs() << "=== Begin " DEBUG_TYPE " ===\n");
   mlir::ModuleOp module = getOperation();
@@ -380,11 +420,42 @@
           if (!type.isa<mlir::FloatType>() && !type.isa<mlir::IntegerType>())
             return;
 
+          // Try to find the element types of the boxed arguments.
+          auto arg1Type = getArgElementType(v1);
+          auto arg2Type = getArgElementType(v2);
+
+          if (!arg1Type || !arg2Type)
+            return;
+
+          // Support only floating point and integer arguments
+          // now (e.g. logical is skipped here).
+          if (!arg1Type->isa<mlir::FloatType>() &&
+              !arg1Type->isa<mlir::IntegerType>())
+            return;
+          if (!arg2Type->isa<mlir::FloatType>() &&
+              !arg2Type->isa<mlir::IntegerType>())
+            return;
+
           auto typeGenerator = [&type](fir::FirOpBuilder &builder) {
             return genFortranADotType(builder, type);
           };
+          auto bodyGenerator = [&arg1Type,
+                                &arg2Type](fir::FirOpBuilder &builder,
+                                           mlir::func::FuncOp &funcOp) {
+            genFortranADotBody(builder, funcOp, *arg1Type, *arg2Type);
+          };
+
+          // Suffix the function name with the element types
+          // of the arguments.
+          std::string typedFuncName(funcName);
+          llvm::raw_string_ostream nameOS(typedFuncName);
+          nameOS << "_";
+          arg1Type->print(nameOS);
+          nameOS << "_";
+          arg2Type->print(nameOS);
+
           mlir::func::FuncOp newFunc = getOrCreateFunction(
-              builder, funcName, typeGenerator, genFortranADotBody);
+              builder, typedFuncName, typeGenerator, bodyGenerator);
           auto newCall = builder.create<fir::CallOp>(loc, newFunc,
                                                      mlir::ValueRange{v1, v2});
           call->replaceAllUsesWith(newCall.getResults());
diff --git a/flang/runtime/CMakeLists.txt b/flang/runtime/CMakeLists.txt
--- a/flang/runtime/CMakeLists.txt
+++ b/flang/runtime/CMakeLists.txt
@@ -88,4 +88,6 @@
 
   LINK_LIBS
   FortranDecimal
+
+  INSTALL_WITH_TOOLCHAIN
 )
diff --git a/flang/runtime/FortranMain/CMakeLists.txt b/flang/runtime/FortranMain/CMakeLists.txt
--- a/flang/runtime/FortranMain/CMakeLists.txt
+++ b/flang/runtime/FortranMain/CMakeLists.txt
@@ -1,3 +1,3 @@
-add_flang_library(Fortran_main STATIC
+add_flang_library(Fortran_main STATIC INSTALL_WITH_TOOLCHAIN
   Fortran_main.c
 )
diff --git a/flang/test/Lower/OpenACC/acc-data-operands.f90 b/flang/test/Lower/OpenACC/acc-data-operands.f90
--- a/flang/test/Lower/OpenACC/acc-data-operands.f90
+++ b/flang/test/Lower/OpenACC/acc-data-operands.f90
@@ -113,12 +113,88 @@
 
 end subroutine
 
-subroutine acc_operand_array_section2(a)
-  real, dimension(100) :: a
+! Testing array sections on allocatable array
+subroutine acc_operand_array_section_allocatable()
+  real, allocatable :: a(:)
+
+  allocate(a(100))
+
+  !$acc data copyin(a(1:50)) copyout(a(51:100))
+  !$acc end data
+
+  !CHECK: %[[ARR_HEAP:.*]] = fir.alloca !fir.heap<!fir.array<?xf32>> {uniq_name = "_QMacc_data_operandFacc_operand_array_section_allocatableEa.addr"}
+
+  !CHECK: %[[LOAD_ARR0:.*]] = fir.load %[[ARR_HEAP]] : !fir.ref<!fir.heap<!fir.array<?xf32>>>
+  !CHECK: %[[C1_I32:.*]] = arith.constant 1 : i32
+  !CHECK: %[[C1_I64:.*]] = fir.convert %[[C1_I32]] : (i32) -> i64
+  !CHECK: %[[LB0:.*]] = fir.convert %[[C1_I64]] : (i64) -> index
+  !CHECK: %[[C1_STEP:.*]] = arith.constant 1 : i64
+  !CHECK: %[[STEP0:.*]] = fir.convert %[[C1_STEP]] : (i64) -> index
+  !CHECK: %[[C50_I32:.*]] = arith.constant 50 : i32
+  !CHECK: %[[C50_I64:.*]] = fir.convert %[[C50_I32]] : (i32) -> i64
+  !CHECK: %[[UB0:.*]] = fir.convert %[[C50_I64]] : (i64) -> index
+  !CHECK: %[[SHAPE_SHIFT0:.*]] = fir.shape_shift %{{.*}}, %{{.*}} : (index, index) -> !fir.shapeshift<1>
+  !CHECK: %[[SLICE0:.*]] = fir.slice %[[LB0]], %[[UB0]], %[[STEP0]] : (index, index, index) -> !fir.slice<1>
+  !CHECK: %[[ARR_SECTION0:.*]] = fir.embox %[[LOAD_ARR0]](%[[SHAPE_SHIFT0]]) [%[[SLICE0]]] : (!fir.heap<!fir.array<?xf32>>, !fir.shapeshift<1>, !fir.slice<1>) -> !fir.box<!fir.array<50xf32>>
+  !CHECK: %[[MEM0:.*]] = fir.alloca !fir.box<!fir.array<50xf32>>
+  !CHECK: fir.store %[[ARR_SECTION0]] to %[[MEM0]] : !fir.ref<!fir.box<!fir.array<50xf32>>>
+
+  !CHECK: %[[LOAD_ARR1:.*]] = fir.load %[[ARR_HEAP]] : !fir.ref<!fir.heap<!fir.array<?xf32>>>
+  !CHECK: %[[C51_I32:.*]] = arith.constant 51 : i32
+  !CHECK: %[[C51_I64:.*]] = fir.convert %[[C51_I32]] : (i32) -> i64
+  !CHECK: %[[LB1:.*]] = fir.convert %[[C51_I64]] : (i64) -> index
+  !CHECK: %[[C1_STEP:.*]] = arith.constant 1 : i64
+  !CHECK: %[[STEP1:.*]] = fir.convert %[[C1_STEP]] : (i64) -> index
+  !CHECK: %[[C100_I32:.*]] = arith.constant 100 : i32
+  !CHECK: %[[C100_I64:.*]] = fir.convert %[[C100_I32]] : (i32) -> i64
+  !CHECK: %[[UB1:.*]] = fir.convert %[[C100_I64]] : (i64) -> index
+  !CHECK: %[[SHAPE_SHIFT1:.*]] = fir.shape_shift %{{.*}}, %{{.*}} : (index, index) -> !fir.shapeshift<1>
+  !CHECK: %[[SLICE1:.*]] = fir.slice %[[LB1]], %[[UB1]], %[[STEP1]] : (index, index, index) -> !fir.slice<1>
+  !CHECK: %[[ARR_SECTION1:.*]] = fir.embox %[[LOAD_ARR1]](%[[SHAPE_SHIFT1]]) [%[[SLICE1]]] : (!fir.heap<!fir.array<?xf32>>, !fir.shapeshift<1>, !fir.slice<1>) -> !fir.box<!fir.array<50xf32>>
+  !CHECK: %[[MEM1:.*]] = fir.alloca !fir.box<!fir.array<50xf32>>
+  !CHECK: fir.store %[[ARR_SECTION1]] to %[[MEM1]] : !fir.ref<!fir.box<!fir.array<50xf32>>>
+
+  !CHECK: acc.data copyin(%[[MEM0]] : !fir.ref<!fir.box<!fir.array<50xf32>>>) copyout(%[[MEM1]] : !fir.ref<!fir.box<!fir.array<50xf32>>>)
+
+  deallocate(a)
+end subroutine
 
-  !$acc data copyin(a)
+
+! Testing array sections on pointer array
+subroutine acc_operand_array_section_pointer()
+  real, target :: a(100)
+  real, pointer :: p(:)
+
+  p => a
+
+  !$acc data copyin(p(1:50))
   !$acc end data
 
+  !CHECK: %[[C100:.*]] = arith.constant 100 : index
+  !CHECK: %[[ARR:.*]] = fir.alloca !fir.array<100xf32> {bindc_name = "a", fir.target, uniq_name = "_QMacc_data_operandFacc_operand_array_section_pointerEa"}
+  !CHECK: %[[PTR:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xf32>>> {bindc_name = "p", uniq_name = "_QMacc_data_operandFacc_operand_array_section_pointerEp"}
+  !CHECK: %[[SHAPE0:.*]] = fir.shape %[[C100]] : (index) -> !fir.shape<1>
+  !CHECK: %[[EMBOX0:.*]] = fir.embox %[[ARR]](%[[SHAPE0]]) : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xf32>>>
+  !CHECK: fir.store %[[EMBOX0]] to %[[PTR]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+  !CHECK: %[[PTR_LOAD:.*]] = fir.load %[[PTR]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+  !CHECK: %[[C0:.*]] = arith.constant 0 : index
+  !CHECK: %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[PTR_LOAD]], %[[C0]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> (index, index, index)
+  !CHECK: %[[C1_I32:.*]] = arith.constant 1 : i32
+  !CHECK: %[[C1_I64:.*]] = fir.convert %[[C1_I32]] : (i32) -> i64
+  !CHECK: %[[LB0:.*]] = fir.convert %[[C1_I64]] : (i64) -> index
+  !CHECK: %[[C1_STEP:.*]] = arith.constant 1 : i64
+  !CHECK: %[[STEP0:.*]] = fir.convert %[[C1_STEP]] : (i64) -> index
+  !CHECK: %[[C50_I32:.*]] = arith.constant 50 : i32
+  !CHECK: %[[C50_I64:.*]] = fir.convert %[[C50_I32]] : (i32) -> i64
+  !CHECK: %[[UB0:.*]] = fir.convert %[[C50_I64]] : (i64) -> index
+  !CHECK: %[[SHIFT0:.*]] = fir.shift %[[BOX_DIMS]]#0 : (index) -> !fir.shift<1>
+  !CHECK: %[[SLICE0:.*]] = fir.slice %[[LB0]], %[[UB0]], %[[STEP0]] : (index, index, index) -> !fir.slice<1>
+  !CHECK: %[[REBOX0:.*]] = fir.rebox %7(%[[SHIFT0]]) [%[[SLICE0]]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, !fir.shift<1>, !fir.slice<1>) -> !fir.box<!fir.array<50xf32>>
+  !CHECK: %[[MEM0:.*]] = fir.alloca !fir.box<!fir.array<50xf32>>
+  !CHECK: fir.store %[[REBOX0]] to %[[MEM0]] : !fir.ref<!fir.box<!fir.array<50xf32>>>
+  
+  !CHECK: acc.data copyin(%[[MEM0]] : !fir.ref<!fir.box<!fir.array<50xf32>>>) {
+
 end subroutine
 
 end module
diff --git a/flang/test/Transforms/simplifyintrinsics.fir b/flang/test/Transforms/simplifyintrinsics.fir
--- a/flang/test/Transforms/simplifyintrinsics.fir
+++ b/flang/test/Transforms/simplifyintrinsics.fir
@@ -344,15 +344,15 @@
 // CHECK:           %[[RESLOC:.*]] = fir.alloca f32 {bindc_name = "dot", uniq_name = "_QFdotEdot"}
 // CHECK:           %[[ACAST:.*]] = fir.convert %[[A]] : (!fir.box<!fir.array<?xf32>>) -> !fir.box<none>
 // CHECK:           %[[BCAST:.*]] = fir.convert %[[B]] : (!fir.box<!fir.array<?xf32>>) -> !fir.box<none>
-// CHECK:           %[[RES:.*]] = fir.call @_FortranADotProductReal4_simplified(%[[ACAST]], %[[BCAST]]) : (!fir.box<none>, !fir.box<none>) -> f32
+// CHECK:           %[[RES:.*]] = fir.call @_FortranADotProductReal4_f32_f32_simplified(%[[ACAST]], %[[BCAST]]) : (!fir.box<none>, !fir.box<none>) -> f32
 // CHECK:           fir.store %[[RES]] to %[[RESLOC]] : !fir.ref<f32>
 // CHECK:           %[[RET:.*]] = fir.load %[[RESLOC]] : !fir.ref<f32>
 // CHECK:           return %[[RET]] : f32
 // CHECK:         }
 
-// CHECK-LABEL:   func.func private @_FortranADotProductReal4_simplified(
-// CHECK-SAME:                                                           %[[A:.*]]: !fir.box<none>,
-// CHECK-SAME:                                                           %[[B:.*]]: !fir.box<none>) -> f32 attributes {llvm.linkage = #llvm.linkage<linkonce_odr>} {
+// CHECK-LABEL:   func.func private @_FortranADotProductReal4_f32_f32_simplified(
+// CHECK-SAME:      %[[A:.*]]: !fir.box<none>,
+// CHECK-SAME:      %[[B:.*]]: !fir.box<none>) -> f32 attributes {llvm.linkage = #llvm.linkage<linkonce_odr>} {
 // CHECK:           %[[FZERO:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK:           %[[IZERO:.*]] = arith.constant 0 : index
 // CHECK:           %[[ACAST:.*]] = fir.convert %[[A]] : (!fir.box<none>) -> !fir.box<!fir.array<?xf32>>
@@ -363,9 +363,11 @@
 // CHECK:           %[[RES:.*]] = fir.do_loop %[[IDX:.*]] = %[[IZERO]] to %[[LEN]] step %[[IONE]] iter_args(%[[SUM:.*]] = %[[FZERO]]) -> (f32) {
 // CHECK:             %[[ALOC:.*]] = fir.coordinate_of %[[ACAST]], %[[IDX]] : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
 // CHECK:             %[[AVAL:.*]] = fir.load %[[ALOC]] : !fir.ref<f32>
+// CHECK:             %[[AVALCAST:.*]] = fir.convert %[[AVAL]] : (f32) -> f32
 // CHECK:             %[[BLOC:.*]] = fir.coordinate_of %[[BCAST]], %[[IDX]] : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
 // CHECK:             %[[BVAL:.*]] = fir.load %[[BLOC]] : !fir.ref<f32>
-// CHECK:             %[[MUL:.*]] = arith.mulf %[[AVAL]], %[[BVAL]] : f32
+// CHECK:             %[[BVALCAST:.*]] = fir.convert %[[BVAL]] : (f32) -> f32
+// CHECK:             %[[MUL:.*]] = arith.mulf %[[AVALCAST]], %[[BVALCAST]] : f32
 // CHECK:             %[[NEWSUM:.*]] = arith.addf %[[MUL]], %[[SUM]] : f32
 // CHECK:             fir.result %[[NEWSUM]] : f32
 // CHECK:           }
@@ -479,15 +481,15 @@
 // CHECK:           %[[RESLOC:.*]] = fir.alloca i32 {bindc_name = "dot", uniq_name = "_QFdotEdot"}
 // CHECK:           %[[ACAST:.*]] = fir.convert %[[A]] : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
 // CHECK:           %[[BCAST:.*]] = fir.convert %[[B]] : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
-// CHECK:           %[[RES:.*]] = fir.call @_FortranADotProductInteger4_simplified(%[[ACAST]], %[[BCAST]]) : (!fir.box<none>, !fir.box<none>) -> i32
+// CHECK:           %[[RES:.*]] = fir.call @_FortranADotProductInteger4_i32_i32_simplified(%[[ACAST]], %[[BCAST]]) : (!fir.box<none>, !fir.box<none>) -> i32
 // CHECK:           fir.store %[[RES]] to %[[RESLOC]] : !fir.ref<i32>
 // CHECK:           %[[RET:.*]] = fir.load %[[RESLOC]] : !fir.ref<i32>
 // CHECK:           return %[[RET]] : i32
 // CHECK:         }
 
-// CHECK-LABEL:   func.func private @_FortranADotProductInteger4_simplified(
-// CHECK-SAME:                                                           %[[A:.*]]: !fir.box<none>,
-// CHECK-SAME:                                                           %[[B:.*]]: !fir.box<none>) -> i32 attributes {llvm.linkage = #llvm.linkage<linkonce_odr>} {
+// CHECK-LABEL:   func.func private @_FortranADotProductInteger4_i32_i32_simplified(
+// CHECK-SAME:      %[[A:.*]]: !fir.box<none>,
+// CHECK-SAME:      %[[B:.*]]: !fir.box<none>) -> i32 attributes {llvm.linkage = #llvm.linkage<linkonce_odr>} {
 // CHECK:           %[[I32ZERO:.*]] = arith.constant 0 : i32
 // CHECK:           %[[IZERO:.*]] = arith.constant 0 : index
 // CHECK:           %[[ACAST:.*]] = fir.convert %[[A]] : (!fir.box<none>) -> !fir.box<!fir.array<?xi32>>
@@ -498,9 +500,11 @@
 // CHECK:           %[[RES:.*]] = fir.do_loop %[[IDX:.*]] = %[[IZERO]] to %[[LEN]] step %[[IONE]] iter_args(%[[SUM:.*]] = %[[I32ZERO]]) -> (i32) {
 // CHECK:             %[[ALOC:.*]] = fir.coordinate_of %[[ACAST]], %[[IDX]] : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
 // CHECK:             %[[AVAL:.*]] = fir.load %[[ALOC]] : !fir.ref<i32>
+// CHECK:             %[[AVALCAST:.*]] = fir.convert %[[AVAL]] : (i32) -> i32
 // CHECK:             %[[BLOC:.*]] = fir.coordinate_of %[[BCAST]], %[[IDX]] : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
 // CHECK:             %[[BVAL:.*]] = fir.load %[[BLOC]] : !fir.ref<i32>
-// CHECK:             %[[MUL:.*]] = arith.muli %[[AVAL]], %[[BVAL]] : i32
+// CHECK:             %[[BVALCAST:.*]] = fir.convert %[[BVAL]] : (i32) -> i32
+// CHECK:             %[[MUL:.*]] = arith.muli %[[AVALCAST]], %[[BVALCAST]] : i32
 // CHECK:             %[[NEWSUM:.*]] = arith.addi %[[MUL]], %[[SUM]] : i32
 // CHECK:             fir.result %[[NEWSUM]] : i32
 // CHECK:           }
@@ -587,3 +591,63 @@
 // CHECK-SAME:                      %[[A:.*]]: !fir.box<!fir.array<?xi64>> {fir.bindc_name = "a"},
 // CHECK-SAME:                      %[[B:.*]]: !fir.box<!fir.array<?xi64>> {fir.bindc_name = "b"}) -> i64 {
 // CHECK-NOT: call{{.*}}_FortranADotProductInteger8(
+
+// -----
+
+// Test mixed types, e.g. when _FortranADotProductReal8 is called
+// with <?xf64> and <?xf32> arguments. The loaded elements must be converted
+// to the result type REAL(8) before the computations.
+
+func.func @dot_f64_f32(%arg0: !fir.box<!fir.array<?xf64>> {fir.bindc_name = "a"}, %arg1: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "b"}) -> f64 {
+  %0 = fir.alloca f64 {bindc_name = "dot", uniq_name = "_QFdotEdot"}
+  %1 = fir.address_of(@_QQcl.2E2F646F742E66393000) : !fir.ref<!fir.char<1,10>>
+  %c3_i32 = arith.constant 3 : i32
+  %2 = fir.convert %arg0 : (!fir.box<!fir.array<?xf64>>) -> !fir.box<none>
+  %3 = fir.convert %arg1 : (!fir.box<!fir.array<?xf32>>) -> !fir.box<none>
+  %4 = fir.convert %1 : (!fir.ref<!fir.char<1,10>>) -> !fir.ref<i8>
+  %5 = fir.call @_FortranADotProductReal8(%2, %3, %4, %c3_i32) : (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> f64
+  fir.store %5 to %0 : !fir.ref<f64>
+  %6 = fir.load %0 : !fir.ref<f64>
+  return %6 : f64
+}
+func.func private @_FortranADotProductReal4(!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> f32 attributes {fir.runtime}
+fir.global linkonce @_QQcl.2E2F646F742E66393000 constant : !fir.char<1,10> {
+  %0 = fir.string_lit "./dot.f90\00"(10) : !fir.char<1,10>
+  fir.has_value %0 : !fir.char<1,10>
+}
+
+// CHECK-LABEL:   func.func @dot_f64_f32(
+// CHECK-SAME:      %[[A:.*]]: !fir.box<!fir.array<?xf64>> {fir.bindc_name = "a"},
+// CHECK-SAME:      %[[B:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "b"}) -> f64 {
+// CHECK:           %[[RESLOC:.*]] = fir.alloca f64 {bindc_name = "dot", uniq_name = "_QFdotEdot"}
+// CHECK:           %[[ACAST:.*]] = fir.convert %[[A]] : (!fir.box<!fir.array<?xf64>>) -> !fir.box<none>
+// CHECK:           %[[BCAST:.*]] = fir.convert %[[B]] : (!fir.box<!fir.array<?xf32>>) -> !fir.box<none>
+// CHECK:           %[[RES:.*]] = fir.call @_FortranADotProductReal8_f64_f32_simplified(%[[ACAST]], %[[BCAST]]) : (!fir.box<none>, !fir.box<none>) -> f64
+// CHECK:           fir.store %[[RES]] to %[[RESLOC]] : !fir.ref<f64>
+// CHECK:           %[[RET:.*]] = fir.load %[[RESLOC]] : !fir.ref<f64>
+// CHECK:           return %[[RET]] : f64
+// CHECK:         }
+
+// CHECK-LABEL:   func.func private @_FortranADotProductReal8_f64_f32_simplified(
+// CHECK-SAME:      %[[A:.*]]: !fir.box<none>,
+// CHECK-SAME:      %[[B:.*]]: !fir.box<none>) -> f64 attributes {llvm.linkage = #llvm.linkage<linkonce_odr>} {
+// CHECK:           %[[FZERO:.*]] = arith.constant 0.000000e+00 : f64
+// CHECK:           %[[IZERO:.*]] = arith.constant 0 : index
+// CHECK:           %[[ACAST:.*]] = fir.convert %[[A]] : (!fir.box<none>) -> !fir.box<!fir.array<?xf64>>
+// CHECK:           %[[BCAST:.*]] = fir.convert %[[B]] : (!fir.box<none>) -> !fir.box<!fir.array<?xf32>>
+// CHECK:           %[[DIMS:.*]]:3 = fir.box_dims %[[ACAST]], %[[IZERO]] : (!fir.box<!fir.array<?xf64>>, index) -> (index, index, index)
+// CHECK:           %[[IONE:.*]] = arith.constant 1 : index
+// CHECK:           %[[LEN:.*]] = arith.subi %[[DIMS]]#1, %[[IONE]] : index
+// CHECK:           %[[RES:.*]] = fir.do_loop %[[IDX:.*]] = %[[IZERO]] to %[[LEN]] step %[[IONE]] iter_args(%[[SUM:.*]] = %[[FZERO]]) -> (f64) {
+// CHECK:             %[[ALOC:.*]] = fir.coordinate_of %[[ACAST]], %[[IDX]] : (!fir.box<!fir.array<?xf64>>, index) -> !fir.ref<f64>
+// CHECK:             %[[AVAL:.*]] = fir.load %[[ALOC]] : !fir.ref<f64>
+// CHECK:             %[[AVALCAST:.*]] = fir.convert %[[AVAL]] : (f64) -> f64
+// CHECK:             %[[BLOC:.*]] = fir.coordinate_of %[[BCAST]], %[[IDX]] : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+// CHECK:             %[[BVAL:.*]] = fir.load %[[BLOC]] : !fir.ref<f32>
+// CHECK:             %[[BVALCAST:.*]] = fir.convert %[[BVAL]] : (f32) -> f64
+// CHECK:             %[[MUL:.*]] = arith.mulf %[[AVALCAST]], %[[BVALCAST]] : f64
+// CHECK:             %[[NEWSUM:.*]] = arith.addf %[[MUL]], %[[SUM]] : f64
+// CHECK:             fir.result %[[NEWSUM]] : f64
+// CHECK:           }
+// CHECK:           return %[[RES]] : f64
+// CHECK:         }
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -91,6 +91,10 @@
     libc.src.stdlib.realloc
     libc.src.stdlib.free
 
+    # stdio.h entrypoints
+    libc.src.stdio.sprintf
+    libc.src.stdio.snprintf
+
     # sys/stat.h entrypoints
     libc.src.sys.stat.mkdir
     libc.src.sys.stat.mkdirat
@@ -242,8 +246,6 @@
     libc.src.stdio.funlockfile
     libc.src.stdio.fwrite
     libc.src.stdio.fwrite_unlocked
-    libc.src.stdio.sprintf
-    libc.src.stdio.snprintf
     libc.src.stdio.fprintf
     libc.src.stdio.printf
     libc.src.stdio.stderr
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -92,6 +92,10 @@
     libc.src.stdlib.aligned_alloc
     libc.src.stdlib.free
 
+    # stdio.h entrypoints
+    libc.src.stdio.sprintf
+    libc.src.stdio.snprintf
+
     # sys/mman.h entrypoints
     libc.src.sys.mman.mmap
     libc.src.sys.mman.munmap
@@ -298,8 +302,6 @@
     libc.src.stdio.funlockfile
     libc.src.stdio.fwrite
     libc.src.stdio.fwrite_unlocked
-    libc.src.stdio.sprintf
-    libc.src.stdio.snprintf
     libc.src.stdio.fprintf
     libc.src.stdio.printf
     libc.src.stdio.stderr
diff --git a/libc/src/CMakeLists.txt b/libc/src/CMakeLists.txt
--- a/libc/src/CMakeLists.txt
+++ b/libc/src/CMakeLists.txt
@@ -7,6 +7,7 @@
 add_subdirectory(math)
 add_subdirectory(string)
 add_subdirectory(stdlib)
+add_subdirectory(stdio)
 
 if(${LIBC_TARGET_OS} STREQUAL "linux")
   add_subdirectory(dirent)
@@ -24,6 +25,5 @@
 # since assert uses the signal API, we disable assert also.
 # add_subdirectory(assert)
 # add_subdirectory(signal)
-add_subdirectory(stdio)
 add_subdirectory(threads)
 add_subdirectory(time)
diff --git a/libc/src/stdio/printf_core/CMakeLists.txt b/libc/src/stdio/printf_core/CMakeLists.txt
--- a/libc/src/stdio/printf_core/CMakeLists.txt
+++ b/libc/src/stdio/printf_core/CMakeLists.txt
@@ -31,17 +31,6 @@
     .core_structs
 )
 
-add_object_library(
-  file_writer
-  SRCS
-    file_writer.cpp
-  HDRS
-    file_writer.h
-  DEPENDS
-    libc.src.__support.File.file
-    .core_structs
-)
-
 add_object_library(
   writer
   SRCS
@@ -91,6 +80,23 @@
     libc.src.__support.arg_list
 )
 
+if(NOT (TARGET libc.src.__support.File.file))
+  # Not all platforms have a file implementation. If file is unvailable,
+  # then we must skip all file based printf sections.
+  return()
+endif()
+
+add_object_library(
+  file_writer
+  SRCS
+    file_writer.cpp
+  HDRS
+    file_writer.h
+  DEPENDS
+    libc.src.__support.File.file
+    .core_structs
+)
+
 add_object_library(
   vfprintf_internal
   SRCS
diff --git a/libc/test/src/CMakeLists.txt b/libc/test/src/CMakeLists.txt
--- a/libc/test/src/CMakeLists.txt
+++ b/libc/test/src/CMakeLists.txt
@@ -34,6 +34,7 @@
 add_subdirectory(math)
 add_subdirectory(string)
 add_subdirectory(stdlib)
+add_subdirectory(stdio)
 
 if(${LIBC_TARGET_OS} STREQUAL "linux")
   add_subdirectory(fcntl)
@@ -50,7 +51,6 @@
 # since assert uses the signal API, we disable assert also.
 # add_subdirectory(assert)
 # add_subdirectory(signal)
-add_subdirectory(stdio)
 add_subdirectory(time)
 
 if(${LIBC_TARGET_OS} STREQUAL "linux")
diff --git a/libc/test/src/stdio/printf_core/parser_test.cpp b/libc/test/src/stdio/printf_core/parser_test.cpp
--- a/libc/test/src/stdio/printf_core/parser_test.cpp
+++ b/libc/test/src/stdio/printf_core/parser_test.cpp
@@ -191,7 +191,7 @@
 TEST(LlvmLibcPrintfParserTest, EvalOneArgWithLongLengthModifier) {
   __llvm_libc::printf_core::FormatSection format_arr[10];
   const char *str = "%lld";
-  int arg1 = 12345;
+  long long arg1 = 12345;
   evaluate(format_arr, str, arg1);
 
   __llvm_libc::printf_core::FormatSection expected;
@@ -208,7 +208,7 @@
 TEST(LlvmLibcPrintfParserTest, EvalOneArgWithAllOptions) {
   __llvm_libc::printf_core::FormatSection format_arr[10];
   const char *str = "% -056.78jd";
-  int arg1 = 12345;
+  intmax_t arg1 = 12345;
   evaluate(format_arr, str, arg1);
 
   __llvm_libc::printf_core::FormatSection expected;
diff --git a/libc/utils/UnitTest/CMakeLists.txt b/libc/utils/UnitTest/CMakeLists.txt
--- a/libc/utils/UnitTest/CMakeLists.txt
+++ b/libc/utils/UnitTest/CMakeLists.txt
@@ -54,13 +54,6 @@
   libc.src.__support.CPP.array_ref
 )
 
-if(NOT LLVM_LIBC_FULL_BUILD) # TODO(michaelrj): make a more permanant solution.
-  return()
-endif()
-
-#currently stdio is fullbuild only, so this matcher that depends on a piece of
-#printf also has to be fullbuild only.
-
 add_library(
   LibcPrintfHelpers
     PrintfMatcher.h
diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -592,6 +592,7 @@
       -Wno-user-defined-literals
       -Wno-covered-switch-default
       -Wno-suggest-override
+      -Wno-ctad-maybe-unsupported
     )
     if (LIBCXX_TARGETING_CLANG_CL)
       target_add_compile_flags_if_supported(${target} PRIVATE
diff --git a/libcxx/docs/Status/SpaceshipProjects.csv b/libcxx/docs/Status/SpaceshipProjects.csv
--- a/libcxx/docs/Status/SpaceshipProjects.csv
+++ b/libcxx/docs/Status/SpaceshipProjects.csv
@@ -15,9 +15,9 @@
 | `[type.info] <https://wg21.link/type.info>`_,| `typeinfo <https://reviews.llvm.org/D130853>`_,None,Adrian Vogelsgesang,|Complete|
 | `[coroutine.handle.compare] <https://wg21.link/coroutine.handle.compare>`_,| `coroutine_handle <https://reviews.llvm.org/D109433>`_,[comparisons.three.way],Chuanqi Xu,|Complete|
 | `[pairs.spec] <https://wg21.link/pairs.spec>`_,| `pair <https://reviews.llvm.org/D107721>`_,[expos.only.func],Kent Ross,|Complete|
-| `[syserr.errcat.nonvirtuals] <https://wg21.link/syserr.errcat.nonvirtuals>`_,| `error_category <https://reviews.llvm.org/D131363>`_,[comparisons.three.way],Adrian Vogelsgesang,|In Progress|
+| `[syserr.errcat.nonvirtuals] <https://wg21.link/syserr.errcat.nonvirtuals>`_,| `error_category <https://reviews.llvm.org/D131363>`_,[comparisons.three.way],Adrian Vogelsgesang,|Complete|
 | `[syserr.compare] <https://wg21.link/syserr.compare>`_,"| `error_code <https://reviews.llvm.org/D131371>`_
-| `error_condition <https://reviews.llvm.org/D131371>`_",None,Adrian Vogelsgesang,|In Progress|
+| `error_condition <https://reviews.llvm.org/D131371>`_",None,Adrian Vogelsgesang,|Complete|
 | `[tuple.rel] <https://wg21.link/tuple.rel>`_,| `tuple <https://reviews.llvm.org/D108250>`_,[expos.only.func],Kent Ross,|Complete|
 "| `[optional.relops] <https://wg21.link/optional.relops>`_
 | `[optional.nullops] <https://wg21.link/optional.nullops>`_
@@ -29,31 +29,31 @@
 | `[unique.ptr.special] <https://wg21.link/unique.ptr.special>`_,| `unique_ptr <https://reviews.llvm.org/D130838>`_,[comparisons.three.way],Adrian Vogelsgesang,|Complete|
 | `[util.smartptr.shared.cmp] <https://wg21.link/util.smartptr.shared.cmp>`_,| `shared_ptr <https://reviews.llvm.org/D130852>`_,[comparisons.three.way],Adrian Vogelsgesang,|Complete|
 | `[type.index.members] <https://wg21.link/type.index.members>`_,| `type_index <https://reviews.llvm.org/D131357>`_,None,Adrian Vogelsgesang,|Complete|
-| `[charconv.syn] <https://wg21.link/charconv.syn>`_,| to_chars_result,None,Mark de Wever,|Complete|
-| `[charconv.syn] <https://wg21.link/charconv.syn>`_,| from_chars_result,None,Mark de Wever,|Complete|
+| `[charconv.syn] <https://wg21.link/charconv.syn>`_,| `to_chars_result <https://reviews.llvm.org/D112366>`_,None,Mark de Wever,|Complete|
+| `[charconv.syn] <https://wg21.link/charconv.syn>`_,| `from_chars_result <https://reviews.llvm.org/D112366>`_,None,Mark de Wever,|Complete|
 | `[stacktrace.entry.cmp] <https://wg21.link/stacktrace.entry.cmp>`_,| stacktrace_entry,None,Unassigned,|Not Started|
 | `[stacktrace.basic.cmp] <https://wg21.link/stacktrace.basic.cmp>`_,| basic_stacktrace,[alg.three.way],Unassigned,|Not Started|
-| `[string.cmp] <https://wg21.link/string.cmp>`_,| `basic_string <https://reviews.llvm.org/D131421>`,None,Mark de Wever,|Complete|
+| `[string.cmp] <https://wg21.link/string.cmp>`_,| `basic_string <https://reviews.llvm.org/D131421>`_,None,Mark de Wever,|Complete|
 | `[string.view.comparison] <https://wg21.link/string.view.comparison>`_,| `basic_string_view <https://reviews.llvm.org/D130295>`_,None,Mark de Wever,|Complete|
-| `[array.syn] <https://wg21.link/array.syn>`_ (`general <https://wg21.link/container.requirements.general#14>`_),| array,[expos.only.func],Unassigned,|Not Started|
-| `[deque.syn] <https://wg21.link/deque.syn>`_ (`general <https://wg21.link/container.requirements.general#14>`_),| deque,[expos.only.func],Unassigned,|Not Started|
-| `[forward.list.syn] <https://wg21.link/forward.list.syn>`_ (`general <https://wg21.link/container.requirements.general#14>`_),| forward_list,[expos.only.func],Unassigned,|Not Started|
-| `[list.syn] <https://wg21.link/list.syn>`_ (`general <https://wg21.link/container.requirements.general#14>`_),| list,[expos.only.func],Unassigned,|Not Started|
-| `[vector.syn] <https://wg21.link/vector.syn>`_ (`general <https://wg21.link/container.requirements.general#14>`_),| vector,[expos.only.func],Unassigned,|Not Started|
-| `[associative.map.syn] <https://wg21.link/associative.map.syn>`_ (`general <https://wg21.link/container.requirements.general#14>`_),"| map
+| `[array.syn] <https://wg21.link/array.syn>`_ (`general <https://wg21.link/container.opt.reqmts>`_),| array,[expos.only.func],Unassigned,|Not Started|
+| `[deque.syn] <https://wg21.link/deque.syn>`_ (`general <https://wg21.link/container.opt.reqmts>`_),| deque,[expos.only.func],Unassigned,|Not Started|
+| `[forward.list.syn] <https://wg21.link/forward.list.syn>`_ (`general <https://wg21.link/container.opt.reqmts>`_),| forward_list,[expos.only.func],Unassigned,|Not Started|
+| `[list.syn] <https://wg21.link/list.syn>`_ (`general <https://wg21.link/container.opt.reqmts>`_),| list,[expos.only.func],Unassigned,|Not Started|
+| `[vector.syn] <https://wg21.link/vector.syn>`_ (`general <https://wg21.link/container.opt.reqmts>`_),| vector,[expos.only.func],Unassigned,|Not Started|
+| `[associative.map.syn] <https://wg21.link/associative.map.syn>`_ (`general <https://wg21.link/container.opt.reqmts>`_),"| map
 | multimap",[expos.only.func],Unassigned,|Not Started|
-| `[associative.set.syn] <https://wg21.link/associative.set.syn>`_ (`general <https://wg21.link/container.requirements.general#14>`_),"| multiset
+| `[associative.set.syn] <https://wg21.link/associative.set.syn>`_ (`general <https://wg21.link/container.opt.reqmts>`_),"| multiset
 | set",[expos.only.func],Unassigned,|Not Started|
 | `[queue.ops] <https://wg21.link/queue.ops>`_,| queue,None,Unassigned,|Not Started|
 | `[stack.ops] <https://wg21.link/stack.ops>`_,| stack,None,Unassigned,|Not Started|
-| `[reverse.iter.cmp] <https://wg21.link/reverse.iter.cmp>`_,| reverse_iterator,None,Mikhail Maltsev,|Complete|
+| `[reverse.iter.cmp] <https://wg21.link/reverse.iter.cmp>`_,| `reverse_iterator <https://reviews.llvm.org/D113695>`_,None,Mikhail Maltsev,|Complete|
 | `[move.iter.op.comp] <https://wg21.link/move.iter.op.comp>`_,| move_iterator,None,Unassigned,|Not Started|
 | `[counted.iter.cmp] <https://wg21.link/counted.iter.cmp>`_,| counted_iterator,None,Unassigned,|Not Started|
 | `[range.iota.iterator] <https://wg21.link/range.iota.iterator>`_,| `ranges::iota_view::iterator <https://reviews.llvm.org/D110774>`_,[concepts.cmp],Arthur O'Dwyer,|Complete|
 | `[range.transform.iterator] <https://wg21.link/range.transform.iterator>`_,| `ranges::transform_view::iterator <https://reviews.llvm.org/D110774>`_,[concepts.cmp],Arthur O'Dwyer,|Complete|
 | `[range.elements.iterator] <https://wg21.link/range.elements.iterator>`_,| ranges::elements_view::iterator,[concepts.cmp],Unassigned,|Not Started|
 | `[time.duration.comparisons] <https://wg21.link/time.duration.comparisons>`_, "chrono::duration", None, Mark de Wever, |Not Started|
-| `[time.point.comparisons] <https://wg21.link/time.point.comparisons>`_, "chrono::point", None, Mark de Wever, |Not Started|
+| `[time.point.comparisons] <https://wg21.link/time.point.comparisons>`_, "chrono::time_point", None, Mark de Wever, |Not Started|
 "| `[time.cal.day.nonmembers] <https://wg21.link/time.cal.day.nonmembers>`_
 | `[time.cal.month.nonmembers] <https://wg21.link/time.cal.month.nonmembers>`_
 | `[time.cal.year.nonmembers] <https://wg21.link/time.cal.year.nonmembers>`_
@@ -61,14 +61,14 @@
 | `[time.cal.mdlast] <https://wg21.link/time.cal.mdlast>`_
 | `[time.cal.ym.nonmembers] <https://wg21.link/time.cal.ym.nonmembers>`_
 | `[time.cal.ymd.nonmembers] <https://wg21.link/time.cal.ymd.nonmembers>`_
-| `[time.cal.ymdlast.nonmembers] <https://wg21.link/time.cal.ymdlast.nonmembers>`_","| chrono::day
-| chrono::month
-| chrono::year
-| chrono::month_day
-| chrono::month_day_last
-| chrono::year_month
-| chrono::year_month_day
-| chrono::year_month_day_last",None,Mark de Wever,|Complete|
+| `[time.cal.ymdlast.nonmembers] <https://wg21.link/time.cal.ymdlast.nonmembers>`_","| `chrono::day <https://reviews.llvm.org/D129887>`_
+| `chrono::month <https://reviews.llvm.org/D129887>`_
+| `chrono::year <https://reviews.llvm.org/D129887>`_
+| `chrono::month_day <https://reviews.llvm.org/D129887>`_
+| `chrono::month_day_last <https://reviews.llvm.org/D129887>`_
+| `chrono::year_month <https://reviews.llvm.org/D129887>`_
+| `chrono::year_month_day <https://reviews.llvm.org/D129887>`_
+| `chrono::year_month_day_last <https://reviews.llvm.org/D129887>`_",None,Mark de Wever,|Complete|
 "| `[time.zone.nonmembers] <https://wg21.link/time.zone.nonmembers>`_
 | `[time.zone.leap.nonmembers] <https://wg21.link/time.zone.leap.nonmembers>`_
 | `[time.zone.link.nonmembers] <https://wg21.link/time.zone.link.nonmembers>`_","| chrono::time_zone
diff --git a/libcxx/include/system_error b/libcxx/include/system_error
--- a/libcxx/include/system_error
+++ b/libcxx/include/system_error
@@ -32,8 +32,9 @@
     virtual string message(int ev) const = 0;
 
     bool operator==(const error_category& rhs) const noexcept;
-    bool operator!=(const error_category& rhs) const noexcept;
-    bool operator<(const error_category& rhs) const noexcept;
+    bool operator!=(const error_category& rhs) const noexcept;              // removed in C++20
+    bool operator<(const error_category& rhs) const noexcept;               // removed in C++20
+    strong_ordering operator<=>(const error_category& rhs) const noexcept;  // C++20
 };
 
 const error_category& generic_category() noexcept;
@@ -75,7 +76,6 @@
 };
 
 // non-member functions:
-bool operator<(const error_code& lhs, const error_code& rhs) noexcept;
 template <class charT, class traits>
     basic_ostream<charT,traits>&
     operator<<(basic_ostream<charT,traits>& os, const error_code& ec);
@@ -102,8 +102,6 @@
     explicit operator bool() const noexcept;
 };
 
-bool operator<(const error_condition& lhs, const error_condition& rhs) noexcept;
-
 class system_error
     : public runtime_error
 {
@@ -128,12 +126,16 @@
 // Comparison operators:
 bool operator==(const error_code& lhs, const error_code& rhs) noexcept;
 bool operator==(const error_code& lhs, const error_condition& rhs) noexcept;
-bool operator==(const error_condition& lhs, const error_code& rhs) noexcept;
+bool operator==(const error_condition& lhs, const error_code& rhs) noexcept;                  // removed in C++20
 bool operator==(const error_condition& lhs, const error_condition& rhs) noexcept;
-bool operator!=(const error_code& lhs, const error_code& rhs) noexcept;
-bool operator!=(const error_code& lhs, const error_condition& rhs) noexcept;
-bool operator!=(const error_condition& lhs, const error_code& rhs) noexcept;
-bool operator!=(const error_condition& lhs, const error_condition& rhs) noexcept;
+bool operator!=(const error_code& lhs, const error_code& rhs) noexcept;                       // removed in C++20
+bool operator!=(const error_code& lhs, const error_condition& rhs) noexcept;                  // removed in C++20
+bool operator!=(const error_condition& lhs, const error_code& rhs) noexcept;                  // removed in C++20
+bool operator!=(const error_condition& lhs, const error_condition& rhs) noexcept;             // removed in C++20
+bool operator<(const error_condition& lhs, const error_condition& rhs) noexcept;              // removed in C++20
+bool operator<(const error_code& lhs, const error_code& rhs) noexcept;                        // removed in C++20
+strong_ordering operator<=>(const error_code& lhs, const error_code& rhs) noexcept;           // C++20
+strong_ordering operator<=>(const error_condition& lhs, const error_condition& rhs) noexcept; // C++20
 
 template <> struct hash<std::error_code>;
 template <> struct hash<std::error_condition>;
@@ -147,6 +149,7 @@
 #include <__errc>
 #include <__functional/hash.h>
 #include <__functional/unary_function.h>
+#include <__memory/addressof.h>
 #include <stdexcept>
 #include <string>
 #include <type_traits>
@@ -223,12 +226,21 @@
     _LIBCPP_INLINE_VISIBILITY
     bool operator==(const error_category& __rhs) const _NOEXCEPT {return this == &__rhs;}
 
+#if _LIBCPP_STD_VER > 17
+
+    _LIBCPP_HIDE_FROM_ABI
+    strong_ordering operator<=>(const error_category& __rhs) const noexcept {return compare_three_way()(this, std::addressof(__rhs));}
+
+#else // _LIBCPP_STD_VER > 17
+
     _LIBCPP_INLINE_VISIBILITY
     bool operator!=(const error_category& __rhs) const _NOEXCEPT {return !(*this == __rhs);}
 
     _LIBCPP_INLINE_VISIBILITY
     bool operator< (const error_category& __rhs) const _NOEXCEPT {return this < &__rhs;}
 
+#endif // _LIBCPP_STD_VER > 17
+
     friend class _LIBCPP_HIDDEN __do_message;
 };
 
@@ -303,14 +315,6 @@
     return error_condition(static_cast<int>(__e), generic_category());
 }
 
-inline _LIBCPP_INLINE_VISIBILITY
-bool
-operator<(const error_condition& __x, const error_condition& __y) _NOEXCEPT
-{
-    return __x.category() < __y.category()
-        || (__x.category() == __y.category() && __x.value() < __y.value());
-}
-
 // error_code
 
 class _LIBCPP_TYPE_VIS error_code
@@ -379,14 +383,6 @@
     return error_code(static_cast<int>(__e), generic_category());
 }
 
-inline _LIBCPP_INLINE_VISIBILITY
-bool
-operator<(const error_code& __x, const error_code& __y) _NOEXCEPT
-{
-    return __x.category() < __y.category()
-        || (__x.category() == __y.category() && __x.value() < __y.value());
-}
-
 inline _LIBCPP_INLINE_VISIBILITY
 bool
 operator==(const error_code& __x, const error_code& __y) _NOEXCEPT
@@ -402,12 +398,14 @@
         || __y.category().equivalent(__x, __y.value());
 }
 
+#if _LIBCPP_STD_VER <= 17
 inline _LIBCPP_INLINE_VISIBILITY
 bool
 operator==(const error_condition& __x, const error_code& __y) _NOEXCEPT
 {
     return __y == __x;
 }
+#endif
 
 inline _LIBCPP_INLINE_VISIBILITY
 bool
@@ -416,6 +414,8 @@
     return __x.category() == __y.category() && __x.value() == __y.value();
 }
 
+#if _LIBCPP_STD_VER <= 17
+
 inline _LIBCPP_INLINE_VISIBILITY
 bool
 operator!=(const error_code& __x, const error_code& __y) _NOEXCEPT
@@ -436,6 +436,42 @@
 operator!=(const error_condition& __x, const error_condition& __y) _NOEXCEPT
 {return !(__x == __y);}
 
+inline _LIBCPP_INLINE_VISIBILITY
+bool
+operator<(const error_condition& __x, const error_condition& __y) _NOEXCEPT
+{
+    return __x.category() < __y.category()
+        || (__x.category() == __y.category() && __x.value() < __y.value());
+}
+
+inline _LIBCPP_INLINE_VISIBILITY
+bool
+operator<(const error_code& __x, const error_code& __y) _NOEXCEPT
+{
+    return __x.category() < __y.category()
+        || (__x.category() == __y.category() && __x.value() < __y.value());
+}
+
+#else // _LIBCPP_STD_VER <= 17
+
+inline _LIBCPP_HIDE_FROM_ABI strong_ordering
+operator<=>(const error_code& __x, const error_code& __y) noexcept
+{
+    if (auto __c = __x.category() <=> __y.category(); __c != 0)
+        return __c;
+    return __x.value() <=> __y.value();
+}
+
+inline _LIBCPP_HIDE_FROM_ABI strong_ordering
+operator<=>(const error_condition& __x, const error_condition& __y) noexcept
+{
+    if (auto __c = __x.category() <=> __y.category(); __c != 0)
+       return __c;
+    return __x.value() <=> __y.value();
+}
+
+#endif // _LIBCPP_STD_VER <= 17
+
 template <>
 struct _LIBCPP_TEMPLATE_VIS hash<error_code>
     : public __unary_function<error_code, size_t>
diff --git a/libcxx/test/std/diagnostics/syserr/syserr.compare/cmp_error_code.pass.cpp b/libcxx/test/std/diagnostics/syserr/syserr.compare/cmp_error_code.pass.cpp
new file mode 100644
--- /dev/null
+++ b/libcxx/test/std/diagnostics/syserr/syserr.compare/cmp_error_code.pass.cpp
@@ -0,0 +1,41 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// <system_error>
+
+// class error_code
+
+// strong_ordering operator<=>(const error_code& lhs, const error_code& rhs) noexcept
+
+#include <system_error>
+#include <cassert>
+
+#include "test_macros.h"
+#include "test_comparisons.h"
+
+int main(int, char**) {
+  AssertOrderAreNoexcept<std::error_code>();
+  AssertOrderReturn<std::strong_ordering, std::error_code>();
+
+  // Same error category
+  std::error_code ec1a = std::error_code(1, std::generic_category());
+  std::error_code ec1b = std::error_code(1, std::generic_category());
+  std::error_code ec2  = std::error_code(2, std::generic_category());
+
+  assert(testOrder(ec1a, ec1b, std::strong_ordering::equal));
+  assert(testOrder(ec1a, ec2, std::strong_ordering::less));
+
+  // Different error category
+  const std::error_code& ec3 = std::error_code(2, std::system_category());
+
+  bool isLess = ec2 < ec3;
+  assert(testOrder(ec2, ec3, isLess ? std::strong_ordering::less : std::strong_ordering::greater));
+
+  return 0;
+}
diff --git a/libcxx/test/std/diagnostics/syserr/syserr.compare/cmp_error_condition.pass.cpp b/libcxx/test/std/diagnostics/syserr/syserr.compare/cmp_error_condition.pass.cpp
new file mode 100644
--- /dev/null
+++ b/libcxx/test/std/diagnostics/syserr/syserr.compare/cmp_error_condition.pass.cpp
@@ -0,0 +1,41 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// <system_error>
+
+// class error_condition
+
+// strong_ordering operator<=>(const error_condition& lhs, const error_condition& rhs) noexcept
+
+#include <system_error>
+#include <cassert>
+
+#include "test_macros.h"
+#include "test_comparisons.h"
+
+int main(int, char**) {
+  AssertOrderAreNoexcept<std::error_condition>();
+  AssertOrderReturn<std::strong_ordering, std::error_condition>();
+
+  // Same error category
+  std::error_condition ec1a = std::error_condition(1, std::generic_category());
+  std::error_condition ec1b = std::error_condition(1, std::generic_category());
+  std::error_condition ec2  = std::error_condition(2, std::generic_category());
+
+  assert(testOrder(ec1a, ec1b, std::strong_ordering::equal));
+  assert(testOrder(ec1a, ec2, std::strong_ordering::less));
+
+  // Different error category
+  const std::error_condition& ec3 = std::error_condition(2, std::system_category());
+
+  bool isLess = ec2 < ec3;
+  assert(testOrder(ec2, ec3, isLess ? std::strong_ordering::less : std::strong_ordering::greater));
+
+  return 0;
+}
diff --git a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.nonvirtuals/cmp.pass.cpp b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.nonvirtuals/cmp.pass.cpp
new file mode 100644
--- /dev/null
+++ b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.nonvirtuals/cmp.pass.cpp
@@ -0,0 +1,36 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// <system_error>
+
+// class error_category
+
+// strong_ordering operator<=>(const error_category& rhs) const noexcept;
+
+#include <system_error>
+#include <cassert>
+
+#include "test_macros.h"
+#include "test_comparisons.h"
+
+int main(int, char**) {
+  AssertOrderAreNoexcept<std::error_category>();
+  AssertOrderReturn<std::strong_ordering, std::error_category>();
+
+  const std::error_category& e_cat1 = std::generic_category();
+  const std::error_category& e_cat2 = std::generic_category();
+  const std::error_category& e_cat3 = std::system_category();
+
+  assert(testOrder(e_cat1, e_cat2, std::strong_ordering::equal));
+
+  bool isLess = e_cat1 < e_cat3;
+  assert(testOrder(e_cat1, e_cat3, isLess ? std::strong_ordering::less : std::strong_ordering::greater));
+
+  return 0;
+}
diff --git a/libcxx/test/support/MoveOnly.h b/libcxx/test/support/MoveOnly.h
--- a/libcxx/test/support/MoveOnly.h
+++ b/libcxx/test/support/MoveOnly.h
@@ -62,7 +62,7 @@
 {
     typedef MoveOnly argument_type;
     typedef size_t result_type;
-    TEST_CONSTEXPR size_t operator()(const MoveOnly& x) const {return x.get();}
+    TEST_CONSTEXPR size_t operator()(const MoveOnly& x) const {return static_cast<size_t>(x.get());}
 };
 
 #endif // MOVEONLY_H
diff --git a/libcxx/test/support/test_macros.h b/libcxx/test/support/test_macros.h
--- a/libcxx/test/support/test_macros.h
+++ b/libcxx/test/support/test_macros.h
@@ -190,8 +190,8 @@
 #define TEST_HAS_NO_EXCEPTIONS
 #endif
 
-#if TEST_HAS_FEATURE(address_sanitizer) || TEST_HAS_FEATURE(memory_sanitizer) || \
-    TEST_HAS_FEATURE(thread_sanitizer)
+#if TEST_HAS_FEATURE(address_sanitizer) || TEST_HAS_FEATURE(hwaddress_sanitizer) || \
+    TEST_HAS_FEATURE(memory_sanitizer) || TEST_HAS_FEATURE(thread_sanitizer)
 #define TEST_HAS_SANITIZERS
 #endif
 
diff --git a/lld/test/ELF/edata-etext.s b/lld/test/ELF/edata-etext.s
--- a/lld/test/ELF/edata-etext.s
+++ b/lld/test/ELF/edata-etext.s
@@ -37,7 +37,7 @@
 
 ## If a relocatable object file defines non-reserved identifiers (by C and C++)
 ## edata/end/etext, don't redefine them. Note: GNU ld redefines the reserved
-## _edata while we don't for simplicty.
+## _edata while we don't for simplicity.
 # RUN: ld.lld %t/b.o -o %t/b
 # RUN: llvm-objdump -t %t/b | FileCheck %s --check-prefix=CHECK2
 # RUN: ld.lld %t/c.o -o %t/c
diff --git a/lld/wasm/SyntheticSections.cpp b/lld/wasm/SyntheticSections.cpp
--- a/lld/wasm/SyntheticSections.cpp
+++ b/lld/wasm/SyntheticSections.cpp
@@ -844,8 +844,7 @@
        {std::make_pair(&info.Languages, &languages),
         std::make_pair(&info.Tools, &tools), std::make_pair(&info.SDKs, &sDKs)})
     for (auto &producer : *producers.first)
-      if (producers.second->end() ==
-          llvm::find_if(*producers.second,
+      if (llvm::none_of(*producers.second,
                         [&](std::pair<std::string, std::string> seen) {
                           return seen.first == producer.first;
                         }))
diff --git a/lldb/examples/customization/bin-utils/binutils.py b/lldb/examples/customization/bin-utils/binutils.py
--- a/lldb/examples/customization/bin-utils/binutils.py
+++ b/lldb/examples/customization/bin-utils/binutils.py
@@ -1,7 +1,5 @@
 "Collection of tools for displaying bit representation of numbers."""
 
-from __future__ import print_function
-
 def binary(n, width=None):
     """
     Return a list of (0|1)'s for the binary representation of n where n >= 0.
diff --git a/lldb/examples/customization/import-python/importcmd.py b/lldb/examples/customization/import-python/importcmd.py
--- a/lldb/examples/customization/import-python/importcmd.py
+++ b/lldb/examples/customization/import-python/importcmd.py
@@ -1,4 +1,3 @@
-from __future__ import print_function
 import sys
 import os
 import lldb
diff --git a/lldb/examples/customization/pwd-cd-and-system/utils.py b/lldb/examples/customization/pwd-cd-and-system/utils.py
--- a/lldb/examples/customization/pwd-cd-and-system/utils.py
+++ b/lldb/examples/customization/pwd-cd-and-system/utils.py
@@ -1,5 +1,4 @@
 """Utility for changing directories and execution of commands in a subshell."""
-from __future__ import print_function
 
 import os
 import shlex
diff --git a/lldb/examples/darwin/heap_find/heap.py b/lldb/examples/darwin/heap_find/heap.py
--- a/lldb/examples/darwin/heap_find/heap.py
+++ b/lldb/examples/darwin/heap_find/heap.py
@@ -8,7 +8,6 @@
 #   (lldb) script import lldb.macosx.heap
 #----------------------------------------------------------------------
 
-from __future__ import print_function
 import lldb
 import optparse
 import os
diff --git a/lldb/examples/python/bsd.py b/lldb/examples/python/bsd.py
--- a/lldb/examples/python/bsd.py
+++ b/lldb/examples/python/bsd.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-from __future__ import print_function
 
 import cmd
 import optparse
diff --git a/lldb/examples/python/cmdtemplate.py b/lldb/examples/python/cmdtemplate.py
--- a/lldb/examples/python/cmdtemplate.py
+++ b/lldb/examples/python/cmdtemplate.py
@@ -9,8 +9,6 @@
 #   (lldb) command script import /path/to/cmdtemplate.py
 # ---------------------------------------------------------------------
 
-from __future__ import print_function
-
 import inspect
 import lldb
 import optparse
diff --git a/lldb/examples/python/delta.py b/lldb/examples/python/delta.py
--- a/lldb/examples/python/delta.py
+++ b/lldb/examples/python/delta.py
@@ -16,8 +16,6 @@
 # available.
 #----------------------------------------------------------------------
 
-from __future__ import print_function
-
 import optparse
 import os
 import shlex
diff --git a/lldb/examples/python/diagnose_nsstring.py b/lldb/examples/python/diagnose_nsstring.py
--- a/lldb/examples/python/diagnose_nsstring.py
+++ b/lldb/examples/python/diagnose_nsstring.py
@@ -4,8 +4,6 @@
 # decisions it did and  providing some useful context information that can
 # be used for improving the formatter
 
-from __future__ import print_function
-
 import lldb
 
 
diff --git a/lldb/examples/python/diagnose_unwind.py b/lldb/examples/python/diagnose_unwind.py
--- a/lldb/examples/python/diagnose_unwind.py
+++ b/lldb/examples/python/diagnose_unwind.py
@@ -5,7 +5,6 @@
 # information about the stack frames, and tries an alternate unwind
 # algorithm, that will help to understand why lldb's unwind algorithm
 # did not succeed.
-from __future__ import print_function
 
 import optparse
 import lldb
diff --git a/lldb/examples/python/gdbremote.py b/lldb/examples/python/gdbremote.py
--- a/lldb/examples/python/gdbremote.py
+++ b/lldb/examples/python/gdbremote.py
@@ -16,7 +16,6 @@
 # available.
 #----------------------------------------------------------------------
 
-from __future__ import print_function
 import binascii
 import subprocess
 import json
diff --git a/lldb/examples/python/globals.py b/lldb/examples/python/globals.py
--- a/lldb/examples/python/globals.py
+++ b/lldb/examples/python/globals.py
@@ -7,7 +7,6 @@
 # For the shells sh, bash:
 #   PYTHONPATH=/Applications/Xcode.app/Contents/SharedFrameworks/LLDB.framework/Resources/Python ./globals.py <path> [<path> ...]
 #----------------------------------------------------------------------
-from __future__ import print_function
 
 import lldb
 import optparse
diff --git a/lldb/examples/python/jump.py b/lldb/examples/python/jump.py
--- a/lldb/examples/python/jump.py
+++ b/lldb/examples/python/jump.py
@@ -1,5 +1,3 @@
-from __future__ import print_function
-
 import lldb
 import re
 
diff --git a/lldb/examples/python/lldb_module_utils.py b/lldb/examples/python/lldb_module_utils.py
--- a/lldb/examples/python/lldb_module_utils.py
+++ b/lldb/examples/python/lldb_module_utils.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-from __future__ import print_function
 
 import lldb
 import optparse
diff --git a/lldb/examples/python/lldbtk.py b/lldb/examples/python/lldbtk.py
--- a/lldb/examples/python/lldbtk.py
+++ b/lldb/examples/python/lldbtk.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-from __future__ import print_function
 
 import lldb
 import shlex
diff --git a/lldb/examples/python/mach_o.py b/lldb/examples/python/mach_o.py
--- a/lldb/examples/python/mach_o.py
+++ b/lldb/examples/python/mach_o.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-from __future__ import print_function
 
 import cmd
 import dict_utils
diff --git a/lldb/examples/python/memory.py b/lldb/examples/python/memory.py
--- a/lldb/examples/python/memory.py
+++ b/lldb/examples/python/memory.py
@@ -9,8 +9,6 @@
 #   (lldb) command script import /path/to/cmdtemplate.py
 #----------------------------------------------------------------------
 
-from __future__ import print_function
-
 import platform
 import os
 import re
diff --git a/lldb/examples/python/performance.py b/lldb/examples/python/performance.py
--- a/lldb/examples/python/performance.py
+++ b/lldb/examples/python/performance.py
@@ -8,8 +8,6 @@
 #   export PYTHONPATH=/Applications/Xcode.app/Contents/SharedFrameworks/LLDB.framework/Resources/Python
 #----------------------------------------------------------------------
 
-from __future__ import print_function
-
 import optparse
 import os
 import platform
diff --git a/lldb/examples/python/process_events.py b/lldb/examples/python/process_events.py
--- a/lldb/examples/python/process_events.py
+++ b/lldb/examples/python/process_events.py
@@ -8,8 +8,6 @@
 #   export PYTHONPATH=/Applications/Xcode.app/Contents/SharedFrameworks/LLDB.framework/Resources/Python
 #----------------------------------------------------------------------
 
-from __future__ import print_function
-
 import optparse
 import os
 import platform
diff --git a/lldb/examples/python/pytracer.py b/lldb/examples/python/pytracer.py
--- a/lldb/examples/python/pytracer.py
+++ b/lldb/examples/python/pytracer.py
@@ -1,4 +1,3 @@
-from __future__ import print_function
 import sys
 import inspect
 from collections import OrderedDict
diff --git a/lldb/examples/python/scripted_step.py b/lldb/examples/python/scripted_step.py
--- a/lldb/examples/python/scripted_step.py
+++ b/lldb/examples/python/scripted_step.py
@@ -93,8 +93,6 @@
 #
 #     (lldb) thread step-scripted -C scripted_step.StepWithPlan
 
-from __future__ import print_function
-
 import lldb
 
 
diff --git a/lldb/examples/python/shadow.py b/lldb/examples/python/shadow.py
--- a/lldb/examples/python/shadow.py
+++ b/lldb/examples/python/shadow.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-from __future__ import print_function
 
 import lldb
 import shlex
diff --git a/lldb/examples/python/sources.py b/lldb/examples/python/sources.py
--- a/lldb/examples/python/sources.py
+++ b/lldb/examples/python/sources.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-from __future__ import print_function
 
 import lldb
 import shlex
diff --git a/lldb/examples/python/stacks.py b/lldb/examples/python/stacks.py
--- a/lldb/examples/python/stacks.py
+++ b/lldb/examples/python/stacks.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-from __future__ import print_function
 import lldb
 import optparse
 import shlex
diff --git a/lldb/examples/python/symbolication.py b/lldb/examples/python/symbolication.py
--- a/lldb/examples/python/symbolication.py
+++ b/lldb/examples/python/symbolication.py
@@ -26,7 +26,6 @@
 #   PYTHONPATH=/path/to/LLDB.framework/Resources/Python ./crashlog.py ~/Library/Logs/DiagnosticReports/a.crash
 #----------------------------------------------------------------------
 
-from __future__ import print_function
 import lldb
 import optparse
 import os
diff --git a/lldb/examples/python/types.py b/lldb/examples/python/types.py
--- a/lldb/examples/python/types.py
+++ b/lldb/examples/python/types.py
@@ -9,8 +9,6 @@
 #   (lldb) command script import /path/to/cmdtemplate.py
 #----------------------------------------------------------------------
 
-from __future__ import print_function
-
 import platform
 import os
 import re
diff --git a/lldb/examples/scripting/tree_utils.py b/lldb/examples/scripting/tree_utils.py
--- a/lldb/examples/scripting/tree_utils.py
+++ b/lldb/examples/scripting/tree_utils.py
@@ -18,8 +18,6 @@
 http://lldb.llvm.org/scripting.html
 """
 
-from __future__ import print_function
-
 
 def DFS(root, word, cur_path):
     """
diff --git a/lldb/examples/summaries/cocoa/CFBitVector.py b/lldb/examples/summaries/cocoa/CFBitVector.py
--- a/lldb/examples/summaries/cocoa/CFBitVector.py
+++ b/lldb/examples/summaries/cocoa/CFBitVector.py
@@ -5,7 +5,6 @@
 See https://llvm.org/LICENSE.txt for license information.
 SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 """
-from __future__ import print_function
 
 # summary provider for CF(Mutable)BitVector
 import lldb
diff --git a/lldb/examples/summaries/cocoa/Logger.py b/lldb/examples/summaries/cocoa/Logger.py
--- a/lldb/examples/summaries/cocoa/Logger.py
+++ b/lldb/examples/summaries/cocoa/Logger.py
@@ -1,4 +1,3 @@
-from __future__ import print_function
 import sys
 import os.path
 import inspect
diff --git a/lldb/examples/summaries/cocoa/NSNumber.py b/lldb/examples/summaries/cocoa/NSNumber.py
--- a/lldb/examples/summaries/cocoa/NSNumber.py
+++ b/lldb/examples/summaries/cocoa/NSNumber.py
@@ -8,8 +8,6 @@
 # example summary provider for NSNumber
 # the real summary is now C++ code built into LLDB
 
-from __future__ import print_function
-
 import lldb
 import ctypes
 import lldb.runtime.objc.objc_runtime
diff --git a/lldb/examples/synthetic/gnu_libstdcpp.py b/lldb/examples/synthetic/gnu_libstdcpp.py
--- a/lldb/examples/synthetic/gnu_libstdcpp.py
+++ b/lldb/examples/synthetic/gnu_libstdcpp.py
@@ -1,4 +1,3 @@
-from __future__ import division
 import lldb.formatters.Logger
 
 # C++ STL formatters for LLDB
diff --git a/lldb/include/lldb/Core/Debugger.h b/lldb/include/lldb/Core/Debugger.h
--- a/lldb/include/lldb/Core/Debugger.h
+++ b/lldb/include/lldb/Core/Debugger.h
@@ -82,6 +82,7 @@
     eBroadcastBitProgress = (1 << 0),
     eBroadcastBitWarning = (1 << 1),
     eBroadcastBitError = (1 << 2),
+    eBroadcastSymbolChange = (1 << 3),
   };
 
   static ConstString GetStaticBroadcasterClass();
@@ -430,6 +431,8 @@
               llvm::Optional<lldb::user_id_t> debugger_id = llvm::None,
               std::once_flag *once = nullptr);
 
+  static void ReportSymbolChange(const ModuleSpec &module_spec);
+
 protected:
   friend class CommandInterpreter;
   friend class REPL;
diff --git a/lldb/include/lldb/Core/DebuggerEvents.h b/lldb/include/lldb/Core/DebuggerEvents.h
--- a/lldb/include/lldb/Core/DebuggerEvents.h
+++ b/lldb/include/lldb/Core/DebuggerEvents.h
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "lldb/Core/ModuleSpec.h"
 #include "lldb/Utility/ConstString.h"
 #include "lldb/Utility/Event.h"
 
@@ -82,6 +83,28 @@
   const DiagnosticEventData &operator=(const DiagnosticEventData &) = delete;
 };
 
+class SymbolChangeEventData : public EventData {
+public:
+  SymbolChangeEventData(lldb::DebuggerWP debugger_wp, ModuleSpec module_spec)
+      : m_debugger_wp(debugger_wp), m_module_spec(std::move(module_spec)) {}
+
+  static ConstString GetFlavorString();
+  ConstString GetFlavor() const override;
+
+  static const SymbolChangeEventData *
+  GetEventDataFromEvent(const Event *event_ptr);
+
+  void DoOnRemoval(Event *event_ptr) override;
+
+private:
+  lldb::DebuggerWP m_debugger_wp;
+  ModuleSpec m_module_spec;
+
+  SymbolChangeEventData(const SymbolChangeEventData &) = delete;
+  const SymbolChangeEventData &
+  operator=(const SymbolChangeEventData &) = delete;
+};
+
 } // namespace lldb_private
 
 #endif // LLDB_CORE_DEBUGGER_EVENTS_H
diff --git a/lldb/include/lldb/Core/ModuleList.h b/lldb/include/lldb/Core/ModuleList.h
--- a/lldb/include/lldb/Core/ModuleList.h
+++ b/lldb/include/lldb/Core/ModuleList.h
@@ -60,6 +60,7 @@
   bool SetClangModulesCachePath(const FileSpec &path);
   bool GetEnableExternalLookup() const;
   bool SetEnableExternalLookup(bool new_value);
+  bool GetEnableBackgroundLookup() const;
   bool GetEnableLLDBIndexCache() const;
   bool SetEnableLLDBIndexCache(bool new_value);
   uint64_t GetLLDBIndexCacheMaxByteSize();
@@ -457,6 +458,8 @@
   static void FindSharedModules(const ModuleSpec &module_spec,
                                 ModuleList &matching_module_list);
 
+  static lldb::ModuleSP FindSharedModule(const UUID &uuid);
+
   static size_t RemoveOrphanSharedModules(bool mandatory);
 
   static bool RemoveSharedModuleIfOrphaned(const Module *module_ptr);
diff --git a/lldb/include/lldb/Symbol/LocateSymbolFile.h b/lldb/include/lldb/Symbol/LocateSymbolFile.h
--- a/lldb/include/lldb/Symbol/LocateSymbolFile.h
+++ b/lldb/include/lldb/Symbol/LocateSymbolFile.h
@@ -14,6 +14,7 @@
 #include "lldb/Core/FileSpecList.h"
 #include "lldb/Utility/FileSpec.h"
 #include "lldb/Utility/Status.h"
+#include "lldb/lldb-forward.h"
 
 namespace lldb_private {
 
@@ -52,7 +53,15 @@
   //
   static bool DownloadObjectAndSymbolFile(ModuleSpec &module_spec,
                                           Status &error,
-                                          bool force_lookup = true);
+                                          bool force_lookup = true,
+                                          bool copy_executable = true);
+
+  /// Locate the symbol file for the given UUID on a background thread. This
+  /// function returns immediately. Under the hood it uses the debugger's
+  /// thread pool to call DownloadObjectAndSymbolFile. If a symbol file is
+  /// found, this will notify all target which contain the module with the
+  /// given UUID.
+  static void DownloadSymbolFileAsync(const UUID &uuid);
 };
 
 } // namespace lldb_private
diff --git a/lldb/include/lldb/Target/Target.h b/lldb/include/lldb/Target/Target.h
--- a/lldb/include/lldb/Target/Target.h
+++ b/lldb/include/lldb/Target/Target.h
@@ -162,7 +162,7 @@
   bool GetEnableNotifyAboutFixIts() const;
 
   FileSpec GetSaveJITObjectsDir() const;
-  
+
   bool GetEnableSyntheticValue() const;
 
   uint32_t GetMaxZeroPaddingInFloatFormat() const;
@@ -260,7 +260,7 @@
   void DisableASLRValueChangedCallback();
   void InheritTCCValueChangedCallback();
   void DisableSTDIOValueChangedCallback();
-  
+
   // Settings checker for target.jit-save-objects-dir:
   void CheckJITObjectsDir();
 
@@ -479,7 +479,8 @@
     eBroadcastBitModulesLoaded = (1 << 1),
     eBroadcastBitModulesUnloaded = (1 << 2),
     eBroadcastBitWatchpointChanged = (1 << 3),
-    eBroadcastBitSymbolsLoaded = (1 << 4)
+    eBroadcastBitSymbolsLoaded = (1 << 4),
+    eBroadcastBitSymbolsChanged = (1 << 5),
   };
 
   // These two functions fill out the Broadcaster interface:
@@ -981,7 +982,7 @@
   ModuleIsExcludedForUnconstrainedSearches(const lldb::ModuleSP &module_sp);
 
   const ArchSpec &GetArchitecture() const { return m_arch.GetSpec(); }
-  
+
   /// Returns the name of the target's ABI plugin.
   llvm::StringRef GetABIName() const;
 
@@ -1425,30 +1426,30 @@
     LazyBool pass = eLazyBoolCalculate;
     LazyBool notify = eLazyBoolCalculate;
     LazyBool stop = eLazyBoolCalculate;
-    DummySignalValues(LazyBool pass, LazyBool notify, LazyBool stop) : 
-        pass(pass), notify(notify), stop(stop) {}
+    DummySignalValues(LazyBool pass, LazyBool notify, LazyBool stop)
+        : pass(pass), notify(notify), stop(stop) {}
     DummySignalValues() = default;
   };
   using DummySignalElement = llvm::StringMapEntry<DummySignalValues>;
-  static bool UpdateSignalFromDummy(lldb::UnixSignalsSP signals_sp, 
-      const DummySignalElement &element);
-  static bool ResetSignalFromDummy(lldb::UnixSignalsSP signals_sp, 
-      const DummySignalElement &element);
+  static bool UpdateSignalFromDummy(lldb::UnixSignalsSP signals_sp,
+                                    const DummySignalElement &element);
+  static bool ResetSignalFromDummy(lldb::UnixSignalsSP signals_sp,
+                                   const DummySignalElement &element);
 
 public:
   /// Add a signal to the Target's list of stored signals/actions.  These
   /// values will get copied into any processes launched from
   /// this target.
-  void AddDummySignal(llvm::StringRef name, LazyBool pass, LazyBool print, 
+  void AddDummySignal(llvm::StringRef name, LazyBool pass, LazyBool print,
                       LazyBool stop);
   /// Updates the signals in signals_sp using the stored dummy signals.
   /// If warning_stream_sp is not null, if any stored signals are not found in
   /// the current process, a warning will be emitted here.
-  void UpdateSignalsFromDummy(lldb::UnixSignalsSP signals_sp, 
+  void UpdateSignalsFromDummy(lldb::UnixSignalsSP signals_sp,
                               lldb::StreamSP warning_stream_sp);
   /// Clear the dummy signals in signal_names from the target, or all signals
   /// if signal_names is empty.  Also remove the behaviors they set from the
-  /// process's signals if it exists. 
+  /// process's signals if it exists.
   void ClearDummySignals(Args &signal_names);
   /// Print all the signals set in this target.
   void PrintDummySignals(Stream &strm, Args &signals);
@@ -1533,7 +1534,7 @@
   lldb::TraceSP m_trace_sp;
   /// Stores the frame recognizers of this target.
   lldb::StackFrameRecognizerManagerUP m_frame_recognizer_manager_up;
-  /// These are used to set the signal state when you don't have a process and 
+  /// These are used to set the signal state when you don't have a process and
   /// more usefully in the Dummy target where you can't know exactly what
   /// signals you will have.
   llvm::StringMap<DummySignalValues> m_dummy_signals;
diff --git a/lldb/source/Core/CoreProperties.td b/lldb/source/Core/CoreProperties.td
--- a/lldb/source/Core/CoreProperties.td
+++ b/lldb/source/Core/CoreProperties.td
@@ -5,6 +5,10 @@
     Global,
     DefaultTrue,
     Desc<"Control the use of external tools and repositories to locate symbol files. Directories listed in target.debug-file-search-paths and directory of the executable are always checked first for separate debug info files. Then depending on this setting: On macOS, Spotlight would be also used to locate a matching .dSYM bundle based on the UUID of the executable. On NetBSD, directory /usr/libdata/debug would be also searched. On platforms other than NetBSD directory /usr/lib/debug would be also searched.">;
+  def EnableBackgroundLookup: Property<"enable-background-lookup", "Boolean">,
+    Global,
+    DefaultFalse,
+    Desc<"On macOS, enable calling dsymForUUID (or an equivalent script/binary) in the background to locate symbol files that weren't found.">;
   def ClangModulesCachePath: Property<"clang-modules-cache-path", "FileSpec">,
     Global,
     DefaultStringValue<"">,
diff --git a/lldb/source/Core/Debugger.cpp b/lldb/source/Core/Debugger.cpp
--- a/lldb/source/Core/Debugger.cpp
+++ b/lldb/source/Core/Debugger.cpp
@@ -13,6 +13,7 @@
 #include "lldb/Core/FormatEntity.h"
 #include "lldb/Core/Mangled.h"
 #include "lldb/Core/ModuleList.h"
+#include "lldb/Core/ModuleSpec.h"
 #include "lldb/Core/PluginManager.h"
 #include "lldb/Core/StreamAsynchronousIO.h"
 #include "lldb/Core/StreamFile.h"
@@ -104,6 +105,7 @@
     nullptr; // NOTE: intentional leak to avoid issues with C++ destructor chain
 static DebuggerList *g_debugger_list_ptr =
     nullptr; // NOTE: intentional leak to avoid issues with C++ destructor chain
+static llvm::ThreadPool *g_thread_pool = nullptr;
 
 static constexpr OptionEnumValueElement g_show_disassembly_enum_values[] = {
     {
@@ -538,6 +540,7 @@
          "Debugger::Initialize called more than once!");
   g_debugger_list_mutex_ptr = new std::recursive_mutex();
   g_debugger_list_ptr = new DebuggerList();
+  g_thread_pool = new llvm::ThreadPool(llvm::optimal_concurrency());
   g_load_plugin_callback = load_plugin_callback;
 }
 
@@ -545,6 +548,11 @@
   assert(g_debugger_list_ptr &&
          "Debugger::Terminate called without a matching Debugger::Initialize!");
 
+  if (g_thread_pool) {
+    // The destructor will wait for all the threads to complete.
+    delete g_thread_pool;
+  }
+
   if (g_debugger_list_ptr && g_debugger_list_mutex_ptr) {
     // Clear our global list of debugger objects
     {
@@ -1406,6 +1414,18 @@
                        debugger_id, once);
 }
 
+void Debugger::ReportSymbolChange(const ModuleSpec &module_spec) {
+  if (g_debugger_list_ptr && g_debugger_list_mutex_ptr) {
+    std::lock_guard<std::recursive_mutex> guard(*g_debugger_list_mutex_ptr);
+    for (DebuggerSP debugger_sp : *g_debugger_list_ptr) {
+      EventSP event_sp = std::make_shared<Event>(
+          Debugger::eBroadcastSymbolChange,
+          new SymbolChangeEventData(debugger_sp, module_spec));
+      debugger_sp->GetBroadcaster().BroadcastEvent(event_sp);
+    }
+  }
+}
+
 static std::shared_ptr<LogHandler>
 CreateLogHandler(LogHandlerKind log_handler_kind, int fd, bool should_close,
                  size_t buffer_size) {
@@ -1702,8 +1722,8 @@
           CommandInterpreter::eBroadcastBitAsynchronousErrorData);
 
   listener_sp->StartListeningForEvents(
-      &m_broadcaster,
-      eBroadcastBitProgress | eBroadcastBitWarning | eBroadcastBitError);
+      &m_broadcaster, eBroadcastBitProgress | eBroadcastBitWarning |
+                          eBroadcastBitError | eBroadcastSymbolChange);
 
   // Let the thread that spawned us know that we have started up and that we
   // are now listening to all required events so no events get missed
@@ -2005,11 +2025,7 @@
 }
 
 llvm::ThreadPool &Debugger::GetThreadPool() {
-  // NOTE: intentional leak to avoid issues with C++ destructor chain
-  static llvm::ThreadPool *g_thread_pool = nullptr;
-  static llvm::once_flag g_once_flag;
-  llvm::call_once(g_once_flag, []() {
-    g_thread_pool = new llvm::ThreadPool(llvm::optimal_concurrency());
-  });
+  assert(g_thread_pool &&
+         "Debugger::GetThreadPool called before Debugger::Initialize");
   return *g_thread_pool;
 }
diff --git a/lldb/source/Core/DebuggerEvents.cpp b/lldb/source/Core/DebuggerEvents.cpp
--- a/lldb/source/Core/DebuggerEvents.cpp
+++ b/lldb/source/Core/DebuggerEvents.cpp
@@ -7,9 +7,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "lldb/Core/DebuggerEvents.h"
+#include "lldb/Core/Debugger.h"
+#include "lldb/Core/Module.h"
 #include "llvm/Support/WithColor.h"
 
 using namespace lldb_private;
+using namespace lldb;
 
 template <typename T>
 static const T *GetEventDataFromEventImpl(const Event *event_ptr) {
@@ -79,3 +82,37 @@
 DiagnosticEventData::GetEventDataFromEvent(const Event *event_ptr) {
   return GetEventDataFromEventImpl<DiagnosticEventData>(event_ptr);
 }
+
+ConstString SymbolChangeEventData::GetFlavorString() {
+  static ConstString g_flavor("SymbolChangeEventData");
+  return g_flavor;
+}
+
+ConstString SymbolChangeEventData::GetFlavor() const {
+  return SymbolChangeEventData::GetFlavorString();
+}
+
+const SymbolChangeEventData *
+SymbolChangeEventData::GetEventDataFromEvent(const Event *event_ptr) {
+  return GetEventDataFromEventImpl<SymbolChangeEventData>(event_ptr);
+}
+
+void SymbolChangeEventData::DoOnRemoval(Event *event_ptr) {
+  DebuggerSP debugger_sp(m_debugger_wp.lock());
+  if (!debugger_sp)
+    return;
+
+  for (TargetSP target_sp : debugger_sp->GetTargetList().Targets()) {
+    if (ModuleSP module_sp =
+            target_sp->GetImages().FindModule(m_module_spec.GetUUID())) {
+      {
+        std::lock_guard<std::recursive_mutex> guard(module_sp->GetMutex());
+        if (!module_sp->GetSymbolFileFileSpec())
+          module_sp->SetSymbolFileFileSpec(m_module_spec.GetSymbolFileSpec());
+      }
+      ModuleList module_list;
+      module_list.Append(module_sp);
+      target_sp->SymbolsDidLoad(module_list);
+    }
+  }
+}
diff --git a/lldb/source/Core/FormatEntity.cpp b/lldb/source/Core/FormatEntity.cpp
--- a/lldb/source/Core/FormatEntity.cpp
+++ b/lldb/source/Core/FormatEntity.cpp
@@ -711,9 +711,6 @@
     return false;
   }
 
-  if (valobj == nullptr)
-    return false;
-
   ValueObject::ExpressionPathAftermath what_next =
       (do_deref_pointer ? ValueObject::eExpressionPathAftermathDereference
                         : ValueObject::eExpressionPathAftermathNothing);
@@ -1695,7 +1692,7 @@
               llvm::StringRef var_representation;
               const char *var_name = var_value_sp->GetName().GetCString();
               if (var_value_sp->GetCompilerType().IsValid()) {
-                if (var_value_sp && exe_scope->CalculateTarget())
+                if (exe_scope && exe_scope->CalculateTarget())
                   var_value_sp =
                       var_value_sp->GetQualifiedRepresentationIfAvailable(
                           exe_scope->CalculateTarget()
diff --git a/lldb/source/Core/Module.cpp b/lldb/source/Core/Module.cpp
--- a/lldb/source/Core/Module.cpp
+++ b/lldb/source/Core/Module.cpp
@@ -24,6 +24,7 @@
 #include "lldb/Interpreter/ScriptInterpreter.h"
 #include "lldb/Symbol/CompileUnit.h"
 #include "lldb/Symbol/Function.h"
+#include "lldb/Symbol/LocateSymbolFile.h"
 #include "lldb/Symbol/ObjectFile.h"
 #include "lldb/Symbol/Symbol.h"
 #include "lldb/Symbol/SymbolContext.h"
@@ -770,7 +771,7 @@
     while (i < sc_list.GetSize()) {
       if (!sc_list.GetContextAtIndex(i, sc))
         break;
-      
+
       bool keep_it =
           NameMatchesLookupInfo(sc.GetFunctionName(), sc.GetLanguage());
       if (keep_it)
@@ -1317,8 +1318,11 @@
 }
 
 UnwindTable &Module::GetUnwindTable() {
-  if (!m_unwind_table)
+  if (!m_unwind_table) {
     m_unwind_table.emplace(*this);
+    if (!m_symfile_spec)
+      Symbols::DownloadSymbolFileAsync(GetUUID());
+  }
   return *m_unwind_table;
 }
 
diff --git a/lldb/source/Core/ModuleList.cpp b/lldb/source/Core/ModuleList.cpp
--- a/lldb/source/Core/ModuleList.cpp
+++ b/lldb/source/Core/ModuleList.cpp
@@ -106,6 +106,12 @@
       nullptr, ePropertyEnableExternalLookup, new_value);
 }
 
+bool ModuleListProperties::GetEnableBackgroundLookup() const {
+  const uint32_t idx = ePropertyEnableBackgroundLookup;
+  return m_collection_sp->GetPropertyAtIndexAsBoolean(
+      nullptr, idx, g_modulelist_properties[idx].default_uint_value != 0);
+}
+
 FileSpec ModuleListProperties::GetClangModulesCachePath() const {
   return m_collection_sp
       ->GetPropertyAtIndexAsOptionValueFileSpec(nullptr, false,
@@ -768,6 +774,10 @@
   GetSharedModuleList().FindModules(module_spec, matching_module_list);
 }
 
+lldb::ModuleSP ModuleList::FindSharedModule(const UUID &uuid) {
+  return GetSharedModuleList().FindModule(uuid);
+}
+
 size_t ModuleList::RemoveOrphanSharedModules(bool mandatory) {
   return GetSharedModuleList().RemoveOrphans(mandatory);
 }
diff --git a/lldb/source/Host/common/Editline.cpp b/lldb/source/Host/common/Editline.cpp
--- a/lldb/source/Host/common/Editline.cpp
+++ b/lldb/source/Host/common/Editline.cpp
@@ -1609,7 +1609,7 @@
     switch (cvt.in(state, input.begin(), input.end(), from_next, &out, &out + 1,
                    to_next)) {
     case std::codecvt_base::ok:
-      return out != (int)WEOF;
+      return out != (EditLineGetCharType)WEOF;
 
     case std::codecvt_base::error:
     case std::codecvt_base::noconv:
diff --git a/lldb/source/Plugins/Trace/intel-pt/LibiptDecoder.cpp b/lldb/source/Plugins/Trace/intel-pt/LibiptDecoder.cpp
--- a/lldb/source/Plugins/Trace/intel-pt/LibiptDecoder.cpp
+++ b/lldb/source/Plugins/Trace/intel-pt/LibiptDecoder.cpp
@@ -295,7 +295,9 @@
         break;
       case ptev_overflow:
         // The CPU internal buffer had an overflow error and some instructions
-        // were lost.
+        // were lost. A OVF packet comes with an FUP packet (harcoded address)
+        // according to the documentation, so we'll continue seeing instructions
+        // after this event.
         m_decoded_thread.AppendError(IntelPTError(-pte_overflow));
         break;
       default:
diff --git a/lldb/source/Symbol/LocateSymbolFile.cpp b/lldb/source/Symbol/LocateSymbolFile.cpp
--- a/lldb/source/Symbol/LocateSymbolFile.cpp
+++ b/lldb/source/Symbol/LocateSymbolFile.cpp
@@ -8,6 +8,8 @@
 
 #include "lldb/Symbol/LocateSymbolFile.h"
 
+#include "lldb/Core/Debugger.h"
+#include "lldb/Core/Module.h"
 #include "lldb/Core/ModuleList.h"
 #include "lldb/Core/ModuleSpec.h"
 #include "lldb/Core/Progress.h"
@@ -23,7 +25,9 @@
 #include "lldb/Utility/Timer.h"
 #include "lldb/Utility/UUID.h"
 
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/ThreadPool.h"
 
 // From MacOSX system header "mach/machine.h"
 typedef int cpu_type_t;
@@ -397,6 +401,35 @@
   return LocateExecutableSymbolFileDsym(module_spec);
 }
 
+void Symbols::DownloadSymbolFileAsync(const UUID &uuid) {
+  if (!ModuleList::GetGlobalModuleListProperties().GetEnableBackgroundLookup())
+    return;
+
+  static llvm::SmallSet<UUID, 8> g_seen_uuids;
+  static std::mutex g_mutex;
+  Debugger::GetThreadPool().async([=]() {
+    {
+      std::lock_guard<std::mutex> guard(g_mutex);
+      if (g_seen_uuids.count(uuid))
+        return;
+      g_seen_uuids.insert(uuid);
+    }
+
+    Status error;
+    ModuleSpec module_spec;
+    module_spec.GetUUID() = uuid;
+    if (!Symbols::DownloadObjectAndSymbolFile(module_spec, error,
+                                              /*force_lookup=*/true,
+                                              /*copy_executable=*/false))
+      return;
+
+    if (error.Fail())
+      return;
+
+    Debugger::ReportSymbolChange(module_spec);
+  });
+}
+
 #if !defined(__APPLE__)
 
 FileSpec Symbols::FindSymbolFileInBundle(const FileSpec &symfile_bundle,
@@ -407,7 +440,8 @@
 }
 
 bool Symbols::DownloadObjectAndSymbolFile(ModuleSpec &module_spec,
-                                          Status &error, bool force_lookup) {
+                                          Status &error, bool force_lookup,
+                                          bool copy_executable) {
   // Fill in the module_spec.GetFileSpec() for the object file and/or the
   // module_spec.GetSymbolFileSpec() for the debug symbols file.
   return false;
diff --git a/lldb/source/Symbol/LocateSymbolFileMacOSX.cpp b/lldb/source/Symbol/LocateSymbolFileMacOSX.cpp
--- a/lldb/source/Symbol/LocateSymbolFileMacOSX.cpp
+++ b/lldb/source/Symbol/LocateSymbolFileMacOSX.cpp
@@ -554,7 +554,8 @@
 }
 
 bool Symbols::DownloadObjectAndSymbolFile(ModuleSpec &module_spec,
-                                          Status &error, bool force_lookup) {
+                                          Status &error, bool force_lookup,
+                                          bool copy_executable) {
   const UUID *uuid_ptr = module_spec.GetUUIDPtr();
   const FileSpec *file_spec_ptr = module_spec.GetFileSpecPtr();
 
@@ -584,15 +585,18 @@
 
   // Create the dsymForUUID command.
   StreamString command;
+  const char *copy_executable_arg = copy_executable ? "--copyExecutable " : "";
   if (!uuid_str.empty()) {
-    command.Printf("%s --ignoreNegativeCache --copyExecutable %s",
-                   dsymForUUID_exe_path.c_str(), uuid_str.c_str());
+    command.Printf("%s --ignoreNegativeCache %s%s",
+                   dsymForUUID_exe_path.c_str(), copy_executable_arg,
+                   uuid_str.c_str());
     LLDB_LOGF(log, "Calling %s with UUID %s to find dSYM: %s",
               dsymForUUID_exe_path.c_str(), uuid_str.c_str(),
               command.GetString().data());
   } else if (!file_path_str.empty()) {
-    command.Printf("%s --ignoreNegativeCache --copyExecutable %s",
-                   dsymForUUID_exe_path.c_str(), file_path_str.c_str());
+    command.Printf("%s --ignoreNegativeCache %s%s",
+                   dsymForUUID_exe_path.c_str(), copy_executable_arg,
+                   file_path_str.c_str());
     LLDB_LOGF(log, "Calling %s with file %s to find dSYM: %s",
               dsymForUUID_exe_path.c_str(), file_path_str.c_str(),
               command.GetString().data());
diff --git a/lldb/source/Target/Platform.cpp b/lldb/source/Target/Platform.cpp
--- a/lldb/source/Target/Platform.cpp
+++ b/lldb/source/Target/Platform.cpp
@@ -2060,10 +2060,9 @@
   // the same platform supports all architectures then that's the obvious next
   // best thing.
   if (candidates.size() == archs.size()) {
-    if (std::all_of(candidates.begin(), candidates.end(),
-                    [&](const PlatformSP &p) -> bool {
-                      return p->GetName() == candidates.front()->GetName();
-                    })) {
+    if (llvm::all_of(candidates, [&](const PlatformSP &p) -> bool {
+          return p->GetName() == candidates.front()->GetName();
+        })) {
       return candidates.front();
     }
   }
diff --git a/lldb/source/Utility/Event.cpp b/lldb/source/Utility/Event.cpp
--- a/lldb/source/Utility/Event.cpp
+++ b/lldb/source/Utility/Event.cpp
@@ -124,9 +124,7 @@
 }
 
 void EventDataBytes::Dump(Stream *s) const {
-  size_t num_printable_chars =
-      std::count_if(m_bytes.begin(), m_bytes.end(), llvm::isPrint);
-  if (num_printable_chars == m_bytes.size())
+  if (llvm::all_of(m_bytes, llvm::isPrint))
     s->Format("\"{0}\"", m_bytes);
   else
     s->Format("{0:$[ ]@[x-2]}", llvm::make_range(
diff --git a/lldb/test/API/api/check_public_api_headers/TestPublicAPIHeaders.py b/lldb/test/API/api/check_public_api_headers/TestPublicAPIHeaders.py
--- a/lldb/test/API/api/check_public_api_headers/TestPublicAPIHeaders.py
+++ b/lldb/test/API/api/check_public_api_headers/TestPublicAPIHeaders.py
@@ -3,9 +3,6 @@
 There should be nothing unwanted there and a simpe main.cpp which includes SB*.h
 should compile and link with the LLDB framework."""
 
-from __future__ import print_function
-
-
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
 from lldbsuite.test import lldbutil
diff --git a/lldb/test/API/api/command-return-object/TestSBCommandReturnObject.py b/lldb/test/API/api/command-return-object/TestSBCommandReturnObject.py
--- a/lldb/test/API/api/command-return-object/TestSBCommandReturnObject.py
+++ b/lldb/test/API/api/command-return-object/TestSBCommandReturnObject.py
@@ -1,8 +1,5 @@
 """Test the lldb public C++ api for returning SBCommandReturnObject."""
 
-from __future__ import print_function
-
-
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
 from lldbsuite.test import lldbutil
diff --git a/lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py b/lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py
--- a/lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py
+++ b/lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py
@@ -1,8 +1,5 @@
 """Test the lldb public C++ api when doing multiple debug sessions simultaneously."""
 
-from __future__ import print_function
-
-
 import os
 
 import lldb
diff --git a/lldb/test/API/api/multiple-targets/TestMultipleTargets.py b/lldb/test/API/api/multiple-targets/TestMultipleTargets.py
--- a/lldb/test/API/api/multiple-targets/TestMultipleTargets.py
+++ b/lldb/test/API/api/multiple-targets/TestMultipleTargets.py
@@ -1,8 +1,5 @@
 """Test the lldb public C++ api when creating multiple targets simultaneously."""
 
-from __future__ import print_function
-
-
 import os
 
 import lldb
diff --git a/lldb/test/API/api/multithreaded/TestMultithreaded.py b/lldb/test/API/api/multithreaded/TestMultithreaded.py
--- a/lldb/test/API/api/multithreaded/TestMultithreaded.py
+++ b/lldb/test/API/api/multithreaded/TestMultithreaded.py
@@ -1,7 +1,5 @@
 """Test the lldb public C++ api breakpoint callbacks."""
 
-from __future__ import print_function
-
 # __package__ = "lldbsuite.test"
 
 
diff --git a/lldb/test/API/arm/emulation/TestEmulations.py b/lldb/test/API/arm/emulation/TestEmulations.py
--- a/lldb/test/API/arm/emulation/TestEmulations.py
+++ b/lldb/test/API/arm/emulation/TestEmulations.py
@@ -2,9 +2,6 @@
 Test some ARM instruction emulation.
 """
 
-from __future__ import print_function
-
-
 import os
 import lldb
 from lldbsuite.test.decorators import *
diff --git a/lldb/test/API/benchmarks/continue/TestBenchmarkContinue.py b/lldb/test/API/benchmarks/continue/TestBenchmarkContinue.py
--- a/lldb/test/API/benchmarks/continue/TestBenchmarkContinue.py
+++ b/lldb/test/API/benchmarks/continue/TestBenchmarkContinue.py
@@ -2,9 +2,6 @@
 Test lldb data formatter subsystem.
 """
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbbench import *
diff --git a/lldb/test/API/benchmarks/expression/TestExpressionCmd.py b/lldb/test/API/benchmarks/expression/TestExpressionCmd.py
--- a/lldb/test/API/benchmarks/expression/TestExpressionCmd.py
+++ b/lldb/test/API/benchmarks/expression/TestExpressionCmd.py
@@ -1,8 +1,5 @@
 """Test lldb's expression evaluations and collect statistics."""
 
-from __future__ import print_function
-
-
 import sys
 import lldb
 from lldbsuite.test.decorators import *
diff --git a/lldb/test/API/benchmarks/expression/TestRepeatedExprs.py b/lldb/test/API/benchmarks/expression/TestRepeatedExprs.py
--- a/lldb/test/API/benchmarks/expression/TestRepeatedExprs.py
+++ b/lldb/test/API/benchmarks/expression/TestRepeatedExprs.py
@@ -1,8 +1,5 @@
 """Test evaluating expressions repeatedly comparing lldb against gdb."""
 
-from __future__ import print_function
-
-
 import sys
 import lldb
 from lldbsuite.test.lldbbench import BenchBase
diff --git a/lldb/test/API/benchmarks/frame_variable/TestFrameVariableResponse.py b/lldb/test/API/benchmarks/frame_variable/TestFrameVariableResponse.py
--- a/lldb/test/API/benchmarks/frame_variable/TestFrameVariableResponse.py
+++ b/lldb/test/API/benchmarks/frame_variable/TestFrameVariableResponse.py
@@ -1,8 +1,5 @@
 """Test lldb's response time for 'frame variable' command."""
 
-from __future__ import print_function
-
-
 import sys
 import lldb
 from lldbsuite.test import configuration
diff --git a/lldb/test/API/benchmarks/libcxxlist/TestBenchmarkLibcxxList.py b/lldb/test/API/benchmarks/libcxxlist/TestBenchmarkLibcxxList.py
--- a/lldb/test/API/benchmarks/libcxxlist/TestBenchmarkLibcxxList.py
+++ b/lldb/test/API/benchmarks/libcxxlist/TestBenchmarkLibcxxList.py
@@ -2,9 +2,6 @@
 Test lldb data formatter subsystem.
 """
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbbench import *
diff --git a/lldb/test/API/benchmarks/libcxxmap/TestBenchmarkLibcxxMap.py b/lldb/test/API/benchmarks/libcxxmap/TestBenchmarkLibcxxMap.py
--- a/lldb/test/API/benchmarks/libcxxmap/TestBenchmarkLibcxxMap.py
+++ b/lldb/test/API/benchmarks/libcxxmap/TestBenchmarkLibcxxMap.py
@@ -2,9 +2,6 @@
 Test lldb data formatter subsystem.
 """
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.lldbbench import *
 from lldbsuite.test.decorators import *
diff --git a/lldb/test/API/benchmarks/startup/TestStartupDelays.py b/lldb/test/API/benchmarks/startup/TestStartupDelays.py
--- a/lldb/test/API/benchmarks/startup/TestStartupDelays.py
+++ b/lldb/test/API/benchmarks/startup/TestStartupDelays.py
@@ -1,8 +1,5 @@
 """Test lldb's startup delays creating a target, setting a breakpoint, and run to breakpoint stop."""
 
-from __future__ import print_function
-
-
 import sys
 import lldb
 from lldbsuite.test import configuration
diff --git a/lldb/test/API/benchmarks/stepping/TestSteppingSpeed.py b/lldb/test/API/benchmarks/stepping/TestSteppingSpeed.py
--- a/lldb/test/API/benchmarks/stepping/TestSteppingSpeed.py
+++ b/lldb/test/API/benchmarks/stepping/TestSteppingSpeed.py
@@ -1,7 +1,5 @@
 """Test lldb's stepping speed."""
 
-from __future__ import print_function
-
 import sys
 import lldb
 from lldbsuite.test import configuration
diff --git a/lldb/test/API/benchmarks/turnaround/TestCompileRunToBreakpointTurnaround.py b/lldb/test/API/benchmarks/turnaround/TestCompileRunToBreakpointTurnaround.py
--- a/lldb/test/API/benchmarks/turnaround/TestCompileRunToBreakpointTurnaround.py
+++ b/lldb/test/API/benchmarks/turnaround/TestCompileRunToBreakpointTurnaround.py
@@ -1,8 +1,5 @@
 """Benchmark the turnaround time starting a debugger and run to the breakpoint with lldb vs. gdb."""
 
-from __future__ import print_function
-
-
 import sys
 import lldb
 from lldbsuite.test.lldbbench import *
diff --git a/lldb/test/API/commands/command/container/welcome.py b/lldb/test/API/commands/command/container/welcome.py
--- a/lldb/test/API/commands/command/container/welcome.py
+++ b/lldb/test/API/commands/command/container/welcome.py
@@ -1,4 +1,3 @@
-from __future__ import print_function
 import lldb
 import sys
 
diff --git a/lldb/test/API/commands/command/script/decorated.py b/lldb/test/API/commands/command/script/decorated.py
--- a/lldb/test/API/commands/command/script/decorated.py
+++ b/lldb/test/API/commands/command/script/decorated.py
@@ -1,5 +1,3 @@
-from __future__ import print_function
-
 import lldb
 
 
diff --git a/lldb/test/API/commands/command/script/import/bar/bar.py b/lldb/test/API/commands/command/script/import/bar/bar.py
--- a/lldb/test/API/commands/command/script/import/bar/bar.py
+++ b/lldb/test/API/commands/command/script/import/bar/bar.py
@@ -1,6 +1,3 @@
-from __future__ import print_function
-
-
 def bar_function(debugger, args, result, dict):
     global UtilityModule
     print(UtilityModule.barutil_function("bar told me " + args), file=result)
diff --git a/lldb/test/API/commands/command/script/import/foo/bar/foobar.py b/lldb/test/API/commands/command/script/import/foo/bar/foobar.py
--- a/lldb/test/API/commands/command/script/import/foo/bar/foobar.py
+++ b/lldb/test/API/commands/command/script/import/foo/bar/foobar.py
@@ -1,6 +1,3 @@
-from __future__ import print_function
-
-
 def foo_function(debugger, args, result, dict):
     print("foobar says " + args, file=result)
     return None
diff --git a/lldb/test/API/commands/command/script/import/foo/foo.py b/lldb/test/API/commands/command/script/import/foo/foo.py
--- a/lldb/test/API/commands/command/script/import/foo/foo.py
+++ b/lldb/test/API/commands/command/script/import/foo/foo.py
@@ -1,6 +1,3 @@
-from __future__ import print_function
-
-
 def foo_function(debugger, args, result, dict):
     print("foo says " + args, file=result)
     return None
diff --git a/lldb/test/API/commands/command/script/import/foo/foo2.py b/lldb/test/API/commands/command/script/import/foo/foo2.py
--- a/lldb/test/API/commands/command/script/import/foo/foo2.py
+++ b/lldb/test/API/commands/command/script/import/foo/foo2.py
@@ -1,6 +1,3 @@
-from __future__ import print_function
-
-
 def foo2_function(debugger, args, result, dict):
     print("foo2 says " + args, file=result)
     return None
diff --git a/lldb/test/API/commands/command/script/import/thepackage/__init__.py b/lldb/test/API/commands/command/script/import/thepackage/__init__.py
--- a/lldb/test/API/commands/command/script/import/thepackage/__init__.py
+++ b/lldb/test/API/commands/command/script/import/thepackage/__init__.py
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-
 from . import TPunitA
 from . import TPunitB
 
diff --git a/lldb/test/API/commands/command/script/mysto.py b/lldb/test/API/commands/command/script/mysto.py
--- a/lldb/test/API/commands/command/script/mysto.py
+++ b/lldb/test/API/commands/command/script/mysto.py
@@ -1,5 +1,3 @@
-from __future__ import print_function
-
 import lldb
 
 
diff --git a/lldb/test/API/commands/command/script/welcome.py b/lldb/test/API/commands/command/script/welcome.py
--- a/lldb/test/API/commands/command/script/welcome.py
+++ b/lldb/test/API/commands/command/script/welcome.py
@@ -1,4 +1,3 @@
-from __future__ import print_function
 import lldb
 import sys
 
diff --git a/lldb/test/API/commands/command/script_alias/tcsacmd.py b/lldb/test/API/commands/command/script_alias/tcsacmd.py
--- a/lldb/test/API/commands/command/script_alias/tcsacmd.py
+++ b/lldb/test/API/commands/command/script_alias/tcsacmd.py
@@ -1,4 +1,3 @@
-from __future__ import print_function
 import lldb
 
 
diff --git a/lldb/test/API/commands/command/source/my.py b/lldb/test/API/commands/command/source/my.py
--- a/lldb/test/API/commands/command/source/my.py
+++ b/lldb/test/API/commands/command/source/my.py
@@ -1,6 +1,3 @@
-from __future__ import print_function
-
-
 def date():
     import datetime
     today = datetime.date.today()
diff --git a/lldb/test/API/commands/expression/no-deadlock/TestExprDoesntBlock.py b/lldb/test/API/commands/expression/no-deadlock/TestExprDoesntBlock.py
--- a/lldb/test/API/commands/expression/no-deadlock/TestExprDoesntBlock.py
+++ b/lldb/test/API/commands/expression/no-deadlock/TestExprDoesntBlock.py
@@ -2,9 +2,6 @@
 Test that expr will time out and allow other threads to run if it blocks.
 """
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/commands/process/launch/TestProcessLaunch.py b/lldb/test/API/commands/process/launch/TestProcessLaunch.py
--- a/lldb/test/API/commands/process/launch/TestProcessLaunch.py
+++ b/lldb/test/API/commands/process/launch/TestProcessLaunch.py
@@ -2,8 +2,6 @@
 Test lldb process launch flags.
 """
 
-from __future__ import print_function
-
 import os
 
 import lldb
diff --git a/lldb/test/API/commands/register/register/register_command/TestRegisters.py b/lldb/test/API/commands/register/register/register_command/TestRegisters.py
--- a/lldb/test/API/commands/register/register/register_command/TestRegisters.py
+++ b/lldb/test/API/commands/register/register/register_command/TestRegisters.py
@@ -2,9 +2,6 @@
 Test the 'register' command.
 """
 
-from __future__ import print_function
-
-
 import os
 import sys
 import lldb
diff --git a/lldb/test/API/commands/watchpoints/multiple_threads/TestWatchpointMultipleThreads.py b/lldb/test/API/commands/watchpoints/multiple_threads/TestWatchpointMultipleThreads.py
--- a/lldb/test/API/commands/watchpoints/multiple_threads/TestWatchpointMultipleThreads.py
+++ b/lldb/test/API/commands/watchpoints/multiple_threads/TestWatchpointMultipleThreads.py
@@ -2,9 +2,6 @@
 Test that lldb watchpoint works for multiple threads.
 """
 
-from __future__ import print_function
-
-
 import re
 import lldb
 from lldbsuite.test.decorators import *
diff --git a/lldb/test/API/commands/watchpoints/watchpoint_events/TestWatchpointEvents.py b/lldb/test/API/commands/watchpoints/watchpoint_events/TestWatchpointEvents.py
--- a/lldb/test/API/commands/watchpoints/watchpoint_events/TestWatchpointEvents.py
+++ b/lldb/test/API/commands/watchpoints/watchpoint_events/TestWatchpointEvents.py
@@ -1,8 +1,5 @@
 """Test that adding, deleting and modifying watchpoints sends the appropriate events."""
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestBreakpointCommandsFromPython.py b/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestBreakpointCommandsFromPython.py
--- a/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestBreakpointCommandsFromPython.py
+++ b/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestBreakpointCommandsFromPython.py
@@ -2,9 +2,6 @@
 Test that you can set breakpoint commands successfully with the Python API's:
 """
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/functionalities/breakpoint/breakpoint_command/bktptcmd.py b/lldb/test/API/functionalities/breakpoint/breakpoint_command/bktptcmd.py
--- a/lldb/test/API/functionalities/breakpoint/breakpoint_command/bktptcmd.py
+++ b/lldb/test/API/functionalities/breakpoint/breakpoint_command/bktptcmd.py
@@ -1,4 +1,3 @@
-from __future__ import print_function
 import side_effect
 
 def useless_function(first, second):
diff --git a/lldb/test/API/functionalities/breakpoint/breakpoint_in_delayslot/TestAvoidBreakpointInDelaySlot.py b/lldb/test/API/functionalities/breakpoint/breakpoint_in_delayslot/TestAvoidBreakpointInDelaySlot.py
--- a/lldb/test/API/functionalities/breakpoint/breakpoint_in_delayslot/TestAvoidBreakpointInDelaySlot.py
+++ b/lldb/test/API/functionalities/breakpoint/breakpoint_in_delayslot/TestAvoidBreakpointInDelaySlot.py
@@ -2,8 +2,6 @@
 Test specific to MIPS
 """
 
-from __future__ import print_function
-
 import re
 import lldb
 from lldbsuite.test.decorators import *
diff --git a/lldb/test/API/functionalities/breakpoint/cpp/TestCPPBreakpointLocations.py b/lldb/test/API/functionalities/breakpoint/cpp/TestCPPBreakpointLocations.py
--- a/lldb/test/API/functionalities/breakpoint/cpp/TestCPPBreakpointLocations.py
+++ b/lldb/test/API/functionalities/breakpoint/cpp/TestCPPBreakpointLocations.py
@@ -2,9 +2,6 @@
 Test lldb breakpoint ids.
 """
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/functionalities/breakpoint/move_nearest/TestMoveNearest.py b/lldb/test/API/functionalities/breakpoint/move_nearest/TestMoveNearest.py
--- a/lldb/test/API/functionalities/breakpoint/move_nearest/TestMoveNearest.py
+++ b/lldb/test/API/functionalities/breakpoint/move_nearest/TestMoveNearest.py
@@ -1,6 +1,3 @@
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.lldbtest import *
 import lldbsuite.test.lldbutil as lldbutil
diff --git a/lldb/test/API/functionalities/conditional_break/TestConditionalBreak.py b/lldb/test/API/functionalities/conditional_break/TestConditionalBreak.py
--- a/lldb/test/API/functionalities/conditional_break/TestConditionalBreak.py
+++ b/lldb/test/API/functionalities/conditional_break/TestConditionalBreak.py
@@ -2,9 +2,6 @@
 Test conditionally break on a function and inspect its variables.
 """
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-objc/TestDataFormatterObjCNSNumber.py b/lldb/test/API/functionalities/data-formatter/data-formatter-objc/TestDataFormatterObjCNSNumber.py
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-objc/TestDataFormatterObjCNSNumber.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-objc/TestDataFormatterObjCNSNumber.py
@@ -3,8 +3,6 @@
 Test lldb data formatter subsystem.
 """
 
-from __future__ import print_function
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-python-synth/TestDataFormatterPythonSynth.py b/lldb/test/API/functionalities/data-formatter/data-formatter-python-synth/TestDataFormatterPythonSynth.py
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-python-synth/TestDataFormatterPythonSynth.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-python-synth/TestDataFormatterPythonSynth.py
@@ -2,9 +2,6 @@
 Test lldb data formatter subsystem.
 """
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/atomic/TestLibCxxAtomic.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/atomic/TestLibCxxAtomic.py
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/atomic/TestLibCxxAtomic.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/atomic/TestLibCxxAtomic.py
@@ -2,9 +2,6 @@
 Test lldb data formatter subsystem.
 """
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-synthval/TestDataFormatterSynthVal.py b/lldb/test/API/functionalities/data-formatter/data-formatter-synthval/TestDataFormatterSynthVal.py
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-synthval/TestDataFormatterSynthVal.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-synthval/TestDataFormatterSynthVal.py
@@ -2,9 +2,6 @@
 Test lldb data formatter subsystem.
 """
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/functionalities/data-formatter/dump_dynamic/TestDumpDynamic.py b/lldb/test/API/functionalities/data-formatter/dump_dynamic/TestDumpDynamic.py
--- a/lldb/test/API/functionalities/data-formatter/dump_dynamic/TestDumpDynamic.py
+++ b/lldb/test/API/functionalities/data-formatter/dump_dynamic/TestDumpDynamic.py
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-
 from lldbsuite.test import lldbinline
 
 lldbinline.MakeInlineTest(
diff --git a/lldb/test/API/functionalities/data-formatter/vector-types/TestVectorTypesFormatting.py b/lldb/test/API/functionalities/data-formatter/vector-types/TestVectorTypesFormatting.py
--- a/lldb/test/API/functionalities/data-formatter/vector-types/TestVectorTypesFormatting.py
+++ b/lldb/test/API/functionalities/data-formatter/vector-types/TestVectorTypesFormatting.py
@@ -2,9 +2,6 @@
 Check that vector types format properly
 """
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/functionalities/exec/TestExec.py b/lldb/test/API/functionalities/exec/TestExec.py
--- a/lldb/test/API/functionalities/exec/TestExec.py
+++ b/lldb/test/API/functionalities/exec/TestExec.py
@@ -1,8 +1,6 @@
 """
 Test some lldb command abbreviations.
 """
-from __future__ import print_function
-
 
 import lldb
 from lldbsuite.test.decorators import *
diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestAArch64XMLRegOffsets.py b/lldb/test/API/functionalities/gdb_remote_client/TestAArch64XMLRegOffsets.py
--- a/lldb/test/API/functionalities/gdb_remote_client/TestAArch64XMLRegOffsets.py
+++ b/lldb/test/API/functionalities/gdb_remote_client/TestAArch64XMLRegOffsets.py
@@ -1,4 +1,3 @@
-from __future__ import print_function
 from textwrap import dedent
 import lldb
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestArmRegisterDefinition.py b/lldb/test/API/functionalities/gdb_remote_client/TestArmRegisterDefinition.py
--- a/lldb/test/API/functionalities/gdb_remote_client/TestArmRegisterDefinition.py
+++ b/lldb/test/API/functionalities/gdb_remote_client/TestArmRegisterDefinition.py
@@ -1,4 +1,3 @@
-from __future__ import print_function
 import lldb
 from lldbsuite.test.lldbtest import *
 from lldbsuite.test.decorators import *
diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestFork.py b/lldb/test/API/functionalities/gdb_remote_client/TestFork.py
--- a/lldb/test/API/functionalities/gdb_remote_client/TestFork.py
+++ b/lldb/test/API/functionalities/gdb_remote_client/TestFork.py
@@ -1,4 +1,3 @@
-from __future__ import print_function
 import lldb
 import unittest
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestGDBServerNoTargetXML.py b/lldb/test/API/functionalities/gdb_remote_client/TestGDBServerNoTargetXML.py
--- a/lldb/test/API/functionalities/gdb_remote_client/TestGDBServerNoTargetXML.py
+++ b/lldb/test/API/functionalities/gdb_remote_client/TestGDBServerNoTargetXML.py
@@ -1,4 +1,3 @@
-from __future__ import print_function
 import lldb
 from lldbsuite.test.lldbtest import *
 from lldbsuite.test.decorators import *
diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestGDBServerTargetXML.py b/lldb/test/API/functionalities/gdb_remote_client/TestGDBServerTargetXML.py
--- a/lldb/test/API/functionalities/gdb_remote_client/TestGDBServerTargetXML.py
+++ b/lldb/test/API/functionalities/gdb_remote_client/TestGDBServerTargetXML.py
@@ -1,4 +1,3 @@
-from __future__ import print_function
 import lldb
 from lldbsuite.test.lldbtest import *
 from lldbsuite.test.decorators import *
diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestJLink6Armv7RegisterDefinition.py b/lldb/test/API/functionalities/gdb_remote_client/TestJLink6Armv7RegisterDefinition.py
--- a/lldb/test/API/functionalities/gdb_remote_client/TestJLink6Armv7RegisterDefinition.py
+++ b/lldb/test/API/functionalities/gdb_remote_client/TestJLink6Armv7RegisterDefinition.py
@@ -1,4 +1,3 @@
-from __future__ import print_function
 import lldb
 from lldbsuite.test.lldbtest import *
 from lldbsuite.test.decorators import *
diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestMultiprocess.py b/lldb/test/API/functionalities/gdb_remote_client/TestMultiprocess.py
--- a/lldb/test/API/functionalities/gdb_remote_client/TestMultiprocess.py
+++ b/lldb/test/API/functionalities/gdb_remote_client/TestMultiprocess.py
@@ -1,4 +1,3 @@
-from __future__ import print_function
 import lldb
 import unittest
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestNestedRegDefinitions.py b/lldb/test/API/functionalities/gdb_remote_client/TestNestedRegDefinitions.py
--- a/lldb/test/API/functionalities/gdb_remote_client/TestNestedRegDefinitions.py
+++ b/lldb/test/API/functionalities/gdb_remote_client/TestNestedRegDefinitions.py
@@ -1,4 +1,3 @@
-from __future__ import print_function
 import lldb
 from lldbsuite.test.lldbtest import *
 from lldbsuite.test.decorators import *
diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestNoGPacketSupported.py b/lldb/test/API/functionalities/gdb_remote_client/TestNoGPacketSupported.py
--- a/lldb/test/API/functionalities/gdb_remote_client/TestNoGPacketSupported.py
+++ b/lldb/test/API/functionalities/gdb_remote_client/TestNoGPacketSupported.py
@@ -1,4 +1,3 @@
-from __future__ import print_function
 import lldb
 from lldbsuite.test.lldbtest import *
 from lldbsuite.test.decorators import *
diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestNoWatchpointSupportInfo.py b/lldb/test/API/functionalities/gdb_remote_client/TestNoWatchpointSupportInfo.py
--- a/lldb/test/API/functionalities/gdb_remote_client/TestNoWatchpointSupportInfo.py
+++ b/lldb/test/API/functionalities/gdb_remote_client/TestNoWatchpointSupportInfo.py
@@ -1,4 +1,3 @@
-from __future__ import print_function
 import lldb
 from lldbsuite.test.lldbtest import *
 from lldbsuite.test.decorators import *
diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestPartialGPacket.py b/lldb/test/API/functionalities/gdb_remote_client/TestPartialGPacket.py
--- a/lldb/test/API/functionalities/gdb_remote_client/TestPartialGPacket.py
+++ b/lldb/test/API/functionalities/gdb_remote_client/TestPartialGPacket.py
@@ -1,4 +1,3 @@
-from __future__ import print_function
 import lldb
 from lldbsuite.test.lldbtest import *
 from lldbsuite.test.decorators import *
diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestRegDefinitionInParts.py b/lldb/test/API/functionalities/gdb_remote_client/TestRegDefinitionInParts.py
--- a/lldb/test/API/functionalities/gdb_remote_client/TestRegDefinitionInParts.py
+++ b/lldb/test/API/functionalities/gdb_remote_client/TestRegDefinitionInParts.py
@@ -1,4 +1,3 @@
-from __future__ import print_function
 import lldb
 import time
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestRemoteRegNums.py b/lldb/test/API/functionalities/gdb_remote_client/TestRemoteRegNums.py
--- a/lldb/test/API/functionalities/gdb_remote_client/TestRemoteRegNums.py
+++ b/lldb/test/API/functionalities/gdb_remote_client/TestRemoteRegNums.py
@@ -1,4 +1,3 @@
-from __future__ import print_function
 import lldb
 from lldbsuite.test.lldbtest import *
 from lldbsuite.test.decorators import *
diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestRestartBug.py b/lldb/test/API/functionalities/gdb_remote_client/TestRestartBug.py
--- a/lldb/test/API/functionalities/gdb_remote_client/TestRestartBug.py
+++ b/lldb/test/API/functionalities/gdb_remote_client/TestRestartBug.py
@@ -1,4 +1,3 @@
-from __future__ import print_function
 import lldb
 from lldbsuite.test.lldbtest import *
 from lldbsuite.test.decorators import *
diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestTargetXMLArch.py b/lldb/test/API/functionalities/gdb_remote_client/TestTargetXMLArch.py
--- a/lldb/test/API/functionalities/gdb_remote_client/TestTargetXMLArch.py
+++ b/lldb/test/API/functionalities/gdb_remote_client/TestTargetXMLArch.py
@@ -1,4 +1,3 @@
-from __future__ import print_function
 import lldb
 from lldbsuite.test.lldbtest import *
 from lldbsuite.test.decorators import *
diff --git a/lldb/test/API/functionalities/inferior-assert/TestInferiorAssert.py b/lldb/test/API/functionalities/inferior-assert/TestInferiorAssert.py
--- a/lldb/test/API/functionalities/inferior-assert/TestInferiorAssert.py
+++ b/lldb/test/API/functionalities/inferior-assert/TestInferiorAssert.py
@@ -1,8 +1,5 @@
 """Test that lldb functions correctly after the inferior has asserted."""
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test import lldbutil
 from lldbsuite.test import lldbplatformutil
diff --git a/lldb/test/API/functionalities/load_unload/TestLoadUnload.py b/lldb/test/API/functionalities/load_unload/TestLoadUnload.py
--- a/lldb/test/API/functionalities/load_unload/TestLoadUnload.py
+++ b/lldb/test/API/functionalities/load_unload/TestLoadUnload.py
@@ -2,9 +2,6 @@
 Test that breakpoint by symbol name works correctly with dynamic libs.
 """
 
-from __future__ import print_function
-
-
 import os
 import re
 import lldb
diff --git a/lldb/test/API/functionalities/multidebugger_commands/TestMultipleDebuggersCommands.py b/lldb/test/API/functionalities/multidebugger_commands/TestMultipleDebuggersCommands.py
--- a/lldb/test/API/functionalities/multidebugger_commands/TestMultipleDebuggersCommands.py
+++ b/lldb/test/API/functionalities/multidebugger_commands/TestMultipleDebuggersCommands.py
@@ -2,9 +2,6 @@
 Test that commands do not try and hold on to stale CommandInterpreters in a multiple debuggers scenario
 """
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/functionalities/plugins/command_plugin/TestPluginCommands.py b/lldb/test/API/functionalities/plugins/command_plugin/TestPluginCommands.py
--- a/lldb/test/API/functionalities/plugins/command_plugin/TestPluginCommands.py
+++ b/lldb/test/API/functionalities/plugins/command_plugin/TestPluginCommands.py
@@ -2,9 +2,6 @@
 Test that plugins that load commands work correctly.
 """
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/functionalities/plugins/python_os_plugin/stepping_plugin_threads/TestOSPluginStepping.py b/lldb/test/API/functionalities/plugins/python_os_plugin/stepping_plugin_threads/TestOSPluginStepping.py
--- a/lldb/test/API/functionalities/plugins/python_os_plugin/stepping_plugin_threads/TestOSPluginStepping.py
+++ b/lldb/test/API/functionalities/plugins/python_os_plugin/stepping_plugin_threads/TestOSPluginStepping.py
@@ -3,9 +3,6 @@
 all threads at every stop.
 """
 
-from __future__ import print_function
-
-
 import os
 import lldb
 from lldbsuite.test.decorators import *
diff --git a/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py b/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py
--- a/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py
+++ b/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py
@@ -2,8 +2,6 @@
 Test basics of linux core file debugging.
 """
 
-from __future__ import division, print_function
-
 import shutil
 import struct
 import os
diff --git a/lldb/test/API/functionalities/postmortem/netbsd-core/TestNetBSDCore.py b/lldb/test/API/functionalities/postmortem/netbsd-core/TestNetBSDCore.py
--- a/lldb/test/API/functionalities/postmortem/netbsd-core/TestNetBSDCore.py
+++ b/lldb/test/API/functionalities/postmortem/netbsd-core/TestNetBSDCore.py
@@ -2,8 +2,6 @@
 Test NetBSD core file debugging.
 """
 
-from __future__ import division, print_function
-
 import signal
 import os
 
diff --git a/lldb/test/API/functionalities/recursion/TestValueObjectRecursion.py b/lldb/test/API/functionalities/recursion/TestValueObjectRecursion.py
--- a/lldb/test/API/functionalities/recursion/TestValueObjectRecursion.py
+++ b/lldb/test/API/functionalities/recursion/TestValueObjectRecursion.py
@@ -2,9 +2,6 @@
 Test lldb data formatter subsystem.
 """
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/functionalities/target-new-solib-notifications/TestModuleLoadedNotifys.py b/lldb/test/API/functionalities/target-new-solib-notifications/TestModuleLoadedNotifys.py
--- a/lldb/test/API/functionalities/target-new-solib-notifications/TestModuleLoadedNotifys.py
+++ b/lldb/test/API/functionalities/target-new-solib-notifications/TestModuleLoadedNotifys.py
@@ -3,9 +3,6 @@
 they should be delivered in batches instead of one-by-one.
 """
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/functionalities/tty/TestTerminal.py b/lldb/test/API/functionalities/tty/TestTerminal.py
--- a/lldb/test/API/functionalities/tty/TestTerminal.py
+++ b/lldb/test/API/functionalities/tty/TestTerminal.py
@@ -2,9 +2,6 @@
 Test lldb command aliases.
 """
 
-from __future__ import print_function
-
-
 import unittest2
 import os
 import lldb
diff --git a/lldb/test/API/functionalities/unwind/noreturn/TestNoreturnUnwind.py b/lldb/test/API/functionalities/unwind/noreturn/TestNoreturnUnwind.py
--- a/lldb/test/API/functionalities/unwind/noreturn/TestNoreturnUnwind.py
+++ b/lldb/test/API/functionalities/unwind/noreturn/TestNoreturnUnwind.py
@@ -2,9 +2,6 @@
 Test that we can backtrace correctly with 'noreturn' functions on the stack
 """
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/functionalities/unwind/sigtramp/TestSigtrampUnwind.py b/lldb/test/API/functionalities/unwind/sigtramp/TestSigtrampUnwind.py
--- a/lldb/test/API/functionalities/unwind/sigtramp/TestSigtrampUnwind.py
+++ b/lldb/test/API/functionalities/unwind/sigtramp/TestSigtrampUnwind.py
@@ -2,9 +2,6 @@
 Test that we can backtrace correctly with 'sigtramp' functions on the stack
 """
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/functionalities/unwind/zeroth_frame/TestZerothFrame.py b/lldb/test/API/functionalities/unwind/zeroth_frame/TestZerothFrame.py
--- a/lldb/test/API/functionalities/unwind/zeroth_frame/TestZerothFrame.py
+++ b/lldb/test/API/functionalities/unwind/zeroth_frame/TestZerothFrame.py
@@ -21,9 +21,6 @@
 when using API directly, for example in LLDB-MI.
 """
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/lang/c/step_over_no_deadlock/TestStepOverDoesntBlock.py b/lldb/test/API/lang/c/step_over_no_deadlock/TestStepOverDoesntBlock.py
--- a/lldb/test/API/lang/c/step_over_no_deadlock/TestStepOverDoesntBlock.py
+++ b/lldb/test/API/lang/c/step_over_no_deadlock/TestStepOverDoesntBlock.py
@@ -2,9 +2,6 @@
 Test that step over will let other threads run when necessary
 """
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/lang/cpp/class_types/TestClassTypesDisassembly.py b/lldb/test/API/lang/cpp/class_types/TestClassTypesDisassembly.py
--- a/lldb/test/API/lang/cpp/class_types/TestClassTypesDisassembly.py
+++ b/lldb/test/API/lang/cpp/class_types/TestClassTypesDisassembly.py
@@ -2,9 +2,6 @@
 Test the lldb disassemble command on each call frame when stopped on C's ctor.
 """
 
-from __future__ import print_function
-
-
 import os
 import lldb
 from lldbsuite.test.decorators import *
diff --git a/lldb/test/API/lang/cpp/dynamic-value/TestCppValueCast.py b/lldb/test/API/lang/cpp/dynamic-value/TestCppValueCast.py
--- a/lldb/test/API/lang/cpp/dynamic-value/TestCppValueCast.py
+++ b/lldb/test/API/lang/cpp/dynamic-value/TestCppValueCast.py
@@ -2,9 +2,6 @@
 Test lldb Python API SBValue::Cast(SBType) for C++ types.
 """
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/lang/cpp/stl/TestStdCXXDisassembly.py b/lldb/test/API/lang/cpp/stl/TestStdCXXDisassembly.py
--- a/lldb/test/API/lang/cpp/stl/TestStdCXXDisassembly.py
+++ b/lldb/test/API/lang/cpp/stl/TestStdCXXDisassembly.py
@@ -2,9 +2,6 @@
 Test the lldb disassemble command on lib stdc++.
 """
 
-from __future__ import print_function
-
-
 import os
 import lldb
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/lang/objc/direct-dispatch-step/TestObjCDirectDispatchStepping.py b/lldb/test/API/lang/objc/direct-dispatch-step/TestObjCDirectDispatchStepping.py
--- a/lldb/test/API/lang/objc/direct-dispatch-step/TestObjCDirectDispatchStepping.py
+++ b/lldb/test/API/lang/objc/direct-dispatch-step/TestObjCDirectDispatchStepping.py
@@ -1,8 +1,5 @@
 """Test stepping through ObjC method dispatch in various forms."""
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/lang/objc/foundation/TestObjCMethods.py b/lldb/test/API/lang/objc/foundation/TestObjCMethods.py
--- a/lldb/test/API/lang/objc/foundation/TestObjCMethods.py
+++ b/lldb/test/API/lang/objc/foundation/TestObjCMethods.py
@@ -3,9 +3,6 @@
 Also lookup objective-c data types and evaluate expressions.
 """
 
-from __future__ import print_function
-
-
 import os
 import os.path
 import lldb
diff --git a/lldb/test/API/lang/objc/foundation/TestObjectDescriptionAPI.py b/lldb/test/API/lang/objc/foundation/TestObjectDescriptionAPI.py
--- a/lldb/test/API/lang/objc/foundation/TestObjectDescriptionAPI.py
+++ b/lldb/test/API/lang/objc/foundation/TestObjectDescriptionAPI.py
@@ -2,9 +2,6 @@
 Test SBValue.GetObjectDescription() with the value from SBTarget.FindGlobalVariables().
 """
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/lang/objc/objc-class-method/TestObjCClassMethod.py b/lldb/test/API/lang/objc/objc-class-method/TestObjCClassMethod.py
--- a/lldb/test/API/lang/objc/objc-class-method/TestObjCClassMethod.py
+++ b/lldb/test/API/lang/objc/objc-class-method/TestObjCClassMethod.py
@@ -1,8 +1,5 @@
 """Test calling functions in class methods."""
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/lang/objc/objc-stepping/TestObjCStepping.py b/lldb/test/API/lang/objc/objc-stepping/TestObjCStepping.py
--- a/lldb/test/API/lang/objc/objc-stepping/TestObjCStepping.py
+++ b/lldb/test/API/lang/objc/objc-stepping/TestObjCStepping.py
@@ -1,8 +1,5 @@
 """Test stepping through ObjC method dispatch in various forms."""
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/lang/objc/print-obj/TestPrintObj.py b/lldb/test/API/lang/objc/print-obj/TestPrintObj.py
--- a/lldb/test/API/lang/objc/print-obj/TestPrintObj.py
+++ b/lldb/test/API/lang/objc/print-obj/TestPrintObj.py
@@ -2,9 +2,6 @@
 Test "print object" where another thread blocks the print object from making progress.
 """
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/lldbtest.py b/lldb/test/API/lldbtest.py
--- a/lldb/test/API/lldbtest.py
+++ b/lldb/test/API/lldbtest.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import
 import os
 import re
 import operator
diff --git a/lldb/test/API/macosx/queues/TestQueues.py b/lldb/test/API/macosx/queues/TestQueues.py
--- a/lldb/test/API/macosx/queues/TestQueues.py
+++ b/lldb/test/API/macosx/queues/TestQueues.py
@@ -1,8 +1,5 @@
 """Test queues inspection SB APIs."""
 
-from __future__ import print_function
-
-
 import os
 import lldb
 from lldbsuite.test.decorators import *
diff --git a/lldb/test/API/python_api/default-constructor/TestDefaultConstructorForAPIObjects.py b/lldb/test/API/python_api/default-constructor/TestDefaultConstructorForAPIObjects.py
--- a/lldb/test/API/python_api/default-constructor/TestDefaultConstructorForAPIObjects.py
+++ b/lldb/test/API/python_api/default-constructor/TestDefaultConstructorForAPIObjects.py
@@ -11,9 +11,6 @@
 after default construction.
 """
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/python_api/disassemble-raw-data/TestDisassembleRawData.py b/lldb/test/API/python_api/disassemble-raw-data/TestDisassembleRawData.py
--- a/lldb/test/API/python_api/disassemble-raw-data/TestDisassembleRawData.py
+++ b/lldb/test/API/python_api/disassemble-raw-data/TestDisassembleRawData.py
@@ -2,9 +2,6 @@
 Use lldb Python API to disassemble raw machine code bytes
 """
 
-from __future__ import print_function
-
-
 import re
 import lldb
 from lldbsuite.test.decorators import *
diff --git a/lldb/test/API/python_api/disassemble-raw-data/TestDisassemble_VST1_64.py b/lldb/test/API/python_api/disassemble-raw-data/TestDisassemble_VST1_64.py
--- a/lldb/test/API/python_api/disassemble-raw-data/TestDisassemble_VST1_64.py
+++ b/lldb/test/API/python_api/disassemble-raw-data/TestDisassemble_VST1_64.py
@@ -2,8 +2,6 @@
 Use lldb Python API to disassemble raw machine code bytes
 """
 
-from __future__ import print_function
-
 from io import StringIO
 import sys
 
diff --git a/lldb/test/API/python_api/event/TestEvents.py b/lldb/test/API/python_api/event/TestEvents.py
--- a/lldb/test/API/python_api/event/TestEvents.py
+++ b/lldb/test/API/python_api/event/TestEvents.py
@@ -2,9 +2,6 @@
 Test lldb Python event APIs.
 """
 
-from __future__ import print_function
-
-
 import re
 import lldb
 from lldbsuite.test.decorators import *
diff --git a/lldb/test/API/python_api/formatters/TestFormattersSBAPI.py b/lldb/test/API/python_api/formatters/TestFormattersSBAPI.py
--- a/lldb/test/API/python_api/formatters/TestFormattersSBAPI.py
+++ b/lldb/test/API/python_api/formatters/TestFormattersSBAPI.py
@@ -1,8 +1,5 @@
 """Test Python APIs for working with formatters"""
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/python_api/frame/TestFrames.py b/lldb/test/API/python_api/frame/TestFrames.py
--- a/lldb/test/API/python_api/frame/TestFrames.py
+++ b/lldb/test/API/python_api/frame/TestFrames.py
@@ -3,8 +3,6 @@
 And other SBFrame API tests.
 """
 
-from __future__ import print_function
-
 import io
 
 import lldb
diff --git a/lldb/test/API/python_api/frame/get-variables/TestGetVariables.py b/lldb/test/API/python_api/frame/get-variables/TestGetVariables.py
--- a/lldb/test/API/python_api/frame/get-variables/TestGetVariables.py
+++ b/lldb/test/API/python_api/frame/get-variables/TestGetVariables.py
@@ -2,9 +2,6 @@
 Test that SBFrame::GetVariables() calls work correctly.
 """
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/python_api/frame/inlines/TestInlinedFrame.py b/lldb/test/API/python_api/frame/inlines/TestInlinedFrame.py
--- a/lldb/test/API/python_api/frame/inlines/TestInlinedFrame.py
+++ b/lldb/test/API/python_api/frame/inlines/TestInlinedFrame.py
@@ -2,9 +2,6 @@
 Testlldb Python SBFrame APIs IsInlined() and GetFunctionName().
 """
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/python_api/function_symbol/TestDisasmAPI.py b/lldb/test/API/python_api/function_symbol/TestDisasmAPI.py
--- a/lldb/test/API/python_api/function_symbol/TestDisasmAPI.py
+++ b/lldb/test/API/python_api/function_symbol/TestDisasmAPI.py
@@ -2,9 +2,6 @@
 Test retrieval of SBAddress from function/symbol, disassembly, and SBAddress APIs.
 """
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/python_api/function_symbol/TestSymbolAPI.py b/lldb/test/API/python_api/function_symbol/TestSymbolAPI.py
--- a/lldb/test/API/python_api/function_symbol/TestSymbolAPI.py
+++ b/lldb/test/API/python_api/function_symbol/TestSymbolAPI.py
@@ -2,9 +2,6 @@
 Test newly added SBSymbol and SBAddress APIs.
 """
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/python_api/interpreter/TestCommandInterpreterAPI.py b/lldb/test/API/python_api/interpreter/TestCommandInterpreterAPI.py
--- a/lldb/test/API/python_api/interpreter/TestCommandInterpreterAPI.py
+++ b/lldb/test/API/python_api/interpreter/TestCommandInterpreterAPI.py
@@ -1,8 +1,5 @@
 """Test the SBCommandInterpreter APIs."""
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/python_api/lldbutil/frame/TestFrameUtils.py b/lldb/test/API/python_api/lldbutil/frame/TestFrameUtils.py
--- a/lldb/test/API/python_api/lldbutil/frame/TestFrameUtils.py
+++ b/lldb/test/API/python_api/lldbutil/frame/TestFrameUtils.py
@@ -2,9 +2,6 @@
 Test utility functions for the frame object.
 """
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/python_api/lldbutil/iter/TestLLDBIterator.py b/lldb/test/API/python_api/lldbutil/iter/TestLLDBIterator.py
--- a/lldb/test/API/python_api/lldbutil/iter/TestLLDBIterator.py
+++ b/lldb/test/API/python_api/lldbutil/iter/TestLLDBIterator.py
@@ -2,9 +2,6 @@
 Test the iteration protocol for some lldb container objects.
 """
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/python_api/lldbutil/iter/TestRegistersIterator.py b/lldb/test/API/python_api/lldbutil/iter/TestRegistersIterator.py
--- a/lldb/test/API/python_api/lldbutil/iter/TestRegistersIterator.py
+++ b/lldb/test/API/python_api/lldbutil/iter/TestRegistersIterator.py
@@ -2,9 +2,6 @@
 Test the iteration protocol for frame registers.
 """
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/python_api/module_section/TestModuleAndSection.py b/lldb/test/API/python_api/module_section/TestModuleAndSection.py
--- a/lldb/test/API/python_api/module_section/TestModuleAndSection.py
+++ b/lldb/test/API/python_api/module_section/TestModuleAndSection.py
@@ -2,9 +2,6 @@
 Test some SBModule and SBSection APIs.
 """
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/python_api/process/TestProcessAPI.py b/lldb/test/API/python_api/process/TestProcessAPI.py
--- a/lldb/test/API/python_api/process/TestProcessAPI.py
+++ b/lldb/test/API/python_api/process/TestProcessAPI.py
@@ -2,9 +2,6 @@
 Test SBProcess APIs, including ReadMemory(), WriteMemory(), and others.
 """
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/python_api/process/io/TestProcessIO.py b/lldb/test/API/python_api/process/io/TestProcessIO.py
--- a/lldb/test/API/python_api/process/io/TestProcessIO.py
+++ b/lldb/test/API/python_api/process/io/TestProcessIO.py
@@ -1,8 +1,5 @@
 """Test Python APIs for process IO."""
 
-from __future__ import print_function
-
-
 import os
 import lldb
 from lldbsuite.test.decorators import *
diff --git a/lldb/test/API/python_api/symbol-context/TestSymbolContext.py b/lldb/test/API/python_api/symbol-context/TestSymbolContext.py
--- a/lldb/test/API/python_api/symbol-context/TestSymbolContext.py
+++ b/lldb/test/API/python_api/symbol-context/TestSymbolContext.py
@@ -2,8 +2,6 @@
 Test SBSymbolContext APIs.
 """
 
-from __future__ import print_function
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/python_api/target/TestTargetAPI.py b/lldb/test/API/python_api/target/TestTargetAPI.py
--- a/lldb/test/API/python_api/target/TestTargetAPI.py
+++ b/lldb/test/API/python_api/target/TestTargetAPI.py
@@ -2,9 +2,6 @@
 Test SBTarget APIs.
 """
 
-from __future__ import print_function
-
-
 import unittest2
 import os
 import lldb
diff --git a/lldb/test/API/python_api/thread/TestThreadAPI.py b/lldb/test/API/python_api/thread/TestThreadAPI.py
--- a/lldb/test/API/python_api/thread/TestThreadAPI.py
+++ b/lldb/test/API/python_api/thread/TestThreadAPI.py
@@ -2,9 +2,6 @@
 Test SBThread APIs.
 """
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/python_api/type/TestTypeList.py b/lldb/test/API/python_api/type/TestTypeList.py
--- a/lldb/test/API/python_api/type/TestTypeList.py
+++ b/lldb/test/API/python_api/type/TestTypeList.py
@@ -2,10 +2,6 @@
 Test SBType and SBTypeList API.
 """
 
-from __future__ import print_function
-
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/python_api/value/TestValueAPI.py b/lldb/test/API/python_api/value/TestValueAPI.py
--- a/lldb/test/API/python_api/value/TestValueAPI.py
+++ b/lldb/test/API/python_api/value/TestValueAPI.py
@@ -2,9 +2,6 @@
 Test some SBValue APIs.
 """
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/python_api/value/linked_list/TestValueAPILinkedList.py b/lldb/test/API/python_api/value/linked_list/TestValueAPILinkedList.py
--- a/lldb/test/API/python_api/value/linked_list/TestValueAPILinkedList.py
+++ b/lldb/test/API/python_api/value/linked_list/TestValueAPILinkedList.py
@@ -3,9 +3,6 @@
 supports iteration till the end of list is reached.
 """
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/python_api/watchpoint/TestSetWatchpoint.py b/lldb/test/API/python_api/watchpoint/TestSetWatchpoint.py
--- a/lldb/test/API/python_api/watchpoint/TestSetWatchpoint.py
+++ b/lldb/test/API/python_api/watchpoint/TestSetWatchpoint.py
@@ -2,9 +2,6 @@
 Use lldb Python SBValue API to create a watchpoint for read_write of 'globl' var.
 """
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/python_api/watchpoint/TestWatchpointIgnoreCount.py b/lldb/test/API/python_api/watchpoint/TestWatchpointIgnoreCount.py
--- a/lldb/test/API/python_api/watchpoint/TestWatchpointIgnoreCount.py
+++ b/lldb/test/API/python_api/watchpoint/TestWatchpointIgnoreCount.py
@@ -2,9 +2,6 @@
 Use lldb Python SBWatchpoint API to set the ignore count.
 """
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/python_api/watchpoint/TestWatchpointIter.py b/lldb/test/API/python_api/watchpoint/TestWatchpointIter.py
--- a/lldb/test/API/python_api/watchpoint/TestWatchpointIter.py
+++ b/lldb/test/API/python_api/watchpoint/TestWatchpointIter.py
@@ -2,10 +2,6 @@
 Use lldb Python SBTarget API to iterate on the watchpoint(s) for the target.
 """
 
-from __future__ import print_function
-
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/python_api/watchpoint/condition/TestWatchpointConditionAPI.py b/lldb/test/API/python_api/watchpoint/condition/TestWatchpointConditionAPI.py
--- a/lldb/test/API/python_api/watchpoint/condition/TestWatchpointConditionAPI.py
+++ b/lldb/test/API/python_api/watchpoint/condition/TestWatchpointConditionAPI.py
@@ -2,9 +2,6 @@
 Test watchpoint condition API.
 """
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/python_api/watchpoint/watchlocation/TestSetWatchlocation.py b/lldb/test/API/python_api/watchpoint/watchlocation/TestSetWatchlocation.py
--- a/lldb/test/API/python_api/watchpoint/watchlocation/TestSetWatchlocation.py
+++ b/lldb/test/API/python_api/watchpoint/watchlocation/TestSetWatchlocation.py
@@ -2,10 +2,6 @@
 Use lldb Python SBValue.WatchPointee() API to create a watchpoint for write of '*g_char_ptr'.
 """
 
-from __future__ import print_function
-
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/python_api/watchpoint/watchlocation/TestTargetWatchAddress.py b/lldb/test/API/python_api/watchpoint/watchlocation/TestTargetWatchAddress.py
--- a/lldb/test/API/python_api/watchpoint/watchlocation/TestTargetWatchAddress.py
+++ b/lldb/test/API/python_api/watchpoint/watchlocation/TestTargetWatchAddress.py
@@ -2,9 +2,6 @@
 Use lldb Python SBtarget.WatchAddress() API to create a watchpoint for write of '*g_char_ptr'.
 """
 
-from __future__ import print_function
-
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/qemu/TestQemuAPI.py b/lldb/test/API/qemu/TestQemuAPI.py
--- a/lldb/test/API/qemu/TestQemuAPI.py
+++ b/lldb/test/API/qemu/TestQemuAPI.py
@@ -1,4 +1,3 @@
-from __future__ import print_function
 import lldb
 import os
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/qemu/TestQemuLaunch.py b/lldb/test/API/qemu/TestQemuLaunch.py
--- a/lldb/test/API/qemu/TestQemuLaunch.py
+++ b/lldb/test/API/qemu/TestQemuLaunch.py
@@ -1,4 +1,3 @@
-from __future__ import print_function
 import lldb
 import unittest
 import os
diff --git a/lldb/test/API/sample_test/TestSampleInlineTest.py b/lldb/test/API/sample_test/TestSampleInlineTest.py
--- a/lldb/test/API/sample_test/TestSampleInlineTest.py
+++ b/lldb/test/API/sample_test/TestSampleInlineTest.py
@@ -2,8 +2,6 @@
 Describe the purpose of the test here.
 """
 
-from __future__ import absolute_import
-
 from lldbsuite.test import lldbinline
 
 lldbinline.MakeInlineTest(
diff --git a/lldb/test/API/source-manager/TestSourceManager.py b/lldb/test/API/source-manager/TestSourceManager.py
--- a/lldb/test/API/source-manager/TestSourceManager.py
+++ b/lldb/test/API/source-manager/TestSourceManager.py
@@ -9,8 +9,6 @@
   Test the caching mechanism of the source manager.
 """
 
-from __future__ import print_function
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/terminal/TestSTTYBeforeAndAfter.py b/lldb/test/API/terminal/TestSTTYBeforeAndAfter.py
--- a/lldb/test/API/terminal/TestSTTYBeforeAndAfter.py
+++ b/lldb/test/API/terminal/TestSTTYBeforeAndAfter.py
@@ -2,9 +2,6 @@
 Test that 'stty -a' displays the same output before and after running the lldb command.
 """
 
-from __future__ import print_function
-
-
 import lldb
 import io
 import sys
diff --git a/lldb/test/API/test_runner/test/inferior.py b/lldb/test/API/test_runner/test/inferior.py
--- a/lldb/test/API/test_runner/test/inferior.py
+++ b/lldb/test/API/test_runner/test/inferior.py
@@ -1,8 +1,6 @@
 #!/usr/bin/env python
 """Inferior program used by process control tests."""
 
-from __future__ import print_function
-
 import argparse
 import datetime
 import signal
diff --git a/lldb/test/API/tools/lldb-server/TestGdbRemoteHostInfo.py b/lldb/test/API/tools/lldb-server/TestGdbRemoteHostInfo.py
--- a/lldb/test/API/tools/lldb-server/TestGdbRemoteHostInfo.py
+++ b/lldb/test/API/tools/lldb-server/TestGdbRemoteHostInfo.py
@@ -1,5 +1,3 @@
-from __future__ import print_function
-
 # lldb test suite imports
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import TestBase
diff --git a/lldb/test/API/tools/lldb-server/TestGdbRemotePlatformFile.py b/lldb/test/API/tools/lldb-server/TestGdbRemotePlatformFile.py
--- a/lldb/test/API/tools/lldb-server/TestGdbRemotePlatformFile.py
+++ b/lldb/test/API/tools/lldb-server/TestGdbRemotePlatformFile.py
@@ -1,5 +1,3 @@
-from __future__ import print_function
-
 # lldb test suite imports
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import TestBase
diff --git a/lldb/test/API/tools/lldb-server/commandline/TestGdbRemoteConnection.py b/lldb/test/API/tools/lldb-server/commandline/TestGdbRemoteConnection.py
--- a/lldb/test/API/tools/lldb-server/commandline/TestGdbRemoteConnection.py
+++ b/lldb/test/API/tools/lldb-server/commandline/TestGdbRemoteConnection.py
@@ -1,5 +1,3 @@
-from __future__ import print_function
-
 import gdbremote_testcase
 import random
 import select
diff --git a/lldb/test/API/tools/lldb-vscode/console/TestVSCode_console.py b/lldb/test/API/tools/lldb-vscode/console/TestVSCode_console.py
--- a/lldb/test/API/tools/lldb-vscode/console/TestVSCode_console.py
+++ b/lldb/test/API/tools/lldb-vscode/console/TestVSCode_console.py
@@ -2,8 +2,6 @@
 Test lldb-vscode setBreakpoints request
 """
 
-from __future__ import print_function
-
 import vscode
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/tools/lldb-vscode/correct-thread/TestVSCode_correct_thread.py b/lldb/test/API/tools/lldb-vscode/correct-thread/TestVSCode_correct_thread.py
--- a/lldb/test/API/tools/lldb-vscode/correct-thread/TestVSCode_correct_thread.py
+++ b/lldb/test/API/tools/lldb-vscode/correct-thread/TestVSCode_correct_thread.py
@@ -2,8 +2,6 @@
 Test lldb-vscode setBreakpoints request
 """
 
-from __future__ import print_function
-
 import vscode
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/tools/lldb-vscode/module/TestVSCode_module.py b/lldb/test/API/tools/lldb-vscode/module/TestVSCode_module.py
--- a/lldb/test/API/tools/lldb-vscode/module/TestVSCode_module.py
+++ b/lldb/test/API/tools/lldb-vscode/module/TestVSCode_module.py
@@ -2,8 +2,6 @@
 Test lldb-vscode setBreakpoints request
 """
 
-from __future__ import print_function
-
 import vscode
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/tools/lldb-vscode/optimized/TestVSCode_optimized.py b/lldb/test/API/tools/lldb-vscode/optimized/TestVSCode_optimized.py
--- a/lldb/test/API/tools/lldb-vscode/optimized/TestVSCode_optimized.py
+++ b/lldb/test/API/tools/lldb-vscode/optimized/TestVSCode_optimized.py
@@ -2,8 +2,6 @@
 Test lldb-vscode variables/stackTrace request for optimized code
 """
 
-from __future__ import print_function
-
 import vscode
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/tools/lldb-vscode/variables/TestVSCode_variables.py b/lldb/test/API/tools/lldb-vscode/variables/TestVSCode_variables.py
--- a/lldb/test/API/tools/lldb-vscode/variables/TestVSCode_variables.py
+++ b/lldb/test/API/tools/lldb-vscode/variables/TestVSCode_variables.py
@@ -2,8 +2,6 @@
 Test lldb-vscode setBreakpoints request
 """
 
-from __future__ import print_function
-
 import vscode
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
diff --git a/lldb/test/API/types/AbstractBase.py b/lldb/test/API/types/AbstractBase.py
--- a/lldb/test/API/types/AbstractBase.py
+++ b/lldb/test/API/types/AbstractBase.py
@@ -2,8 +2,6 @@
 Abstract base class of basic types provides a generic type tester method.
 """
 
-from __future__ import print_function
-
 import os
 import re
 import lldb
diff --git a/lldb/test/Shell/Commands/CommandScriptImmediateOutput/Inputs/custom_command.py b/lldb/test/Shell/Commands/CommandScriptImmediateOutput/Inputs/custom_command.py
--- a/lldb/test/Shell/Commands/CommandScriptImmediateOutput/Inputs/custom_command.py
+++ b/lldb/test/Shell/Commands/CommandScriptImmediateOutput/Inputs/custom_command.py
@@ -1,5 +1,3 @@
-from __future__ import print_function
-
 import sys
 
 
diff --git a/lldb/test/Shell/helper/build.py b/lldb/test/Shell/helper/build.py
--- a/lldb/test/Shell/helper/build.py
+++ b/lldb/test/Shell/helper/build.py
@@ -1,7 +1,5 @@
 #!/usr/bin/env python
 
-from __future__ import print_function
-
 import argparse
 import os
 import shutil
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -339,6 +339,12 @@
 
 set(LLVM_LIBDIR_SUFFIX "" CACHE STRING "Define suffix of library directory name (32/64)" )
 
+# LLVM_INSTALL_PACKAGE_DIR needs to be declared prior to adding the tools
+# subdirectory in order to have the value available for llvm-config.
+include(GNUInstallPackageDir)
+set(LLVM_INSTALL_PACKAGE_DIR "${CMAKE_INSTALL_PACKAGEDIR}/llvm" CACHE STRING
+  "Path for CMake subdirectory for LLVM (defaults to '${CMAKE_INSTALL_PACKAGEDIR}/llvm')")
+
 set(LLVM_TOOLS_INSTALL_DIR "${CMAKE_INSTALL_BINDIR}" CACHE STRING
     "Path for binary subdirectory (defaults to '${CMAKE_INSTALL_BINDIR}')")
 mark_as_advanced(LLVM_TOOLS_INSTALL_DIR)
@@ -1141,6 +1147,9 @@
   add_subdirectory(utils/UnicodeData)
   add_subdirectory(utils/yaml-bench)
   add_subdirectory(utils/split-file)
+  if( LLVM_INCLUDE_TESTS )
+    add_subdirectory(utils/unittest)
+  endif()
 else()
   if ( LLVM_INCLUDE_TESTS )
     message(FATAL_ERROR "Including tests when not building utils will not work.
@@ -1185,9 +1194,6 @@
   add_subdirectory(utils/lit)
   add_subdirectory(test)
   add_subdirectory(unittests)
-  if( LLVM_INCLUDE_UTILS )
-    add_subdirectory(utils/unittest)
-  endif()
 
   if (WIN32)
     # This utility is used to prevent crashing tests from calling Dr. Watson on
diff --git a/llvm/cmake/modules/CMakeLists.txt b/llvm/cmake/modules/CMakeLists.txt
--- a/llvm/cmake/modules/CMakeLists.txt
+++ b/llvm/cmake/modules/CMakeLists.txt
@@ -1,10 +1,7 @@
-include(GNUInstallPackageDir)
 include(ExtendPath)
 include(LLVMDistributionSupport)
 include(FindPrefixFromConfig)
 
-set(LLVM_INSTALL_PACKAGE_DIR "${CMAKE_INSTALL_PACKAGEDIR}/llvm" CACHE STRING
-  "Path for CMake subdirectory for LLVM (defaults to '${CMAKE_INSTALL_PACKAGEDIR}/llvm')")
 # CMAKE_INSTALL_PACKAGEDIR might be absolute, so don't reuse below.
 set(llvm_cmake_builddir "${LLVM_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX}/cmake/llvm")
 
diff --git a/llvm/cmake/modules/FindGRPC.cmake b/llvm/cmake/modules/FindGRPC.cmake
--- a/llvm/cmake/modules/FindGRPC.cmake
+++ b/llvm/cmake/modules/FindGRPC.cmake
@@ -132,7 +132,7 @@
         ARGS ${Flags} "${ProtoSourceAbsolutePath}"
         DEPENDS "${ProtoSourceAbsolutePath}")
 
-  add_clang_library(${LibraryName} ${GeneratedProtoSource}
+  add_llvm_library(${LibraryName} ${GeneratedProtoSource}
     PARTIAL_SOURCES_INTENDED
     LINK_LIBS PUBLIC grpc++ protobuf)
 
diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -794,6 +794,9 @@
 
   # Prevent bugs that can happen with llvm's brace style.
   add_flag_if_supported("-Wmisleading-indentation" MISLEADING_INDENTATION_FLAG)
+
+  # Enable -Wctad-maybe-unsupported to catch unintended use of CTAD.
+  add_flag_if_supported("-Wctad-maybe-unsupported" CTAD_MAYBE_UNSPPORTED_FLAG)
 endif (LLVM_ENABLE_WARNINGS AND (LLVM_COMPILER_IS_GCC_COMPATIBLE OR CLANG_CL))
 
 if (LLVM_COMPILER_IS_GCC_COMPATIBLE AND NOT LLVM_ENABLE_WARNINGS)
diff --git a/llvm/docs/CommandGuide/llvm-objdump.rst b/llvm/docs/CommandGuide/llvm-objdump.rst
--- a/llvm/docs/CommandGuide/llvm-objdump.rst
+++ b/llvm/docs/CommandGuide/llvm-objdump.rst
@@ -312,6 +312,10 @@
 
   Disassemble just the specified symbol's instructions.
 
+.. option:: --chained-fixups
+
+  Print chained fixup information.
+
 .. option:: --dyld_info
 
   Print bind and rebase information used by dyld to resolve external
diff --git a/llvm/docs/CommandGuide/llvm-otool.rst b/llvm/docs/CommandGuide/llvm-otool.rst
--- a/llvm/docs/CommandGuide/llvm-otool.rst
+++ b/llvm/docs/CommandGuide/llvm-otool.rst
@@ -23,6 +23,10 @@
 
  Select slice of universal Mach-O file.
 
+.. option:: -chained_fixups
+
+ Print chained fixup information.
+
 .. option:: -C
 
  Print linker optimization hints.
diff --git a/llvm/include/llvm/ADT/Optional.h b/llvm/include/llvm/ADT/Optional.h
--- a/llvm/include/llvm/ADT/Optional.h
+++ b/llvm/include/llvm/ADT/Optional.h
@@ -348,6 +348,7 @@
     return None;
   }
   template <class Function>
+  LLVM_DEPRECATED("Use transform instead.", "transform")
   auto map(const Function &F) const & -> Optional<decltype(F(value()))> {
     if (*this)
       return F(value());
@@ -378,6 +379,7 @@
     return None;
   }
   template <class Function>
+  LLVM_DEPRECATED("Use transform instead.", "transform")
   auto map(const Function &F)
       && -> Optional<decltype(F(std::move(*this).value()))> {
     if (*this)
diff --git a/llvm/include/llvm/ADT/SmallSet.h b/llvm/include/llvm/ADT/SmallSet.h
--- a/llvm/include/llvm/ADT/SmallSet.h
+++ b/llvm/include/llvm/ADT/SmallSet.h
@@ -141,6 +141,7 @@
   std::set<T, C> Set;
 
   using VIterator = typename SmallVector<T, N>::const_iterator;
+  using SIterator = typename std::set<T, C>::const_iterator;
   using mutable_iterator = typename SmallVector<T, N>::iterator;
 
   // In small mode SmallPtrSet uses linear search for the elements, so it is
@@ -171,22 +172,21 @@
   }
 
   /// insert - Insert an element into the set if it isn't already there.
-  /// Returns true if the element is inserted (it was not in the set before).
-  /// The first value of the returned pair is unused and provided for
-  /// partial compatibility with the standard library self-associative container
-  /// concept.
-  // FIXME: Add iterators that abstract over the small and large form, and then
-  // return those here.
-  std::pair<NoneType, bool> insert(const T &V) {
-    if (!isSmall())
-      return std::make_pair(None, Set.insert(V).second);
+  /// Returns a pair. The first value of it is an iterator to the inserted
+  /// element or the existing element in the set. The second value is true
+  /// if the element is inserted (it was not in the set before).
+  std::pair<const_iterator, bool> insert(const T &V) {
+    if (!isSmall()) {
+      auto [I, Inserted] = Set.insert(V);
+      return std::make_pair(const_iterator(I), Inserted);
+    }
 
     VIterator I = vfind(V);
     if (I != Vector.end())    // Don't reinsert if it already exists.
-      return std::make_pair(None, false);
+      return std::make_pair(const_iterator(I), false);
     if (Vector.size() < N) {
       Vector.push_back(V);
-      return std::make_pair(None, true);
+      return std::make_pair(const_iterator(std::prev(Vector.end())), true);
     }
 
     // Otherwise, grow from vector to set.
@@ -194,8 +194,7 @@
       Set.insert(Vector.back());
       Vector.pop_back();
     }
-    Set.insert(V);
-    return std::make_pair(None, true);
+    return std::make_pair(const_iterator(Set.insert(V).first), true);
   }
 
   template <typename IterT>
diff --git a/llvm/include/llvm/ADT/SmallVector.h b/llvm/include/llvm/ADT/SmallVector.h
--- a/llvm/include/llvm/ADT/SmallVector.h
+++ b/llvm/include/llvm/ADT/SmallVector.h
@@ -92,8 +92,8 @@
 
 template <class T>
 using SmallVectorSizeType =
-    typename std::conditional<sizeof(T) < 4 && sizeof(void *) >= 8, uint64_t,
-                              uint32_t>::type;
+    std::conditional_t<sizeof(T) < 4 && sizeof(void *) >= 8, uint64_t,
+                       uint32_t>;
 
 /// Figure out the offset of the first element.
 template <class T, typename = void> struct SmallVectorAlignmentAndSize {
diff --git a/llvm/include/llvm/Analysis/RegionInfoImpl.h b/llvm/include/llvm/Analysis/RegionInfoImpl.h
--- a/llvm/include/llvm/Analysis/RegionInfoImpl.h
+++ b/llvm/include/llvm/Analysis/RegionInfoImpl.h
@@ -390,10 +390,10 @@
 template <class Tr>
 void RegionBase<Tr>::addSubRegion(RegionT *SubRegion, bool moveChildren) {
   assert(!SubRegion->parent && "SubRegion already has a parent!");
-  assert(llvm::find_if(*this,
+  assert(llvm::none_of(*this,
                        [&](const std::unique_ptr<RegionT> &R) {
                          return R.get() == SubRegion;
-                       }) == children.end() &&
+                       }) &&
          "Subregion already exists!");
 
   SubRegion->parent = static_cast<RegionT *>(this);
diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -529,11 +529,9 @@
   bool containsAddRecurrence(const SCEV *S);
 
   /// Is operation \p BinOp between \p LHS and \p RHS provably does not have
-  /// a signed/unsigned overflow (\p Signed)? If \p CtxI is specified, the
-  /// no-overflow fact should be true in the context of this instruction.
+  /// a signed/unsigned overflow (\p Signed)?
   bool willNotOverflow(Instruction::BinaryOps BinOp, bool Signed,
-                       const SCEV *LHS, const SCEV *RHS,
-                       const Instruction *CtxI = nullptr);
+                       const SCEV *LHS, const SCEV *RHS);
 
   /// Parse NSW/NUW flags from add/sub/mul IR binary operation \p Op into
   /// SCEV no-wrap flags, and deduce flag[s] that aren't known yet.
diff --git a/llvm/include/llvm/BinaryFormat/MachO.h b/llvm/include/llvm/BinaryFormat/MachO.h
--- a/llvm/include/llvm/BinaryFormat/MachO.h
+++ b/llvm/include/llvm/BinaryFormat/MachO.h
@@ -1002,6 +1002,19 @@
   uint64_t n_value;
 };
 
+// Values for dyld_chained_fixups_header::imports_format.
+enum {
+  DYLD_CHAINED_IMPORT = 1,
+  DYLD_CHAINED_IMPORT_ADDEND = 2,
+  DYLD_CHAINED_IMPORT_ADDEND64 = 3,
+};
+
+// Values for dyld_chained_fixups_header::symbols_format.
+enum {
+  DYLD_CHAINED_SYMBOL_UNCOMPRESSED = 0,
+  DYLD_CHAINED_SYMBOL_ZLIB = 1,
+};
+
 /// Structs for dyld chained fixups.
 /// dyld_chained_fixups_header is the data pointed to by LC_DYLD_CHAINED_FIXUPS
 /// load command.
diff --git a/llvm/include/llvm/CodeGen/RDFGraph.h b/llvm/include/llvm/CodeGen/RDFGraph.h
--- a/llvm/include/llvm/CodeGen/RDFGraph.h
+++ b/llvm/include/llvm/CodeGen/RDFGraph.h
@@ -934,6 +934,8 @@
     const DataFlowGraph &G;
   };
 
+  template <typename T> Print(const T &, const DataFlowGraph &) -> Print<T>;
+
   template <typename T>
   struct PrintNode : Print<NodeAddr<T>> {
     PrintNode(const NodeAddr<T> &x, const DataFlowGraph &g)
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/ELF_i386.h b/llvm/include/llvm/ExecutionEngine/JITLink/ELF_i386.h
new file mode 100644
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/ELF_i386.h
@@ -0,0 +1,39 @@
+//===--- ELF_i386.h - JIT link functions for ELF/i386 --*- C++ -*----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+//
+// jit-link functions for ELF/i386.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_JITLINK_ELF_I386_H
+#define LLVM_EXECUTIONENGINE_JITLINK_ELF_I386_H
+
+#include "llvm/ExecutionEngine/JITLink/JITLink.h"
+
+namespace llvm {
+namespace jitlink {
+
+/// Create a LinkGraph from an ELF/i386 relocatable object
+///
+/// Note: The graph does not take ownership of the underlying buffer, nor copy
+/// its contents. The caller is responsible for ensuring that the object buffer
+/// outlives the graph.
+Expected<std::unique_ptr<LinkGraph>>
+createLinkGraphFromELFObject_i386(MemoryBufferRef ObjectBuffer);
+
+/// jit-link the given object buffer, which must be a ELF i386 relocatable
+/// object file.
+void link_ELF_i386(std::unique_ptr<LinkGraph> G,
+                   std::unique_ptr<JITLinkContext> Ctx);
+
+} // end namespace jitlink
+} // end namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_JITLINK_ELF_I386_H
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h b/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h
--- a/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h
@@ -1004,10 +1004,10 @@
 
   /// Create a section with the given name, protection flags, and alignment.
   Section &createSection(StringRef Name, MemProt Prot) {
-    assert(llvm::find_if(Sections,
+    assert(llvm::none_of(Sections,
                          [&](std::unique_ptr<Section> &Sec) {
                            return Sec->getName() == Name;
-                         }) == Sections.end() &&
+                         }) &&
            "Duplicate section name");
     std::unique_ptr<Section> Sec(new Section(Name, Prot, Sections.size()));
     Sections.push_back(std::move(Sec));
@@ -1349,9 +1349,8 @@
     assert(ExternalSymbols.count(&Sym) && "Symbol is not in the externals set");
     ExternalSymbols.erase(&Sym);
     Addressable &Base = *Sym.Base;
-    assert(llvm::find_if(ExternalSymbols,
-                         [&](Symbol *AS) { return AS->Base == &Base; }) ==
-               ExternalSymbols.end() &&
+    assert(llvm::none_of(ExternalSymbols,
+                         [&](Symbol *AS) { return AS->Base == &Base; }) &&
            "Base addressable still in use");
     destroySymbol(Sym);
     destroyAddressable(Base);
@@ -1365,9 +1364,8 @@
            "Symbol is not in the absolute symbols set");
     AbsoluteSymbols.erase(&Sym);
     Addressable &Base = *Sym.Base;
-    assert(llvm::find_if(ExternalSymbols,
-                         [&](Symbol *AS) { return AS->Base == &Base; }) ==
-               ExternalSymbols.end() &&
+    assert(llvm::none_of(ExternalSymbols,
+                         [&](Symbol *AS) { return AS->Base == &Base; }) &&
            "Base addressable still in use");
     destroySymbol(Sym);
     destroyAddressable(Base);
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/i386.h b/llvm/include/llvm/ExecutionEngine/JITLink/i386.h
new file mode 100644
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/i386.h
@@ -0,0 +1,38 @@
+//=== i386.h - Generic JITLink i386 edge kinds, utilities -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Generic utilities for graphs representing i386 objects.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_JITLINK_I386_H
+#define LLVM_EXECUTIONENGINE_JITLINK_I386_H
+
+#include "llvm/ExecutionEngine/JITLink/JITLink.h"
+
+namespace llvm {
+namespace jitlink {
+namespace i386 {
+
+/// Represets i386 fixups
+enum EdgeKind_i386 : Edge::Kind {
+
+  /// None
+  None = Edge::FirstRelocation,
+
+};
+
+/// Returns a string name for the given i386 edge. For debugging purposes
+/// only
+const char *getEdgeKindName(Edge::Kind K);
+
+} // namespace i386
+} // namespace jitlink
+} // namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_JITLINK_I386_H
\ No newline at end of file
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h b/llvm/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h
--- a/llvm/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h
@@ -18,6 +18,7 @@
 #include "llvm/ExecutionEngine/JITSymbol.h"
 #include "llvm/ExecutionEngine/Orc/Core.h"
 #include "llvm/ExecutionEngine/Orc/Mangling.h"
+#include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h"
 #include "llvm/ExecutionEngine/Orc/Shared/OrcError.h"
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
 #include "llvm/Object/Archive.h"
@@ -314,6 +315,40 @@
   DenseMap<SymbolStringPtr, MemoryBufferRef> ObjectFilesMap;
 };
 
+/// A utility class to create COFF dllimport GOT symbols (__imp_*) and PLT
+/// stubs.
+///
+/// If an instance of this class is attached to a JITDylib as a fallback
+/// definition generator, PLT stubs and dllimport __imp_ symbols will be
+/// generated for external symbols found outside the given jitdylib. Currently
+/// only supports x86_64 architecture.
+class DLLImportDefinitionGenerator : public DefinitionGenerator {
+public:
+  /// Creates a DLLImportDefinitionGenerator instance.
+  static std::unique_ptr<DLLImportDefinitionGenerator>
+  Create(ExecutionSession &ES, ObjectLinkingLayer &L);
+
+  Error tryToGenerate(LookupState &LS, LookupKind K, JITDylib &JD,
+                      JITDylibLookupFlags JDLookupFlags,
+                      const SymbolLookupSet &Symbols) override;
+
+private:
+  DLLImportDefinitionGenerator(ExecutionSession &ES, ObjectLinkingLayer &L)
+      : ES(ES), L(L) {}
+
+  static Expected<unsigned> getTargetPointerSize(const Triple &TT);
+  static Expected<support::endianness> getTargetEndianness(const Triple &TT);
+  Expected<std::unique_ptr<jitlink::LinkGraph>>
+  createStubsGraph(const SymbolMap &Resolved);
+
+  static StringRef getImpPrefix() { return "__imp_"; }
+
+  static StringRef getSectionName() { return "$__DLLIMPORT_STUBS"; }
+
+  ExecutionSession &ES;
+  ObjectLinkingLayer &L;
+};
+
 } // end namespace orc
 } // end namespace llvm
 
diff --git a/llvm/include/llvm/IR/CFG.h b/llvm/include/llvm/IR/CFG.h
--- a/llvm/include/llvm/IR/CFG.h
+++ b/llvm/include/llvm/IR/CFG.h
@@ -47,7 +47,7 @@
   using pointer = Ptr *;
   using reference = Ptr *;
 
-private:
+protected:
   using Self = PredIterator<Ptr, USE_iterator>;
   USE_iterator It;
 
diff --git a/llvm/include/llvm/MC/MCDwarf.h b/llvm/include/llvm/MC/MCDwarf.h
--- a/llvm/include/llvm/MC/MCDwarf.h
+++ b/llvm/include/llvm/MC/MCDwarf.h
@@ -387,6 +387,7 @@
 
   bool hasRootFile() const { return !Header.RootFile.Name.empty(); }
 
+  MCDwarfFile &getRootFile() { return Header.RootFile; }
   const MCDwarfFile &getRootFile() const { return Header.RootFile; }
 
   // Report whether MD5 usage has been consistent (all-or-none).
diff --git a/llvm/include/llvm/Support/GenericDomTreeConstruction.h b/llvm/include/llvm/Support/GenericDomTreeConstruction.h
--- a/llvm/include/llvm/Support/GenericDomTreeConstruction.h
+++ b/llvm/include/llvm/Support/GenericDomTreeConstruction.h
@@ -712,7 +712,7 @@
     assert(IsPostDom && "This function is only for postdominators");
 
     // The tree has only trivial roots -- nothing to update.
-    if (std::none_of(DT.Roots.begin(), DT.Roots.end(), [BUI](const NodePtr N) {
+    if (llvm::none_of(DT.Roots, [BUI](const NodePtr N) {
           return HasForwardSuccessors(N, BUI);
         }))
       return;
diff --git a/llvm/include/llvm/Transforms/IPO.h b/llvm/include/llvm/Transforms/IPO.h
--- a/llvm/include/llvm/Transforms/IPO.h
+++ b/llvm/include/llvm/Transforms/IPO.h
@@ -240,10 +240,6 @@
 /// devirtualization and control-flow integrity.
 ModulePass *createGlobalSplitPass();
 
-/// Write ThinLTO-ready bitcode to Str.
-ModulePass *createWriteThinLTOBitcodePass(raw_ostream &Str,
-                                          raw_ostream *ThinLinkOS = nullptr);
-
 } // End llvm namespace
 
 #endif
diff --git a/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h b/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h
--- a/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h
+++ b/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h
@@ -222,6 +222,8 @@
   Value *optimizePuts(CallInst *CI, IRBuilderBase &B);
 
   // Helper methods
+  Value* emitSnPrintfMemCpy(CallInst *CI, Value *StrArg, StringRef Str,
+                            uint64_t N, IRBuilderBase &B);
   Value *emitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len,
                           IRBuilderBase &B);
   void classifyArgUse(Value *Val, Function *F, bool IsFloat,
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -2868,6 +2868,11 @@
     if (isImpliedCondition(LHS, RHS, Q.DL).value_or(false))
       return getTrue(ITy);
     break;
+  case ICmpInst::ICMP_SLE:
+    /// SLE follows the same logic as SGE with the LHS and RHS swapped.
+    if (isImpliedCondition(RHS, LHS, Q.DL).value_or(false))
+      return getTrue(ITy);
+    break;
   }
 
   return nullptr;
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -242,11 +242,6 @@
                             cl::desc("Handle <= and >= in finite loops"),
                             cl::init(true));
 
-static cl::opt<bool> UseContextForNoWrapFlagInference(
-    "scalar-evolution-use-context-for-no-wrap-flag-strenghening", cl::Hidden,
-    cl::desc("Infer nuw/nsw flags using context where suitable"),
-    cl::init(true));
-
 //===----------------------------------------------------------------------===//
 //                           SCEV class definitions
 //===----------------------------------------------------------------------===//
@@ -2290,8 +2285,7 @@
 }
 
 bool ScalarEvolution::willNotOverflow(Instruction::BinaryOps BinOp, bool Signed,
-                                      const SCEV *LHS, const SCEV *RHS,
-                                      const Instruction *CtxI) {
+                                      const SCEV *LHS, const SCEV *RHS) {
   const SCEV *(ScalarEvolution::*Operation)(const SCEV *, const SCEV *,
                                             SCEV::NoWrapFlags, unsigned);
   switch (BinOp) {
@@ -2322,30 +2316,7 @@
   const SCEV *LHSB = (this->*Extension)(LHS, WideTy, 0);
   const SCEV *RHSB = (this->*Extension)(RHS, WideTy, 0);
   const SCEV *B = (this->*Operation)(LHSB, RHSB, SCEV::FlagAnyWrap, 0);
-  if (A == B)
-    return true;
-  // Can we use context to prove the fact we need?
-  if (!CtxI)
-    return false;
-  // We can prove that add(x, constant) doesn't wrap if isKnownPredicateAt can
-  // guarantee that x <= max_int - constant at the given context.
-  // TODO: Support other operations.
-  if (BinOp != Instruction::Add)
-    return false;
-  auto *RHSC = dyn_cast<SCEVConstant>(RHS);
-  // TODO: Lift this limitation.
-  if (!RHSC)
-    return false;
-  APInt C = RHSC->getAPInt();
-  // TODO: Also lift this limitation.
-  if (Signed && C.isNegative())
-    return false;
-  unsigned NumBits = C.getBitWidth();
-  APInt Max =
-      Signed ? APInt::getSignedMaxValue(NumBits) : APInt::getMaxValue(NumBits);
-  APInt Limit = Max - C;
-  ICmpInst::Predicate Pred = Signed ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE;
-  return isKnownPredicateAt(Pred, LHS, getConstant(Limit), CtxI);
+  return A == B;
 }
 
 Optional<SCEV::NoWrapFlags>
@@ -2372,18 +2343,16 @@
   const SCEV *LHS = getSCEV(OBO->getOperand(0));
   const SCEV *RHS = getSCEV(OBO->getOperand(1));
 
-  const Instruction *CtxI =
-      UseContextForNoWrapFlagInference ? dyn_cast<Instruction>(OBO) : nullptr;
   if (!OBO->hasNoUnsignedWrap() &&
       willNotOverflow((Instruction::BinaryOps)OBO->getOpcode(),
-                      /* Signed */ false, LHS, RHS, CtxI)) {
+                      /* Signed */ false, LHS, RHS)) {
     Flags = ScalarEvolution::setFlags(Flags, SCEV::FlagNUW);
     Deduced = true;
   }
 
   if (!OBO->hasNoSignedWrap() &&
       willNotOverflow((Instruction::BinaryOps)OBO->getOpcode(),
-                      /* Signed */ true, LHS, RHS, CtxI)) {
+                      /* Signed */ true, LHS, RHS)) {
     Flags = ScalarEvolution::setFlags(Flags, SCEV::FlagNSW);
     Deduced = true;
   }
diff --git a/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp b/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp
--- a/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp
@@ -663,9 +663,10 @@
       Asm->OutStreamer->emitLabel(CSRange.ExceptionLabel);
 
       // Emit the LSDA header.
-      // If only one call-site range exists, LPStart is omitted as it is the
-      // same as the function entry.
-      if (CallSiteRanges.size() == 1) {
+      // LPStart is omitted if either we have a single call-site range (in which
+      // case the function entry is treated as @LPStart) or if this function has
+      // no landing pads (in which case @LPStart is undefined).
+      if (CallSiteRanges.size() == 1 || LandingPadRange == nullptr) {
         Asm->emitEncodingByte(dwarf::DW_EH_PE_omit, "@LPStart");
       } else if (!Asm->isPositionIndependent()) {
         // For more than one call-site ranges, LPStart must be explicitly
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -7754,14 +7754,14 @@
     return false;
   // Check that GEP is used outside the block, meaning it's alive on the
   // IndirectBr edge(s).
-  if (find_if(GEPI->users(), [&](User *Usr) {
+  if (llvm::none_of(GEPI->users(), [&](User *Usr) {
         if (auto *I = dyn_cast<Instruction>(Usr)) {
           if (I->getParent() != SrcBlock) {
             return true;
           }
         }
         return false;
-      }) == GEPI->users().end())
+      }))
     return false;
   // The second elements of the GEP chains to be unmerged.
   std::vector<GetElementPtrInst *> UGEPIs;
diff --git a/llvm/lib/CodeGen/MLRegallocEvictAdvisor.cpp b/llvm/lib/CodeGen/MLRegallocEvictAdvisor.cpp
--- a/llvm/lib/CodeGen/MLRegallocEvictAdvisor.cpp
+++ b/llvm/lib/CodeGen/MLRegallocEvictAdvisor.cpp
@@ -193,12 +193,10 @@
     "lowest stage of an interval in this LR")                                  \
   M(float, progress, {1}, "ratio of current queue size to initial size")
 
-// The model learns to pick one of the mask == 1 interferences. This is the name
-// of the output tensor.
-// The contract with the model is that the output will be guaranteed to be to a
-// mask == 1 position.
-// Using a macro here to avoid 'not used' warnings (and keep cond compilation to
-// a minimum)
+// The model learns to pick one of the mask == 1 interferences. This is the
+// name of the output tensor. The contract with the model is that the output
+// will be guaranteed to be to a mask == 1 position. Using a macro here to
+// avoid 'not used' warnings (and keep cond compilation to a minimum)
 #define DecisionName "index_to_evict"
 
 // Named features index.
@@ -211,7 +209,8 @@
 
 // The ML advisor will typically have a sparse input to the evaluator, because
 // various phys regs won't be available. It's easier (maintenance-wise) to
-// bulk-reset the state of the evaluator each time we are about to use it again.
+// bulk-reset the state of the evaluator each time we are about to use it
+// again.
 template <typename T> size_t getTotalSize(const std::vector<int64_t> &Shape) {
   size_t Ret = sizeof(T);
   for (const auto V : Shape)
@@ -227,8 +226,8 @@
 #undef _RESET
 }
 
-// Per-live interval components that get aggregated into the feature values that
-// will be passed to the evaluator.
+// Per-live interval components that get aggregated into the feature values
+// that will be passed to the evaluator.
 struct LIFeatureComponents {
   double R = 0;
   double W = 0;
@@ -242,7 +241,8 @@
 
 using CandidateRegList =
     std::array<std::pair<MCRegister, bool>, NumberOfInterferences>;
-using FeaturesListNormalizer = std::array<float, FeatureIDs::FeatureCount>;
+using FeaturesListNormalizer =
+    llvm::SmallVector<float, FeatureIDs::FeatureCount>;
 
 /// The ML evictor (commonalities between release and development mode)
 class MLEvictAdvisor : public RegAllocEvictionAdvisor {
@@ -260,10 +260,10 @@
   // error, and we shouldn't be asking for it here.
   const MLModelRunner &getRunner() const { return *Runner; }
 
-  /// This just calls Evaluate on the Runner, but in the development mode case,
-  /// if we're just capturing the log of the default advisor, it needs to call
-  /// the latter instead, so we need to pass all the necessary parameters for
-  /// it. In the development case, it will also log.
+  /// This just calls Evaluate on the Runner, but in the development mode
+  /// case, if we're just capturing the log of the default advisor, it needs
+  /// to call the latter instead, so we need to pass all the necessary
+  /// parameters for it. In the development case, it will also log.
   virtual int64_t
   tryFindEvictionCandidatePosition(const LiveInterval &VirtReg,
                                    const AllocationOrder &Order,
@@ -272,11 +272,11 @@
 
   /// Load the features of the given VirtReg (allocated or not) at column Pos,
   /// but if  that can't be evicted, return false instead.
-  bool
-  loadInterferenceFeatures(const LiveInterval &VirtReg, MCRegister PhysReg,
-                           bool IsHint, const SmallVirtRegSet &FixedRegisters,
-                           std::array<float, FeatureIDs::FeatureCount> &Largest,
-                           size_t Pos) const;
+  bool loadInterferenceFeatures(const LiveInterval &VirtReg, MCRegister PhysReg,
+                                bool IsHint,
+                                const SmallVirtRegSet &FixedRegisters,
+                                llvm::SmallVectorImpl<float> &Largest,
+                                size_t Pos) const;
 
 private:
   static float getInitialQueueSize(const MachineFunction &MF);
@@ -287,11 +287,12 @@
       const SmallVirtRegSet &FixedRegisters) const override;
 
   void extractFeatures(const SmallVectorImpl<const LiveInterval *> &Intervals,
-                       std::array<float, FeatureIDs::FeatureCount> &Largest,
-                       size_t Pos, int64_t IsHint, int64_t LocalIntfsCount,
+                       llvm::SmallVectorImpl<float> &Largest, size_t Pos,
+                       int64_t IsHint, int64_t LocalIntfsCount,
                        float NrUrgent) const;
 
-  // Point-in-time: we didn't learn this, so we always delegate to the default.
+  // Point-in-time: we didn't learn this, so we always delegate to the
+  // default.
   bool canEvictHintInterference(
       const LiveInterval &VirtReg, MCRegister PhysReg,
       const SmallVirtRegSet &FixedRegisters) const override {
@@ -303,9 +304,9 @@
   getLIFeatureComponents(const LiveInterval &LI) const;
 
   // Hold on to a default advisor for:
-  // 1) the implementation of canEvictHintInterference, because we didn't learn
-  // that nuance yet;
-  // 2) for bootstrapping (logging) in the development mode case.
+  // 1) the implementation of canEvictHintInterference, because we didn't
+  // learn that nuance yet; 2) for bootstrapping (logging) in the development
+  // mode case.
   const DefaultEvictionAdvisor DefaultAdvisor;
   MLModelRunner *const Runner;
   const MachineBlockFrequencyInfo &MBFI;
@@ -323,10 +324,6 @@
 #define _DECL_FEATURES(type, name, shape, _)                                   \
   TensorSpec::createSpec<type>(#name, shape),
 
-static const std::vector<TensorSpec> InputFeatures{
-    {RA_EVICT_FEATURES_LIST(_DECL_FEATURES)},
-};
-#undef _DECL_FEATURES
 // ===================================
 // Release (AOT) - specifics
 // ===================================
@@ -334,13 +331,17 @@
     : public RegAllocEvictionAdvisorAnalysis {
 public:
   ReleaseModeEvictionAdvisorAnalysis()
-      : RegAllocEvictionAdvisorAnalysis(AdvisorMode::Release) {}
+      : RegAllocEvictionAdvisorAnalysis(AdvisorMode::Release) {
+    InputFeatures = {RA_EVICT_FEATURES_LIST(_DECL_FEATURES)};
+  }
   // support for isa<> and dyn_cast.
   static bool classof(const RegAllocEvictionAdvisorAnalysis *R) {
     return R->getAdvisorMode() == AdvisorMode::Release;
   }
 
 private:
+  std::vector<TensorSpec> InputFeatures;
+
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<MachineBlockFrequencyInfo>();
     AU.addRequired<MachineLoopInfo>();
@@ -370,19 +371,12 @@
 static const TensorSpec Reward = TensorSpec::createSpec<float>("reward", {1});
 
 // Features we bind on the model. The tensor names have a prefix, and we also
-// need to include some tensors that are expected to be present by the training
-// algo.
+// need to include some tensors that are expected to be present by the
+// training algo.
 // TODO: can we just get rid of these?
 #define _DECL_TRAIN_FEATURES(type, name, shape, _)                             \
   TensorSpec::createSpec<type>(std::string("action_") + #name, shape),
 
-static const std::vector<TensorSpec> TrainingInputFeatures{
-    {RA_EVICT_FEATURES_LIST(_DECL_TRAIN_FEATURES)
-         TensorSpec::createSpec<float>("action_discount", {1}),
-     TensorSpec::createSpec<int32_t>("action_step_type", {1}),
-     TensorSpec::createSpec<float>("action_reward", {1})}};
-#undef _DECL_TRAIN_FEATURES
-
 class DevelopmentModeEvictAdvisor : public MLEvictAdvisor {
 public:
   DevelopmentModeEvictAdvisor(const MachineFunction &MF, const RAGreedy &RA,
@@ -404,7 +398,14 @@
     : public RegAllocEvictionAdvisorAnalysis {
 public:
   DevelopmentModeEvictionAdvisorAnalysis()
-      : RegAllocEvictionAdvisorAnalysis(AdvisorMode::Development) {}
+      : RegAllocEvictionAdvisorAnalysis(AdvisorMode::Development) {
+    InputFeatures = {RA_EVICT_FEATURES_LIST(_DECL_FEATURES)};
+    TrainingInputFeatures = {
+        RA_EVICT_FEATURES_LIST(_DECL_TRAIN_FEATURES)
+            TensorSpec::createSpec<float>("action_discount", {1}),
+        TensorSpec::createSpec<int32_t>("action_step_type", {1}),
+        TensorSpec::createSpec<float>("action_reward", {1})};
+  }
   // support for isa<> and dyn_cast.
   static bool classof(const RegAllocEvictionAdvisorAnalysis *R) {
     return R->getAdvisorMode() == AdvisorMode::Development;
@@ -420,6 +421,9 @@
   }
 
 private:
+  std::vector<TensorSpec> InputFeatures;
+  std::vector<TensorSpec> TrainingInputFeatures;
+
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<MachineBlockFrequencyInfo>();
     AU.addRequired<MachineLoopInfo>();
@@ -486,6 +490,7 @@
   std::unique_ptr<MLModelRunner> Runner;
   StringMap<std::unique_ptr<Logger>> LogMap;
 };
+
 #endif //#ifdef LLVM_HAVE_TF_API
 } // namespace
 
@@ -529,8 +534,8 @@
 
 bool MLEvictAdvisor::loadInterferenceFeatures(
     const LiveInterval &VirtReg, MCRegister PhysReg, bool IsHint,
-    const SmallVirtRegSet &FixedRegisters, FeaturesListNormalizer &Largest,
-    size_t Pos) const {
+    const SmallVirtRegSet &FixedRegisters,
+    llvm::SmallVectorImpl<float> &Largest, size_t Pos) const {
   // It is only possible to evict virtual register interference.
   if (Matrix->checkInterference(VirtReg, PhysReg) > LiveRegMatrix::IK_VirtReg) {
     // leave unavailable
@@ -547,8 +552,8 @@
   SmallVector<const LiveInterval *, MaxInterferences> InterferingIntervals;
   for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
     LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units);
-    // Different from the default heuristic, we don't make any assumptions about
-    // what having more than 10 results in the query may mean.
+    // Different from the default heuristic, we don't make any assumptions
+    // about what having more than 10 results in the query may mean.
     const auto &IFIntervals = Q.interferingVRegs(EvictInterferenceCutoff);
     if (IFIntervals.empty() && InterferingIntervals.empty())
       continue;
@@ -605,14 +610,14 @@
   // max<uint8_t>, then any of the costs of the legally-evictable intervals
   // would be lower. When that happens, one of those will be selected.
   // Therefore, we allow the candidate be selected, unless the candidate is
-  // unspillable, in which case it would be incorrect to not find a register for
-  // it.
+  // unspillable, in which case it would be incorrect to not find a register
+  // for it.
   const bool MustFindEviction =
       (!VirtReg.isSpillable() && CostPerUseLimit == static_cast<uint8_t>(~0u));
   // Number of available candidates - if 0, no need to continue.
   size_t Available = 0;
-  // Make sure we don't have leftover partial state from an attempt where we had
-  // no available candidates and bailed out early.
+  // Make sure we don't have leftover partial state from an attempt where we
+  // had no available candidates and bailed out early.
   resetInputs(*Runner);
 
   // Track the index->register mapping because AllocationOrder doesn't do that
@@ -625,15 +630,13 @@
   // only normalize (some of) the float features, but it's just simpler to
   // dimension 'Largest' to all the features, especially since we have the
   // 'DoNotNormalize' list.
-  FeaturesListNormalizer Largest;
-  Largest.fill(0.0);
-
-  // Same overal idea as in the default eviction policy - we visit the values of
-  // AllocationOrder one at a time. If it's not legally available, we mask off
-  // the corresponding feature column (==do nothing because we already reset all
-  // the features to 0)
-  // Use Pos to capture the column we load features at - in AllocationOrder
-  // order.
+  FeaturesListNormalizer Largest(FeatureIDs::FeatureCount, 0.0);
+
+  // Same overal idea as in the default eviction policy - we visit the values
+  // of AllocationOrder one at a time. If it's not legally available, we mask
+  // off the corresponding feature column (==do nothing because we already
+  // reset all the features to 0) Use Pos to capture the column we load
+  // features at - in AllocationOrder order.
   size_t Pos = 0;
   for (auto I = Order.begin(), E = Order.getOrderLimitEnd(OrderLimit); I != E;
        ++I, ++Pos) {
@@ -660,7 +663,8 @@
   Regs[CandidateVirtRegPos].second = !MustFindEviction;
   if (!MustFindEviction)
     extractFeatures(SmallVector<const LiveInterval *, 1>(1, &VirtReg), Largest,
-                    CandidateVirtRegPos, /*IsHint*/ 0, /*LocalIntfsCount*/ 0,
+                    CandidateVirtRegPos, /*IsHint*/ 0,
+                    /*LocalIntfsCount*/ 0,
                     /*NrUrgent*/ 0.0);
   assert(InitialQSize > 0.0 && "We couldn't have gotten here if we had "
                                "nothing to allocate initially.");
@@ -747,8 +751,8 @@
 // of accummulating the various features, we keep them separate.
 void MLEvictAdvisor::extractFeatures(
     const SmallVectorImpl<const LiveInterval *> &Intervals,
-    std::array<float, FeatureIDs::FeatureCount> &Largest, size_t Pos,
-    int64_t IsHint, int64_t LocalIntfsCount, float NrUrgent) const {
+    llvm::SmallVectorImpl<float> &Largest, size_t Pos, int64_t IsHint,
+    int64_t LocalIntfsCount, float NrUrgent) const {
   int64_t NrDefsAndUses = 0;
   int64_t NrBrokenHints = 0;
   double R = 0.0;
@@ -854,9 +858,9 @@
   } else {
     MCRegister PhysReg = getDefaultAdvisor().tryFindEvictionCandidate(
         VirtReg, Order, CostPerUseLimit, FixedRegisters);
-    // Find the index of the selected PhysReg. We need it for logging, otherwise
-    // this is wasted cycles (but so would starting development mode without a
-    // model nor logging)
+    // Find the index of the selected PhysReg. We need it for logging,
+    // otherwise this is wasted cycles (but so would starting development mode
+    // without a model nor logging)
     if (!PhysReg)
       Ret = CandidateVirtRegPos;
     else
diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp
--- a/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -281,7 +281,6 @@
     Register traceCopies(Register VirtReg) const;
     Register traceCopyChain(Register Reg) const;
 
-    bool shouldAllocateRegister(const Register Reg) const;
     int getStackSpaceFor(Register VirtReg);
     void spill(MachineBasicBlock::iterator Before, Register VirtReg,
                MCPhysReg AssignedReg, bool Kill, bool LiveOut);
@@ -301,12 +300,6 @@
 INITIALIZE_PASS(RegAllocFast, "regallocfast", "Fast Register Allocator", false,
                 false)
 
-bool RegAllocFast::shouldAllocateRegister(const Register Reg) const {
-  assert(Register::isVirtualRegister(Reg));
-  const TargetRegisterClass &RC = *MRI->getRegClass(Reg);
-  return ShouldAllocateClass(*TRI, RC);
-}
-
 void RegAllocFast::setPhysRegState(MCPhysReg PhysReg, unsigned NewState) {
   for (MCRegUnitIterator UI(PhysReg, TRI); UI.isValid(); ++UI)
     RegUnitStates[*UI] = NewState;
@@ -846,8 +839,6 @@
   assert(MO.isUndef() && "expected undef use");
   Register VirtReg = MO.getReg();
   assert(Register::isVirtualRegister(VirtReg) && "Expected virtreg");
-  if (!shouldAllocateRegister(VirtReg))
-    return;
 
   LiveRegMap::const_iterator LRI = findLiveVirtReg(VirtReg);
   MCPhysReg PhysReg;
@@ -873,8 +864,6 @@
 /// (tied or earlyclobber) that may interfere with preassigned uses.
 void RegAllocFast::defineLiveThroughVirtReg(MachineInstr &MI, unsigned OpNum,
                                             Register VirtReg) {
-  if (!shouldAllocateRegister(VirtReg))
-    return;
   LiveRegMap::iterator LRI = findLiveVirtReg(VirtReg);
   if (LRI != LiveVirtRegs.end()) {
     MCPhysReg PrevReg = LRI->PhysReg;
@@ -908,8 +897,6 @@
 void RegAllocFast::defineVirtReg(MachineInstr &MI, unsigned OpNum,
                                  Register VirtReg, bool LookAtPhysRegUses) {
   assert(VirtReg.isVirtual() && "Not a virtual register");
-  if (!shouldAllocateRegister(VirtReg))
-    return;
   MachineOperand &MO = MI.getOperand(OpNum);
   LiveRegMap::iterator LRI;
   bool New;
@@ -960,8 +947,6 @@
 void RegAllocFast::useVirtReg(MachineInstr &MI, unsigned OpNum,
                               Register VirtReg) {
   assert(VirtReg.isVirtual() && "Not a virtual register");
-  if (!shouldAllocateRegister(VirtReg))
-    return;
   MachineOperand &MO = MI.getOperand(OpNum);
   LiveRegMap::iterator LRI;
   bool New;
@@ -986,13 +971,8 @@
     Register Hint;
     if (MI.isCopy() && MI.getOperand(1).getSubReg() == 0) {
       Hint = MI.getOperand(0).getReg();
-      if (Hint.isVirtual()) {
-        assert(!shouldAllocateRegister(Hint));
-        Hint = Register();
-      } else {
-        assert(Hint.isPhysical() &&
-               "Copy destination should already be assigned");
-      }
+      assert(Hint.isPhysical() &&
+             "Copy destination should already be assigned");
     }
     allocVirtReg(MI, *LRI, Hint, false);
     if (LRI->Error) {
@@ -1100,8 +1080,6 @@
   assert(RegClassDefCounts.size() == TRI->getNumRegClasses());
 
   if (Reg.isVirtual()) {
-    if (!shouldAllocateRegister(Reg))
-      return;
     const TargetRegisterClass *OpRC = MRI->getRegClass(Reg);
     for (unsigned RCIdx = 0, RCIdxEnd = TRI->getNumRegClasses();
          RCIdx != RCIdxEnd; ++RCIdx) {
@@ -1161,8 +1139,6 @@
     if (MO.isReg()) {
       Register Reg = MO.getReg();
       if (Reg.isVirtual()) {
-        if (!shouldAllocateRegister(Reg))
-          continue;
         if (MO.isDef()) {
           HasDef = true;
           HasVRegDef = true;
@@ -1226,7 +1202,7 @@
           }
 
           if (MO.isDef()) {
-            if (Reg.isVirtual() && shouldAllocateRegister(Reg))
+            if (Reg.isVirtual())
               DefOperandIndexes.push_back(I);
 
             addRegClassDefCounts(RegClassDefCounts, Reg);
@@ -1316,10 +1292,6 @@
       Register Reg = MO.getReg();
       if (!Reg)
         continue;
-      if (Reg.isVirtual()) {
-        assert(!shouldAllocateRegister(Reg));
-        continue;
-      }
       assert(Reg.isPhysical());
       if (MRI->isReserved(Reg))
         continue;
@@ -1366,7 +1338,7 @@
     if (!MO.isReg() || !MO.isUse())
       continue;
     Register Reg = MO.getReg();
-    if (!Reg.isVirtual() || !shouldAllocateRegister(Reg))
+    if (!Reg.isVirtual())
       continue;
 
     if (MO.isUndef()) {
@@ -1393,7 +1365,7 @@
       if (!MO.isReg() || !MO.isUse())
         continue;
       Register Reg = MO.getReg();
-      if (!Reg.isVirtual() || !shouldAllocateRegister(Reg))
+      if (!Reg.isVirtual())
         continue;
 
       assert(MO.isUndef() && "Should only have undef virtreg uses left");
@@ -1416,10 +1388,6 @@
       Register Reg = MO.getReg();
       if (!Reg)
         continue;
-      if (Reg.isVirtual()) {
-        assert(!shouldAllocateRegister(Reg));
-        continue;
-      }
       assert(Reg.isPhysical() && "should have register assigned");
 
       // We sometimes get odd situations like:
@@ -1449,8 +1417,6 @@
   for (Register Reg : MI.getUsedDebugRegs()) {
     if (!Register::isVirtualRegister(Reg))
       continue;
-    if (!shouldAllocateRegister(Reg))
-      continue;
 
     // Already spilled to a stackslot?
     int SS = StackSlotForVirtReg[Reg];
@@ -1491,7 +1457,7 @@
         continue;
 
       Register Reg = MO.getReg();
-      if (!Reg.isVirtual() || !shouldAllocateRegister(Reg))
+      if (!Reg.isVirtual())
         continue;
 
       DenseMap<Register, MCPhysReg>::iterator DI;
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -13213,6 +13213,26 @@
       return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, BSwap, N1);
   }
 
+  // Fold (iM_signext_inreg
+  //        (extract_subvector (zext|anyext|sext iN_v to _) _)
+  //        from iN)
+  //      -> (extract_subvector (signext iN_v to iM))
+  if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR && N0.hasOneUse() &&
+      ISD::isExtOpcode(N0.getOperand(0).getOpcode())) {
+    SDValue InnerExt = N0.getOperand(0);
+    EVT InnerExtVT = InnerExt->getValueType(0);
+    SDValue Extendee = InnerExt->getOperand(0);
+
+    if (ExtVTBits == Extendee.getValueType().getScalarSizeInBits() &&
+        (!LegalOperations ||
+         TLI.isOperationLegal(ISD::SIGN_EXTEND, InnerExtVT))) {
+      SDValue SignExtExtendee =
+          DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), InnerExtVT, Extendee);
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, SignExtExtendee,
+                         N0.getOperand(1));
+    }
+  }
+
   return SDValue();
 }
 
@@ -22846,25 +22866,31 @@
       SDLoc DL(N);
       EVT IntVT = VT.changeVectorElementTypeToInteger();
       EVT IntSVT = VT.getVectorElementType().changeTypeToInteger();
-      IntSVT = TLI.getTypeToTransformTo(*DAG.getContext(), IntSVT);
-      SDValue ZeroElt = DAG.getConstant(0, DL, IntSVT);
-      SDValue AllOnesElt = DAG.getAllOnesConstant(DL, IntSVT);
-      SmallVector<SDValue, 16> AndMask(NumElts, DAG.getUNDEF(IntSVT));
-      for (int I = 0; I != (int)NumElts; ++I)
-        if (0 <= Mask[I])
-          AndMask[I] = Mask[I] == I ? AllOnesElt : ZeroElt;
-
-      // See if a clear mask is legal instead of going via
-      // XformToShuffleWithZero which loses UNDEF mask elements.
-      if (TLI.isVectorClearMaskLegal(ClearMask, IntVT))
-        return DAG.getBitcast(
-            VT, DAG.getVectorShuffle(IntVT, DL, DAG.getBitcast(IntVT, N0),
-                                     DAG.getConstant(0, DL, IntVT), ClearMask));
-
-      if (TLI.isOperationLegalOrCustom(ISD::AND, IntVT))
-        return DAG.getBitcast(
-            VT, DAG.getNode(ISD::AND, DL, IntVT, DAG.getBitcast(IntVT, N0),
-                            DAG.getBuildVector(IntVT, DL, AndMask)));
+      // Transform the type to a legal type so that the buildvector constant
+      // elements are not illegal. Make sure that the result is larger than the
+      // original type, incase the value is split into two (eg i64->i32).
+      if (!TLI.isTypeLegal(IntSVT) && LegalTypes)
+        IntSVT = TLI.getTypeToTransformTo(*DAG.getContext(), IntSVT);
+      if (IntSVT.getSizeInBits() >= IntVT.getScalarSizeInBits()) {
+        SDValue ZeroElt = DAG.getConstant(0, DL, IntSVT);
+        SDValue AllOnesElt = DAG.getAllOnesConstant(DL, IntSVT);
+        SmallVector<SDValue, 16> AndMask(NumElts, DAG.getUNDEF(IntSVT));
+        for (int I = 0; I != (int)NumElts; ++I)
+          if (0 <= Mask[I])
+            AndMask[I] = Mask[I] == I ? AllOnesElt : ZeroElt;
+
+        // See if a clear mask is legal instead of going via
+        // XformToShuffleWithZero which loses UNDEF mask elements.
+        if (TLI.isVectorClearMaskLegal(ClearMask, IntVT))
+          return DAG.getBitcast(
+              VT, DAG.getVectorShuffle(IntVT, DL, DAG.getBitcast(IntVT, N0),
+                                      DAG.getConstant(0, DL, IntVT), ClearMask));
+
+        if (TLI.isOperationLegalOrCustom(ISD::AND, IntVT))
+          return DAG.getBitcast(
+              VT, DAG.getNode(ISD::AND, DL, IntVT, DAG.getBitcast(IntVT, N0),
+                              DAG.getBuildVector(IntVT, DL, AndMask)));
+      }
     }
   }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -4508,6 +4508,9 @@
     return true;
 
   switch (Opcode) {
+  case ISD::VALUETYPE:
+    return true;
+
   case ISD::UNDEF:
     return PoisonOnly;
 
@@ -4564,6 +4567,8 @@
 
   unsigned Opcode = Op.getOpcode();
   switch (Opcode) {
+  case ISD::AssertSext:
+  case ISD::AssertZext:
   case ISD::FREEZE:
   case ISD::AND:
   case ISD::OR:
@@ -4575,6 +4580,7 @@
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND:
   case ISD::TRUNCATE:
+  case ISD::SIGN_EXTEND_INREG:
   case ISD::BITCAST:
     return false;
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4733,7 +4733,8 @@
   EVT MemVT =
       TLI.getMemValueType(DAG.getDataLayout(), I.getValueOperand()->getType());
 
-  if (I.getAlign().value() < MemVT.getSizeInBits() / 8)
+  if (!TLI.supportsUnalignedAtomics() &&
+      I.getAlign().value() < MemVT.getSizeInBits() / 8)
     report_fatal_error("Cannot generate unaligned atomic store");
 
   auto Flags = TLI.getStoreMemOperandFlags(I, DAG.getDataLayout());
diff --git a/llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt b/llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt
--- a/llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt
+++ b/llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt
@@ -22,6 +22,7 @@
   ELF.cpp
   ELFLinkGraphBuilder.cpp
   ELF_aarch64.cpp
+  ELF_i386.cpp
   ELF_riscv.cpp
   ELF_x86_64.cpp
 
@@ -33,6 +34,7 @@
 
   # Architectures:
   aarch64.cpp
+  i386.cpp
   riscv.cpp
   x86_64.cpp
 
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF.cpp
--- a/llvm/lib/ExecutionEngine/JITLink/ELF.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF.cpp
@@ -14,6 +14,7 @@
 
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/ExecutionEngine/JITLink/ELF_aarch64.h"
+#include "llvm/ExecutionEngine/JITLink/ELF_i386.h"
 #include "llvm/ExecutionEngine/JITLink/ELF_riscv.h"
 #include "llvm/ExecutionEngine/JITLink/ELF_x86_64.h"
 #include "llvm/Object/ELF.h"
@@ -71,6 +72,8 @@
     return createLinkGraphFromELFObject_riscv(ObjectBuffer);
   case ELF::EM_X86_64:
     return createLinkGraphFromELFObject_x86_64(ObjectBuffer);
+  case ELF::EM_386:
+    return createLinkGraphFromELFObject_i386(ObjectBuffer);
   default:
     return make_error<JITLinkError>(
         "Unsupported target machine architecture in ELF object " +
@@ -91,6 +94,9 @@
   case Triple::x86_64:
     link_ELF_x86_64(std::move(G), std::move(Ctx));
     return;
+  case Triple::x86:
+    link_ELF_i386(std::move(G), std::move(Ctx));
+    return;
   default:
     Ctx->notifyFailed(make_error<JITLinkError>(
         "Unsupported target machine architecture in ELF link graph " +
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_i386.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_i386.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_i386.cpp
@@ -0,0 +1,116 @@
+//===----- ELF_i386.cpp - JIT linker implementation for ELF/i386 ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// ELF/i386 jit-link implementation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/JITLink/ELF_i386.h"
+#include "ELFLinkGraphBuilder.h"
+#include "JITLinkGeneric.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/ExecutionEngine/JITLink/i386.h"
+#include "llvm/Object/ELFObjectFile.h"
+
+#define DEBUG_TYPE "jitlink"
+
+using namespace llvm;
+using namespace llvm::jitlink;
+
+namespace llvm {
+namespace jitlink {
+
+class ELFJITLinker_i386 : public JITLinker<ELFJITLinker_i386> {
+  friend class JITLinker<ELFJITLinker_i386>;
+
+public:
+  ELFJITLinker_i386(std::unique_ptr<JITLinkContext> Ctx,
+                    std::unique_ptr<LinkGraph> G, PassConfiguration PassConfig)
+      : JITLinker(std::move(Ctx), std::move(G), std::move(PassConfig)) {}
+
+private:
+  Error applyFixup(LinkGraph &G, Block &B, const Edge &E) const {
+    using namespace i386;
+    using namespace llvm::support;
+
+    switch (E.getKind()) {
+    case i386::None: {
+      break;
+    }
+    }
+    return Error::success();
+  }
+};
+
+template <typename ELFT>
+class ELFLinkGraphBuilder_i386 : public ELFLinkGraphBuilder<ELFT> {
+private:
+  static Expected<i386::EdgeKind_i386> getRelocationKind(const uint32_t Type) {
+    using namespace i386;
+    switch (Type) {
+    case ELF::R_386_NONE:
+      return EdgeKind_i386::None;
+    }
+
+    return make_error<JITLinkError>("Unsupported i386 relocation:" +
+                                    formatv("{0:d}", Type));
+  }
+
+  Error addRelocations() override {
+    LLVM_DEBUG(dbgs() << "Adding relocations\n");
+    using Base = ELFLinkGraphBuilder<ELFT>;
+
+    return Error::success();
+  }
+
+public:
+  ELFLinkGraphBuilder_i386(StringRef FileName, const object::ELFFile<ELFT> &Obj,
+                           const Triple T)
+      : ELFLinkGraphBuilder<ELFT>(Obj, std::move(T), FileName,
+                                  i386::getEdgeKindName) {}
+};
+
+Expected<std::unique_ptr<LinkGraph>>
+createLinkGraphFromELFObject_i386(MemoryBufferRef ObjectBuffer) {
+  LLVM_DEBUG({
+    dbgs() << "Building jitlink graph for new input "
+           << ObjectBuffer.getBufferIdentifier() << "...\n";
+  });
+
+  auto ELFObj = object::ObjectFile::createELFObjectFile(ObjectBuffer);
+  if (!ELFObj)
+    return ELFObj.takeError();
+
+  assert((*ELFObj)->getArch() == Triple::x86 &&
+         "Only i386 (little endian) is supported for now");
+
+  auto &ELFObjFile = cast<object::ELFObjectFile<object::ELF32LE>>(**ELFObj);
+  return ELFLinkGraphBuilder_i386<object::ELF32LE>((*ELFObj)->getFileName(),
+                                                   ELFObjFile.getELFFile(),
+                                                   (*ELFObj)->makeTriple())
+      .buildGraph();
+}
+
+void link_ELF_i386(std::unique_ptr<LinkGraph> G,
+                   std::unique_ptr<JITLinkContext> Ctx) {
+  PassConfiguration Config;
+  const Triple &TT = G->getTargetTriple();
+  if (Ctx->shouldAddDefaultTargetPasses(TT)) {
+    if (auto MarkLive = Ctx->getMarkLivePass(TT))
+      Config.PrePrunePasses.push_back(std::move(MarkLive));
+    else
+      Config.PrePrunePasses.push_back(markAllSymbolsLive);
+  }
+  if (auto Err = Ctx->modifyPassConfig(*G, Config))
+    return Ctx->notifyFailed(std::move(Err));
+
+  ELFJITLinker_i386::link(std::move(Ctx), std::move(G), std::move(Config));
+}
+
+} // namespace jitlink
+} // namespace llvm
\ No newline at end of file
diff --git a/llvm/lib/ExecutionEngine/JITLink/i386.cpp b/llvm/lib/ExecutionEngine/JITLink/i386.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/JITLink/i386.cpp
@@ -0,0 +1,30 @@
+//===---- i386.cpp - Generic JITLink i386 edge kinds, utilities -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Generic utilities for graphs representing i386 objects.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/JITLink/i386.h"
+
+#define DEBUG_TYPE "jitlink"
+
+namespace llvm {
+namespace jitlink {
+namespace i386 {
+
+const char *getEdgeKindName(Edge::Kind K) {
+  switch (K) {
+  case None:
+    return "None";
+  }
+  return getGenericEdgeKindName(K);
+}
+} // namespace i386
+} // namespace jitlink
+} // namespace llvm
\ No newline at end of file
diff --git a/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp b/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
--- a/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
+#include "llvm/ExecutionEngine/JITLink/x86_64.h"
 #include "llvm/ExecutionEngine/Orc/Layer.h"
 #include "llvm/ExecutionEngine/Orc/ObjectFileInterface.h"
 #include "llvm/IR/Constants.h"
@@ -350,7 +351,6 @@
 Error StaticLibraryDefinitionGenerator::tryToGenerate(
     LookupState &LS, LookupKind K, JITDylib &JD,
     JITDylibLookupFlags JDLookupFlags, const SymbolLookupSet &Symbols) {
-
   // Don't materialize symbols from static archives unless this is a static
   // lookup.
   if (K != LookupKind::Static)
@@ -430,5 +430,121 @@
     Err = buildObjectFilesMap();
 }
 
+std::unique_ptr<DLLImportDefinitionGenerator>
+DLLImportDefinitionGenerator::Create(ExecutionSession &ES,
+                                     ObjectLinkingLayer &L) {
+  return std::unique_ptr<DLLImportDefinitionGenerator>(
+      new DLLImportDefinitionGenerator(ES, L));
+}
+
+Error DLLImportDefinitionGenerator::tryToGenerate(
+    LookupState &LS, LookupKind K, JITDylib &JD,
+    JITDylibLookupFlags JDLookupFlags, const SymbolLookupSet &Symbols) {
+  JITDylibSearchOrder LinkOrder;
+  JD.withLinkOrderDo([&](const JITDylibSearchOrder &LO) {
+    LinkOrder.reserve(LO.size());
+    for (auto &KV : LO) {
+      if (KV.first == &JD)
+        continue;
+      LinkOrder.push_back(KV);
+    }
+  });
+
+  // FIXME: if regular symbol name start with __imp_ we have to issue lookup of
+  // both __imp_ and stripped name and use the lookup information to resolve the
+  // real symbol name.
+  SymbolLookupSet LookupSet;
+  DenseMap<StringRef, SymbolLookupFlags> ToLookUpSymbols;
+  for (auto &KV : Symbols) {
+    StringRef Deinterned = *KV.first;
+    if (Deinterned.startswith(getImpPrefix()))
+      Deinterned = Deinterned.drop_front(StringRef(getImpPrefix()).size());
+    // Don't degrade the required state
+    if (ToLookUpSymbols.count(Deinterned) &&
+        ToLookUpSymbols[Deinterned] == SymbolLookupFlags::RequiredSymbol)
+      continue;
+    ToLookUpSymbols[Deinterned] = KV.second;
+  }
+
+  for (auto &KV : ToLookUpSymbols)
+    LookupSet.add(ES.intern(KV.first), KV.second);
+
+  auto Resolved =
+      ES.lookup(LinkOrder, LookupSet, LookupKind::DLSym, SymbolState::Resolved);
+  if (!Resolved)
+    return Resolved.takeError();
+
+  auto G = createStubsGraph(*Resolved);
+  if (!G)
+    return G.takeError();
+  return L.add(JD, std::move(*G));
+}
+
+Expected<unsigned>
+DLLImportDefinitionGenerator::getTargetPointerSize(const Triple &TT) {
+  switch (TT.getArch()) {
+  case Triple::x86_64:
+    return 8;
+  default:
+    return make_error<StringError>(
+        "architecture unsupported by DLLImportDefinitionGenerator",
+        inconvertibleErrorCode());
+  }
+}
+
+Expected<support::endianness>
+DLLImportDefinitionGenerator::getTargetEndianness(const Triple &TT) {
+  switch (TT.getArch()) {
+  case Triple::x86_64:
+    return support::endianness::little;
+  default:
+    return make_error<StringError>(
+        "architecture unsupported by DLLImportDefinitionGenerator",
+        inconvertibleErrorCode());
+  }
+}
+
+Expected<std::unique_ptr<jitlink::LinkGraph>>
+DLLImportDefinitionGenerator::createStubsGraph(const SymbolMap &Resolved) {
+  Triple TT = ES.getExecutorProcessControl().getTargetTriple();
+  auto PointerSize = getTargetEndianness(TT);
+  if (!PointerSize)
+    return PointerSize.takeError();
+  auto Endianness = getTargetEndianness(TT);
+  if (!Endianness)
+    return Endianness.takeError();
+
+  auto G = std::make_unique<jitlink::LinkGraph>(
+      "<DLLIMPORT_STUBS>", TT, *PointerSize, *Endianness,
+      jitlink::getGenericEdgeKindName);
+  jitlink::Section &Sec = G->createSection(
+      getSectionName(), jitlink::MemProt::Read | jitlink::MemProt::Exec);
+
+  for (auto &KV : Resolved) {
+    jitlink::Symbol &Target = G->addAbsoluteSymbol(
+        *KV.first, ExecutorAddr(KV.second.getAddress()), *PointerSize,
+        jitlink::Linkage::Strong, jitlink::Scope::Local, false);
+
+    // Create __imp_ symbol
+    jitlink::Symbol &Ptr =
+        jitlink::x86_64::createAnonymousPointer(*G, Sec, &Target);
+    auto NameCopy = G->allocateString(Twine(getImpPrefix()) + *KV.first);
+    StringRef NameCopyRef = StringRef(NameCopy.data(), NameCopy.size());
+    Ptr.setName(NameCopyRef);
+    Ptr.setLinkage(jitlink::Linkage::Strong);
+    Ptr.setScope(jitlink::Scope::Default);
+
+    // Create PLT stub
+    // FIXME: check PLT stub of data symbol is not accessed
+    jitlink::Block &StubBlock =
+        jitlink::x86_64::createPointerJumpStubBlock(*G, Sec, Ptr);
+    G->addDefinedSymbol(StubBlock, 0, *KV.first, StubBlock.getSize(),
+                        jitlink::Linkage::Strong, jitlink::Scope::Default, true,
+                        false);
+  }
+
+  return std::move(G);
+}
+
 } // End namespace orc.
 } // End namespace llvm.
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -412,21 +412,20 @@
   // MDNode. This loop also initializes DILocationReachable, later
   // needed by updateLoopMetadataDebugLocationsImpl; the use of
   // count_if avoids an early exit.
-  if (!std::count_if(N->op_begin() + 1, N->op_end(),
-                     [&Visited, &DILocationReachable](const MDOperand &Op) {
-                       return isDILocationReachable(
-                                  Visited, DILocationReachable, Op.get());
-                     }))
+  if (llvm::none_of(llvm::drop_begin(N->operands()),
+                    [&Visited, &DILocationReachable](const MDOperand &Op) {
+                      return isDILocationReachable(Visited, DILocationReachable,
+                                                   Op.get());
+                    }))
     return N;
 
   // If there is only the debug location without any actual loop metadata, we
   // can remove the metadata.
-  if (std::all_of(
-          N->op_begin() + 1, N->op_end(),
-          [&Visited, &DILocationReachable](const MDOperand &Op) {
-            return isDILocationReachable(Visited, DILocationReachable,
-                                         Op.get());
-          }))
+  if (llvm::all_of(llvm::drop_begin(N->operands()),
+                   [&Visited, &DILocationReachable](const MDOperand &Op) {
+                     return isDILocationReachable(Visited, DILocationReachable,
+                                                  Op.get());
+                   }))
     return nullptr;
 
   return updateLoopMetadataDebugLocationsImpl(
diff --git a/llvm/lib/MC/MCContext.cpp b/llvm/lib/MC/MCContext.cpp
--- a/llvm/lib/MC/MCContext.cpp
+++ b/llvm/lib/MC/MCContext.cpp
@@ -869,7 +869,7 @@
   // Remap compilation directory.
   remapDebugPath(CompilationDir);
 
-  // Remap MCDwarfDirs in all compilation units.
+  // Remap MCDwarfDirs and RootFile.Name in all compilation units.
   SmallString<256> P;
   for (auto &CUIDTablePair : MCDwarfLineTablesCUMap) {
     for (auto &Dir : CUIDTablePair.second.getMCDwarfDirs()) {
@@ -877,6 +877,12 @@
       remapDebugPath(P);
       Dir = std::string(P);
     }
+
+    // Used by DW_TAG_compile_unit's DT_AT_name and DW_TAG_label's
+    // DW_AT_decl_file for DWARF v5 generated for assembly source.
+    P = CUIDTablePair.second.getRootFile().Name;
+    remapDebugPath(P);
+    CUIDTablePair.second.getRootFile().Name = std::string(P);
   }
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -8235,15 +8235,17 @@
             Swap = true;
           }
         }
-        // 64-bit check whether we can use CSINC. To avoid signed integer
-        // overflow the condition ignores wrap around, which is already
-        // handled by CSINV above.
-      } else if (1 ==
-                 std::max(TrueVal, FalseVal) - std::min(TrueVal, FalseVal)) {
-        Opcode = AArch64ISD::CSINC;
-
-        if (TrueVal > FalseVal) {
-          Swap = true;
+      } else {
+        // 64-bit check whether we can use CSINC.
+        const uint64_t TrueVal64 = TrueVal;
+        const uint64_t FalseVal64 = FalseVal;
+
+        if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) {
+          Opcode = AArch64ISD::CSINC;
+
+          if (TrueVal > FalseVal) {
+            Swap = true;
+          }
         }
       }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -338,9 +338,9 @@
 
 static bool canFitIntoPipeline(SUnit &SU, ScheduleDAGInstrs *DAG,
                                DenseSet<SUnit *> &ConflictedInstrs) {
-  return std::all_of(
-      ConflictedInstrs.begin(), ConflictedInstrs.end(),
-      [DAG, &SU](SUnit *SuccSU) { return DAG->canAddEdge(SuccSU, &SU); });
+  return llvm::all_of(ConflictedInstrs, [DAG, &SU](SUnit *SuccSU) {
+    return DAG->canAddEdge(SuccSU, &SU);
+  });
 }
 
 void SchedGroup::initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -7679,10 +7679,9 @@
   // extend that single value
   SDValue FirstOp = Op.getOperand(0);
   if (!isa<ConstantSDNode>(FirstOp) &&
-      std::all_of(std::next(Op->op_begin()), Op->op_end(),
-                  [&FirstOp](SDUse &U) {
-                    return U.get().isUndef() || U.get() == FirstOp;
-                  })) {
+      llvm::all_of(llvm::drop_begin(Op->ops()), [&FirstOp](const SDUse &U) {
+        return U.get().isUndef() || U.get() == FirstOp;
+      })) {
     SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp,
                               DAG.getValueType(MVT::i1));
     return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext);
diff --git a/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp b/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
--- a/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
@@ -648,45 +648,28 @@
   MachineInstr &MI = *MBBI;
   Register DstLoReg, DstHiReg;
   Register DstReg = MI.getOperand(0).getReg();
-  Register TmpReg = 0; // 0 for no temporary register
   Register SrcReg = MI.getOperand(1).getReg();
   bool SrcIsKill = MI.getOperand(1).isKill();
   unsigned OpLo = AVR::LDRdPtr;
   unsigned OpHi = AVR::LDDRdPtrQ;
   TRI->splitReg(DstReg, DstLoReg, DstHiReg);
 
-  // Use a temporary register if src and dst registers are the same.
-  if (DstReg == SrcReg)
-    TmpReg = scavengeGPR8(MI);
-
-  Register CurDstLoReg = (DstReg == SrcReg) ? TmpReg : DstLoReg;
-  Register CurDstHiReg = (DstReg == SrcReg) ? TmpReg : DstHiReg;
+  // DstReg has an earlyclobber so the register allocator will allocate them in
+  // separate registers.
+  assert(DstReg != SrcReg && "Dst and Src registers are the same!");
 
   // Load low byte.
-  auto MIBLO = buildMI(MBB, MBBI, OpLo)
-                   .addReg(CurDstLoReg, RegState::Define)
-                   .addReg(SrcReg);
-
-  // Push low byte onto stack if necessary.
-  if (TmpReg)
-    buildMI(MBB, MBBI, AVR::PUSHRr).addReg(TmpReg);
+  buildMI(MBB, MBBI, OpLo)
+      .addReg(DstLoReg, RegState::Define)
+      .addReg(SrcReg)
+      .setMemRefs(MI.memoperands());
 
   // Load high byte.
-  auto MIBHI = buildMI(MBB, MBBI, OpHi)
-                   .addReg(CurDstHiReg, RegState::Define)
-                   .addReg(SrcReg, getKillRegState(SrcIsKill))
-                   .addImm(1);
-
-  if (TmpReg) {
-    // Move the high byte into the final destination.
-    buildMI(MBB, MBBI, AVR::MOVRdRr, DstHiReg).addReg(TmpReg);
-
-    // Move the low byte from the scratch space into the final destination.
-    buildMI(MBB, MBBI, AVR::POPRd, DstLoReg);
-  }
-
-  MIBLO.setMemRefs(MI.memoperands());
-  MIBHI.setMemRefs(MI.memoperands());
+  buildMI(MBB, MBBI, OpHi)
+      .addReg(DstHiReg, RegState::Define)
+      .addReg(SrcReg, getKillRegState(SrcIsKill))
+      .addImm(1)
+      .setMemRefs(MI.memoperands());
 
   MI.eraseFromParent();
   return true;
@@ -763,7 +746,6 @@
   MachineInstr &MI = *MBBI;
   Register DstLoReg, DstHiReg;
   Register DstReg = MI.getOperand(0).getReg();
-  Register TmpReg = 0; // 0 for no temporary register
   Register SrcReg = MI.getOperand(1).getReg();
   unsigned Imm = MI.getOperand(2).getImm();
   bool SrcIsKill = MI.getOperand(1).isKill();
@@ -775,39 +757,23 @@
   // highest Imm value allowed for the instruction, 62 is the limit here.
   assert(Imm <= 62 && "Offset is out of range");
 
-  // Use a temporary register if src and dst registers are the same.
-  if (DstReg == SrcReg)
-    TmpReg = scavengeGPR8(MI);
-
-  Register CurDstLoReg = (DstReg == SrcReg) ? TmpReg : DstLoReg;
-  Register CurDstHiReg = (DstReg == SrcReg) ? TmpReg : DstHiReg;
+  // DstReg has an earlyclobber so the register allocator will allocate them in
+  // separate registers.
+  assert(DstReg != SrcReg && "Dst and Src registers are the same!");
 
   // Load low byte.
-  auto MIBLO = buildMI(MBB, MBBI, OpLo)
-                   .addReg(CurDstLoReg, RegState::Define)
-                   .addReg(SrcReg)
-                   .addImm(Imm);
-
-  // Push low byte onto stack if necessary.
-  if (TmpReg)
-    buildMI(MBB, MBBI, AVR::PUSHRr).addReg(TmpReg);
+  buildMI(MBB, MBBI, OpLo)
+      .addReg(DstLoReg, RegState::Define)
+      .addReg(SrcReg)
+      .addImm(Imm)
+      .setMemRefs(MI.memoperands());
 
   // Load high byte.
-  auto MIBHI = buildMI(MBB, MBBI, OpHi)
-                   .addReg(CurDstHiReg, RegState::Define)
-                   .addReg(SrcReg, getKillRegState(SrcIsKill))
-                   .addImm(Imm + 1);
-
-  if (TmpReg) {
-    // Move the high byte into the final destination.
-    buildMI(MBB, MBBI, AVR::MOVRdRr, DstHiReg).addReg(TmpReg);
-
-    // Move the low byte from the scratch space into the final destination.
-    buildMI(MBB, MBBI, AVR::POPRd, DstLoReg);
-  }
-
-  MIBLO.setMemRefs(MI.memoperands());
-  MIBHI.setMemRefs(MI.memoperands());
+  buildMI(MBB, MBBI, OpHi)
+      .addReg(DstHiReg, RegState::Define)
+      .addReg(SrcReg, getKillRegState(SrcIsKill))
+      .addImm(Imm + 1)
+      .setMemRefs(MI.memoperands());
 
   MI.eraseFromParent();
   return true;
@@ -1382,8 +1348,8 @@
                  .addReg(DstReg, getKillRegState(DstIsKill))
                  .addReg(ZERO_REGISTER);
 
-  // SREG is always implicitly killed
-  MIB->getOperand(2).setIsKill();
+  MIB->getOperand(3).setIsDead(); // SREG is always dead
+  MIB->getOperand(4).setIsKill(); // SREG is always implicitly killed
 
   MI.eraseFromParent();
   return true;
diff --git a/llvm/lib/Target/AVR/AVRFrameLowering.cpp b/llvm/lib/Target/AVR/AVRFrameLowering.cpp
--- a/llvm/lib/Target/AVR/AVRFrameLowering.cpp
+++ b/llvm/lib/Target/AVR/AVRFrameLowering.cpp
@@ -56,6 +56,7 @@
   const AVRSubtarget &STI = MF.getSubtarget<AVRSubtarget>();
   const AVRInstrInfo &TII = *STI.getInstrInfo();
   const AVRMachineFunctionInfo *AFI = MF.getInfo<AVRMachineFunctionInfo>();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
   bool HasFP = hasFP(MF);
 
   // Interrupt handlers re-enable interrupts in function entry.
@@ -68,8 +69,8 @@
   // Emit special prologue code to save R1, R0 and SREG in interrupt/signal
   // handlers before saving any other registers.
   if (AFI->isInterruptOrSignalHandler()) {
-    BuildMI(MBB, MBBI, DL, TII.get(AVR::PUSHWRr))
-        .addReg(AVR::R1R0, RegState::Kill)
+    BuildMI(MBB, MBBI, DL, TII.get(AVR::PUSHRr))
+        .addReg(AVR::R0, RegState::Kill)
         .setMIFlag(MachineInstr::FrameSetup);
 
     BuildMI(MBB, MBBI, DL, TII.get(AVR::INRdA), AVR::R0)
@@ -78,11 +79,16 @@
     BuildMI(MBB, MBBI, DL, TII.get(AVR::PUSHRr))
         .addReg(AVR::R0, RegState::Kill)
         .setMIFlag(MachineInstr::FrameSetup);
-    BuildMI(MBB, MBBI, DL, TII.get(AVR::EORRdRr))
-        .addReg(AVR::R1, RegState::Define)
-        .addReg(AVR::R1, RegState::Kill)
-        .addReg(AVR::R1, RegState::Kill)
-        .setMIFlag(MachineInstr::FrameSetup);
+    if (!MRI.reg_empty(AVR::R1)) {
+      BuildMI(MBB, MBBI, DL, TII.get(AVR::PUSHRr))
+          .addReg(AVR::R1, RegState::Kill)
+          .setMIFlag(MachineInstr::FrameSetup);
+      BuildMI(MBB, MBBI, DL, TII.get(AVR::EORRdRr))
+          .addReg(AVR::R1, RegState::Define)
+          .addReg(AVR::R1, RegState::Kill)
+          .addReg(AVR::R1, RegState::Kill)
+          .setMIFlag(MachineInstr::FrameSetup);
+    }
   }
 
   // Early exit if the frame pointer is not needed in this function.
@@ -132,6 +138,7 @@
 
 static void restoreStatusRegister(MachineFunction &MF, MachineBasicBlock &MBB) {
   const AVRMachineFunctionInfo *AFI = MF.getInfo<AVRMachineFunctionInfo>();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
 
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
 
@@ -142,11 +149,14 @@
   // Emit special epilogue code to restore R1, R0 and SREG in interrupt/signal
   // handlers at the very end of the function, just before reti.
   if (AFI->isInterruptOrSignalHandler()) {
+    if (!MRI.reg_empty(AVR::R1)) {
+      BuildMI(MBB, MBBI, DL, TII.get(AVR::POPRd), AVR::R1);
+    }
     BuildMI(MBB, MBBI, DL, TII.get(AVR::POPRd), AVR::R0);
     BuildMI(MBB, MBBI, DL, TII.get(AVR::OUTARr))
         .addImm(STI.getIORegSREG())
         .addReg(AVR::R0, RegState::Kill);
-    BuildMI(MBB, MBBI, DL, TII.get(AVR::POPWRd), AVR::R1R0);
+    BuildMI(MBB, MBBI, DL, TII.get(AVR::POPRd), AVR::R0);
   }
 }
 
diff --git a/llvm/lib/Target/AVR/AVRISelLowering.cpp b/llvm/lib/Target/AVR/AVRISelLowering.cpp
--- a/llvm/lib/Target/AVR/AVRISelLowering.cpp
+++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp
@@ -57,6 +57,8 @@
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i8, Expand);
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i16, Expand);
 
+  setOperationAction(ISD::INLINEASM, MVT::Other, Custom);
+
   for (MVT VT : MVT::integer_valuetypes()) {
     for (auto N : {ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}) {
       setLoadExtAction(N, VT, MVT::i1, Promote);
@@ -836,6 +838,52 @@
                       MachinePointerInfo(SV));
 }
 
+// Modify the existing ISD::INLINEASM node to add the implicit register r1.
+SDValue AVRTargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const {
+  SDValue R1Reg = DAG.getRegister(AVR::R1, MVT::i8);
+  if (Op.getOperand(Op.getNumOperands() - 1) == R1Reg ||
+      Op.getOperand(Op.getNumOperands() - 2) == R1Reg) {
+    // R1 has already been added. Don't add it again.
+    // If this isn't handled, we get called over and over again.
+    return Op;
+  }
+
+  // Get a list of operands to the new INLINEASM node. This is mostly a copy,
+  // with some edits.
+  // Add the following operands at the end (but before the glue node, if it's
+  // there):
+  //  - The flags of the implicit R1 register operand.
+  //  - The implicit R1 register operand itself.
+  SDLoc dl(Op);
+  SmallVector<SDValue, 8> Ops;
+  SDNode *N = Op.getNode();
+  SDValue Glue;
+  for (unsigned I = 0; I < N->getNumOperands(); I++) {
+    SDValue Operand = N->getOperand(I);
+    if (Operand.getValueType() == MVT::Glue) {
+      // The glue operand always needs to be at the end, so we need to treat it
+      // specially.
+      Glue = Operand;
+    } else {
+      Ops.push_back(Operand);
+    }
+  }
+  unsigned Flags = InlineAsm::getFlagWord(InlineAsm::Kind_RegUse, 1);
+  Ops.push_back(DAG.getTargetConstant(Flags, dl, MVT::i32));
+  Ops.push_back(R1Reg);
+  if (Glue) {
+    Ops.push_back(Glue);
+  }
+
+  // Replace the current INLINEASM node with a new one that has R1 as implicit
+  // parameter.
+  SDValue New = DAG.getNode(N->getOpcode(), dl, N->getVTList(), Ops);
+  DAG.ReplaceAllUsesOfValueWith(Op, New);
+  DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), New.getValue(1));
+
+  return New;
+}
+
 SDValue AVRTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default:
@@ -861,6 +909,8 @@
   case ISD::SDIVREM:
   case ISD::UDIVREM:
     return LowerDivRem(Op, DAG);
+  case ISD::INLINEASM:
+    return LowerINLINEASM(Op, DAG);
   }
 
   return SDValue();
@@ -1451,6 +1501,10 @@
     Ops.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
   }
 
+  // The R1 register must be passed as an implicit register so that R1 is
+  // correctly zeroed in interrupts.
+  Ops.push_back(DAG.getRegister(AVR::R1, MVT::i8));
+
   // Add a register mask operand representing the call-preserved registers.
   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
   const uint32_t *Mask =
@@ -1572,6 +1626,14 @@
 
   const AVRMachineFunctionInfo *AFI = MF.getInfo<AVRMachineFunctionInfo>();
 
+  if (!AFI->isInterruptOrSignalHandler()) {
+    // The return instruction has an implicit R1 operand: it must contain zero
+    // on return.
+    // This is not needed in interrupts however, where R1 is handled specially
+    // (only pushed/popped when needed).
+    RetOps.push_back(DAG.getRegister(AVR::R1, MVT::i8));
+  }
+
   unsigned RetOpc =
       AFI->isInterruptOrSignalHandler() ? AVRISD::RETI_FLAG : AVRISD::RET_FLAG;
 
diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.td b/llvm/lib/Target/AVR/AVRInstrInfo.td
--- a/llvm/lib/Target/AVR/AVRInstrInfo.td
+++ b/llvm/lib/Target/AVR/AVRInstrInfo.td
@@ -915,6 +915,7 @@
   // neg Rd+1
   // neg Rd
   // sbc Rd+1, r1
+  let Uses = [R1] in
   def NEGWRd : Pseudo<(outs DREGS
                        : $rd),
                       (ins DREGS
@@ -1986,6 +1987,7 @@
   def ASRWLoRd : Pseudo<(outs DREGS:$rd), (ins DREGS:$src), "asrwlo\t$rd",
                         [(set i16:$rd, (AVRasrlo i16:$src)), (implicit SREG)]>;
 
+  let Uses = [R1] in
   def ROLBRd : Pseudo<(outs GPR8
                        : $rd),
                       (ins GPR8
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -39,8 +39,8 @@
   // Do not save RA to the SCS if it's not saved to the regular stack,
   // i.e. RA is not at risk of being overwritten.
   std::vector<CalleeSavedInfo> &CSI = MF.getFrameInfo().getCalleeSavedInfo();
-  if (std::none_of(CSI.begin(), CSI.end(),
-                   [&](CalleeSavedInfo &CSR) { return CSR.getReg() == RAReg; }))
+  if (llvm::none_of(
+          CSI, [&](CalleeSavedInfo &CSR) { return CSR.getReg() == RAReg; }))
     return;
 
   Register SCSPReg = RISCVABI::getSCSPReg();
@@ -89,8 +89,8 @@
 
   // See emitSCSPrologue() above.
   std::vector<CalleeSavedInfo> &CSI = MF.getFrameInfo().getCalleeSavedInfo();
-  if (std::none_of(CSI.begin(), CSI.end(),
-                   [&](CalleeSavedInfo &CSR) { return CSR.getReg() == RAReg; }))
+  if (llvm::none_of(
+          CSI, [&](CalleeSavedInfo &CSR) { return CSR.getReg() == RAReg; }))
     return;
 
   Register SCSPReg = RISCVABI::getSCSPReg();
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1104,6 +1104,8 @@
 // On RV32, 64-bit integers are split into their high and low parts and held
 // in two different registers, so the trunc is free since the low register can
 // just be used.
+// FIXME: Should we consider i64->i32 free on RV64 to match the EVT version of
+// isTruncateFree?
 bool RISCVTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const {
   if (Subtarget.is64Bit() || !SrcTy->isIntegerTy() || !DstTy->isIntegerTy())
     return false;
@@ -1113,8 +1115,10 @@
 }
 
 bool RISCVTargetLowering::isTruncateFree(EVT SrcVT, EVT DstVT) const {
-  if (Subtarget.is64Bit() || SrcVT.isVector() || DstVT.isVector() ||
-      !SrcVT.isInteger() || !DstVT.isInteger())
+  // We consider i64->i32 free on RV64 since we have good selection of W
+  // instructions that make promoting operations back to i64 free in many cases.
+  if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() ||
+      !DstVT.isInteger())
     return false;
   unsigned SrcBits = SrcVT.getSizeInBits();
   unsigned DestBits = DstVT.getSizeInBits();
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -29735,8 +29735,22 @@
 
   uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
 
-  if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
+  if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) {
+    // Hardware support for vector shifts is sparse which makes us scalarize the
+    // vector operations in many cases. Also, on sandybridge ADD is faster than
+    // shl: (shl V, 1) -> (add (freeze V), (freeze V))
+    if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
+      // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
+      // must be 0). (add undef, undef) however can be any value. To make this
+      // safe, we must freeze R to ensure that register allocation uses the same
+      // register for an undefined value. This ensures that the result will
+      // still be even and preserves the original semantics.
+      R = DAG.getFreeze(R);
+      return DAG.getNode(ISD::ADD, dl, VT, R, R);
+    }
+
     return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
+  }
 
   // i64 SRA needs to be performed as partial shifts.
   if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
@@ -46674,20 +46688,6 @@
     }
   }
 
-  // Hardware support for vector shifts is sparse which makes us scalarize the
-  // vector operations in many cases. Also, on sandybridge ADD is faster than
-  // shl.
-  // (shl V, 1) -> add V,V
-  if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
-    if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
-      assert(N0.getValueType().isVector() && "Invalid vector shift type");
-      // We shift all of the values by one. In many cases we do not have
-      // hardware support for this operation. This is better expressed as an ADD
-      // of two values.
-      if (N1SplatC->isOne())
-        return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
-    }
-
   return SDValue();
 }
 
@@ -47269,12 +47269,18 @@
                                    TargetLowering::DAGCombinerInfo &DCI,
                                    const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
-  assert(((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) ||
-          (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) ||
-          N->getOpcode() == ISD::INSERT_VECTOR_ELT) &&
+  unsigned Opcode = N->getOpcode();
+  assert(((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) ||
+          (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) ||
+          Opcode == ISD::INSERT_VECTOR_ELT) &&
          "Unexpected vector insertion");
 
-  if (N->getOpcode() == X86ISD::PINSRB || N->getOpcode() == X86ISD::PINSRW) {
+  // Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt).
+  if (Opcode == ISD::INSERT_VECTOR_ELT && N->getOperand(0).isUndef() &&
+      isNullConstant(N->getOperand(2)))
+    return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, N->getOperand(1));
+
+  if (Opcode == X86ISD::PINSRB || Opcode == X86ISD::PINSRW) {
     unsigned NumBitsPerElt = VT.getScalarSizeInBits();
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     if (TLI.SimplifyDemandedBits(SDValue(N, 0),
diff --git a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
--- a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
+++ b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -546,54 +546,8 @@
     writeThinLinkBitcodeToFile(M, *ThinLinkOS, *Index, ModHash);
 }
 
-class WriteThinLTOBitcode : public ModulePass {
-  raw_ostream &OS; // raw_ostream to print on
-  // The output stream on which to emit a minimized module for use
-  // just in the thin link, if requested.
-  raw_ostream *ThinLinkOS = nullptr;
-
-public:
-  static char ID; // Pass identification, replacement for typeid
-  WriteThinLTOBitcode() : ModulePass(ID), OS(dbgs()) {
-    initializeWriteThinLTOBitcodePass(*PassRegistry::getPassRegistry());
-  }
-
-  explicit WriteThinLTOBitcode(raw_ostream &o, raw_ostream *ThinLinkOS)
-      : ModulePass(ID), OS(o), ThinLinkOS(ThinLinkOS) {
-    initializeWriteThinLTOBitcodePass(*PassRegistry::getPassRegistry());
-  }
-
-  StringRef getPassName() const override { return "ThinLTO Bitcode Writer"; }
-
-  bool runOnModule(Module &M) override {
-    const ModuleSummaryIndex *Index =
-        &(getAnalysis<ModuleSummaryIndexWrapperPass>().getIndex());
-    writeThinLTOBitcode(OS, ThinLinkOS, LegacyAARGetter(*this), M, Index);
-    return true;
-  }
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesAll();
-    AU.addRequired<AssumptionCacheTracker>();
-    AU.addRequired<ModuleSummaryIndexWrapperPass>();
-    AU.addRequired<TargetLibraryInfoWrapperPass>();
-  }
-};
 } // anonymous namespace
 
-char WriteThinLTOBitcode::ID = 0;
-INITIALIZE_PASS_BEGIN(WriteThinLTOBitcode, "write-thinlto-bitcode",
-                      "Write ThinLTO Bitcode", false, true)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(ModuleSummaryIndexWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(WriteThinLTOBitcode, "write-thinlto-bitcode",
-                    "Write ThinLTO Bitcode", false, true)
-
-ModulePass *llvm::createWriteThinLTOBitcodePass(raw_ostream &Str,
-                                                raw_ostream *ThinLinkOS) {
-  return new WriteThinLTOBitcode(Str, ThinLinkOS);
-}
-
 PreservedAnalyses
 llvm::ThinLTOBitcodeWriterPass::run(Module &M, ModuleAnalysisManager &AM) {
   FunctionAnalysisManager &FAM =
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -3185,6 +3185,20 @@
     if (auto *II = dyn_cast<IntrinsicInst>(Cmp.getOperand(0)))
       if (Instruction *I = foldICmpIntrinsicWithConstant(Cmp, II, *C))
         return I;
+
+    // (extractval ([s/u]subo X, Y), 0) == 0 --> X == Y
+    // (extractval ([s/u]subo X, Y), 0) != 0 --> X != Y
+    // TODO: This checks one-use, but that is not strictly necessary.
+    Value *Cmp0 = Cmp.getOperand(0);
+    Value *X, *Y;
+    if (C->isZero() && Cmp.isEquality() && Cmp0->hasOneUse() &&
+        (match(Cmp0,
+               m_ExtractValue<0>(m_Intrinsic<Intrinsic::ssub_with_overflow>(
+                   m_Value(X), m_Value(Y)))) ||
+         match(Cmp0,
+               m_ExtractValue<0>(m_Intrinsic<Intrinsic::usub_with_overflow>(
+                   m_Value(X), m_Value(Y))))))
+      return new ICmpInst(Cmp.getPredicate(), X, Y);
   }
 
   if (match(Cmp.getOperand(1), m_APIntAllowUndef(C)))
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -473,7 +473,8 @@
 static ConstantInt *GetConstantInt(Value *V, const DataLayout &DL) {
   // Normal constant int.
   ConstantInt *CI = dyn_cast<ConstantInt>(V);
-  if (CI || !isa<Constant>(V) || !V->getType()->isPointerTy())
+  if (CI || !isa<Constant>(V) || !V->getType()->isPointerTy() ||
+      DL.isNonIntegralPointerType(V->getType()))
     return CI;
 
   // This is some kind of pointer constant. Turn it into a pointer-sized
diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
--- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -2908,6 +2908,60 @@
   return nullptr;
 }
 
+// Transform an snprintf call CI with the bound N to format the string Str
+// either to a call to memcpy, or to single character a store, or to nothing,
+// and fold the result to a constant.  A nonnull StrArg refers to the string
+// argument being formatted.  Otherwise the call is one with N < 2 and
+// the "%c" directive to format a single character.
+Value *LibCallSimplifier::emitSnPrintfMemCpy(CallInst *CI, Value *StrArg,
+                                             StringRef Str, uint64_t N,
+                                             IRBuilderBase &B) {
+  assert(StrArg || (N < 2 && Str.size() == 1));
+
+  unsigned IntBits = TLI->getIntSize();
+  uint64_t IntMax = maxIntN(IntBits);
+  if (Str.size() > IntMax)
+    // Bail if the string is longer than INT_MAX.  POSIX requires
+    // implementations to set errno to EOVERFLOW in this case, in
+    // addition to when N is larger than that (checked by the caller).
+    return nullptr;
+
+  Value *StrLen = ConstantInt::get(CI->getType(), Str.size());
+  if (N == 0)
+    return StrLen;
+
+  // Set to the number of bytes to copy fron StrArg which is also
+  // the offset of the terinating nul.
+  uint64_t NCopy;
+  if (N > Str.size())
+    // Copy the full string, including the terminating nul (which must
+    // be present regardless of the bound).
+    NCopy = Str.size() + 1;
+  else
+    NCopy = N - 1;
+
+  Value *DstArg = CI->getArgOperand(0);
+  if (NCopy && StrArg)
+    // Transform the call to lvm.memcpy(dst, fmt, N).
+    copyFlags(
+         *CI,
+          B.CreateMemCpy(
+                         DstArg, Align(1), StrArg, Align(1),
+              ConstantInt::get(DL.getIntPtrType(CI->getContext()), NCopy)));
+
+  if (N > Str.size())
+    // Return early when the whole format string, including the final nul,
+    // has been copied.
+    return StrLen;
+
+  // Otherwise, when truncating the string append a terminating nul.
+  Type *Int8Ty = B.getInt8Ty();
+  Value *NulOff = B.getIntN(IntBits, NCopy);
+  Value *DstEnd = B.CreateInBoundsGEP(Int8Ty, DstArg, NulOff, "endptr");
+  B.CreateStore(ConstantInt::get(Int8Ty, 0), DstEnd);
+  return StrLen;
+}
+
 Value *LibCallSimplifier::optimizeSnPrintFString(CallInst *CI,
                                                  IRBuilderBase &B) {
   // Check for size
@@ -2916,78 +2970,66 @@
     return nullptr;
 
   uint64_t N = Size->getZExtValue();
+  uint64_t IntMax = maxIntN(TLI->getIntSize());
+  if (N > IntMax)
+    // Bail if the bound exceeds INT_MAX.  POSIX requires implementations
+    // to set errno to EOVERFLOW in this case.
+    return nullptr;
+
+  Value *DstArg = CI->getArgOperand(0);
+  Value *FmtArg = CI->getArgOperand(2);
+
   // Check for a fixed format string.
   StringRef FormatStr;
-  if (!getConstantStringInfo(CI->getArgOperand(2), FormatStr))
+  if (!getConstantStringInfo(FmtArg, FormatStr))
     return nullptr;
 
   // If we just have a format string (nothing else crazy) transform it.
   if (CI->arg_size() == 3) {
-    // Make sure there's no % in the constant array.  We could try to handle
-    // %% -> % in the future if we cared.
     if (FormatStr.contains('%'))
-      return nullptr; // we found a format specifier, bail out.
-
-    if (N == 0)
-      return ConstantInt::get(CI->getType(), FormatStr.size());
-    else if (N < FormatStr.size() + 1)
+      // Bail if the format string contains a directive and there are
+      // no arguments.  We could handle "%%" in the future.
       return nullptr;
 
-    // snprintf(dst, size, fmt) -> llvm.memcpy(align 1 dst, align 1 fmt,
-    // strlen(fmt)+1)
-    copyFlags(
-        *CI,
-        B.CreateMemCpy(
-            CI->getArgOperand(0), Align(1), CI->getArgOperand(2), Align(1),
-            ConstantInt::get(DL.getIntPtrType(CI->getContext()),
-                             FormatStr.size() + 1))); // Copy the null byte.
-    return ConstantInt::get(CI->getType(), FormatStr.size());
+    return emitSnPrintfMemCpy(CI, FmtArg, FormatStr, N, B);
   }
 
   // The remaining optimizations require the format string to be "%s" or "%c"
   // and have an extra operand.
-  if (FormatStr.size() == 2 && FormatStr[0] == '%' && CI->arg_size() == 4) {
-
-    // Decode the second character of the format string.
-    if (FormatStr[1] == 'c') {
-      if (N == 0)
-        return ConstantInt::get(CI->getType(), 1);
-      else if (N == 1)
-        return nullptr;
-
-      // snprintf(dst, size, "%c", chr) --> *(i8*)dst = chr; *((i8*)dst+1) = 0
-      if (!CI->getArgOperand(3)->getType()->isIntegerTy())
-        return nullptr;
-      Value *V = B.CreateTrunc(CI->getArgOperand(3), B.getInt8Ty(), "char");
-      Value *Ptr = castToCStr(CI->getArgOperand(0), B);
-      B.CreateStore(V, Ptr);
-      Ptr = B.CreateInBoundsGEP(B.getInt8Ty(), Ptr, B.getInt32(1), "nul");
-      B.CreateStore(B.getInt8(0), Ptr);
+  if (FormatStr.size() != 2 || FormatStr[0] != '%' || CI->arg_size() != 4)
+    return nullptr;
 
-      return ConstantInt::get(CI->getType(), 1);
+  // Decode the second character of the format string.
+  if (FormatStr[1] == 'c') {
+    if (N <= 1) {
+      // Use an arbitary string of length 1 to transform the call into
+      // either a nul store (N == 1) or a no-op (N == 0) and fold it
+      // to one.
+      StringRef CharStr("*");
+      return emitSnPrintfMemCpy(CI, nullptr, CharStr, N, B);
     }
 
-    if (FormatStr[1] == 's') {
-      // snprintf(dest, size, "%s", str) to llvm.memcpy(dest, str, len+1, 1)
-      StringRef Str;
-      if (!getConstantStringInfo(CI->getArgOperand(3), Str))
-        return nullptr;
+    // snprintf(dst, size, "%c", chr) --> *(i8*)dst = chr; *((i8*)dst+1) = 0
+    if (!CI->getArgOperand(3)->getType()->isIntegerTy())
+      return nullptr;
+    Value *V = B.CreateTrunc(CI->getArgOperand(3), B.getInt8Ty(), "char");
+    Value *Ptr = castToCStr(DstArg, B);
+    B.CreateStore(V, Ptr);
+    Ptr = B.CreateInBoundsGEP(B.getInt8Ty(), Ptr, B.getInt32(1), "nul");
+    B.CreateStore(B.getInt8(0), Ptr);
+    return ConstantInt::get(CI->getType(), 1);
+  }
 
-      if (N == 0)
-        return ConstantInt::get(CI->getType(), Str.size());
-      else if (N < Str.size() + 1)
-        return nullptr;
+  if (FormatStr[1] != 's')
+    return nullptr;
 
-      copyFlags(
-          *CI, B.CreateMemCpy(CI->getArgOperand(0), Align(1),
-                              CI->getArgOperand(3), Align(1),
-                              ConstantInt::get(CI->getType(), Str.size() + 1)));
+  Value *StrArg = CI->getArgOperand(3);
+  // snprintf(dest, size, "%s", str) to llvm.memcpy(dest, str, len+1, 1)
+  StringRef Str;
+  if (!getConstantStringInfo(StrArg, Str))
+    return nullptr;
 
-      // The snprintf result is the unincremented number of bytes in the string.
-      return ConstantInt::get(CI->getType(), Str.size());
-    }
-  }
-  return nullptr;
+  return emitSnPrintfMemCpy(CI, StrArg, Str, N, B);
 }
 
 Value *LibCallSimplifier::optimizeSnPrintF(CallInst *CI, IRBuilderBase &B) {
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -205,7 +205,7 @@
 /// \returns True if the value is a constant (but not globals/constant
 /// expressions).
 static bool isConstant(Value *V) {
-  return isa<Constant>(V) && !isa<ConstantExpr>(V) && !isa<GlobalValue>(V);
+  return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
 }
 
 /// Checks if \p V is one of vector-like instructions, i.e. undef,
@@ -2994,7 +2994,7 @@
           // okay.
           auto *In = BundleMember->Inst;
           assert(In &&
-                 (isa<ExtractValueInst>(In) || isa<ExtractElementInst>(In) ||
+                 (isa<ExtractValueInst, ExtractElementInst>(In) ||
                   In->getNumOperands() == TE->getNumOperands()) &&
                  "Missed TreeEntry operands?");
           (void)In; // fake use to avoid build failure when assertions disabled
@@ -4489,7 +4489,7 @@
   } else if (auto *I = dyn_cast<Instruction>(V)) {
     // Sort other instructions just by the opcodes except for CMPInst.
     // For CMP also sort by the predicate kind.
-    if ((isa<BinaryOperator>(I) || isa<CastInst>(I)) &&
+    if ((isa<BinaryOperator, CastInst>(I)) &&
         isValidForAlternation(I->getOpcode())) {
       if (AllowAlternate)
         Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
@@ -5536,8 +5536,7 @@
   unsigned N = 1;
   Type *EltTy = T;
 
-  while (isa<StructType>(EltTy) || isa<ArrayType>(EltTy) ||
-         isa<VectorType>(EltTy)) {
+  while (isa<StructType, ArrayType, VectorType>(EltTy)) {
     if (auto *ST = dyn_cast<StructType>(EltTy)) {
       // Check that struct is homogeneous.
       for (const auto *Ty : ST->elements())
@@ -5867,9 +5866,9 @@
       // Take credit for instruction that will become dead.
       if (EE->hasOneUse()) {
         Instruction *Ext = EE->user_back();
-        if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
-            all_of(Ext->users(),
-                   [](User *U) { return isa<GetElementPtrInst>(U); })) {
+        if (isa<SExtInst, ZExtInst>(Ext) && all_of(Ext->users(), [](User *U) {
+              return isa<GetElementPtrInst>(U);
+            })) {
           // Use getExtractWithExtendCost() to calculate the cost of
           // extractelement/ext pair.
           Cost -=
@@ -6142,18 +6141,18 @@
           // Take credit for instruction that will become dead.
           if (EI->hasOneUse()) {
             Instruction *Ext = EI->user_back();
-            if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
+            if (isa<SExtInst, ZExtInst>(Ext) &&
                 all_of(Ext->users(),
                        [](User *U) { return isa<GetElementPtrInst>(U); })) {
-              // Use getExtractWithExtendCost() to calculate the cost of
-              // extractelement/ext pair.
-              CommonCost -= TTI->getExtractWithExtendCost(
-                  Ext->getOpcode(), Ext->getType(), VecTy, I);
-              // Add back the cost of s|zext which is subtracted separately.
-              CommonCost += TTI->getCastInstrCost(
-                  Ext->getOpcode(), Ext->getType(), EI->getType(),
-                  TTI::getCastContextHint(Ext), CostKind, Ext);
-              continue;
+            // Use getExtractWithExtendCost() to calculate the cost of
+            // extractelement/ext pair.
+            CommonCost -= TTI->getExtractWithExtendCost(
+                Ext->getOpcode(), Ext->getType(), VecTy, I);
+            // Add back the cost of s|zext which is subtracted separately.
+            CommonCost += TTI->getCastInstrCost(
+                Ext->getOpcode(), Ext->getType(), EI->getType(),
+                TTI::getCastContextHint(Ext), CostKind, Ext);
+            continue;
             }
           }
           CommonCost -=
@@ -9001,8 +9000,8 @@
     for (Instruction &In : llvm::make_early_inc_range(*BB)) {
       if (isDeleted(&In))
         continue;
-      if (!isa<InsertElementInst>(&In) && !isa<ExtractElementInst>(&In) &&
-          !isa<ShuffleVectorInst>(&In) && !GatherShuffleSeq.contains(&In))
+      if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
+          !GatherShuffleSeq.contains(&In))
         continue;
 
       // Check if we can replace this instruction with any of the
@@ -9660,17 +9659,15 @@
 
     // If the current instruction is a load, update MaxWidth to reflect the
     // width of the loaded value.
-    if (isa<LoadInst>(I) || isa<ExtractElementInst>(I) ||
-        isa<ExtractValueInst>(I))
+    if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(I))
       Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
 
     // Otherwise, we need to visit the operands of the instruction. We only
     // handle the interesting cases from buildTree here. If an operand is an
     // instruction we haven't yet visited and from the same basic block as the
     // user or the use is a PHI node, we add it to the worklist.
-    else if (isa<PHINode>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) ||
-             isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I) ||
-             isa<UnaryOperator>(I)) {
+    else if (isa<PHINode, CastInst, GetElementPtrInst, CmpInst, SelectInst,
+                 BinaryOperator, UnaryOperator>(I)) {
       for (Use &U : I->operands())
         if (auto *J = dyn_cast<Instruction>(U.get()))
           if (Visited.insert(J).second &&
@@ -9723,8 +9720,7 @@
     break;
   case Instruction::ZExt:
   case Instruction::SExt:
-    if (isa<ExtractElementInst>(I->getOperand(0)) ||
-        isa<InsertElementInst>(I->getOperand(0)))
+    if (isa<ExtractElementInst, InsertElementInst>(I->getOperand(0)))
       return false;
     break;
 
@@ -10083,7 +10079,7 @@
 
   InstructionCost Cost = R.getTreeCost();
 
-  LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF =" << VF << "\n");
+  LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
   if (Cost < -SLPCostThreshold) {
     LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
 
@@ -10384,6 +10380,7 @@
       CandidateFound = true;
       MinCost = std::min(MinCost, Cost);
 
+      LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
       if (Cost < -SLPCostThreshold) {
         LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
         R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
@@ -10422,8 +10419,7 @@
   if (!I)
     return false;
 
-  if ((!isa<BinaryOperator>(I) && !isa<CmpInst>(I)) ||
-      isa<VectorType>(I->getType()))
+  if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
     return false;
 
   Value *P = I->getParent();
@@ -11224,8 +11220,8 @@
         InstructionCost ReductionCost =
             getReductionCost(TTI, VL, ReduxWidth, RdxFMF);
         InstructionCost Cost = TreeCost + ReductionCost;
+        LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for reduction\n");
         if (!Cost.isValid()) {
-          LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n");
           return nullptr;
         }
         if (Cost >= -SLPCostThreshold) {
@@ -11533,8 +11529,7 @@
         getInsertIndex(LastInsertInst, OperandOffset);
     if (!OperandIndex)
       return;
-    if (isa<InsertElementInst>(InsertedOperand) ||
-        isa<InsertValueInst>(InsertedOperand)) {
+    if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
       findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI,
                              BuildVectorOpds, InsertElts, *OperandIndex);
 
@@ -11544,8 +11539,7 @@
     }
     LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
   } while (LastInsertInst != nullptr &&
-           (isa<InsertValueInst>(LastInsertInst) ||
-            isa<InsertElementInst>(LastInsertInst)) &&
+           isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
            LastInsertInst->hasOneUse());
 }
 
@@ -12240,8 +12234,8 @@
     // Ran into an instruction without users, like terminator, or function call
     // with ignored return value, store. Ignore unused instructions (basing on
     // instruction type, except for CallInst and InvokeInst).
-    if (it->use_empty() && (it->getType()->isVoidTy() || isa<CallInst>(it) ||
-                            isa<InvokeInst>(it))) {
+    if (it->use_empty() &&
+        (it->getType()->isVoidTy() || isa<CallInst, InvokeInst>(it))) {
       KeyNodes.insert(&*it);
       bool OpsChanged = false;
       if (ShouldStartVectorizeHorAtStore || !isa<StoreInst>(it)) {
@@ -12265,8 +12259,7 @@
       }
     }
 
-    if (isa<InsertElementInst>(it) || isa<CmpInst>(it) ||
-        isa<InsertValueInst>(it))
+    if (isa<CmpInst, InsertElementInst, InsertValueInst>(it))
       PostProcessInstructions.push_back(&*it);
   }
 
diff --git a/llvm/test/CodeGen/AArch64/arm64-csel.ll b/llvm/test/CodeGen/AArch64/arm64-csel.ll
--- a/llvm/test/CodeGen/AArch64/arm64-csel.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-csel.ll
@@ -292,6 +292,32 @@
   ret i64 %.
 }
 
+; Regression test for FalseVal - TrueVal overflow
+define i64 @foo18_overflow3(i1 %cmp) nounwind readnone optsize ssp {
+; CHECK-LABEL: foo18_overflow3:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov x8, #-9223372036854775808 
+; CHECK-NEXT:    tst w0, #0x1 
+; CHECK-NEXT:    csel x0, x8, xzr, ne 
+; CHECK-NEXT:    ret
+entry:
+  %. = select i1 %cmp, i64 -9223372036854775808, i64 0
+  ret i64 %.
+}
+
+; Regression test for TrueVal - FalseVal overflow
+define i64 @foo18_overflow4(i1 %cmp) nounwind readnone optsize ssp {
+; CHECK-LABEL: foo18_overflow4:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov x8, #-9223372036854775808 
+; CHECK-NEXT:    tst w0, #0x1 
+; CHECK-NEXT:    csel x0, xzr, x8, ne 
+; CHECK-NEXT:    ret
+entry:
+  %. = select i1 %cmp, i64 0, i64 -9223372036854775808
+  ret i64 %.
+}
+
 define i64 @foo19(i64 %a, i64 %b, i64 %c) {
 ; CHECK-LABEL: foo19:
 ; CHECK:       // %bb.0: // %entry
diff --git a/llvm/test/CodeGen/AArch64/extract-subvec-combine.ll b/llvm/test/CodeGen/AArch64/extract-subvec-combine.ll
--- a/llvm/test/CodeGen/AArch64/extract-subvec-combine.ll
+++ b/llvm/test/CodeGen/AArch64/extract-subvec-combine.ll
@@ -54,9 +54,8 @@
 define <2 x i32> @sext_extract_zext_idx0(<4 x i16> %vec) nounwind {
 ; CHECK-LABEL: sext_extract_zext_idx0:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    shl v0.2s, v0.2s, #16
-; CHECK-NEXT:    sshr v0.2s, v0.2s, #16
+; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
   %zext = zext <4 x i16> %vec to <4 x i32>
   %extract = call <2 x i32> @llvm.vector.extract.v2i32.v4i32(<4 x i32> %zext, i64 0)
@@ -65,6 +64,21 @@
   ret <2 x i32> %sext_inreg
 }
 
+; Negative test, combine should not fire if sign extension is for a different width.
+define <2 x i32> @sext_extract_zext_idx0_negtest(<4 x i16> %vec) nounwind {
+; CHECK-LABEL: sext_extract_zext_idx0_negtest:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    shl v0.2s, v0.2s, #17
+; CHECK-NEXT:    sshr v0.2s, v0.2s, #17
+; CHECK-NEXT:    ret
+  %zext = zext <4 x i16> %vec to <4 x i32>
+  %extract = call <2 x i32> @llvm.vector.extract.v2i32.v4i32(<4 x i32> %zext, i64 0)
+  %sext_inreg_step0 = shl <2 x i32> %extract, <i32 17, i32 17>
+  %sext_inreg = ashr <2 x i32> %sext_inreg_step0, <i32 17, i32 17>
+  ret <2 x i32> %sext_inreg
+}
+
 define <4 x i16> @sext_extract_sext_idx0(<8 x i8> %vec) nounwind {
 ; CHECK-LABEL: sext_extract_sext_idx0:
 ; CHECK:       // %bb.0:
@@ -81,10 +95,9 @@
 define <2 x i32> @sext_extract_zext_idx2(<4 x i16> %vec) nounwind {
 ; CHECK-LABEL: sext_extract_zext_idx2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    shl v0.2s, v0.2s, #16
-; CHECK-NEXT:    sshr v0.2s, v0.2s, #16
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
   %zext = zext <4 x i16> %vec to <4 x i32>
   %extract = call <2 x i32> @llvm.vector.extract.v2i32.v4i32(<4 x i32> %zext, i64 2)
diff --git a/llvm/test/CodeGen/AArch64/nontemporal-load.ll b/llvm/test/CodeGen/AArch64/nontemporal-load.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/nontemporal-load.ll
@@ -0,0 +1,337 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple aarch64-apple-darwin | FileCheck %s
+
+define <4 x double> @test_ldnp_v4f64(<4 x double>* %A) {
+; CHECK-LABEL: test_ldnp_v4f64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ret
+  %lv = load <4 x double>, <4 x double>* %A, align 8, !nontemporal !0
+  ret <4 x double> %lv
+}
+
+define <4 x i64> @test_ldnp_v4i64(<4 x i64>* %A) {
+; CHECK-LABEL: test_ldnp_v4i64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ret
+  %lv = load <4 x i64>, <4 x i64>* %A, align 8, !nontemporal !0
+  ret <4 x i64> %lv
+}
+
+define <8 x i32> @test_ldnp_v8i32(<8 x i32>* %A) {
+; CHECK-LABEL: test_ldnp_v8i32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ret
+  %lv = load <8 x i32>, <8 x i32>* %A, align 8, !nontemporal !0
+  ret <8 x i32> %lv
+}
+
+define <8 x float> @test_ldnp_v8f32(<8 x float>* %A) {
+; CHECK-LABEL: test_ldnp_v8f32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ret
+  %lv = load <8 x float>, <8 x float>* %A, align 8, !nontemporal !0
+  ret <8 x float> %lv
+}
+
+define <16 x i16> @test_ldnp_v16i16(<16 x i16>* %A) {
+; CHECK-LABEL: test_ldnp_v16i16:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ret
+  %lv = load <16 x i16>, <16 x i16>* %A, align 8, !nontemporal !0
+  ret <16 x i16> %lv
+}
+
+define <16 x half> @test_ldnp_v16f16(<16 x half>* %A) {
+; CHECK-LABEL: test_ldnp_v16f16:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ret
+  %lv = load <16 x half>, <16 x half>* %A, align 8, !nontemporal !0
+  ret <16 x half> %lv
+}
+
+define <32 x i8> @test_ldnp_v32i8(<32 x i8>* %A) {
+; CHECK-LABEL: test_ldnp_v32i8:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ret
+  %lv = load <32 x i8>, <32 x i8>* %A, align 8, !nontemporal !0
+  ret <32 x i8> %lv
+}
+
+define <4 x i32> @test_ldnp_v4i32(<4 x i32>* %A) {
+; CHECK-LABEL: test_ldnp_v4i32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ret
+  %lv = load<4 x i32>, <4 x i32>* %A, align 8, !nontemporal !0
+  ret <4 x i32> %lv
+}
+
+define <4 x float> @test_ldnp_v4f32(<4 x float>* %A) {
+; CHECK-LABEL: test_ldnp_v4f32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ret
+  %lv = load<4 x float>, <4 x float>* %A, align 8, !nontemporal !0
+  ret <4 x float> %lv
+}
+
+define <8 x i16> @test_ldnp_v8i16(<8 x i16>* %A) {
+; CHECK-LABEL: test_ldnp_v8i16:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ret
+  %lv = load <8 x i16>, <8 x i16>* %A, align 8, !nontemporal !0
+  ret <8 x i16> %lv
+}
+
+define <16 x i8> @test_ldnp_v16i8(<16 x i8>* %A) {
+; CHECK-LABEL: test_ldnp_v16i8:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ret
+  %lv = load <16 x i8>, <16 x i8>* %A, align 8, !nontemporal !0
+  ret <16 x i8> %lv
+}
+define <2 x double> @test_ldnp_v2f64(<2 x double>* %A) {
+; CHECK-LABEL: test_ldnp_v2f64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ret
+  %lv = load <2 x double>, <2 x double>* %A, align 8, !nontemporal !0
+  ret <2 x double> %lv
+}
+
+define <2 x i32> @test_ldnp_v2i32(<2 x i32>* %A) {
+; CHECK-LABEL: test_ldnp_v2i32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ret
+  %lv = load <2 x i32>, <2 x i32>* %A, align 8, !nontemporal !0
+  ret <2 x i32> %lv
+}
+
+define <2 x float> @test_ldnp_v2f32(<2 x float>* %A) {
+; CHECK-LABEL: test_ldnp_v2f32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ret
+  %lv = load <2 x float>, <2 x float>* %A, align 8, !nontemporal !0
+  ret <2 x float> %lv
+}
+
+define <4 x i16> @test_ldnp_v4i16(<4 x i16>* %A) {
+; CHECK-LABEL: test_ldnp_v4i16:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ret
+  %lv = load <4 x i16>, <4 x i16>* %A, align 8, !nontemporal !0
+  ret <4 x i16> %lv
+}
+
+define <8 x i8> @test_ldnp_v8i8(<8 x i8>* %A) {
+; CHECK-LABEL: test_ldnp_v8i8:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ret
+  %lv = load <8 x i8>, <8 x i8>* %A, align 8, !nontemporal !0
+  ret <8 x i8> %lv
+}
+
+define <1 x double> @test_ldnp_v1f64(<1 x double>* %A) {
+; CHECK-LABEL: test_ldnp_v1f64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ret
+  %lv = load <1 x double>, <1 x double>* %A, align 8, !nontemporal !0
+  ret <1 x double> %lv
+}
+
+define <1 x i64> @test_ldnp_v1i64(<1 x i64>* %A) {
+; CHECK-LABEL: test_ldnp_v1i64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ret
+  %lv = load <1 x i64>, <1 x i64>* %A, align 8, !nontemporal !0
+  ret <1 x i64> %lv
+}
+
+define <32 x i16> @test_ldnp_v32i16(<32 x i16>* %A) {
+; CHECK-LABEL: test_ldnp_v32i16:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ldp q2, q3, [x0, #32]
+; CHECK-NEXT:    ret
+  %lv = load <32 x i16>, <32 x i16>* %A, align 8, !nontemporal !0
+  ret <32 x i16> %lv
+}
+
+define <32 x half> @test_ldnp_v32f16(<32 x half>* %A) {
+; CHECK-LABEL: test_ldnp_v32f16:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ldp q2, q3, [x0, #32]
+; CHECK-NEXT:    ret
+  %lv = load <32 x half>, <32 x half>* %A, align 8, !nontemporal !0
+  ret <32 x half> %lv
+}
+
+define <16 x i32> @test_ldnp_v16i32(<16 x i32>* %A) {
+; CHECK-LABEL: test_ldnp_v16i32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ldp q2, q3, [x0, #32]
+; CHECK-NEXT:    ret
+  %lv = load <16 x i32>, <16 x i32>* %A, align 8, !nontemporal !0
+  ret <16 x i32> %lv
+}
+
+define <16 x float> @test_ldnp_v16f32(<16 x float>* %A) {
+; CHECK-LABEL: test_ldnp_v16f32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ldp q2, q3, [x0, #32]
+; CHECK-NEXT:    ret
+  %lv = load <16 x float>, <16 x float>* %A, align 8, !nontemporal !0
+  ret <16 x float> %lv
+}
+
+define <17 x float> @test_ldnp_v17f32(<17 x float>* %A) {
+; CHECK-LABEL: test_ldnp_v17f32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldp q1, q2, [x0, #32]
+; CHECK-NEXT:    ldp q3, q4, [x0]
+; CHECK-NEXT:    ldr s0, [x0, #64]
+; CHECK-NEXT:    stp q3, q4, [x8]
+; CHECK-NEXT:    stp q1, q2, [x8, #32]
+; CHECK-NEXT:    str s0, [x8, #64]
+; CHECK-NEXT:    ret
+  %lv = load <17 x float>, <17 x float>* %A, align 8, !nontemporal !0
+  ret <17 x float> %lv
+}
+
+define <33 x double> @test_ldnp_v33f64(<33 x double>* %A) {
+; CHECK-LABEL: test_ldnp_v33f64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ldp q2, q3, [x0, #32]
+; CHECK-NEXT:    ldp q4, q5, [x0, #64]
+; CHECK-NEXT:    ldp q6, q7, [x0, #96]
+; CHECK-NEXT:    ldp q16, q17, [x0, #128]
+; CHECK-NEXT:    ldp q18, q19, [x0, #160]
+; CHECK-NEXT:    ldp q21, q22, [x0, #224]
+; CHECK-NEXT:    ldp q23, q24, [x0, #192]
+; CHECK-NEXT:    ldr d20, [x0, #256]
+; CHECK-NEXT:    stp q0, q1, [x8]
+; CHECK-NEXT:    stp q2, q3, [x8, #32]
+; CHECK-NEXT:    stp q4, q5, [x8, #64]
+; CHECK-NEXT:    str d20, [x8, #256]
+; CHECK-NEXT:    stp q6, q7, [x8, #96]
+; CHECK-NEXT:    stp q16, q17, [x8, #128]
+; CHECK-NEXT:    stp q18, q19, [x8, #160]
+; CHECK-NEXT:    stp q23, q24, [x8, #192]
+; CHECK-NEXT:    stp q21, q22, [x8, #224]
+; CHECK-NEXT:    ret
+  %lv = load <33 x double>, <33 x double>* %A, align 8, !nontemporal !0
+  ret <33 x double> %lv
+}
+
+define <33 x i8> @test_ldnp_v33i8(<33 x i8>* %A) {
+; CHECK-LABEL: test_ldnp_v33i8:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ldrb w9, [x0, #32]
+; CHECK-NEXT:    stp q1, q0, [x8]
+; CHECK-NEXT:    strb w9, [x8, #32]
+; CHECK-NEXT:    ret
+  %lv = load<33 x i8>, <33 x i8>* %A, align 8, !nontemporal !0
+  ret <33 x i8> %lv
+}
+
+define <4 x i65> @test_ldnp_v4i65(<4 x i65>* %A) {
+; CHECK-LABEL: test_ldnp_v4i65:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldp x8, x9, [x0, #8]
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldr x10, [x0, #24]
+; CHECK-NEXT:    and x1, x8, #0x1
+; CHECK-NEXT:    ldrb w11, [x0, #32]
+; CHECK-NEXT:    extr x2, x9, x8, #1
+; CHECK-NEXT:    extr x4, x10, x9, #2
+; CHECK-NEXT:    extr x6, x11, x10, #3
+; CHECK-NEXT:    ubfx x3, x9, #1, #1
+; CHECK-NEXT:    mov.d v0[1], x1
+; CHECK-NEXT:    ubfx x5, x10, #2, #1
+; CHECK-NEXT:    ubfx x7, x11, #3, #1
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+  %lv = load <4 x i65>, <4 x i65>* %A, align 8, !nontemporal !0
+  ret <4 x i65> %lv
+}
+
+define <4 x i63> @test_ldnp_v4i63(<4 x i63>* %A) {
+; CHECK-LABEL: test_ldnp_v4i63:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldp x8, x9, [x0]
+; CHECK-NEXT:    ldp x10, x11, [x0, #16]
+; CHECK-NEXT:    extr x12, x9, x8, #63
+; CHECK-NEXT:    and x0, x8, #0x7fffffffffffffff
+; CHECK-NEXT:    extr x9, x10, x9, #62
+; CHECK-NEXT:    extr x10, x11, x10, #61
+; CHECK-NEXT:    and x1, x12, #0x7fffffffffffffff
+; CHECK-NEXT:    and x2, x9, #0x7fffffffffffffff
+; CHECK-NEXT:    and x3, x10, #0x7fffffffffffffff
+; CHECK-NEXT:    ret
+  %lv = load <4 x i63>, <4 x i63>* %A, align 8, !nontemporal !0
+  ret <4 x i63> %lv
+}
+
+define <5 x double> @test_ldnp_v5f64(<5 x double>* %A) {
+; CHECK-LABEL: test_ldnp_v5f64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldp q0, q2, [x0]
+; CHECK-NEXT:    ext.16b v1, v0, v0, #8
+; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ; kill: def $d1 killed $d1 killed $q1
+; CHECK-NEXT:    ext.16b v3, v2, v2, #8
+; CHECK-NEXT:    ldr d4, [x0, #32]
+; CHECK-NEXT:    ; kill: def $d2 killed $d2 killed $q2
+; CHECK-NEXT:    ; kill: def $d3 killed $d3 killed $q3
+; CHECK-NEXT:    ; kill: def $d4 killed $d4 killed $q4
+; CHECK-NEXT:    ret
+  %lv = load<5 x double>, <5 x double>* %A, align 8, !nontemporal !0
+  ret <5 x double> %lv
+}
+
+define <16 x i64> @test_ldnp_v16i64(<16 x i64>* %A) {
+; CHECK-LABEL: test_ldnp_v16i64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ldp q2, q3, [x0, #32]
+; CHECK-NEXT:    ldp q4, q5, [x0, #64]
+; CHECK-NEXT:    ldp q6, q7, [x0, #96]
+; CHECK-NEXT:    ret
+  %lv = load <16 x i64>, <16 x i64>* %A, align 8, !nontemporal !0
+  ret <16 x i64> %lv
+}
+
+define <16 x double> @test_ldnp_v16f64(<16 x double>* %A) {
+; CHECK-LABEL: test_ldnp_v16f64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ldp q2, q3, [x0, #32]
+; CHECK-NEXT:    ldp q4, q5, [x0, #64]
+; CHECK-NEXT:    ldp q6, q7, [x0, #96]
+; CHECK-NEXT:    ret
+  %lv = load <16 x double>, <16 x double>* %A, align 8, !nontemporal !0
+  ret <16 x double> %lv
+}
+
+
+!0 = !{i32 1}
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
@@ -575,9 +575,7 @@
 ; CHECK-NEXT:    movi d0, #0000000000000000
 ; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    fcmeq v1.4h, v1.4h, #0.0
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-NEXT:    shl v1.2s, v1.2s, #16
-; CHECK-NEXT:    sshr v1.2s, v1.2s, #16
+; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
 ; CHECK-NEXT:    fmov w8, s1
 ; CHECK-NEXT:    mov w9, v1.s[1]
 ; CHECK-NEXT:    ldr q1, [x1]
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
@@ -17,15 +17,11 @@
 ; CHECK-NEXT:    ldr s2, [x1]
 ; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    fcmeq v1.4h, v1.4h, v2.4h
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-NEXT:    shl v1.2s, v1.2s, #16
-; CHECK-NEXT:    sshr v1.2s, v1.2s, #16
+; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
 ; CHECK-NEXT:    fmov w8, s1
 ; CHECK-NEXT:    mov w9, v1.s[1]
 ; CHECK-NEXT:    mov v0.h[0], w8
 ; CHECK-NEXT:    mov v0.h[1], w9
-; CHECK-NEXT:    shl v0.4h, v0.4h, #15
-; CHECK-NEXT:    cmlt v0.4h, v0.4h, #0
 ; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, #0
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
@@ -538,9 +538,7 @@
 ; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    fcmeq v2.4h, v1.4h, #0.0
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
-; CHECK-NEXT:    ushll v2.4s, v2.4h, #0
-; CHECK-NEXT:    shl v2.2s, v2.2s, #16
-; CHECK-NEXT:    sshr v2.2s, v2.2s, #16
+; CHECK-NEXT:    sshll v2.4s, v2.4h, #0
 ; CHECK-NEXT:    fmov w8, s2
 ; CHECK-NEXT:    mov w9, v2.s[1]
 ; CHECK-NEXT:    ldr q2, [x1]
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll
@@ -17,9 +17,7 @@
 ; CHECK-NEXT:    ldr s2, [x1]
 ; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    fcmeq v2.4h, v1.4h, v2.4h
-; CHECK-NEXT:    ushll v2.4s, v2.4h, #0
-; CHECK-NEXT:    shl v2.2s, v2.2s, #16
-; CHECK-NEXT:    sshr v2.2s, v2.2s, #16
+; CHECK-NEXT:    sshll v2.4s, v2.4h, #0
 ; CHECK-NEXT:    fmov w8, s2
 ; CHECK-NEXT:    mov w9, v2.s[1]
 ; CHECK-NEXT:    mov v0.h[0], w8
diff --git a/llvm/test/CodeGen/AArch64/vector-fcvt.ll b/llvm/test/CodeGen/AArch64/vector-fcvt.ll
--- a/llvm/test/CodeGen/AArch64/vector-fcvt.ll
+++ b/llvm/test/CodeGen/AArch64/vector-fcvt.ll
@@ -326,26 +326,16 @@
 define <8 x double> @sitofp_i16_double(<8 x i16> %a) {
 ; CHECK-LABEL: sitofp_i16_double:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ushll2 v1.4s, v0.8h, #0
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    shl v2.2s, v1.2s, #16
-; CHECK-NEXT:    shl v3.2s, v0.2s, #16
-; CHECK-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT:    sshr v2.2s, v2.2s, #16
-; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    sshr v3.2s, v3.2s, #16
-; CHECK-NEXT:    sshll v2.2d, v2.2s, #0
-; CHECK-NEXT:    shl v1.2s, v1.2s, #16
-; CHECK-NEXT:    shl v0.2s, v0.2s, #16
-; CHECK-NEXT:    scvtf v2.2d, v2.2d
-; CHECK-NEXT:    sshr v1.2s, v1.2s, #16
-; CHECK-NEXT:    sshr v0.2s, v0.2s, #16
-; CHECK-NEXT:    sshll v3.2d, v3.2s, #0
+; CHECK-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-NEXT:    sshll2 v2.2d, v1.4s, #0
+; CHECK-NEXT:    sshll2 v3.2d, v0.4s, #0
 ; CHECK-NEXT:    sshll v4.2d, v1.2s, #0
-; CHECK-NEXT:    sshll v1.2d, v0.2s, #0
-; CHECK-NEXT:    scvtf v0.2d, v3.2d
-; CHECK-NEXT:    scvtf v1.2d, v1.2d
-; CHECK-NEXT:    scvtf v3.2d, v4.2d
+; CHECK-NEXT:    sshll v0.2d, v0.2s, #0
+; CHECK-NEXT:    scvtf v1.2d, v3.2d
+; CHECK-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-NEXT:    scvtf v3.2d, v2.2d
+; CHECK-NEXT:    scvtf v2.2d, v4.2d
 ; CHECK-NEXT:    ret
   %1 = sitofp <8 x i16> %a to <8 x double>
   ret <8 x double> %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
@@ -8,50 +8,51 @@
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
 ; CHECK-NEXT:    s_or_saveexec_b32 s4, -1
-; CHECK-NEXT:    buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b32 exec_lo, s4
-; CHECK-NEXT:    v_mov_b32_e32 v15, v1
-; CHECK-NEXT:    v_mov_b32_e32 v14, v2
-; CHECK-NEXT:    v_mov_b32_e32 v13, v3
-; CHECK-NEXT:    v_mov_b32_e32 v12, v4
-; CHECK-NEXT:    v_mov_b32_e32 v11, v5
-; CHECK-NEXT:    v_mov_b32_e32 v10, v6
-; CHECK-NEXT:    v_mov_b32_e32 v9, v7
+; CHECK-NEXT:    v_mov_b32_e32 v14, v1
+; CHECK-NEXT:    v_mov_b32_e32 v13, v2
+; CHECK-NEXT:    v_mov_b32_e32 v12, v3
+; CHECK-NEXT:    v_mov_b32_e32 v11, v4
+; CHECK-NEXT:    v_mov_b32_e32 v10, v5
+; CHECK-NEXT:    v_mov_b32_e32 v9, v6
+; CHECK-NEXT:    v_mov_b32_e32 v8, v7
 ; CHECK-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 killed $exec
-; CHECK-NEXT:    v_mov_b32_e32 v1, v15
-; CHECK-NEXT:    v_mov_b32_e32 v2, v14
-; CHECK-NEXT:    v_mov_b32_e32 v3, v13
-; CHECK-NEXT:    v_mov_b32_e32 v4, v12
-; CHECK-NEXT:    v_mov_b32_e32 v5, v11
-; CHECK-NEXT:    v_mov_b32_e32 v6, v10
-; CHECK-NEXT:    v_mov_b32_e32 v7, v9
-; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; CHECK-NEXT:    v_mov_b32_e32 v1, v14
+; CHECK-NEXT:    v_mov_b32_e32 v2, v13
+; CHECK-NEXT:    v_mov_b32_e32 v3, v12
+; CHECK-NEXT:    v_mov_b32_e32 v4, v11
+; CHECK-NEXT:    v_mov_b32_e32 v5, v10
+; CHECK-NEXT:    v_mov_b32_e32 v6, v9
+; CHECK-NEXT:    v_mov_b32_e32 v7, v8
+; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b32 s8, 0
 ; CHECK-NEXT:    s_mov_b32 s4, s8
 ; CHECK-NEXT:    s_mov_b32 s5, s8
 ; CHECK-NEXT:    s_mov_b32 s6, s8
 ; CHECK-NEXT:    s_mov_b32 s7, s8
-; CHECK-NEXT:    v_writelane_b32 v8, s4, 0
-; CHECK-NEXT:    v_writelane_b32 v8, s5, 1
-; CHECK-NEXT:    v_writelane_b32 v8, s6, 2
-; CHECK-NEXT:    v_writelane_b32 v8, s7, 3
+; CHECK-NEXT:    v_writelane_b32 v16, s4, 0
+; CHECK-NEXT:    v_writelane_b32 v16, s5, 1
+; CHECK-NEXT:    v_writelane_b32 v16, s6, 2
+; CHECK-NEXT:    v_writelane_b32 v16, s7, 3
 ; CHECK-NEXT:    s_mov_b32 s6, 0
 ; CHECK-NEXT:    s_mov_b32 s4, s6
 ; CHECK-NEXT:    s_mov_b32 s5, s6
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s4
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s5
-; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b32 s4, exec_lo
-; CHECK-NEXT:    v_writelane_b32 v8, s4, 4
+; CHECK-NEXT:    v_writelane_b32 v16, s4, 4
 ; CHECK-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -59,16 +60,15 @@
 ; CHECK-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v7, v9
-; CHECK-NEXT:    v_mov_b32_e32 v6, v10
-; CHECK-NEXT:    v_mov_b32_e32 v5, v11
-; CHECK-NEXT:    v_mov_b32_e32 v4, v12
-; CHECK-NEXT:    v_mov_b32_e32 v3, v13
-; CHECK-NEXT:    v_mov_b32_e32 v2, v14
-; CHECK-NEXT:    v_mov_b32_e32 v1, v15
-; CHECK-NEXT:    v_mov_b32_e32 v0, v16
+; CHECK-NEXT:    v_mov_b32_e32 v7, v8
+; CHECK-NEXT:    v_mov_b32_e32 v6, v9
+; CHECK-NEXT:    v_mov_b32_e32 v5, v10
+; CHECK-NEXT:    v_mov_b32_e32 v4, v11
+; CHECK-NEXT:    v_mov_b32_e32 v3, v12
+; CHECK-NEXT:    v_mov_b32_e32 v2, v13
+; CHECK-NEXT:    v_mov_b32_e32 v1, v14
+; CHECK-NEXT:    v_mov_b32_e32 v0, v15
 ; CHECK-NEXT:    v_readfirstlane_b32 s12, v7
 ; CHECK-NEXT:    v_readfirstlane_b32 s10, v6
 ; CHECK-NEXT:    v_readfirstlane_b32 s9, v5
@@ -85,22 +85,22 @@
 ; CHECK-NEXT:    s_mov_b32 s17, s6
 ; CHECK-NEXT:    s_mov_b32 s18, s5
 ; CHECK-NEXT:    s_mov_b32 s19, s4
-; CHECK-NEXT:    v_writelane_b32 v8, s12, 5
-; CHECK-NEXT:    v_writelane_b32 v8, s13, 6
-; CHECK-NEXT:    v_writelane_b32 v8, s14, 7
-; CHECK-NEXT:    v_writelane_b32 v8, s15, 8
-; CHECK-NEXT:    v_writelane_b32 v8, s16, 9
-; CHECK-NEXT:    v_writelane_b32 v8, s17, 10
-; CHECK-NEXT:    v_writelane_b32 v8, s18, 11
-; CHECK-NEXT:    v_writelane_b32 v8, s19, 12
-; CHECK-NEXT:    v_mov_b32_e32 v6, v9
-; CHECK-NEXT:    v_mov_b32_e32 v7, v10
-; CHECK-NEXT:    v_mov_b32_e32 v4, v11
-; CHECK-NEXT:    v_mov_b32_e32 v5, v12
-; CHECK-NEXT:    v_mov_b32_e32 v2, v13
-; CHECK-NEXT:    v_mov_b32_e32 v3, v14
-; CHECK-NEXT:    v_mov_b32_e32 v0, v15
-; CHECK-NEXT:    v_mov_b32_e32 v1, v16
+; CHECK-NEXT:    v_writelane_b32 v16, s12, 5
+; CHECK-NEXT:    v_writelane_b32 v16, s13, 6
+; CHECK-NEXT:    v_writelane_b32 v16, s14, 7
+; CHECK-NEXT:    v_writelane_b32 v16, s15, 8
+; CHECK-NEXT:    v_writelane_b32 v16, s16, 9
+; CHECK-NEXT:    v_writelane_b32 v16, s17, 10
+; CHECK-NEXT:    v_writelane_b32 v16, s18, 11
+; CHECK-NEXT:    v_writelane_b32 v16, s19, 12
+; CHECK-NEXT:    v_mov_b32_e32 v6, v8
+; CHECK-NEXT:    v_mov_b32_e32 v7, v9
+; CHECK-NEXT:    v_mov_b32_e32 v4, v10
+; CHECK-NEXT:    v_mov_b32_e32 v5, v11
+; CHECK-NEXT:    v_mov_b32_e32 v2, v12
+; CHECK-NEXT:    v_mov_b32_e32 v3, v13
+; CHECK-NEXT:    v_mov_b32_e32 v0, v14
+; CHECK-NEXT:    v_mov_b32_e32 v1, v15
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[12:13]
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], s[16:17]
@@ -113,40 +113,40 @@
 ; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, s[6:7], v[0:1]
 ; CHECK-NEXT:    s_and_b32 s4, s4, s5
 ; CHECK-NEXT:    s_and_saveexec_b32 s4, s4
-; CHECK-NEXT:    v_writelane_b32 v8, s4, 13
+; CHECK-NEXT:    v_writelane_b32 v16, s4, 13
 ; CHECK-NEXT:  ; %bb.2: ; in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; CHECK-NEXT:    v_readlane_b32 s4, v8, 13
-; CHECK-NEXT:    v_readlane_b32 s8, v8, 5
-; CHECK-NEXT:    v_readlane_b32 s9, v8, 6
-; CHECK-NEXT:    v_readlane_b32 s10, v8, 7
-; CHECK-NEXT:    v_readlane_b32 s11, v8, 8
-; CHECK-NEXT:    v_readlane_b32 s12, v8, 9
-; CHECK-NEXT:    v_readlane_b32 s13, v8, 10
-; CHECK-NEXT:    v_readlane_b32 s14, v8, 11
-; CHECK-NEXT:    v_readlane_b32 s15, v8, 12
-; CHECK-NEXT:    v_readlane_b32 s16, v8, 0
-; CHECK-NEXT:    v_readlane_b32 s17, v8, 1
-; CHECK-NEXT:    v_readlane_b32 s18, v8, 2
-; CHECK-NEXT:    v_readlane_b32 s19, v8, 3
+; CHECK-NEXT:    v_readlane_b32 s4, v16, 13
+; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT:    v_readlane_b32 s8, v16, 5
+; CHECK-NEXT:    v_readlane_b32 s9, v16, 6
+; CHECK-NEXT:    v_readlane_b32 s10, v16, 7
+; CHECK-NEXT:    v_readlane_b32 s11, v16, 8
+; CHECK-NEXT:    v_readlane_b32 s12, v16, 9
+; CHECK-NEXT:    v_readlane_b32 s13, v16, 10
+; CHECK-NEXT:    v_readlane_b32 s14, v16, 11
+; CHECK-NEXT:    v_readlane_b32 s15, v16, 12
+; CHECK-NEXT:    v_readlane_b32 s16, v16, 0
+; CHECK-NEXT:    v_readlane_b32 s17, v16, 1
+; CHECK-NEXT:    v_readlane_b32 s18, v16, 2
+; CHECK-NEXT:    v_readlane_b32 s19, v16, 3
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    image_sample v0, v[0:1], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; CHECK-NEXT:    s_cbranch_execnz .LBB0_1
 ; CHECK-NEXT:  ; %bb.3:
-; CHECK-NEXT:    v_readlane_b32 s4, v8, 4
+; CHECK-NEXT:    v_readlane_b32 s4, v16, 4
 ; CHECK-NEXT:    s_mov_b32 exec_lo, s4
 ; CHECK-NEXT:  ; %bb.4:
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; CHECK-NEXT:    ; implicit-def: $sgpr4
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s4
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s4
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s4
 ; CHECK-NEXT:    s_or_saveexec_b32 s4, -1
-; CHECK-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b32 exec_lo, s4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
--- a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
@@ -32,39 +32,39 @@
 ; GCN_DBG:       ; %bb.0: ; %entry
 ; GCN_DBG-NEXT:    s_load_dword s2, s[0:1], 0x9
 ; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN_DBG-NEXT:    v_writelane_b32 v0, s2, 0
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s2, 0
 ; GCN_DBG-NEXT:    s_load_dword s1, s[0:1], 0xa
 ; GCN_DBG-NEXT:    s_mov_b32 s0, 0
 ; GCN_DBG-NEXT:    s_mov_b32 s2, -1
 ; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN_DBG-NEXT:    s_cmp_lg_u32 s1, s2
-; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 1
 ; GCN_DBG-NEXT:    s_cbranch_scc1 .LBB0_2
 ; GCN_DBG-NEXT:  ; %bb.1: ; %for.exit
 ; GCN_DBG-NEXT:    s_endpgm
 ; GCN_DBG-NEXT:  .LBB0_2: ; %for.body
 ; GCN_DBG-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN_DBG-NEXT:    v_readlane_b32 s0, v0, 1
-; GCN_DBG-NEXT:    v_readlane_b32 s2, v0, 0
+; GCN_DBG-NEXT:    v_readlane_b32 s0, v2, 1
+; GCN_DBG-NEXT:    v_readlane_b32 s2, v2, 0
 ; GCN_DBG-NEXT:    s_mov_b32 s1, 2
 ; GCN_DBG-NEXT:    s_lshl_b32 s1, s0, s1
 ; GCN_DBG-NEXT:    s_add_i32 s1, s1, s2
 ; GCN_DBG-NEXT:    s_mov_b32 s2, 0x80
 ; GCN_DBG-NEXT:    s_add_i32 s1, s1, s2
 ; GCN_DBG-NEXT:    s_mov_b32 m0, -1
-; GCN_DBG-NEXT:    v_mov_b32_e32 v1, s1
-; GCN_DBG-NEXT:    ds_read_b32 v1, v1
+; GCN_DBG-NEXT:    v_mov_b32_e32 v0, s1
+; GCN_DBG-NEXT:    ds_read_b32 v0, v0
 ; GCN_DBG-NEXT:    s_mov_b32 s2, 1.0
 ; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN_DBG-NEXT:    v_add_f32_e64 v2, v1, s2
+; GCN_DBG-NEXT:    v_add_f32_e64 v1, v0, s2
 ; GCN_DBG-NEXT:    s_mov_b32 m0, -1
-; GCN_DBG-NEXT:    v_mov_b32_e32 v1, s1
-; GCN_DBG-NEXT:    ds_write_b32 v1, v2
+; GCN_DBG-NEXT:    v_mov_b32_e32 v0, s1
+; GCN_DBG-NEXT:    ds_write_b32 v0, v1
 ; GCN_DBG-NEXT:    s_mov_b32 s1, 1
 ; GCN_DBG-NEXT:    s_add_i32 s0, s0, s1
 ; GCN_DBG-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN_DBG-NEXT:    s_and_b64 vcc, exec, s[2:3]
-; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 1
 ; GCN_DBG-NEXT:    s_cbranch_vccnz .LBB0_2
 ; GCN_DBG-NEXT:  ; %bb.3: ; %DummyReturnBlock
 ; GCN_DBG-NEXT:    s_endpgm
@@ -107,35 +107,35 @@
 ; GCN_DBG:       ; %bb.0: ; %entry
 ; GCN_DBG-NEXT:    s_load_dword s0, s[0:1], 0x9
 ; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 0
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 0
 ; GCN_DBG-NEXT:    s_mov_b32 s0, 0
-; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 1
 ; GCN_DBG-NEXT:    s_branch .LBB1_2
 ; GCN_DBG-NEXT:  .LBB1_1: ; %for.exit
 ; GCN_DBG-NEXT:    s_endpgm
 ; GCN_DBG-NEXT:  .LBB1_2: ; %for.body
 ; GCN_DBG-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN_DBG-NEXT:    v_readlane_b32 s0, v0, 1
-; GCN_DBG-NEXT:    v_readlane_b32 s2, v0, 0
+; GCN_DBG-NEXT:    v_readlane_b32 s0, v2, 1
+; GCN_DBG-NEXT:    v_readlane_b32 s2, v2, 0
 ; GCN_DBG-NEXT:    s_mov_b32 s1, 2
 ; GCN_DBG-NEXT:    s_lshl_b32 s1, s0, s1
 ; GCN_DBG-NEXT:    s_add_i32 s1, s1, s2
 ; GCN_DBG-NEXT:    s_mov_b32 s2, 0x80
 ; GCN_DBG-NEXT:    s_add_i32 s1, s1, s2
 ; GCN_DBG-NEXT:    s_mov_b32 m0, -1
-; GCN_DBG-NEXT:    v_mov_b32_e32 v1, s1
-; GCN_DBG-NEXT:    ds_read_b32 v1, v1
+; GCN_DBG-NEXT:    v_mov_b32_e32 v0, s1
+; GCN_DBG-NEXT:    ds_read_b32 v0, v0
 ; GCN_DBG-NEXT:    s_mov_b32 s2, 1.0
 ; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN_DBG-NEXT:    v_add_f32_e64 v2, v1, s2
+; GCN_DBG-NEXT:    v_add_f32_e64 v1, v0, s2
 ; GCN_DBG-NEXT:    s_mov_b32 m0, -1
-; GCN_DBG-NEXT:    v_mov_b32_e32 v1, s1
-; GCN_DBG-NEXT:    ds_write_b32 v1, v2
+; GCN_DBG-NEXT:    v_mov_b32_e32 v0, s1
+; GCN_DBG-NEXT:    ds_write_b32 v0, v1
 ; GCN_DBG-NEXT:    s_mov_b32 s1, 1
 ; GCN_DBG-NEXT:    s_add_i32 s0, s0, s1
 ; GCN_DBG-NEXT:    s_mov_b64 s[2:3], 0
 ; GCN_DBG-NEXT:    s_and_b64 vcc, exec, s[2:3]
-; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 1
 ; GCN_DBG-NEXT:    s_cbranch_vccnz .LBB1_1
 ; GCN_DBG-NEXT:    s_branch .LBB1_2
 entry:
@@ -172,35 +172,35 @@
 ; GCN_DBG:       ; %bb.0: ; %entry
 ; GCN_DBG-NEXT:    s_load_dword s0, s[0:1], 0x9
 ; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 0
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 0
 ; GCN_DBG-NEXT:    s_mov_b32 s0, 0
-; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 1
 ; GCN_DBG-NEXT:    s_branch .LBB2_2
 ; GCN_DBG-NEXT:  .LBB2_1: ; %for.exit
 ; GCN_DBG-NEXT:    s_endpgm
 ; GCN_DBG-NEXT:  .LBB2_2: ; %for.body
 ; GCN_DBG-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN_DBG-NEXT:    v_readlane_b32 s0, v0, 1
-; GCN_DBG-NEXT:    v_readlane_b32 s2, v0, 0
+; GCN_DBG-NEXT:    v_readlane_b32 s0, v2, 1
+; GCN_DBG-NEXT:    v_readlane_b32 s2, v2, 0
 ; GCN_DBG-NEXT:    s_mov_b32 s1, 2
 ; GCN_DBG-NEXT:    s_lshl_b32 s1, s0, s1
 ; GCN_DBG-NEXT:    s_add_i32 s1, s1, s2
 ; GCN_DBG-NEXT:    s_mov_b32 s2, 0x80
 ; GCN_DBG-NEXT:    s_add_i32 s1, s1, s2
 ; GCN_DBG-NEXT:    s_mov_b32 m0, -1
-; GCN_DBG-NEXT:    v_mov_b32_e32 v1, s1
-; GCN_DBG-NEXT:    ds_read_b32 v1, v1
+; GCN_DBG-NEXT:    v_mov_b32_e32 v0, s1
+; GCN_DBG-NEXT:    ds_read_b32 v0, v0
 ; GCN_DBG-NEXT:    s_mov_b32 s2, 1.0
 ; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN_DBG-NEXT:    v_add_f32_e64 v2, v1, s2
+; GCN_DBG-NEXT:    v_add_f32_e64 v1, v0, s2
 ; GCN_DBG-NEXT:    s_mov_b32 m0, -1
-; GCN_DBG-NEXT:    v_mov_b32_e32 v1, s1
-; GCN_DBG-NEXT:    ds_write_b32 v1, v2
+; GCN_DBG-NEXT:    v_mov_b32_e32 v0, s1
+; GCN_DBG-NEXT:    ds_write_b32 v0, v1
 ; GCN_DBG-NEXT:    s_mov_b32 s1, 1
 ; GCN_DBG-NEXT:    s_add_i32 s0, s0, s1
 ; GCN_DBG-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN_DBG-NEXT:    s_and_b64 vcc, exec, s[2:3]
-; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 1
 ; GCN_DBG-NEXT:    s_cbranch_vccnz .LBB2_1
 ; GCN_DBG-NEXT:    s_branch .LBB2_2
 entry:
@@ -238,33 +238,33 @@
 ; GCN_DBG:       ; %bb.0: ; %entry
 ; GCN_DBG-NEXT:    s_load_dword s0, s[0:1], 0x9
 ; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 0
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 0
 ; GCN_DBG-NEXT:    s_mov_b32 s0, 0
-; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 1
 ; GCN_DBG-NEXT:    s_branch .LBB3_2
 ; GCN_DBG-NEXT:  .LBB3_1: ; %for.exit
 ; GCN_DBG-NEXT:    s_endpgm
 ; GCN_DBG-NEXT:  .LBB3_2: ; %for.body
 ; GCN_DBG-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN_DBG-NEXT:    v_readlane_b32 s0, v0, 1
-; GCN_DBG-NEXT:    v_readlane_b32 s2, v0, 0
+; GCN_DBG-NEXT:    v_readlane_b32 s0, v2, 1
+; GCN_DBG-NEXT:    v_readlane_b32 s2, v2, 0
 ; GCN_DBG-NEXT:    s_mov_b32 s1, 2
 ; GCN_DBG-NEXT:    s_lshl_b32 s1, s0, s1
 ; GCN_DBG-NEXT:    s_add_i32 s1, s1, s2
 ; GCN_DBG-NEXT:    s_mov_b32 s2, 0x80
 ; GCN_DBG-NEXT:    s_add_i32 s1, s1, s2
 ; GCN_DBG-NEXT:    s_mov_b32 m0, -1
-; GCN_DBG-NEXT:    v_mov_b32_e32 v1, s1
-; GCN_DBG-NEXT:    ds_read_b32 v1, v1
+; GCN_DBG-NEXT:    v_mov_b32_e32 v0, s1
+; GCN_DBG-NEXT:    ds_read_b32 v0, v0
 ; GCN_DBG-NEXT:    s_mov_b32 s2, 1.0
 ; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN_DBG-NEXT:    v_add_f32_e64 v2, v1, s2
+; GCN_DBG-NEXT:    v_add_f32_e64 v1, v0, s2
 ; GCN_DBG-NEXT:    s_mov_b32 m0, -1
-; GCN_DBG-NEXT:    v_mov_b32_e32 v1, s1
-; GCN_DBG-NEXT:    ds_write_b32 v1, v2
+; GCN_DBG-NEXT:    v_mov_b32_e32 v0, s1
+; GCN_DBG-NEXT:    ds_write_b32 v0, v1
 ; GCN_DBG-NEXT:    s_mov_b32 s1, 1
 ; GCN_DBG-NEXT:    s_add_i32 s0, s0, s1
-; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 1
 ; GCN_DBG-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GCN_DBG-NEXT:    s_branch .LBB3_2
 entry:
@@ -316,48 +316,48 @@
 ; GCN_DBG:       ; %bb.0: ; %entry
 ; GCN_DBG-NEXT:    s_load_dword s0, s[0:1], 0x9
 ; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 0
-; GCN_DBG-NEXT:    v_mov_b32_e32 v1, 0
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 0
+; GCN_DBG-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN_DBG-NEXT:    s_mov_b32 m0, -1
-; GCN_DBG-NEXT:    ds_read_u8 v1, v1
+; GCN_DBG-NEXT:    ds_read_u8 v0, v0
 ; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN_DBG-NEXT:    v_readfirstlane_b32 s0, v1
+; GCN_DBG-NEXT:    v_readfirstlane_b32 s0, v0
 ; GCN_DBG-NEXT:    s_and_b32 s0, 1, s0
 ; GCN_DBG-NEXT:    s_cmp_eq_u32 s0, 1
 ; GCN_DBG-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN_DBG-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN_DBG-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
-; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
-; GCN_DBG-NEXT:    v_writelane_b32 v0, s1, 2
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 1
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s1, 2
 ; GCN_DBG-NEXT:    s_mov_b32 s0, 0
-; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 3
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 3
 ; GCN_DBG-NEXT:    s_branch .LBB4_2
 ; GCN_DBG-NEXT:  .LBB4_1: ; %for.exit
 ; GCN_DBG-NEXT:    s_endpgm
 ; GCN_DBG-NEXT:  .LBB4_2: ; %for.body
 ; GCN_DBG-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN_DBG-NEXT:    v_readlane_b32 s0, v0, 3
-; GCN_DBG-NEXT:    v_readlane_b32 s2, v0, 1
-; GCN_DBG-NEXT:    v_readlane_b32 s3, v0, 2
-; GCN_DBG-NEXT:    v_readlane_b32 s4, v0, 0
+; GCN_DBG-NEXT:    v_readlane_b32 s0, v2, 3
+; GCN_DBG-NEXT:    v_readlane_b32 s2, v2, 1
+; GCN_DBG-NEXT:    v_readlane_b32 s3, v2, 2
+; GCN_DBG-NEXT:    v_readlane_b32 s4, v2, 0
 ; GCN_DBG-NEXT:    s_mov_b32 s1, 2
 ; GCN_DBG-NEXT:    s_lshl_b32 s1, s0, s1
 ; GCN_DBG-NEXT:    s_add_i32 s1, s1, s4
 ; GCN_DBG-NEXT:    s_mov_b32 s4, 0x80
 ; GCN_DBG-NEXT:    s_add_i32 s1, s1, s4
 ; GCN_DBG-NEXT:    s_mov_b32 m0, -1
-; GCN_DBG-NEXT:    v_mov_b32_e32 v1, s1
-; GCN_DBG-NEXT:    ds_read_b32 v1, v1
+; GCN_DBG-NEXT:    v_mov_b32_e32 v0, s1
+; GCN_DBG-NEXT:    ds_read_b32 v0, v0
 ; GCN_DBG-NEXT:    s_mov_b32 s4, 1.0
 ; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN_DBG-NEXT:    v_add_f32_e64 v2, v1, s4
+; GCN_DBG-NEXT:    v_add_f32_e64 v1, v0, s4
 ; GCN_DBG-NEXT:    s_mov_b32 m0, -1
-; GCN_DBG-NEXT:    v_mov_b32_e32 v1, s1
-; GCN_DBG-NEXT:    ds_write_b32 v1, v2
+; GCN_DBG-NEXT:    v_mov_b32_e32 v0, s1
+; GCN_DBG-NEXT:    ds_write_b32 v0, v1
 ; GCN_DBG-NEXT:    s_mov_b32 s1, 1
 ; GCN_DBG-NEXT:    s_add_i32 s0, s0, s1
 ; GCN_DBG-NEXT:    s_and_b64 vcc, exec, s[2:3]
-; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 3
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 3
 ; GCN_DBG-NEXT:    s_cbranch_vccnz .LBB4_1
 ; GCN_DBG-NEXT:    s_branch .LBB4_2
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
--- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
+++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
@@ -420,11 +420,11 @@
 ; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0:[0-9]+]]
 ; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1:[0-9]+]]
 ; GCN-O0: [[INNER_LOOP:.LBB[0-9]+_[0-9]+]]:
-; GCN-O0: buffer_load_dword
 ; GCN-O0-DAG:  v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0]]
 ; GCN-O0-DAG:  v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1]]
 ; GCN-O0-DAG:  v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_IN_EXEC_SPILL_LANE_0]]
 ; GCN-O0-DAG:  v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_IN_EXEC_SPILL_LANE_1]]
+; GCN-O0: buffer_load_dword
 ; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_LOOP_EXEC_SPILL_LANE_0:[0-9]+]]
 ; GCN-O0-DAG:  v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_LOOP_EXEC_SPILL_LANE_1:[0-9]+]]
 ; GCN-O0:      s_or_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]
diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
--- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
@@ -48,9 +48,6 @@
 
 ; VMEM: [[ENDIF]]:
 
-; Restore val
-; GCN: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload
-
 ; Reload and restore exec mask
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
@@ -62,6 +59,9 @@
 
 ; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]]
 
+; Restore val
+; GCN: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload
+
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RELOAD_VAL]]
 define amdgpu_kernel void @divergent_if_endif(i32 addrspace(1)* %out) #0 {
 entry:
@@ -121,7 +121,6 @@
 ; GCN: buffer_store_dword [[VAL_LOOP]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET:[0-9]+]] ; 4-byte Folded Spill
 
 ; GCN: [[END]]:
-; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
 
@@ -131,6 +130,7 @@
 ; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 1
 
 ; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]]
+; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[VAL_END]]
 define amdgpu_kernel void @divergent_loop(i32 addrspace(1)* %out) #0 {
@@ -187,7 +187,6 @@
 ; GCN-NEXT: s_branch [[ELSE:.LBB[0-9]+_[0-9]+]]
 
 ; GCN: [[FLOW]]: ; %Flow
-; GCN: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload
 ; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
 ; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
 
@@ -199,6 +198,7 @@
 ; GCN: s_or_saveexec_b64 s[[[FLOW_S_RELOAD_SAVEEXEC_LO_SAVEEXEC:[0-9]+]]:[[FLOW_S_RELOAD_SAVEEXEC_HI_SAVEEXEC:[0-9]+]]], s[[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]]
 
 ; Regular spill value restored after exec modification
+; GCN: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload
 ; Followed by spill
 ; GCN: buffer_store_dword [[FLOW_VAL]], off, s[0:3], 0 offset:[[RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill
 
@@ -230,7 +230,6 @@
 ; GCN-NEXT: s_branch [[FLOW]]
 
 ; GCN: [[ENDIF]]:
-; GCN: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_LO_LANE]]
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_HI_LANE]]
 
@@ -242,6 +241,7 @@
 
 ; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]]
 
+; GCN: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RESULT]]
 define amdgpu_kernel void @divergent_if_else_endif(i32 addrspace(1)* %out) #0 {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll
@@ -13,7 +13,7 @@
   ; GCN-NEXT:   successors: %bb.1(0x80000000)
   ; GCN-NEXT:   liveins: $vgpr0, $sgpr0_sgpr1
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY killed $vgpr0
+  ; GCN-NEXT:   SI_SPILL_V32_SAVE killed $vgpr0, %stack.3, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
   ; GCN-NEXT:   renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset.cast, align 4, addrspace 4)
   ; GCN-NEXT:   renamable $sgpr6 = COPY renamable $sgpr1
   ; GCN-NEXT:   renamable $sgpr0 = COPY renamable $sgpr0, implicit killed $sgpr0_sgpr1
@@ -23,7 +23,7 @@
   ; GCN-NEXT:   renamable $sgpr1 = COPY killed renamable $sgpr6
   ; GCN-NEXT:   renamable $sgpr2 = COPY killed renamable $sgpr5
   ; GCN-NEXT:   renamable $sgpr3 = COPY killed renamable $sgpr4
-  ; GCN-NEXT:   SI_SPILL_S128_SAVE killed $sgpr0_sgpr1_sgpr2_sgpr3, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.1, align 4, addrspace 5)
+  ; GCN-NEXT:   SI_SPILL_S128_SAVE killed $sgpr0_sgpr1_sgpr2_sgpr3, %stack.2, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.2, align 4, addrspace 5)
   ; GCN-NEXT:   renamable $sgpr0 = S_MOV_B32 16
   ; GCN-NEXT:   renamable $sgpr1 = S_MOV_B32 15
   ; GCN-NEXT:   renamable $sgpr2 = S_MOV_B32 14
@@ -40,55 +40,59 @@
   ; GCN-NEXT:   renamable $sgpr13 = S_MOV_B32 2
   ; GCN-NEXT:   renamable $sgpr14 = S_MOV_B32 1
   ; GCN-NEXT:   renamable $sgpr15 = S_MOV_B32 0
-  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr15
-  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr14
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr13
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr12
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr11
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr10
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr9
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr8
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr7
-  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr6
-  ; GCN-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr5
-  ; GCN-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr4
-  ; GCN-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr3
-  ; GCN-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr2
-  ; GCN-NEXT:   [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr1
-  ; GCN-NEXT:   [[COPY16:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr0
-  ; GCN-NEXT:   undef %28.sub0:vreg_512 = COPY [[COPY1]]
-  ; GCN-NEXT:   %28.sub1:vreg_512 = COPY [[COPY2]]
-  ; GCN-NEXT:   %28.sub2:vreg_512 = COPY [[COPY3]]
-  ; GCN-NEXT:   %28.sub3:vreg_512 = COPY [[COPY4]]
-  ; GCN-NEXT:   %28.sub4:vreg_512 = COPY [[COPY5]]
-  ; GCN-NEXT:   %28.sub5:vreg_512 = COPY [[COPY6]]
-  ; GCN-NEXT:   %28.sub6:vreg_512 = COPY [[COPY7]]
-  ; GCN-NEXT:   %28.sub7:vreg_512 = COPY [[COPY8]]
-  ; GCN-NEXT:   %28.sub8:vreg_512 = COPY [[COPY9]]
-  ; GCN-NEXT:   %28.sub9:vreg_512 = COPY [[COPY10]]
-  ; GCN-NEXT:   %28.sub10:vreg_512 = COPY [[COPY11]]
-  ; GCN-NEXT:   %28.sub11:vreg_512 = COPY [[COPY12]]
-  ; GCN-NEXT:   %28.sub12:vreg_512 = COPY [[COPY13]]
-  ; GCN-NEXT:   %28.sub13:vreg_512 = COPY [[COPY14]]
-  ; GCN-NEXT:   %28.sub14:vreg_512 = COPY [[COPY15]]
-  ; GCN-NEXT:   %28.sub15:vreg_512 = COPY [[COPY16]]
+  ; GCN-NEXT:   renamable $vgpr0 = COPY killed renamable $sgpr15
+  ; GCN-NEXT:   renamable $vgpr30 = COPY killed renamable $sgpr14
+  ; GCN-NEXT:   renamable $vgpr29 = COPY killed renamable $sgpr13
+  ; GCN-NEXT:   renamable $vgpr28 = COPY killed renamable $sgpr12
+  ; GCN-NEXT:   renamable $vgpr27 = COPY killed renamable $sgpr11
+  ; GCN-NEXT:   renamable $vgpr26 = COPY killed renamable $sgpr10
+  ; GCN-NEXT:   renamable $vgpr25 = COPY killed renamable $sgpr9
+  ; GCN-NEXT:   renamable $vgpr24 = COPY killed renamable $sgpr8
+  ; GCN-NEXT:   renamable $vgpr23 = COPY killed renamable $sgpr7
+  ; GCN-NEXT:   renamable $vgpr22 = COPY killed renamable $sgpr6
+  ; GCN-NEXT:   renamable $vgpr21 = COPY killed renamable $sgpr5
+  ; GCN-NEXT:   renamable $vgpr20 = COPY killed renamable $sgpr4
+  ; GCN-NEXT:   renamable $vgpr19 = COPY killed renamable $sgpr3
+  ; GCN-NEXT:   renamable $vgpr18 = COPY killed renamable $sgpr2
+  ; GCN-NEXT:   renamable $vgpr17 = COPY killed renamable $sgpr1
+  ; GCN-NEXT:   renamable $vgpr16 = COPY killed renamable $sgpr0
+  ; GCN-NEXT:   undef renamable $vgpr0 = COPY killed renamable $vgpr0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+  ; GCN-NEXT:   renamable $vgpr1 = COPY killed renamable $vgpr30
+  ; GCN-NEXT:   renamable $vgpr2 = COPY killed renamable $vgpr29
+  ; GCN-NEXT:   renamable $vgpr3 = COPY killed renamable $vgpr28
+  ; GCN-NEXT:   renamable $vgpr4 = COPY killed renamable $vgpr27
+  ; GCN-NEXT:   renamable $vgpr5 = COPY killed renamable $vgpr26
+  ; GCN-NEXT:   renamable $vgpr6 = COPY killed renamable $vgpr25
+  ; GCN-NEXT:   renamable $vgpr7 = COPY killed renamable $vgpr24
+  ; GCN-NEXT:   renamable $vgpr8 = COPY killed renamable $vgpr23
+  ; GCN-NEXT:   renamable $vgpr9 = COPY killed renamable $vgpr22
+  ; GCN-NEXT:   renamable $vgpr10 = COPY killed renamable $vgpr21
+  ; GCN-NEXT:   renamable $vgpr11 = COPY killed renamable $vgpr20
+  ; GCN-NEXT:   renamable $vgpr12 = COPY killed renamable $vgpr19
+  ; GCN-NEXT:   renamable $vgpr13 = COPY killed renamable $vgpr18
+  ; GCN-NEXT:   renamable $vgpr14 = COPY killed renamable $vgpr17
+  ; GCN-NEXT:   renamable $vgpr15 = COPY killed renamable $vgpr16
+  ; GCN-NEXT:   SI_SPILL_V512_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, %stack.1, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.1, align 4, addrspace 5)
   ; GCN-NEXT:   renamable $sgpr0_sgpr1 = S_MOV_B64 $exec
   ; GCN-NEXT:   SI_SPILL_S64_SAVE killed $sgpr0_sgpr1, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5)
-  ; GCN-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GCN-NEXT:   renamable $vgpr0 = IMPLICIT_DEF
   ; GCN-NEXT:   renamable $sgpr0_sgpr1 = IMPLICIT_DEF
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.1:
   ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.3(0x40000000)
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.2, align 4, addrspace 5)
-  ; GCN-NEXT:   dead %45:vgpr_32 = COPY [[DEF]]
-  ; GCN-NEXT:   renamable $sgpr2 = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec
-  ; GCN-NEXT:   renamable $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $sgpr2, [[COPY]](s32), implicit $exec
+  ; GCN-NEXT:   $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.4, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.4, align 4, addrspace 5)
+  ; GCN-NEXT:   $vgpr17 = SI_SPILL_V32_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5)
+  ; GCN-NEXT:   $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = SI_SPILL_V512_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.1, align 4, addrspace 5)
+  ; GCN-NEXT:   $vgpr16 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
+  ; GCN-NEXT:   renamable $sgpr2 = V_READFIRSTLANE_B32 $vgpr16, implicit $exec
+  ; GCN-NEXT:   renamable $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $sgpr2, $vgpr16, implicit $exec
   ; GCN-NEXT:   renamable $sgpr0_sgpr1 = S_AND_SAVEEXEC_B64 killed renamable $sgpr0_sgpr1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-  ; GCN-NEXT:   [[V_INDIRECT_REG_READ_GPR_IDX_B32_V16_:%[0-9]+]]:vgpr_32 = V_INDIRECT_REG_READ_GPR_IDX_B32_V16 %28, killed $sgpr2, 11, implicit-def $m0, implicit $m0, implicit $exec
-  ; GCN-NEXT:   [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[V_INDIRECT_REG_READ_GPR_IDX_B32_V16_]]
+  ; GCN-NEXT:   renamable $vgpr0 = V_INDIRECT_REG_READ_GPR_IDX_B32_V16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $sgpr2, 11, implicit-def $m0, implicit $m0, implicit $exec
+  ; GCN-NEXT:   SI_SPILL_V32_SAVE $vgpr0, %stack.6, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
+  ; GCN-NEXT:   SI_SPILL_V32_SAVE killed $vgpr0, %stack.5, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5)
   ; GCN-NEXT:   renamable $sgpr2_sgpr3 = COPY renamable $sgpr0_sgpr1
-  ; GCN-NEXT:   SI_SPILL_S64_SAVE killed $sgpr2_sgpr3, %stack.2, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.2, align 4, addrspace 5)
+  ; GCN-NEXT:   SI_SPILL_S64_SAVE killed $sgpr2_sgpr3, %stack.4, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.4, align 4, addrspace 5)
   ; GCN-NEXT:   $exec = S_XOR_B64_term $exec, killed renamable $sgpr0_sgpr1, implicit-def dead $scc
   ; GCN-NEXT:   S_CBRANCH_EXECNZ %bb.1, implicit $exec
   ; GCN-NEXT: {{  $}}
@@ -99,8 +103,9 @@
   ; GCN-NEXT:   $exec = S_MOV_B64 renamable $sgpr0_sgpr1
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.2:
-  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = SI_SPILL_S128_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.1, align 4, addrspace 5)
-  ; GCN-NEXT:   BUFFER_STORE_DWORD_OFFSET [[V_INDIRECT_REG_READ_GPR_IDX_B32_V16_]], killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.out.load, addrspace 1)
+  ; GCN-NEXT:   $vgpr0 = SI_SPILL_V32_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
+  ; GCN-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = SI_SPILL_S128_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.2, align 4, addrspace 5)
+  ; GCN-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.out.load, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
 entry:
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
@@ -227,14 +227,14 @@
 
 ; W64-O0-DAG: s_mov_b32 [[IDX_S:s[0-9]+]], s{{[0-9]+}}
 ; W64-O0-DAG: v_mov_b32_e32 [[IDX_V:v[0-9]+]], s{{[0-9]+}}
-; W64-O0-DAG: buffer_store_dword [[IDX_V]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill
 ; W64-O0-DAG: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
+; W64-O0-DAG: buffer_store_dword [[IDX_V]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} ; 4-byte Folded Spill
 
 ; W64-O0: [[LOOPBB0:.LBB[0-9]+_[0-9]+]]: ; =>This Inner Loop Header: Depth=1
-; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; W64-O0: s_waitcnt vmcnt(0)
 ; W64-O0-DAG: v_readfirstlane_b32 s[[S0:[0-9]+]], v[[VRSRC0]]
 ; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP1:[0-9]+]], v[[VRSRC1]]
@@ -251,7 +251,7 @@
 ; W64-O0-DAG: s_mov_b32 s[[S2:[0-9]+]], s[[SRSRCTMP2]]
 ; W64-O0-DAG: s_mov_b32 s[[S3:[0-9]+]], s[[SRSRCTMP3]]
 ; W64-O0: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]]
-; W64-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:4 ; 4-byte Folded Reload
+; W64-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; 4-byte Folded Reload
 ; W64-O0: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s[[[S0]]:[[S3]]], {{.*}} idxen
 ; W64-O0: s_waitcnt vmcnt(0)
 ; W64-O0: buffer_store_dword [[RES]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill
@@ -270,10 +270,10 @@
 ; W64-O0: v_writelane_b32 [[VSAVEEXEC]], s[[SAVEEXEC1]], [[SAVEEXEC_IDX1:[0-9]+]]
 
 ; W64-O0: [[LOOPBB1:.LBB[0-9]+_[0-9]+]]: ; =>This Inner Loop Header: Depth=1
-; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; W64-O0: s_waitcnt vmcnt(0)
 ; W64-O0-DAG: v_readfirstlane_b32 s[[S0:[0-9]+]], v[[VRSRC0]]
 ; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP1:[0-9]+]], v[[VRSRC1]]
@@ -297,10 +297,10 @@
 ; W64-O0: s_xor_b64 exec, exec, [[SAVE]]
 ; W64-O0-NEXT: s_cbranch_execnz [[LOOPBB1]]
 
-; W64-O0: buffer_load_dword [[RES:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP]] ; 4-byte Folded Reload
 ; W64-O0: v_readlane_b32 s[[SAVEEXEC0:[0-9]+]], [[VSAVEEXEC]], [[SAVEEXEC_IDX0]]
 ; W64-O0: v_readlane_b32 s[[SAVEEXEC1:[0-9]+]], [[VSAVEEXEC]], [[SAVEEXEC_IDX1]]
 ; W64-O0: s_mov_b64 exec, s[[[SAVEEXEC0]]:[[SAVEEXEC1]]]
+; W64-O0: buffer_load_dword [[RES:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP]] ; 4-byte Folded Reload
 ; W64-O0: buffer_store_dword [[RES]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF]] ; 4-byte Folded Spill
 
 ; W64-O0: [[TERMBB]]:
diff --git a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
--- a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
@@ -19,10 +19,10 @@
 ; CHECK-NEXT:    v_writelane_b32 v40, s33, 2
 ; CHECK-NEXT:    s_mov_b32 s33, s32
 ; CHECK-NEXT:    s_add_i32 s32, s32, 0x400
-; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
 ; CHECK-NEXT:  .Ltmp0:
 ; CHECK-NEXT:    .loc 0 31 3 prologue_end ; lane-info.cpp:31:3
+; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
+; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
 ; CHECK-NEXT:    s_getpc_b64 s[16:17]
 ; CHECK-NEXT:    s_add_u32 s16, s16, _ZL13sleep_foreverv@gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s17, s17, _ZL13sleep_foreverv@gotpcrel32@hi+12
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
@@ -191,23 +191,23 @@
 ; we have no VGPR to allocate for SGPR spills. We are forced to spill to memory.
 
 ; GCN-LABEL: {{^}}spill_sgpr_no_free_vgpr:
-; GCN: v_writelane_b32 v{{[0-9]+}}, s34, 0
-; GCN: v_writelane_b32 v{{[0-9]+}}, s35, 1
-; GCN: v_writelane_b32 v{{[0-9]+}}, s36, 2
-; GCN: v_writelane_b32 v{{[0-9]+}}, s37, 3
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32
+; GCN: v_writelane_b32 [[A:v[0-9]+]], s34, 0
+; GCN: buffer_store_dword [[A]], off, s[0:3], s32
+; GCN: v_writelane_b32 [[B:v[0-9]+]], s35, 0
+; GCN: buffer_store_dword [[B]], off, s[0:3], s32
+; GCN: v_writelane_b32 [[C:v[0-9]+]], s36, 0
+; GCN: buffer_store_dword [[C]], off, s[0:3], s32
+; GCN: v_writelane_b32 [[D:v[0-9]+]], s37, 0
+; GCN: buffer_store_dword [[D]], off, s[0:3], s32
 ; GCN: #ASMEND
-; GCN: buffer_load_dword v{{[0-9]+}}
-; GCN: buffer_load_dword v{{[0-9]+}}
-; GCN: buffer_load_dword v{{[0-9]+}}
-; GCN: buffer_load_dword v{{[0-9]+}}
-; GCN: v_readlane_b32 s37, v{{[0-9]+}}, 3
-; GCN: v_readlane_b32 s36, v{{[0-9]+}}, 2
-; GCN: v_readlane_b32 s35, v{{[0-9]+}}, 1
-; GCN: v_readlane_b32 s34, v{{[0-9]+}}, 0
+; GCN: buffer_load_dword [[E:v[0-9]+]]
+; GCN: v_readlane_b32 s37, [[E]], 0
+; GCN: buffer_load_dword [[F:v[0-9]+]]
+; GCN: v_readlane_b32 s36, [[F]], 0
+; GCN: buffer_load_dword [[G:v[0-9]+]]
+; GCN: v_readlane_b32 s35, [[G]], 0
+; GCN: buffer_load_dword [[H:v[0-9]+]]
+; GCN: v_readlane_b32 s34, [[H]], 0
 
 define void @spill_sgpr_no_free_vgpr(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -133,7 +133,7 @@
 ; GFX9-O0:       ; %bb.0: ; %entry
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
@@ -144,18 +144,18 @@
 ; GFX9-O0-NEXT:    s_mov_b32 s39, s7
 ; GFX9-O0-NEXT:    s_mov_b64 s[42:43], s[38:39]
 ; GFX9-O0-NEXT:    s_mov_b64 s[40:41], s[36:37]
-; GFX9-O0-NEXT:    v_writelane_b32 v3, s40, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v3, s41, 1
-; GFX9-O0-NEXT:    v_writelane_b32 v3, s42, 2
-; GFX9-O0-NEXT:    v_writelane_b32 v3, s43, 3
+; GFX9-O0-NEXT:    v_writelane_b32 v5, s40, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v5, s41, 1
+; GFX9-O0-NEXT:    v_writelane_b32 v5, s42, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v5, s43, 3
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 0
-; GFX9-O0-NEXT:    buffer_load_dwordx2 v[4:5], off, s[36:39], s34
+; GFX9-O0-NEXT:    buffer_load_dwordx2 v[3:4], off, s[36:39], s34
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s34
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
@@ -165,23 +165,23 @@
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O0-NEXT:    v_add_u32_e64 v1, v1, v2
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[36:37], v0, s34
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s34
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[34:35], exec
-; GFX9-O0-NEXT:    v_writelane_b32 v3, s34, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v3, s35, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v5, s34, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v5, s35, 5
 ; GFX9-O0-NEXT:    s_and_b64 s[34:35], s[34:35], s[36:37]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX9-O0-NEXT:  ; %bb.1: ; %if
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
@@ -194,20 +194,19 @@
 ; GFX9-O0-NEXT:    v_add_u32_e64 v1, v2, v1
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:  .LBB1_2: ; %merge
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_readlane_b32 s34, v3, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s35, v3, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s34, v5, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s35, v5, 5
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[34:35]
-; GFX9-O0-NEXT:    v_readlane_b32 s36, v3, 0
-; GFX9-O0-NEXT:    v_readlane_b32 s37, v3, 1
-; GFX9-O0-NEXT:    v_readlane_b32 s38, v3, 2
-; GFX9-O0-NEXT:    v_readlane_b32 s39, v3, 3
+; GFX9-O0-NEXT:    v_readlane_b32 s36, v5, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s37, v5, 1
+; GFX9-O0-NEXT:    v_readlane_b32 s38, v5, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s39, v5, 3
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[34:35], v0, v4
+; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[34:35], v0, v3
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[34:35]
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 1
 ; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v0, s34, v0
@@ -216,7 +215,7 @@
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 0
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[36:39], s34 offset:4
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
diff --git a/llvm/test/CodeGen/ARM/vector-store.ll b/llvm/test/CodeGen/ARM/vector-store.ll
--- a/llvm/test/CodeGen/ARM/vector-store.ll
+++ b/llvm/test/CodeGen/ARM/vector-store.ll
@@ -419,3 +419,20 @@
   store <3 x i8> zeroinitializer, <3 x i8> *%p, align 4
   ret void
 }
+
+define void @v3i64shuffle(<3 x i64> *%p, <3 x i64> %a) {
+; CHECK-LABEL: v3i64shuffle:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i32 q8, #0x0
+; CHECK-NEXT:    ldrd r12, r1, [sp, #8]
+; CHECK-NEXT:    vmov d18, r2, r3
+; CHECK-NEXT:    vorr d19, d16, d16
+; CHECK-NEXT:    str r1, [r0, #20]
+; CHECK-NEXT:    vst1.32 {d18, d19}, [r0]!
+; CHECK-NEXT:    str.w r12, [r0]
+; CHECK-NEXT:    bx lr
+  %b = shufflevector <3 x i64> %a, <3 x i64> zeroinitializer, <3 x i32> <i32 0, i32 3, i32 2>
+  store <3 x i64> %b, <3 x i64> *%p, align 4
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/AVR/interrupts.ll b/llvm/test/CodeGen/AVR/interrupts.ll
--- a/llvm/test/CodeGen/AVR/interrupts.ll
+++ b/llvm/test/CodeGen/AVR/interrupts.ll
@@ -1,18 +1,16 @@
 ; RUN: llc < %s -march=avr | FileCheck %s
 
 @count = global i8 0
+@funcptr = global void () addrspace(1)* null
 
 define avr_intrcc void @interrupt_handler() {
 ; CHECK-LABEL: interrupt_handler:
 ; CHECK: sei
 ; CHECK-NEXT: push r0
-; CHECK-NEXT: push r1
 ; CHECK-NEXT: in r0, 63
 ; CHECK-NEXT: push r0
-; CHECK-NEXT: clr r1
 ; CHECK: pop r0
 ; CHECK-NEXT: out 63, r0
-; CHECK-NEXT: pop r1
 ; CHECK-NEXT: pop r0
 ; CHECK-NEXT: reti
   ret void
@@ -22,13 +20,10 @@
 ; CHECK-LABEL: interrupt_handler_via_ir_attribute:
 ; CHECK: sei
 ; CHECK-NEXT: push r0
-; CHECK-NEXT: push r1
 ; CHECK-NEXT: in r0, 63
 ; CHECK-NEXT: push r0
-; CHECK-NEXT: clr r1
 ; CHECK: pop r0
 ; CHECK-NEXT: out 63, r0
-; CHECK-NEXT: pop r1
 ; CHECK-NEXT: pop r0
 ; CHECK-NEXT: reti
   ret void
@@ -38,13 +33,10 @@
 ; CHECK-LABEL: signal_handler:
 ; CHECK-NOT: sei
 ; CHECK: push r0
-; CHECK-NEXT: push r1
 ; CHECK-NEXT: in r0, 63
 ; CHECK-NEXT: push r0
-; CHECK-NEXT: clr r1
 ; CHECK: pop r0
 ; CHECK-NEXT: out 63, r0
-; CHECK-NEXT: pop r1
 ; CHECK-NEXT: pop r0
 ; CHECK-NEXT: reti
   ret void
@@ -54,13 +46,10 @@
 ; CHECK-LABEL: signal_handler_via_attribute:
 ; CHECK-NOT: sei
 ; CHECK: push r0
-; CHECK-NEXT: push r1
 ; CHECK-NEXT: in r0, 63
 ; CHECK-NEXT: push r0
-; CHECK-NEXT: clr r1
 ; CHECK: pop r0
 ; CHECK-NEXT: out 63, r0
-; CHECK-NEXT: pop r1
 ; CHECK-NEXT: pop r0
 ; CHECK-NEXT: reti
   ret void
@@ -70,10 +59,8 @@
 ; CHECK-LABEL: interrupt_alloca:
 ; CHECK: sei
 ; CHECK-NEXT: push r0
-; CHECK-NEXT: push r1
 ; CHECK-NEXT: in r0, 63
 ; CHECK-NEXT: push r0
-; CHECK-NEXT: clr r1
 ; CHECK: push r28
 ; CHECK-NEXT: push r29
 ; CHECK-NEXT: in r28, 61
@@ -94,7 +81,6 @@
 ; CHECK-NEXT: pop r28
 ; CHECK: pop r0
 ; CHECK-NEXT: out 63, r0
-; CHECK-NEXT: pop r1
 ; CHECK-NEXT: pop r0
 ; CHECK-NEXT: reti
   alloca i8
@@ -104,10 +90,8 @@
 define void @signal_handler_with_increment() #1 {
 ; CHECK-LABEL: signal_handler_with_increment:
 ; CHECK:      push r0
-; CHECK-NEXT: push r1
 ; CHECK-NEXT: in r0, 63
 ; CHECK-NEXT: push r0
-; CHECK-NEXT: clr r1
 ; CHECK-NEXT: push r24
 ; CHECK-NEXT: lds r24, count
 ; CHECK-NEXT: inc r24
@@ -115,7 +99,6 @@
 ; CHECK-NEXT: pop r24
 ; CHECK-NEXT: pop r0
 ; CHECK-NEXT: out 63, r0
-; CHECK-NEXT: pop r1
 ; CHECK-NEXT: pop r0
 ; CHECK-NEXT: reti
   %old = load volatile i8, i8* @count
@@ -124,6 +107,29 @@
   ret void
 }
 
+; Check that r1 is saved/restored and set to 0 when using inline assembly.
+define void @signal_handler_with_asm() #1 {
+; CHECK-LABEL: signal_handler_with_asm:
+; CHECK:      push r0
+; CHECK-NEXT: in r0, 63
+; CHECK-NEXT: push r0
+; CHECK-NEXT: push r1
+; CHECK-NEXT: clr r1
+; CHECK-NEXT: push r24
+; CHECK-NEXT: ldi
+;             ;APP
+; CHECK:      mov
+;             ;NO_APP
+; CHECK:      pop r24
+; CHECK-NEXT: pop r1
+; CHECK-NEXT: pop r0
+; CHECK-NEXT: out 63, r0
+; CHECK-NEXT: pop r0
+; CHECK-NEXT: reti
+  call i8 asm sideeffect "mov $0, $1", "=r,r"(i8 3) nounwind
+  ret void
+}
+
 declare void @foo()
 
 ; When a signal handler calls a function, it must push/pop all call clobbered
@@ -131,9 +137,9 @@
 define void @signal_handler_with_call() #1 {
 ; CHECK-LABEL: signal_handler_with_call:
 ; CHECK:      push r0
-; CHECK-NEXT: push r1
 ; CHECK-NEXT: in r0, 63
 ; CHECK-NEXT: push r0
+; CHECK-NEXT: push r1
 ; CHECK-NEXT: clr r1
 ; CHECK-NEXT: push r18
 ; CHECK-NEXT: push r19
@@ -160,14 +166,58 @@
 ; CHECK-NEXT: pop r20
 ; CHECK-NEXT: pop r19
 ; CHECK-NEXT: pop r18
+; CHECK-NEXT: pop r1
 ; CHECK-NEXT: pop r0
 ; CHECK-NEXT: out 63, r0
-; CHECK-NEXT: pop r1
 ; CHECK-NEXT: pop r0
 ; CHECK-NEXT: reti
   call void @foo()
   ret void
 }
 
+define void @signal_handler_with_icall() #1 {
+; CHECK-LABEL: signal_handler_with_icall:
+; CHECK:      push    r0
+; CHECK-NEXT: in      r0, 63
+; CHECK-NEXT: push    r0
+; CHECK-NEXT: push    r1
+; CHECK-NEXT: clr     r1
+; CHECK-NEXT: push    r18
+; CHECK-NEXT: push    r19
+; CHECK-NEXT: push    r20
+; CHECK-NEXT: push    r21
+; CHECK-NEXT: push    r22
+; CHECK-NEXT: push    r23
+; CHECK-NEXT: push    r24
+; CHECK-NEXT: push    r25
+; CHECK-NEXT: push    r26
+; CHECK-NEXT: push    r27
+; CHECK-NEXT: push    r30
+; CHECK-NEXT: push    r31
+; CHECK-NEXT: lds     r30, funcptr
+; CHECK-NEXT: lds     r31, funcptr+1
+; CHECK-NEXT: icall
+; CHECK-NEXT: pop     r31
+; CHECK-NEXT: pop     r30
+; CHECK-NEXT: pop     r27
+; CHECK-NEXT: pop     r26
+; CHECK-NEXT: pop     r25
+; CHECK-NEXT: pop     r24
+; CHECK-NEXT: pop     r23
+; CHECK-NEXT: pop     r22
+; CHECK-NEXT: pop     r21
+; CHECK-NEXT: pop     r20
+; CHECK-NEXT: pop     r19
+; CHECK-NEXT: pop     r18
+; CHECK-NEXT: pop     r1
+; CHECK-NEXT: pop     r0
+; CHECK-NEXT: out     63, r0
+; CHECK-NEXT: pop     r0
+; CHECK-NEXT: reti
+  %ptr = load volatile void() addrspace(1)*, void() addrspace(1)** @funcptr
+  call void %ptr()
+  ret void
+}
+
 attributes #0 = { "interrupt" }
 attributes #1 = { "signal" }
diff --git a/llvm/test/CodeGen/AVR/pseudo/LDWRdPtr-same-src-dst.mir b/llvm/test/CodeGen/AVR/pseudo/LDWRdPtr-same-src-dst.mir
deleted file mode 100644
--- a/llvm/test/CodeGen/AVR/pseudo/LDWRdPtr-same-src-dst.mir
+++ /dev/null
@@ -1,30 +0,0 @@
-# RUN: llc -O0 %s -o - | FileCheck %s
-
-# This test checks the expansion of the 16-bit LDWRdPtr pseudo instruction.
-
---- |
-  target triple = "avr--"
-  define void @test_ldwrdptr() {
-  entry:
-    ret void
-  }
-...
-
----
-name:            test_ldwrdptr
-tracksRegLiveness: true
-body: |
-  bb.0.entry:
-    liveins: $r31r30
-
-    ; CHECK-LABEL: test_ldwrdptr
-
-    ; CHECK:      ld [[SCRATCH:r[0-9]+]], Z
-    ; CHECK-NEXT: push [[SCRATCH]]
-    ; CHECK-NEXT: ldd [[SCRATCH]], Z+1
-    ; CHECK-NEXT: mov r31, [[SCRATCH]]
-    ; CHECK-NEXT: pop r30
-
-    early-clobber $r31r30 = LDWRdPtr undef $r31r30
-...
-
diff --git a/llvm/test/CodeGen/AVR/pseudo/NEGWRd.mir b/llvm/test/CodeGen/AVR/pseudo/NEGWRd.mir
--- a/llvm/test/CodeGen/AVR/pseudo/NEGWRd.mir
+++ b/llvm/test/CodeGen/AVR/pseudo/NEGWRd.mir
@@ -22,5 +22,5 @@
     ; CHECK-NEXT: $r14 = NEGRd $r14
     ; CHECK-NEXT: $r15 = SBCRdRr $r15, $r1, implicit-def $sreg, implicit killed $sreg
 
-    $r15r14 = NEGWRd $r15r14, implicit-def $sreg
+    $r15r14 = NEGWRd $r15r14, implicit-def $sreg, implicit $r1
 ...
diff --git a/llvm/test/CodeGen/AVR/pseudo/ROLBrd.mir b/llvm/test/CodeGen/AVR/pseudo/ROLBrd.mir
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AVR/pseudo/ROLBrd.mir
@@ -0,0 +1,25 @@
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - | FileCheck %s
+
+# This test checks the expansion of the 8-bit ROLB (rotate) pseudo instruction.
+
+--- |
+  target triple = "avr--"
+  define void @test_rolbrd() {
+  entry:
+    ret void
+  }
+...
+
+---
+name:            test_rolbrd
+body: |
+  bb.0.entry:
+    liveins: $r14
+
+    ; CHECK-LABEL: test_rolbrd
+
+    ; CHECK:      $r14 = ADDRdRr killed $r14, killed $r14, implicit-def $sreg
+    ; CHECK-NEXT: $r14 = ADCRdRr $r14, $r1, implicit-def dead $sreg, implicit killed $sreg
+
+    $r14 = ROLBRd $r14, implicit-def $sreg, implicit $r1
+...
diff --git a/llvm/test/CodeGen/AVR/unaligned-atomic-loads.ll b/llvm/test/CodeGen/AVR/unaligned-atomic-ops.ll
rename from llvm/test/CodeGen/AVR/unaligned-atomic-loads.ll
rename to llvm/test/CodeGen/AVR/unaligned-atomic-ops.ll
--- a/llvm/test/CodeGen/AVR/unaligned-atomic-loads.ll
+++ b/llvm/test/CodeGen/AVR/unaligned-atomic-ops.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -mattr=addsubiw < %s -march=avr | FileCheck %s
 
-; This verifies that the middle end can handle an unaligned atomic load.
+; This verifies that the backend can handle an unaligned atomic load and store.
 ;
 ; In the past, an assertion inside the SelectionDAGBuilder would always
 ; hit an assertion for unaligned loads and stores.
@@ -14,6 +14,7 @@
 start:
   %a = getelementptr inbounds %AtomicI16, %AtomicI16* %self, i16 0, i32 0, i32 0
   load atomic i16, i16* %a seq_cst, align 1
+  store atomic i16 5, i16* %a seq_cst, align 1
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AVR/zeroreg.ll b/llvm/test/CodeGen/AVR/zeroreg.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AVR/zeroreg.ll
@@ -0,0 +1,27 @@
+; RUN: llc -mattr=avr6,sram < %s -march=avr | FileCheck %s
+
+; This file tests whether the compiler correctly works with the r1 register,
+; clearing it when needed.
+
+; Test regular use of r1 as a zero register.
+; CHECK-LABEL: store8zero:
+; CHECK:      st {{[XYZ]}}, r1
+; CHECK-NEXT: mov r24, r1
+; CHECK-NEXT: ret
+define i8 @store8zero(i8* %x) {
+  store i8 0, i8* %x
+  ret i8 0
+}
+
+; Test that mulitplication instructions (mul, muls, etc) clobber r1 and require
+; a "clr r1" instruction.
+; CHECK-LABEL: mul:
+; CHECK:      muls
+; CHECK-NEXT: clr r1
+; CHECK-NEXT: st {{[XYZ]}}, r0
+; CHECK-NEXT: ret
+define void @mul(i8* %ptr, i8 %n) {
+  %result = mul i8 %n, 3
+  store i8 %result, i8* %ptr
+  ret void
+}
diff --git a/llvm/test/CodeGen/PowerPC/scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/scalar_to_vector_shuffle.ll
deleted file mode 100644
--- a/llvm/test/CodeGen/PowerPC/scalar_to_vector_shuffle.ll
+++ /dev/null
@@ -1,138 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \
-; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc64le-unknown-linux-gnu < %s | \
-; RUN: FileCheck %s --check-prefix=CHECK-LE-P8
-; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \
-; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc64le-unknown-linux-gnu < %s | \
-; RUN: FileCheck %s --check-prefix=CHECK-LE-P9
-; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \
-; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc64-unknown-linux-gnu < %s | \
-; RUN: FileCheck %s --check-prefix=CHECK-BE-P8
-; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \
-; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc64-unknown-linux-gnu < %s | \
-; RUN: FileCheck %s --check-prefix=CHECK-BE-P9
-
-; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \
-; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc64-ibm-aix < %s | \
-; RUN: FileCheck %s --check-prefix=CHECK-AIX-64-P8
-; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \
-; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc64-ibm-aix < %s | \
-; RUN: FileCheck %s --check-prefix=CHECK-AIX-64-P9
-; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \
-; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc-ibm-aix < %s | \
-; RUN: FileCheck %s --check-prefix=CHECK-AIX-32-P8
-; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \
-; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc-ibm-aix < %s | \
-; RUN: FileCheck %s --check-prefix=CHECK-AIX-32-P9
-
-define <16 x i8> @test_4_8(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) local_unnamed_addr {
-; CHECK-LE-P8-LABEL: test_4_8:
-; CHECK-LE-P8:       # %bb.0: # %entry
-; CHECK-LE-P8-NEXT:    addis r5, r2, .LCPI0_0@toc@ha
-; CHECK-LE-P8-NEXT:    lfiwzx f0, 0, r3
-; CHECK-LE-P8-NEXT:    lfdx f1, 0, r4
-; CHECK-LE-P8-NEXT:    addi r3, r5, .LCPI0_0@toc@l
-; CHECK-LE-P8-NEXT:    lxvd2x vs2, 0, r3
-; CHECK-LE-P8-NEXT:    xxswapd v2, f0
-; CHECK-LE-P8-NEXT:    xxswapd v3, f1
-; CHECK-LE-P8-NEXT:    xxswapd v4, vs2
-; CHECK-LE-P8-NEXT:    vperm v2, v3, v2, v4
-; CHECK-LE-P8-NEXT:    blr
-;
-; CHECK-LE-P9-LABEL: test_4_8:
-; CHECK-LE-P9:       # %bb.0: # %entry
-; CHECK-LE-P9-NEXT:    lfiwzx f0, 0, r3
-; CHECK-LE-P9-NEXT:    addis r3, r2, .LCPI0_0@toc@ha
-; CHECK-LE-P9-NEXT:    addi r3, r3, .LCPI0_0@toc@l
-; CHECK-LE-P9-NEXT:    xxswapd v2, f0
-; CHECK-LE-P9-NEXT:    lfd f0, 0(r4)
-; CHECK-LE-P9-NEXT:    lxv v4, 0(r3)
-; CHECK-LE-P9-NEXT:    xxswapd v3, f0
-; CHECK-LE-P9-NEXT:    vperm v2, v3, v2, v4
-; CHECK-LE-P9-NEXT:    blr
-;
-; CHECK-BE-P8-LABEL: test_4_8:
-; CHECK-BE-P8:       # %bb.0: # %entry
-; CHECK-BE-P8-NEXT:    lfiwzx f0, 0, r3
-; CHECK-BE-P8-NEXT:    addis r5, r2, .LCPI0_0@toc@ha
-; CHECK-BE-P8-NEXT:    lxsdx v3, 0, r4
-; CHECK-BE-P8-NEXT:    addi r3, r5, .LCPI0_0@toc@l
-; CHECK-BE-P8-NEXT:    lxvw4x v4, 0, r3
-; CHECK-BE-P8-NEXT:    xxsldwi v2, f0, f0, 1
-; CHECK-BE-P8-NEXT:    vperm v2, v2, v3, v4
-; CHECK-BE-P8-NEXT:    blr
-;
-; CHECK-BE-P9-LABEL: test_4_8:
-; CHECK-BE-P9:       # %bb.0: # %entry
-; CHECK-BE-P9-NEXT:    lfiwzx f0, 0, r3
-; CHECK-BE-P9-NEXT:    addis r3, r2, .LCPI0_0@toc@ha
-; CHECK-BE-P9-NEXT:    lxsd v3, 0(r4)
-; CHECK-BE-P9-NEXT:    addi r3, r3, .LCPI0_0@toc@l
-; CHECK-BE-P9-NEXT:    lxv v4, 0(r3)
-; CHECK-BE-P9-NEXT:    xxsldwi v2, f0, f0, 1
-; CHECK-BE-P9-NEXT:    vperm v2, v2, v3, v4
-; CHECK-BE-P9-NEXT:    blr
-;
-; CHECK-AIX-64-P8-LABEL: test_4_8:
-; CHECK-AIX-64-P8:       # %bb.0: # %entry
-; CHECK-AIX-64-P8-NEXT:    ld r5, L..C0(r2) # %const.0
-; CHECK-AIX-64-P8-NEXT:    lfiwzx f0, 0, r3
-; CHECK-AIX-64-P8-NEXT:    lxsdx v3, 0, r4
-; CHECK-AIX-64-P8-NEXT:    xxsldwi v2, f0, f0, 1
-; CHECK-AIX-64-P8-NEXT:    lxvw4x v4, 0, r5
-; CHECK-AIX-64-P8-NEXT:    vperm v2, v2, v3, v4
-; CHECK-AIX-64-P8-NEXT:    blr
-;
-; CHECK-AIX-64-P9-LABEL: test_4_8:
-; CHECK-AIX-64-P9:       # %bb.0: # %entry
-; CHECK-AIX-64-P9-NEXT:    lfiwzx f0, 0, r3
-; CHECK-AIX-64-P9-NEXT:    ld r3, L..C0(r2) # %const.0
-; CHECK-AIX-64-P9-NEXT:    lxsd v3, 0(r4)
-; CHECK-AIX-64-P9-NEXT:    xxsldwi v2, f0, f0, 1
-; CHECK-AIX-64-P9-NEXT:    lxv v4, 0(r3)
-; CHECK-AIX-64-P9-NEXT:    vperm v2, v2, v3, v4
-; CHECK-AIX-64-P9-NEXT:    blr
-;
-; CHECK-AIX-32-P8-LABEL: test_4_8:
-; CHECK-AIX-32-P8:       # %bb.0: # %entry
-; CHECK-AIX-32-P8-NEXT:    lwz r5, 4(r4)
-; CHECK-AIX-32-P8-NEXT:    lxsiwzx v2, 0, r3
-; CHECK-AIX-32-P8-NEXT:    stw r5, -16(r1)
-; CHECK-AIX-32-P8-NEXT:    lwz r3, 0(r4)
-; CHECK-AIX-32-P8-NEXT:    addi r4, r1, -32
-; CHECK-AIX-32-P8-NEXT:    stw r3, -32(r1)
-; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
-; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r3
-; CHECK-AIX-32-P8-NEXT:    lwz r3, L..C0(r2) # %const.0
-; CHECK-AIX-32-P8-NEXT:    lxvw4x vs1, 0, r4
-; CHECK-AIX-32-P8-NEXT:    lxvw4x v4, 0, r3
-; CHECK-AIX-32-P8-NEXT:    xxmrghw v3, vs1, vs0
-; CHECK-AIX-32-P8-NEXT:    vperm v2, v2, v3, v4
-; CHECK-AIX-32-P8-NEXT:    blr
-;
-; CHECK-AIX-32-P9-LABEL: test_4_8:
-; CHECK-AIX-32-P9:       # %bb.0: # %entry
-; CHECK-AIX-32-P9-NEXT:    lxsiwzx v2, 0, r3
-; CHECK-AIX-32-P9-NEXT:    lwz r3, 4(r4)
-; CHECK-AIX-32-P9-NEXT:    stw r3, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lwz r3, 0(r4)
-; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    stw r3, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lwz r3, L..C0(r2) # %const.0
-; CHECK-AIX-32-P9-NEXT:    lxv vs1, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v4, 0(r3)
-; CHECK-AIX-32-P9-NEXT:    xxmrghw v3, vs1, vs0
-; CHECK-AIX-32-P9-NEXT:    vperm v2, v2, v3, v4
-; CHECK-AIX-32-P9-NEXT:    blr
-entry:
-  %0 = load <4 x i8>, ptr %a, align 4
-  %bc1 = bitcast <4 x i8> %0 to i32
-  %vecinit3 = insertelement <4 x i32> poison, i32 %bc1, i64 0
-  %1 = load <8 x i8>, ptr %b, align 8
-  %bc2 = bitcast <8 x i8> %1 to i64
-  %vecinit6 = insertelement <2 x i64> undef, i64 %bc2, i64 0
-  %2 = bitcast <4 x i32> %vecinit3 to <16 x i8>
-  %3 = bitcast <2 x i64> %vecinit6 to <16 x i8>
-  %shuffle = shufflevector <16 x i8> %2, <16 x i8> %3, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  ret <16 x i8> %shuffle
-}
diff --git a/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll
@@ -0,0 +1,2090 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc64le-unknown-linux-gnu < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-LE-P8
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc64le-unknown-linux-gnu < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-LE-P9
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc64-unknown-linux-gnu < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-BE-P8
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc64-unknown-linux-gnu < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-BE-P9
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc64-ibm-aix < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-AIX-64-P8
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc64-ibm-aix < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-AIX-64-P9
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc-ibm-aix < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-AIX-32-P8
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc-ibm-aix < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-AIX-32-P9
+
+define <16 x i8> @test_v16i8_v16i8(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
+; CHECK-LE-P8-LABEL: test_v16i8_v16i8:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    lbz r3, 0(r3)
+; CHECK-LE-P8-NEXT:    lbz r4, 0(r4)
+; CHECK-LE-P8-NEXT:    mtvsrd v2, r3
+; CHECK-LE-P8-NEXT:    mtvsrd v3, r4
+; CHECK-LE-P8-NEXT:    vmrghh v2, v3, v2
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v16i8_v16i8:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    lxsibzx v2, 0, r3
+; CHECK-LE-P9-NEXT:    lxsibzx v3, 0, r4
+; CHECK-LE-P9-NEXT:    vmrghh v2, v3, v2
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v16i8_v16i8:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    addis r5, r2, .LCPI0_0@toc@ha
+; CHECK-BE-P8-NEXT:    lbz r4, 0(r4)
+; CHECK-BE-P8-NEXT:    lbz r3, 0(r3)
+; CHECK-BE-P8-NEXT:    addi r5, r5, .LCPI0_0@toc@l
+; CHECK-BE-P8-NEXT:    lxvw4x v2, 0, r5
+; CHECK-BE-P8-NEXT:    mtvsrwz v3, r4
+; CHECK-BE-P8-NEXT:    mtvsrwz v4, r3
+; CHECK-BE-P8-NEXT:    vperm v2, v4, v3, v2
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v16i8_v16i8:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    addis r5, r2, .LCPI0_0@toc@ha
+; CHECK-BE-P9-NEXT:    lxsibzx v3, 0, r4
+; CHECK-BE-P9-NEXT:    lxsibzx v4, 0, r3
+; CHECK-BE-P9-NEXT:    addi r5, r5, .LCPI0_0@toc@l
+; CHECK-BE-P9-NEXT:    lxv v2, 0(r5)
+; CHECK-BE-P9-NEXT:    vperm v2, v4, v3, v2
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v16i8_v16i8:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    ld r5, L..C0(r2) # %const.0
+; CHECK-AIX-64-P8-NEXT:    lbz r4, 0(r4)
+; CHECK-AIX-64-P8-NEXT:    lbz r3, 0(r3)
+; CHECK-AIX-64-P8-NEXT:    mtvsrwz v3, r4
+; CHECK-AIX-64-P8-NEXT:    lxvw4x v2, 0, r5
+; CHECK-AIX-64-P8-NEXT:    mtvsrwz v4, r3
+; CHECK-AIX-64-P8-NEXT:    vperm v2, v4, v3, v2
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v16i8_v16i8:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    ld r5, L..C0(r2) # %const.0
+; CHECK-AIX-64-P9-NEXT:    lxsibzx v3, 0, r4
+; CHECK-AIX-64-P9-NEXT:    lxsibzx v4, 0, r3
+; CHECK-AIX-64-P9-NEXT:    lxv v2, 0(r5)
+; CHECK-AIX-64-P9-NEXT:    vperm v2, v4, v3, v2
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v16i8_v16i8:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    lwz r5, L..C0(r2) # %const.0
+; CHECK-AIX-32-P8-NEXT:    lbz r4, 0(r4)
+; CHECK-AIX-32-P8-NEXT:    lbz r3, 0(r3)
+; CHECK-AIX-32-P8-NEXT:    mtvsrwz v3, r4
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v2, 0, r5
+; CHECK-AIX-32-P8-NEXT:    mtvsrwz v4, r3
+; CHECK-AIX-32-P8-NEXT:    vperm v2, v4, v3, v2
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v16i8_v16i8:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    lwz r5, L..C0(r2) # %const.0
+; CHECK-AIX-32-P9-NEXT:    lxsibzx v3, 0, r4
+; CHECK-AIX-32-P9-NEXT:    lxsibzx v4, 0, r3
+; CHECK-AIX-32-P9-NEXT:    lxv v2, 0(r5)
+; CHECK-AIX-32-P9-NEXT:    vperm v2, v4, v3, v2
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %0 = load <1 x i8>, ptr %a, align 4
+  %bc1 = bitcast <1 x i8> %0 to i8
+  %vecinit3 = insertelement <16 x i8> poison, i8 %bc1, i64 0
+  %1 = load <1 x i8>, ptr %b, align 8
+  %bc2 = bitcast <1 x i8> %1 to i8
+  %vecinit6 = insertelement <16 x i8> undef, i8 %bc2, i64 0
+  %2 = bitcast <16 x i8> %vecinit3 to <16 x i8>
+  %3 = bitcast <16 x i8> %vecinit6 to <16 x i8>
+  %shuffle = shufflevector <16 x i8> %2, <16 x i8> %3, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @test_v16i8_none(<16 x i8> %a, i8 %b) {
+; CHECK-LE-P8-LABEL: test_v16i8_none:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    addis r3, r2, .LCPI1_0@toc@ha
+; CHECK-LE-P8-NEXT:    mtvsrd v4, r5
+; CHECK-LE-P8-NEXT:    addi r3, r3, .LCPI1_0@toc@l
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r3
+; CHECK-LE-P8-NEXT:    xxswapd v3, vs0
+; CHECK-LE-P8-NEXT:    vperm v2, v2, v4, v3
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v16i8_none:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    mtvsrwz v3, r5
+; CHECK-LE-P9-NEXT:    vinsertb v2, v3, 15
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v16i8_none:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    addis r3, r2, .LCPI1_0@toc@ha
+; CHECK-BE-P8-NEXT:    mtvsrwz v4, r5
+; CHECK-BE-P8-NEXT:    addi r3, r3, .LCPI1_0@toc@l
+; CHECK-BE-P8-NEXT:    lxvw4x v3, 0, r3
+; CHECK-BE-P8-NEXT:    vperm v2, v4, v2, v3
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v16i8_none:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    mtvsrwz v3, r5
+; CHECK-BE-P9-NEXT:    vinsertb v2, v3, 0
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v16i8_none:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    ld r4, L..C1(r2) # %const.0
+; CHECK-AIX-64-P8-NEXT:    mtvsrwz v4, r3
+; CHECK-AIX-64-P8-NEXT:    lxvw4x v3, 0, r4
+; CHECK-AIX-64-P8-NEXT:    vperm v2, v4, v2, v3
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v16i8_none:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    mtvsrwz v3, r3
+; CHECK-AIX-64-P9-NEXT:    vinsertb v2, v3, 0
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v16i8_none:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    lwz r4, L..C1(r2) # %const.0
+; CHECK-AIX-32-P8-NEXT:    mtvsrwz v4, r3
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v3, 0, r4
+; CHECK-AIX-32-P8-NEXT:    vperm v2, v4, v2, v3
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v16i8_none:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    mtvsrwz v3, r3
+; CHECK-AIX-32-P9-NEXT:    vinsertb v2, v3, 0
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %vecins = insertelement <16 x i8> %a, i8 %b, i32 0
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @test_none_v16i8(i8 %arg, ptr nocapture noundef readonly %b) {
+; CHECK-LE-P8-LABEL: test_none_v16i8:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    addis r5, r2, .LCPI2_0@toc@ha
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
+; CHECK-LE-P8-NEXT:    mtvsrd v4, r3
+; CHECK-LE-P8-NEXT:    addi r5, r5, .LCPI2_0@toc@l
+; CHECK-LE-P8-NEXT:    lxvd2x vs1, 0, r5
+; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
+; CHECK-LE-P8-NEXT:    xxswapd v3, vs1
+; CHECK-LE-P8-NEXT:    vperm v2, v4, v2, v3
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_none_v16i8:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    mtvsrd v3, r3
+; CHECK-LE-P9-NEXT:    addis r3, r2, .LCPI2_0@toc@ha
+; CHECK-LE-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-LE-P9-NEXT:    addi r3, r3, .LCPI2_0@toc@l
+; CHECK-LE-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-LE-P9-NEXT:    vperm v2, v3, v2, v4
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_none_v16i8:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    addis r5, r2, .LCPI2_0@toc@ha
+; CHECK-BE-P8-NEXT:    lxvw4x v2, 0, r4
+; CHECK-BE-P8-NEXT:    mtvsrwz v4, r3
+; CHECK-BE-P8-NEXT:    addi r5, r5, .LCPI2_0@toc@l
+; CHECK-BE-P8-NEXT:    lxvw4x v3, 0, r5
+; CHECK-BE-P8-NEXT:    vperm v2, v2, v4, v3
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_none_v16i8:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    mtvsrwz v3, r3
+; CHECK-BE-P9-NEXT:    addis r3, r2, .LCPI2_0@toc@ha
+; CHECK-BE-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-BE-P9-NEXT:    addi r3, r3, .LCPI2_0@toc@l
+; CHECK-BE-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-BE-P9-NEXT:    vperm v2, v2, v3, v4
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_none_v16i8:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    ld r5, L..C2(r2) # %const.0
+; CHECK-AIX-64-P8-NEXT:    lxvw4x v2, 0, r4
+; CHECK-AIX-64-P8-NEXT:    mtvsrwz v4, r3
+; CHECK-AIX-64-P8-NEXT:    lxvw4x v3, 0, r5
+; CHECK-AIX-64-P8-NEXT:    vperm v2, v2, v4, v3
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_none_v16i8:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    mtvsrwz v3, r3
+; CHECK-AIX-64-P9-NEXT:    ld r3, L..C1(r2) # %const.0
+; CHECK-AIX-64-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-64-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    vperm v2, v2, v3, v4
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_none_v16i8:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    addi r5, r1, -16
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v2, 0, r4
+; CHECK-AIX-32-P8-NEXT:    stb r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v3, 0, r5
+; CHECK-AIX-32-P8-NEXT:    vmrghh v2, v2, v3
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_none_v16i8:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    stb r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    vmrghh v2, v2, v3
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %lhs = load <16 x i8>, ptr %b, align 4
+  %rhs = insertelement <16 x i8> undef, i8 %arg, i32 0
+  %shuffle = shufflevector <16 x i8> %lhs, <16 x i8> %rhs, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @test_v16i8_v8i16(i16 %arg, i8 %arg1) {
+; CHECK-LE-P8-LABEL: test_v16i8_v8i16:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    mtfprd f0, r4
+; CHECK-LE-P8-NEXT:    mtfprd f1, r3
+; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
+; CHECK-LE-P8-NEXT:    xxswapd v3, vs1
+; CHECK-LE-P8-NEXT:    vmrglh v2, v3, v2
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v16i8_v8i16:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    mtfprd f0, r4
+; CHECK-LE-P9-NEXT:    xxswapd v2, vs0
+; CHECK-LE-P9-NEXT:    mtfprd f0, r3
+; CHECK-LE-P9-NEXT:    xxswapd v3, vs0
+; CHECK-LE-P9-NEXT:    vmrglh v2, v3, v2
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v16i8_v8i16:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    sldi r4, r4, 56
+; CHECK-BE-P8-NEXT:    sldi r3, r3, 48
+; CHECK-BE-P8-NEXT:    mtvsrd v2, r4
+; CHECK-BE-P8-NEXT:    mtvsrd v3, r3
+; CHECK-BE-P8-NEXT:    vmrghh v2, v2, v3
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v16i8_v8i16:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    sldi r4, r4, 56
+; CHECK-BE-P9-NEXT:    sldi r3, r3, 48
+; CHECK-BE-P9-NEXT:    mtvsrd v2, r4
+; CHECK-BE-P9-NEXT:    mtvsrd v3, r3
+; CHECK-BE-P9-NEXT:    vmrghh v2, v2, v3
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v16i8_v8i16:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    sldi r4, r4, 56
+; CHECK-AIX-64-P8-NEXT:    sldi r3, r3, 48
+; CHECK-AIX-64-P8-NEXT:    mtvsrd v2, r4
+; CHECK-AIX-64-P8-NEXT:    mtvsrd v3, r3
+; CHECK-AIX-64-P8-NEXT:    vmrghh v2, v2, v3
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v16i8_v8i16:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    sldi r4, r4, 56
+; CHECK-AIX-64-P9-NEXT:    sldi r3, r3, 48
+; CHECK-AIX-64-P9-NEXT:    mtvsrd v2, r4
+; CHECK-AIX-64-P9-NEXT:    mtvsrd v3, r3
+; CHECK-AIX-64-P9-NEXT:    vmrghh v2, v2, v3
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v16i8_v8i16:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    addi r5, r1, -32
+; CHECK-AIX-32-P8-NEXT:    stb r4, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v2, 0, r5
+; CHECK-AIX-32-P8-NEXT:    sth r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v3, 0, r3
+; CHECK-AIX-32-P8-NEXT:    vmrghh v2, v2, v3
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v16i8_v8i16:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    stb r4, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    sth r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v2, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    vmrghh v2, v2, v3
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %lhs = insertelement <16 x i8> undef, i8 %arg1, i32 0
+  %rhs.tmp = insertelement <8 x i16> undef, i16 %arg, i32 0
+  %rhs = bitcast <8 x i16> %rhs.tmp to <16 x i8>
+  %shuffle = shufflevector <16 x i8> %lhs, <16 x i8> %rhs, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @test_v8i16_v16i8(i16 %arg, i8 %arg1) {
+; CHECK-LE-P8-LABEL: test_v8i16_v16i8:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    mtfprd f0, r4
+; CHECK-LE-P8-NEXT:    mtfprd f1, r3
+; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
+; CHECK-LE-P8-NEXT:    xxswapd v3, vs1
+; CHECK-LE-P8-NEXT:    vmrglh v2, v2, v3
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v8i16_v16i8:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    mtfprd f0, r4
+; CHECK-LE-P9-NEXT:    xxswapd v2, vs0
+; CHECK-LE-P9-NEXT:    mtfprd f0, r3
+; CHECK-LE-P9-NEXT:    xxswapd v3, vs0
+; CHECK-LE-P9-NEXT:    vmrglh v2, v2, v3
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v8i16_v16i8:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    sldi r4, r4, 56
+; CHECK-BE-P8-NEXT:    sldi r3, r3, 48
+; CHECK-BE-P8-NEXT:    mtvsrd v2, r4
+; CHECK-BE-P8-NEXT:    mtvsrd v3, r3
+; CHECK-BE-P8-NEXT:    vmrghh v2, v3, v2
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v8i16_v16i8:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    sldi r4, r4, 56
+; CHECK-BE-P9-NEXT:    sldi r3, r3, 48
+; CHECK-BE-P9-NEXT:    mtvsrd v2, r4
+; CHECK-BE-P9-NEXT:    mtvsrd v3, r3
+; CHECK-BE-P9-NEXT:    vmrghh v2, v3, v2
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v8i16_v16i8:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    sldi r4, r4, 56
+; CHECK-AIX-64-P8-NEXT:    sldi r3, r3, 48
+; CHECK-AIX-64-P8-NEXT:    mtvsrd v2, r4
+; CHECK-AIX-64-P8-NEXT:    mtvsrd v3, r3
+; CHECK-AIX-64-P8-NEXT:    vmrghh v2, v3, v2
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v8i16_v16i8:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    sldi r4, r4, 56
+; CHECK-AIX-64-P9-NEXT:    sldi r3, r3, 48
+; CHECK-AIX-64-P9-NEXT:    mtvsrd v2, r4
+; CHECK-AIX-64-P9-NEXT:    mtvsrd v3, r3
+; CHECK-AIX-64-P9-NEXT:    vmrghh v2, v3, v2
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v8i16_v16i8:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    addi r5, r1, -32
+; CHECK-AIX-32-P8-NEXT:    stb r4, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v2, 0, r5
+; CHECK-AIX-32-P8-NEXT:    sth r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v3, 0, r3
+; CHECK-AIX-32-P8-NEXT:    vmrghh v2, v3, v2
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v8i16_v16i8:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    stb r4, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    sth r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v2, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    vmrghh v2, v3, v2
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %rhs = insertelement <16 x i8> undef, i8 %arg1, i32 0
+  %lhs.tmp = insertelement <8 x i16> undef, i16 %arg, i32 0
+  %lhs = bitcast <8 x i16> %lhs.tmp to <16 x i8>
+  %shuffle = shufflevector <16 x i8> %lhs, <16 x i8> %rhs, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @test_none_v8i16(i16 %arg, ptr nocapture noundef readonly %b) {
+; CHECK-LE-P8-LABEL: test_none_v8i16:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    addis r5, r2, .LCPI5_0@toc@ha
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
+; CHECK-LE-P8-NEXT:    mtvsrd v4, r3
+; CHECK-LE-P8-NEXT:    addi r5, r5, .LCPI5_0@toc@l
+; CHECK-LE-P8-NEXT:    lxvd2x vs1, 0, r5
+; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
+; CHECK-LE-P8-NEXT:    xxswapd v3, vs1
+; CHECK-LE-P8-NEXT:    vperm v2, v4, v2, v3
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_none_v8i16:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    mtvsrd v3, r3
+; CHECK-LE-P9-NEXT:    addis r3, r2, .LCPI5_0@toc@ha
+; CHECK-LE-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-LE-P9-NEXT:    addi r3, r3, .LCPI5_0@toc@l
+; CHECK-LE-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-LE-P9-NEXT:    vperm v2, v3, v2, v4
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_none_v8i16:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    addis r5, r2, .LCPI5_0@toc@ha
+; CHECK-BE-P8-NEXT:    lxvw4x v2, 0, r4
+; CHECK-BE-P8-NEXT:    mtvsrwz v4, r3
+; CHECK-BE-P8-NEXT:    addi r5, r5, .LCPI5_0@toc@l
+; CHECK-BE-P8-NEXT:    lxvw4x v3, 0, r5
+; CHECK-BE-P8-NEXT:    vperm v2, v2, v4, v3
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_none_v8i16:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    mtvsrwz v3, r3
+; CHECK-BE-P9-NEXT:    addis r3, r2, .LCPI5_0@toc@ha
+; CHECK-BE-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-BE-P9-NEXT:    addi r3, r3, .LCPI5_0@toc@l
+; CHECK-BE-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-BE-P9-NEXT:    vperm v2, v2, v3, v4
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_none_v8i16:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    ld r5, L..C3(r2) # %const.0
+; CHECK-AIX-64-P8-NEXT:    lxvw4x v2, 0, r4
+; CHECK-AIX-64-P8-NEXT:    mtvsrwz v4, r3
+; CHECK-AIX-64-P8-NEXT:    lxvw4x v3, 0, r5
+; CHECK-AIX-64-P8-NEXT:    vperm v2, v2, v4, v3
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_none_v8i16:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    mtvsrwz v3, r3
+; CHECK-AIX-64-P9-NEXT:    ld r3, L..C2(r2) # %const.0
+; CHECK-AIX-64-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-64-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    vperm v2, v2, v3, v4
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_none_v8i16:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    addi r5, r1, -16
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v2, 0, r4
+; CHECK-AIX-32-P8-NEXT:    sth r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v3, 0, r5
+; CHECK-AIX-32-P8-NEXT:    vmrghh v2, v2, v3
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_none_v8i16:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    sth r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    vmrghh v2, v2, v3
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %lhs = load <16 x i8>, ptr %b, align 4
+  %rhs.tmp = insertelement <8 x i16> undef, i16 %arg, i32 0
+  %rhs = bitcast <8 x i16> %rhs.tmp to <16 x i8>
+  %shuffle = shufflevector <16 x i8> %lhs, <16 x i8> %rhs, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %shuffle
+}
+
+define <8 x i16> @test_v8i16_none(<8 x i16> %a, i16 %b) {
+; CHECK-LE-P8-LABEL: test_v8i16_none:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    addis r3, r2, .LCPI6_0@toc@ha
+; CHECK-LE-P8-NEXT:    mtvsrd v4, r5
+; CHECK-LE-P8-NEXT:    addi r3, r3, .LCPI6_0@toc@l
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r3
+; CHECK-LE-P8-NEXT:    xxswapd v3, vs0
+; CHECK-LE-P8-NEXT:    vperm v2, v2, v4, v3
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v8i16_none:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    mtvsrwz v3, r5
+; CHECK-LE-P9-NEXT:    vinserth v2, v3, 14
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v8i16_none:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    addis r3, r2, .LCPI6_0@toc@ha
+; CHECK-BE-P8-NEXT:    mtvsrwz v4, r5
+; CHECK-BE-P8-NEXT:    addi r3, r3, .LCPI6_0@toc@l
+; CHECK-BE-P8-NEXT:    lxvw4x v3, 0, r3
+; CHECK-BE-P8-NEXT:    vperm v2, v4, v2, v3
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v8i16_none:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    mtvsrwz v3, r5
+; CHECK-BE-P9-NEXT:    vinserth v2, v3, 0
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v8i16_none:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    ld r4, L..C4(r2) # %const.0
+; CHECK-AIX-64-P8-NEXT:    mtvsrwz v4, r3
+; CHECK-AIX-64-P8-NEXT:    lxvw4x v3, 0, r4
+; CHECK-AIX-64-P8-NEXT:    vperm v2, v4, v2, v3
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v8i16_none:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    mtvsrwz v3, r3
+; CHECK-AIX-64-P9-NEXT:    vinserth v2, v3, 0
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v8i16_none:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    lwz r4, L..C2(r2) # %const.0
+; CHECK-AIX-32-P8-NEXT:    sth r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v4, 0, r3
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v3, 0, r4
+; CHECK-AIX-32-P8-NEXT:    vperm v2, v4, v2, v3
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v8i16_none:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    mtvsrwz v3, r3
+; CHECK-AIX-32-P9-NEXT:    vinserth v2, v3, 0
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %vecins = insertelement <8 x i16> %a, i16 %b, i32 0
+  ret <8 x i16> %vecins
+}
+
+define <16 x i8> @test_v16i8_v4i32(i8 %arg, i32 %arg1, <16 x i8> %a, <4 x i32> %b) {
+; CHECK-LE-P8-LABEL: test_v16i8_v4i32:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    mtfprd f0, r3
+; CHECK-LE-P8-NEXT:    mtfprd f1, r4
+; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
+; CHECK-LE-P8-NEXT:    xxswapd v3, vs1
+; CHECK-LE-P8-NEXT:    vmrglb v2, v3, v2
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v16i8_v4i32:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    mtfprd f0, r3
+; CHECK-LE-P9-NEXT:    mtvsrws v3, r4
+; CHECK-LE-P9-NEXT:    xxswapd v2, vs0
+; CHECK-LE-P9-NEXT:    vmrglb v2, v3, v2
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v16i8_v4i32:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    sldi r3, r3, 56
+; CHECK-BE-P8-NEXT:    sldi r4, r4, 32
+; CHECK-BE-P8-NEXT:    mtvsrd v2, r3
+; CHECK-BE-P8-NEXT:    mtvsrd v3, r4
+; CHECK-BE-P8-NEXT:    vmrghb v2, v2, v3
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v16i8_v4i32:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    sldi r3, r3, 56
+; CHECK-BE-P9-NEXT:    mtvsrws v3, r4
+; CHECK-BE-P9-NEXT:    mtvsrd v2, r3
+; CHECK-BE-P9-NEXT:    vmrghb v2, v2, v3
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v16i8_v4i32:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    sldi r3, r3, 56
+; CHECK-AIX-64-P8-NEXT:    sldi r4, r4, 32
+; CHECK-AIX-64-P8-NEXT:    mtvsrd v2, r3
+; CHECK-AIX-64-P8-NEXT:    mtvsrd v3, r4
+; CHECK-AIX-64-P8-NEXT:    vmrghb v2, v2, v3
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v16i8_v4i32:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    sldi r3, r3, 56
+; CHECK-AIX-64-P9-NEXT:    mtvsrws v3, r4
+; CHECK-AIX-64-P9-NEXT:    mtvsrd v2, r3
+; CHECK-AIX-64-P9-NEXT:    vmrghb v2, v2, v3
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v16i8_v4i32:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    addi r5, r1, -16
+; CHECK-AIX-32-P8-NEXT:    stb r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -32
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v2, 0, r5
+; CHECK-AIX-32-P8-NEXT:    stw r4, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v3, 0, r3
+; CHECK-AIX-32-P8-NEXT:    vmrghb v2, v2, v3
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v16i8_v4i32:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    stb r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r4, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v2, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    vmrghb v2, v2, v3
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %lhs.tmp = insertelement <16 x i8> %a, i8 %arg, i32 0
+  %lhs = bitcast <16 x i8> %lhs.tmp to <16 x i8>
+  %rhs.tmp = insertelement <4 x i32> %b, i32 %arg1, i32 0
+  %rhs = bitcast <4 x i32> %rhs.tmp to <16 x i8>
+  %shuffle = shufflevector <16 x i8> %lhs, <16 x i8> %rhs, <16 x i32> <i32 0, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @test_v4i32_v16i8(i32 %arg, i8 %arg1) {
+; CHECK-LE-P8-LABEL: test_v4i32_v16i8:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    mtfprd f0, r4
+; CHECK-LE-P8-NEXT:    mtfprd f1, r3
+; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
+; CHECK-LE-P8-NEXT:    xxswapd v3, vs1
+; CHECK-LE-P8-NEXT:    vmrglh v2, v2, v3
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v4i32_v16i8:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    mtfprd f0, r4
+; CHECK-LE-P9-NEXT:    mtvsrws v3, r3
+; CHECK-LE-P9-NEXT:    xxswapd v2, vs0
+; CHECK-LE-P9-NEXT:    vmrglh v2, v2, v3
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v4i32_v16i8:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    sldi r4, r4, 56
+; CHECK-BE-P8-NEXT:    sldi r3, r3, 32
+; CHECK-BE-P8-NEXT:    mtvsrd v2, r4
+; CHECK-BE-P8-NEXT:    mtvsrd v3, r3
+; CHECK-BE-P8-NEXT:    vmrghh v2, v3, v2
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v4i32_v16i8:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    sldi r4, r4, 56
+; CHECK-BE-P9-NEXT:    mtvsrws v3, r3
+; CHECK-BE-P9-NEXT:    mtvsrd v2, r4
+; CHECK-BE-P9-NEXT:    vmrghh v2, v3, v2
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v4i32_v16i8:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    sldi r4, r4, 56
+; CHECK-AIX-64-P8-NEXT:    sldi r3, r3, 32
+; CHECK-AIX-64-P8-NEXT:    mtvsrd v2, r4
+; CHECK-AIX-64-P8-NEXT:    mtvsrd v3, r3
+; CHECK-AIX-64-P8-NEXT:    vmrghh v2, v3, v2
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v4i32_v16i8:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    sldi r4, r4, 56
+; CHECK-AIX-64-P9-NEXT:    mtvsrws v3, r3
+; CHECK-AIX-64-P9-NEXT:    mtvsrd v2, r4
+; CHECK-AIX-64-P9-NEXT:    vmrghh v2, v3, v2
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v4i32_v16i8:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    addi r5, r1, -32
+; CHECK-AIX-32-P8-NEXT:    stb r4, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v2, 0, r5
+; CHECK-AIX-32-P8-NEXT:    stw r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v3, 0, r3
+; CHECK-AIX-32-P8-NEXT:    vmrghh v2, v3, v2
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v4i32_v16i8:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    stb r4, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v2, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    vmrghh v2, v3, v2
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %rhs = insertelement <16 x i8> undef, i8 %arg1, i32 0
+  %lhs.tmp = insertelement <4 x i32> undef, i32 %arg, i32 0
+  %lhs = bitcast <4 x i32> %lhs.tmp to <16 x i8>
+  %shuffle = shufflevector <16 x i8> %lhs, <16 x i8> %rhs, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %shuffle
+}
+
+define <4 x i32> @test_none_v4i32(<4 x i32> %a, i64 %b) {
+; CHECK-LE-P8-LABEL: test_none_v4i32:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    addis r3, r2, .LCPI9_0@toc@ha
+; CHECK-LE-P8-NEXT:    mtvsrwz v4, r5
+; CHECK-LE-P8-NEXT:    addi r3, r3, .LCPI9_0@toc@l
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r3
+; CHECK-LE-P8-NEXT:    addis r3, r2, .LCPI9_1@toc@ha
+; CHECK-LE-P8-NEXT:    addi r3, r3, .LCPI9_1@toc@l
+; CHECK-LE-P8-NEXT:    xxswapd v3, vs0
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r3
+; CHECK-LE-P8-NEXT:    vperm v2, v4, v2, v3
+; CHECK-LE-P8-NEXT:    xxswapd v3, vs0
+; CHECK-LE-P8-NEXT:    vperm v2, v4, v2, v3
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_none_v4i32:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    mtfprwz f0, r5
+; CHECK-LE-P9-NEXT:    xxinsertw v2, vs0, 8
+; CHECK-LE-P9-NEXT:    xxinsertw v2, vs0, 0
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_none_v4i32:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    addis r3, r2, .LCPI9_0@toc@ha
+; CHECK-BE-P8-NEXT:    mtvsrwz v4, r5
+; CHECK-BE-P8-NEXT:    addi r3, r3, .LCPI9_0@toc@l
+; CHECK-BE-P8-NEXT:    lxvw4x v3, 0, r3
+; CHECK-BE-P8-NEXT:    addis r3, r2, .LCPI9_1@toc@ha
+; CHECK-BE-P8-NEXT:    addi r3, r3, .LCPI9_1@toc@l
+; CHECK-BE-P8-NEXT:    vperm v2, v2, v4, v3
+; CHECK-BE-P8-NEXT:    lxvw4x v3, 0, r3
+; CHECK-BE-P8-NEXT:    vperm v2, v2, v4, v3
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_none_v4i32:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    mtfprwz f0, r5
+; CHECK-BE-P9-NEXT:    xxinsertw v2, vs0, 4
+; CHECK-BE-P9-NEXT:    xxinsertw v2, vs0, 12
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_none_v4i32:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    ld r4, L..C5(r2) # %const.0
+; CHECK-AIX-64-P8-NEXT:    mtvsrwz v4, r3
+; CHECK-AIX-64-P8-NEXT:    ld r3, L..C6(r2) # %const.1
+; CHECK-AIX-64-P8-NEXT:    lxvw4x v3, 0, r4
+; CHECK-AIX-64-P8-NEXT:    vperm v2, v2, v4, v3
+; CHECK-AIX-64-P8-NEXT:    lxvw4x v3, 0, r3
+; CHECK-AIX-64-P8-NEXT:    vperm v2, v2, v4, v3
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_none_v4i32:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-64-P9-NEXT:    xxinsertw v2, vs0, 4
+; CHECK-AIX-64-P9-NEXT:    xxinsertw v2, vs0, 12
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_none_v4i32:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    lwz r3, L..C3(r2) # %const.0
+; CHECK-AIX-32-P8-NEXT:    stw r4, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r4, r1, -16
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v4, 0, r4
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v3, 0, r3
+; CHECK-AIX-32-P8-NEXT:    lwz r3, L..C4(r2) # %const.1
+; CHECK-AIX-32-P8-NEXT:    vperm v2, v2, v4, v3
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v3, 0, r3
+; CHECK-AIX-32-P8-NEXT:    vperm v2, v2, v4, v3
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_none_v4i32:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r4
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v2, vs0, 4
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v2, vs0, 12
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %conv = trunc i64 %b to i32
+  %vecins = insertelement <4 x i32> %a, i32 %conv, i32 1
+  %vecins2 = insertelement <4 x i32> %vecins, i32 %conv, i32 3
+  ret <4 x i32> %vecins2
+}
+
+define <16 x i8> @test_v4i32_none(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
+; CHECK-LE-P8-LABEL: test_v4i32_none:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    addis r5, r2, .LCPI10_0@toc@ha
+; CHECK-LE-P8-NEXT:    lbzx r4, 0, r4
+; CHECK-LE-P8-NEXT:    lxsiwzx v3, 0, r3
+; CHECK-LE-P8-NEXT:    addi r5, r5, .LCPI10_0@toc@l
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r5
+; CHECK-LE-P8-NEXT:    mtvsrwz v2, r4
+; CHECK-LE-P8-NEXT:    vspltb v2, v2, 7
+; CHECK-LE-P8-NEXT:    xxswapd v4, vs0
+; CHECK-LE-P8-NEXT:    vperm v2, v2, v3, v4
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v4i32_none:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    lxsiwzx v2, 0, r3
+; CHECK-LE-P9-NEXT:    addis r3, r2, .LCPI10_0@toc@ha
+; CHECK-LE-P9-NEXT:    lxsibzx v3, 0, r4
+; CHECK-LE-P9-NEXT:    addi r3, r3, .LCPI10_0@toc@l
+; CHECK-LE-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-LE-P9-NEXT:    vspltb v3, v3, 7
+; CHECK-LE-P9-NEXT:    vperm v2, v3, v2, v4
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v4i32_none:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    lbzx r4, 0, r4
+; CHECK-BE-P8-NEXT:    addis r5, r2, .LCPI10_0@toc@ha
+; CHECK-BE-P8-NEXT:    lxsiwzx v3, 0, r3
+; CHECK-BE-P8-NEXT:    mtvsrwz v2, r4
+; CHECK-BE-P8-NEXT:    addi r4, r5, .LCPI10_0@toc@l
+; CHECK-BE-P8-NEXT:    lxvw4x v4, 0, r4
+; CHECK-BE-P8-NEXT:    vspltb v2, v2, 7
+; CHECK-BE-P8-NEXT:    vperm v2, v3, v2, v4
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v4i32_none:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    lxsiwzx v2, 0, r3
+; CHECK-BE-P9-NEXT:    addis r3, r2, .LCPI10_0@toc@ha
+; CHECK-BE-P9-NEXT:    lxsibzx v3, 0, r4
+; CHECK-BE-P9-NEXT:    addi r3, r3, .LCPI10_0@toc@l
+; CHECK-BE-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-BE-P9-NEXT:    vspltb v3, v3, 7
+; CHECK-BE-P9-NEXT:    vperm v2, v2, v3, v4
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v4i32_none:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    lbzx r4, 0, r4
+; CHECK-AIX-64-P8-NEXT:    ld r5, L..C7(r2) # %const.0
+; CHECK-AIX-64-P8-NEXT:    lxsiwzx v3, 0, r3
+; CHECK-AIX-64-P8-NEXT:    mtvsrwz v2, r4
+; CHECK-AIX-64-P8-NEXT:    lxvw4x v4, 0, r5
+; CHECK-AIX-64-P8-NEXT:    vspltb v2, v2, 7
+; CHECK-AIX-64-P8-NEXT:    vperm v2, v3, v2, v4
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v4i32_none:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    lxsiwzx v2, 0, r3
+; CHECK-AIX-64-P9-NEXT:    ld r3, L..C3(r2) # %const.0
+; CHECK-AIX-64-P9-NEXT:    lxsibzx v3, 0, r4
+; CHECK-AIX-64-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    vspltb v3, v3, 7
+; CHECK-AIX-64-P9-NEXT:    vperm v2, v2, v3, v4
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v4i32_none:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    lbzx r4, 0, r4
+; CHECK-AIX-32-P8-NEXT:    lwz r5, L..C5(r2) # %const.0
+; CHECK-AIX-32-P8-NEXT:    lxsiwzx v3, 0, r3
+; CHECK-AIX-32-P8-NEXT:    mtvsrwz v2, r4
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v4, 0, r5
+; CHECK-AIX-32-P8-NEXT:    vspltb v2, v2, 7
+; CHECK-AIX-32-P8-NEXT:    vperm v2, v3, v2, v4
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v4i32_none:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    lxsiwzx v2, 0, r3
+; CHECK-AIX-32-P9-NEXT:    lwz r3, L..C1(r2) # %const.0
+; CHECK-AIX-32-P9-NEXT:    lxsibzx v3, 0, r4
+; CHECK-AIX-32-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    vspltb v3, v3, 7
+; CHECK-AIX-32-P9-NEXT:    vperm v2, v2, v3, v4
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %0 = load <4 x i8>, ptr %a, align 4
+  %bc1 = bitcast <4 x i8> %0 to i32
+  %vecinit3 = insertelement <4 x i32> poison, i32 %bc1, i64 0
+  %1 = load <1 x i8>, ptr %b, align 8
+  %bc2 = bitcast <1 x i8> %1 to i8
+  %vecinit6 = insertelement <16 x i8> undef, i8 %bc2, i64 0
+  %2 = bitcast <4 x i32> %vecinit3 to <16 x i8>
+  %3 = bitcast <16 x i8> %vecinit6 to <16 x i8>
+  %shuffle = shufflevector <16 x i8> %2, <16 x i8> %3, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @test_v16i8_v2i64(i8 %arg, i64 %arg1, <16 x i8> %a, <2 x i64> %b) {
+; CHECK-LE-P8-LABEL: test_v16i8_v2i64:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    mtfprd f0, r3
+; CHECK-LE-P8-NEXT:    mtfprd f1, r4
+; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
+; CHECK-LE-P8-NEXT:    xxswapd v3, vs1
+; CHECK-LE-P8-NEXT:    vmrglb v2, v3, v2
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v16i8_v2i64:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    mtfprd f0, r3
+; CHECK-LE-P9-NEXT:    xxswapd v2, vs0
+; CHECK-LE-P9-NEXT:    mtfprd f0, r4
+; CHECK-LE-P9-NEXT:    xxswapd v3, vs0
+; CHECK-LE-P9-NEXT:    vmrglb v2, v3, v2
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v16i8_v2i64:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    sldi r3, r3, 56
+; CHECK-BE-P8-NEXT:    mtvsrd v3, r4
+; CHECK-BE-P8-NEXT:    mtvsrd v2, r3
+; CHECK-BE-P8-NEXT:    vmrghb v2, v2, v3
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v16i8_v2i64:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    sldi r3, r3, 56
+; CHECK-BE-P9-NEXT:    mtvsrd v3, r4
+; CHECK-BE-P9-NEXT:    mtvsrd v2, r3
+; CHECK-BE-P9-NEXT:    vmrghb v2, v2, v3
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v16i8_v2i64:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    sldi r3, r3, 56
+; CHECK-AIX-64-P8-NEXT:    mtvsrd v3, r4
+; CHECK-AIX-64-P8-NEXT:    mtvsrd v2, r3
+; CHECK-AIX-64-P8-NEXT:    vmrghb v2, v2, v3
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v16i8_v2i64:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    sldi r3, r3, 56
+; CHECK-AIX-64-P9-NEXT:    mtvsrd v3, r4
+; CHECK-AIX-64-P9-NEXT:    mtvsrd v2, r3
+; CHECK-AIX-64-P9-NEXT:    vmrghb v2, v2, v3
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v16i8_v2i64:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    addi r5, r1, -16
+; CHECK-AIX-32-P8-NEXT:    stb r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -32
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v2, 0, r5
+; CHECK-AIX-32-P8-NEXT:    stw r4, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v3, 0, r3
+; CHECK-AIX-32-P8-NEXT:    vmrghb v2, v2, v3
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v16i8_v2i64:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    stb r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r4, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v2, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    vmrghb v2, v2, v3
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %lhs.tmp = insertelement <16 x i8> %a, i8 %arg, i32 0
+  %lhs = bitcast <16 x i8> %lhs.tmp to <16 x i8>
+  %rhs.tmp = insertelement <2 x i64> %b, i64 %arg1, i32 0
+  %rhs = bitcast <2 x i64> %rhs.tmp to <16 x i8>
+  %shuffle = shufflevector <16 x i8> %lhs, <16 x i8> %rhs, <16 x i32> <i32 0, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @test_v2i64_v16i8(i64 %arg, i8 %arg1) {
+; CHECK-LE-P8-LABEL: test_v2i64_v16i8:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    mtfprd f0, r4
+; CHECK-LE-P8-NEXT:    mtfprd f1, r3
+; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
+; CHECK-LE-P8-NEXT:    xxswapd v3, vs1
+; CHECK-LE-P8-NEXT:    vmrglh v2, v2, v3
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v2i64_v16i8:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    mtfprd f0, r4
+; CHECK-LE-P9-NEXT:    xxswapd v2, vs0
+; CHECK-LE-P9-NEXT:    mtfprd f0, r3
+; CHECK-LE-P9-NEXT:    xxswapd v3, vs0
+; CHECK-LE-P9-NEXT:    vmrglh v2, v2, v3
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v2i64_v16i8:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    sldi r4, r4, 56
+; CHECK-BE-P8-NEXT:    mtvsrd v3, r3
+; CHECK-BE-P8-NEXT:    mtvsrd v2, r4
+; CHECK-BE-P8-NEXT:    vmrghh v2, v3, v2
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v2i64_v16i8:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    sldi r4, r4, 56
+; CHECK-BE-P9-NEXT:    mtvsrd v3, r3
+; CHECK-BE-P9-NEXT:    mtvsrd v2, r4
+; CHECK-BE-P9-NEXT:    vmrghh v2, v3, v2
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v2i64_v16i8:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    sldi r4, r4, 56
+; CHECK-AIX-64-P8-NEXT:    mtvsrd v3, r3
+; CHECK-AIX-64-P8-NEXT:    mtvsrd v2, r4
+; CHECK-AIX-64-P8-NEXT:    vmrghh v2, v3, v2
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v2i64_v16i8:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    sldi r4, r4, 56
+; CHECK-AIX-64-P9-NEXT:    mtvsrd v3, r3
+; CHECK-AIX-64-P9-NEXT:    mtvsrd v2, r4
+; CHECK-AIX-64-P9-NEXT:    vmrghh v2, v3, v2
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v2i64_v16i8:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    addi r4, r1, -32
+; CHECK-AIX-32-P8-NEXT:    stb r5, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v2, 0, r4
+; CHECK-AIX-32-P8-NEXT:    stw r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v3, 0, r3
+; CHECK-AIX-32-P8-NEXT:    vmrghh v2, v3, v2
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v2i64_v16i8:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    stb r5, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v2, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    vmrghh v2, v3, v2
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %rhs = insertelement <16 x i8> undef, i8 %arg1, i32 0
+  %lhs.tmp = insertelement <2 x i64> undef, i64 %arg, i32 0
+  %lhs = bitcast <2 x i64> %lhs.tmp to <16 x i8>
+  %shuffle = shufflevector <16 x i8> %lhs, <16 x i8> %rhs, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %shuffle
+}
+
+define dso_local <16 x i8> @test_1_2(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) local_unnamed_addr {
+; CHECK-LE-P8-LABEL: test_1_2:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    addis r5, r2, .LCPI13_0@toc@ha
+; CHECK-LE-P8-NEXT:    lbzx r3, 0, r3
+; CHECK-LE-P8-NEXT:    lxsdx v3, 0, r4
+; CHECK-LE-P8-NEXT:    addi r5, r5, .LCPI13_0@toc@l
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r5
+; CHECK-LE-P8-NEXT:    mtvsrwz v2, r3
+; CHECK-LE-P8-NEXT:    vspltb v2, v2, 7
+; CHECK-LE-P8-NEXT:    xxswapd v4, vs0
+; CHECK-LE-P8-NEXT:    vperm v2, v3, v2, v4
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_1_2:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    lxsibzx v2, 0, r3
+; CHECK-LE-P9-NEXT:    addis r3, r2, .LCPI13_0@toc@ha
+; CHECK-LE-P9-NEXT:    lxsd v3, 0(r4)
+; CHECK-LE-P9-NEXT:    addi r3, r3, .LCPI13_0@toc@l
+; CHECK-LE-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-LE-P9-NEXT:    vspltb v2, v2, 7
+; CHECK-LE-P9-NEXT:    vperm v2, v3, v2, v4
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_1_2:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    lbzx r3, 0, r3
+; CHECK-BE-P8-NEXT:    lxsdx v3, 0, r4
+; CHECK-BE-P8-NEXT:    mtvsrwz v2, r3
+; CHECK-BE-P8-NEXT:    vspltb v2, v2, 7
+; CHECK-BE-P8-NEXT:    vmrghh v2, v2, v3
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_1_2:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    lxsibzx v2, 0, r3
+; CHECK-BE-P9-NEXT:    lxsd v3, 0(r4)
+; CHECK-BE-P9-NEXT:    vspltb v2, v2, 7
+; CHECK-BE-P9-NEXT:    vmrghh v2, v2, v3
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_1_2:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    lbzx r3, 0, r3
+; CHECK-AIX-64-P8-NEXT:    lxsdx v3, 0, r4
+; CHECK-AIX-64-P8-NEXT:    mtvsrwz v2, r3
+; CHECK-AIX-64-P8-NEXT:    vspltb v2, v2, 7
+; CHECK-AIX-64-P8-NEXT:    vmrghh v2, v2, v3
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_1_2:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    lxsibzx v2, 0, r3
+; CHECK-AIX-64-P9-NEXT:    lxsd v3, 0(r4)
+; CHECK-AIX-64-P9-NEXT:    vspltb v2, v2, 7
+; CHECK-AIX-64-P9-NEXT:    vmrghh v2, v2, v3
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_1_2:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    lbzx r3, 0, r3
+; CHECK-AIX-32-P8-NEXT:    lwz r5, L..C6(r2) # %const.0
+; CHECK-AIX-32-P8-NEXT:    lxsiwzx v3, 0, r4
+; CHECK-AIX-32-P8-NEXT:    mtvsrwz v2, r3
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v4, 0, r5
+; CHECK-AIX-32-P8-NEXT:    vspltb v2, v2, 7
+; CHECK-AIX-32-P8-NEXT:    vperm v2, v2, v3, v4
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_1_2:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    lxsibzx v2, 0, r3
+; CHECK-AIX-32-P9-NEXT:    lwz r3, L..C2(r2) # %const.0
+; CHECK-AIX-32-P9-NEXT:    lxsiwzx v3, 0, r4
+; CHECK-AIX-32-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    vspltb v2, v2, 7
+; CHECK-AIX-32-P9-NEXT:    vperm v2, v2, v3, v4
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %0 = load <1 x i8>, ptr %a, align 4
+  %bc1 = bitcast <1 x i8> %0 to i8
+  %vecinit3 = insertelement <16 x i8> poison, i8 %bc1, i64 0
+  %1 = load <2 x i8>, ptr %b, align 8
+  %bc2 = bitcast <2 x i8> %1 to i16
+  %vecinit6 = insertelement <8 x i16> undef, i16 %bc2, i64 0
+  %2 = bitcast <16 x i8> %vecinit3 to <16 x i8>
+  %3 = bitcast <8 x i16> %vecinit6 to <16 x i8>
+  %shuffle = shufflevector <16 x i8> %2, <16 x i8> %3, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @test_none_v2i64(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
+; CHECK-LE-P8-LABEL: test_none_v2i64:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    addis r5, r2, .LCPI14_0@toc@ha
+; CHECK-LE-P8-NEXT:    lbzx r3, 0, r3
+; CHECK-LE-P8-NEXT:    lxsdx v3, 0, r4
+; CHECK-LE-P8-NEXT:    addi r5, r5, .LCPI14_0@toc@l
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r5
+; CHECK-LE-P8-NEXT:    mtvsrwz v2, r3
+; CHECK-LE-P8-NEXT:    vspltb v2, v2, 7
+; CHECK-LE-P8-NEXT:    xxswapd v4, vs0
+; CHECK-LE-P8-NEXT:    vperm v2, v3, v2, v4
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_none_v2i64:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    lxsibzx v2, 0, r3
+; CHECK-LE-P9-NEXT:    addis r3, r2, .LCPI14_0@toc@ha
+; CHECK-LE-P9-NEXT:    lxsd v3, 0(r4)
+; CHECK-LE-P9-NEXT:    addi r3, r3, .LCPI14_0@toc@l
+; CHECK-LE-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-LE-P9-NEXT:    vspltb v2, v2, 7
+; CHECK-LE-P9-NEXT:    vperm v2, v3, v2, v4
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_none_v2i64:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    lbzx r3, 0, r3
+; CHECK-BE-P8-NEXT:    lxsdx v3, 0, r4
+; CHECK-BE-P8-NEXT:    mtvsrwz v2, r3
+; CHECK-BE-P8-NEXT:    vspltb v2, v2, 7
+; CHECK-BE-P8-NEXT:    vmrghh v2, v2, v3
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_none_v2i64:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    lxsibzx v2, 0, r3
+; CHECK-BE-P9-NEXT:    lxsd v3, 0(r4)
+; CHECK-BE-P9-NEXT:    vspltb v2, v2, 7
+; CHECK-BE-P9-NEXT:    vmrghh v2, v2, v3
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_none_v2i64:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    lbzx r3, 0, r3
+; CHECK-AIX-64-P8-NEXT:    lxsdx v3, 0, r4
+; CHECK-AIX-64-P8-NEXT:    mtvsrwz v2, r3
+; CHECK-AIX-64-P8-NEXT:    vspltb v2, v2, 7
+; CHECK-AIX-64-P8-NEXT:    vmrghh v2, v2, v3
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_none_v2i64:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    lxsibzx v2, 0, r3
+; CHECK-AIX-64-P9-NEXT:    lxsd v3, 0(r4)
+; CHECK-AIX-64-P9-NEXT:    vspltb v2, v2, 7
+; CHECK-AIX-64-P9-NEXT:    vmrghh v2, v2, v3
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_none_v2i64:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    lbzx r3, 0, r3
+; CHECK-AIX-32-P8-NEXT:    lwz r5, L..C7(r2) # %const.0
+; CHECK-AIX-32-P8-NEXT:    lxsiwzx v3, 0, r4
+; CHECK-AIX-32-P8-NEXT:    mtvsrwz v2, r3
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v4, 0, r5
+; CHECK-AIX-32-P8-NEXT:    vspltb v2, v2, 7
+; CHECK-AIX-32-P8-NEXT:    vperm v2, v2, v3, v4
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_none_v2i64:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    lxsibzx v2, 0, r3
+; CHECK-AIX-32-P9-NEXT:    lwz r3, L..C3(r2) # %const.0
+; CHECK-AIX-32-P9-NEXT:    lxsiwzx v3, 0, r4
+; CHECK-AIX-32-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    vspltb v2, v2, 7
+; CHECK-AIX-32-P9-NEXT:    vperm v2, v2, v3, v4
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %0 = load <1 x i8>, ptr %a, align 4
+  %bc1 = bitcast <1 x i8> %0 to i8
+  %vecinit3 = insertelement <16 x i8> poison, i8 %bc1, i64 0
+  %1 = load <2 x i8>, ptr %b, align 8
+  %bc2 = bitcast <2 x i8> %1 to i16
+  %vecinit6 = insertelement <8 x i16> undef, i16 %bc2, i64 0
+  %2 = bitcast <16 x i8> %vecinit3 to <16 x i8>
+  %3 = bitcast <8 x i16> %vecinit6 to <16 x i8>
+  %shuffle = shufflevector <16 x i8> %2, <16 x i8> %3, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @test_v2i64_none(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
+; CHECK-LE-P8-LABEL: test_v2i64_none:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    addis r5, r2, .LCPI15_0@toc@ha
+; CHECK-LE-P8-NEXT:    lbzx r4, 0, r4
+; CHECK-LE-P8-NEXT:    lxsdx v3, 0, r3
+; CHECK-LE-P8-NEXT:    addi r5, r5, .LCPI15_0@toc@l
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r5
+; CHECK-LE-P8-NEXT:    mtvsrwz v2, r4
+; CHECK-LE-P8-NEXT:    vspltb v2, v2, 7
+; CHECK-LE-P8-NEXT:    xxswapd v4, vs0
+; CHECK-LE-P8-NEXT:    vperm v2, v2, v3, v4
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v2i64_none:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    lxsd v2, 0(r3)
+; CHECK-LE-P9-NEXT:    addis r3, r2, .LCPI15_0@toc@ha
+; CHECK-LE-P9-NEXT:    lxsibzx v3, 0, r4
+; CHECK-LE-P9-NEXT:    addi r3, r3, .LCPI15_0@toc@l
+; CHECK-LE-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-LE-P9-NEXT:    vspltb v3, v3, 7
+; CHECK-LE-P9-NEXT:    vperm v2, v3, v2, v4
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v2i64_none:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    lbzx r4, 0, r4
+; CHECK-BE-P8-NEXT:    lxsdx v3, 0, r3
+; CHECK-BE-P8-NEXT:    mtvsrwz v2, r4
+; CHECK-BE-P8-NEXT:    vspltb v2, v2, 7
+; CHECK-BE-P8-NEXT:    vmrghh v2, v3, v2
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v2i64_none:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    lxsibzx v3, 0, r4
+; CHECK-BE-P9-NEXT:    lxsd v2, 0(r3)
+; CHECK-BE-P9-NEXT:    vspltb v3, v3, 7
+; CHECK-BE-P9-NEXT:    vmrghh v2, v2, v3
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v2i64_none:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    lbzx r4, 0, r4
+; CHECK-AIX-64-P8-NEXT:    lxsdx v3, 0, r3
+; CHECK-AIX-64-P8-NEXT:    mtvsrwz v2, r4
+; CHECK-AIX-64-P8-NEXT:    vspltb v2, v2, 7
+; CHECK-AIX-64-P8-NEXT:    vmrghh v2, v3, v2
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v2i64_none:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    lxsibzx v3, 0, r4
+; CHECK-AIX-64-P9-NEXT:    lxsd v2, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    vspltb v3, v3, 7
+; CHECK-AIX-64-P9-NEXT:    vmrghh v2, v2, v3
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v2i64_none:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    lbzx r4, 0, r4
+; CHECK-AIX-32-P8-NEXT:    lfiwzx f0, 0, r3
+; CHECK-AIX-32-P8-NEXT:    mtvsrwz v2, r4
+; CHECK-AIX-32-P8-NEXT:    xxspltw v3, vs0, 1
+; CHECK-AIX-32-P8-NEXT:    vspltb v2, v2, 7
+; CHECK-AIX-32-P8-NEXT:    vmrghh v2, v3, v2
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v2i64_none:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    lxsibzx v3, 0, r4
+; CHECK-AIX-32-P9-NEXT:    lxvwsx v2, 0, r3
+; CHECK-AIX-32-P9-NEXT:    vspltb v3, v3, 7
+; CHECK-AIX-32-P9-NEXT:    vmrghh v2, v2, v3
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %0 = load <8 x i8>, ptr %a, align 4
+  %bc1 = bitcast <8 x i8> %0 to i64
+  %vecinit3 = insertelement <2 x i64> poison, i64 %bc1, i64 0
+  %1 = load <1 x i8>, ptr %b, align 8
+  %bc2 = bitcast <1 x i8> %1 to i8
+  %vecinit6 = insertelement <16 x i8> undef, i8 %bc2, i64 0
+  %2 = bitcast <2 x i64> %vecinit3 to <16 x i8>
+  %3 = bitcast <16 x i8> %vecinit6 to <16 x i8>
+  %shuffle = shufflevector <16 x i8> %2, <16 x i8> %3, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @test_v8i16_v8i16rhs(i16 %arg, i16 %arg1) {
+; CHECK-LE-P8-LABEL: test_v8i16_v8i16rhs:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    mtvsrd v2, r3
+; CHECK-LE-P8-NEXT:    mtvsrd v3, r4
+; CHECK-LE-P8-NEXT:    vmrghh v2, v3, v2
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v8i16_v8i16rhs:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    mtvsrd v2, r3
+; CHECK-LE-P9-NEXT:    mtvsrd v3, r4
+; CHECK-LE-P9-NEXT:    vmrghh v2, v3, v2
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v8i16_v8i16rhs:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    addis r5, r2, .LCPI16_0@toc@ha
+; CHECK-BE-P8-NEXT:    mtvsrwz v3, r4
+; CHECK-BE-P8-NEXT:    addi r5, r5, .LCPI16_0@toc@l
+; CHECK-BE-P8-NEXT:    mtvsrwz v4, r3
+; CHECK-BE-P8-NEXT:    lxvw4x v2, 0, r5
+; CHECK-BE-P8-NEXT:    vperm v2, v4, v3, v2
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v8i16_v8i16rhs:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    addis r5, r2, .LCPI16_0@toc@ha
+; CHECK-BE-P9-NEXT:    mtvsrwz v3, r4
+; CHECK-BE-P9-NEXT:    mtvsrwz v4, r3
+; CHECK-BE-P9-NEXT:    addi r5, r5, .LCPI16_0@toc@l
+; CHECK-BE-P9-NEXT:    lxv v2, 0(r5)
+; CHECK-BE-P9-NEXT:    vperm v2, v4, v3, v2
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v8i16_v8i16rhs:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    ld r5, L..C8(r2) # %const.0
+; CHECK-AIX-64-P8-NEXT:    mtvsrwz v2, r4
+; CHECK-AIX-64-P8-NEXT:    mtvsrwz v4, r3
+; CHECK-AIX-64-P8-NEXT:    lxvw4x v3, 0, r5
+; CHECK-AIX-64-P8-NEXT:    vperm v2, v4, v2, v3
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v8i16_v8i16rhs:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    ld r5, L..C4(r2) # %const.0
+; CHECK-AIX-64-P9-NEXT:    mtvsrwz v3, r4
+; CHECK-AIX-64-P9-NEXT:    mtvsrwz v4, r3
+; CHECK-AIX-64-P9-NEXT:    lxv v2, 0(r5)
+; CHECK-AIX-64-P9-NEXT:    vperm v2, v4, v3, v2
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v8i16_v8i16rhs:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    sth r4, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    sth r3, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
+; CHECK-AIX-32-P8-NEXT:    addi r4, r1, -32
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v2, 0, r3
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v3, 0, r4
+; CHECK-AIX-32-P8-NEXT:    vmrghh v2, v3, v2
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v8i16_v8i16rhs:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    sth r4, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    sth r3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v2, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    vmrghh v2, v3, v2
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %rhs.tmp = insertelement <8 x i16> undef, i16 %arg1, i32 0
+  %rhs = bitcast <8 x i16> %rhs.tmp to <16 x i8>
+  %lhs.tmp = insertelement <8 x i16> undef, i16 %arg, i32 0
+  %lhs = bitcast <8 x i16> %lhs.tmp to <16 x i8>
+  %shuffle = shufflevector <16 x i8> %lhs, <16 x i8> %rhs, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @test_v8i16_v4i32(<8 x i16> %a, <4 x i32> %b, i16 %arg, i32 %arg1) {
+; CHECK-LE-P8-LABEL: test_v8i16_v4i32:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    mtfprd f0, r7
+; CHECK-LE-P8-NEXT:    mtfprd f1, r8
+; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
+; CHECK-LE-P8-NEXT:    xxswapd v3, vs1
+; CHECK-LE-P8-NEXT:    vmrglb v2, v3, v2
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v8i16_v4i32:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    mtfprd f0, r7
+; CHECK-LE-P9-NEXT:    mtvsrws v3, r8
+; CHECK-LE-P9-NEXT:    xxswapd v2, vs0
+; CHECK-LE-P9-NEXT:    vmrglb v2, v3, v2
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v8i16_v4i32:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    sldi r3, r7, 48
+; CHECK-BE-P8-NEXT:    sldi r4, r8, 32
+; CHECK-BE-P8-NEXT:    mtvsrd v2, r3
+; CHECK-BE-P8-NEXT:    mtvsrd v3, r4
+; CHECK-BE-P8-NEXT:    vmrghb v2, v2, v3
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v8i16_v4i32:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    sldi r3, r7, 48
+; CHECK-BE-P9-NEXT:    mtvsrws v3, r8
+; CHECK-BE-P9-NEXT:    mtvsrd v2, r3
+; CHECK-BE-P9-NEXT:    vmrghb v2, v2, v3
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v8i16_v4i32:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    sldi r3, r3, 48
+; CHECK-AIX-64-P8-NEXT:    sldi r4, r4, 32
+; CHECK-AIX-64-P8-NEXT:    mtvsrd v2, r3
+; CHECK-AIX-64-P8-NEXT:    mtvsrd v3, r4
+; CHECK-AIX-64-P8-NEXT:    vmrghb v2, v2, v3
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v8i16_v4i32:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    sldi r3, r3, 48
+; CHECK-AIX-64-P9-NEXT:    mtvsrws v3, r4
+; CHECK-AIX-64-P9-NEXT:    mtvsrd v2, r3
+; CHECK-AIX-64-P9-NEXT:    vmrghb v2, v2, v3
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v8i16_v4i32:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    addi r5, r1, -16
+; CHECK-AIX-32-P8-NEXT:    sth r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -32
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v2, 0, r5
+; CHECK-AIX-32-P8-NEXT:    stw r4, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v3, 0, r3
+; CHECK-AIX-32-P8-NEXT:    vmrghb v2, v2, v3
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v8i16_v4i32:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    sth r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r4, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v2, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    vmrghb v2, v2, v3
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %lhs.tmp = insertelement <8 x i16> %a, i16 %arg, i32 0
+  %lhs = bitcast <8 x i16> %lhs.tmp to <16 x i8>
+  %rhs.tmp = insertelement <4 x i32> %b, i32 %arg1, i32 0
+  %rhs = bitcast <4 x i32> %rhs.tmp to <16 x i8>
+  %shuffle = shufflevector <16 x i8> %lhs, <16 x i8> %rhs, <16 x i32> <i32 0, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @test_v8i16_v2i64(<8 x i16> %a, <2 x i64> %b, i16 %arg, i64 %arg1) {
+; CHECK-LE-P8-LABEL: test_v8i16_v2i64:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    mtfprd f0, r7
+; CHECK-LE-P8-NEXT:    mtfprd f1, r8
+; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
+; CHECK-LE-P8-NEXT:    xxswapd v3, vs1
+; CHECK-LE-P8-NEXT:    vmrglb v2, v3, v2
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v8i16_v2i64:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    mtfprd f0, r7
+; CHECK-LE-P9-NEXT:    xxswapd v2, vs0
+; CHECK-LE-P9-NEXT:    mtfprd f0, r8
+; CHECK-LE-P9-NEXT:    xxswapd v3, vs0
+; CHECK-LE-P9-NEXT:    vmrglb v2, v3, v2
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v8i16_v2i64:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    sldi r3, r7, 48
+; CHECK-BE-P8-NEXT:    mtvsrd v3, r8
+; CHECK-BE-P8-NEXT:    mtvsrd v2, r3
+; CHECK-BE-P8-NEXT:    vmrghb v2, v2, v3
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v8i16_v2i64:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    sldi r3, r7, 48
+; CHECK-BE-P9-NEXT:    mtvsrd v3, r8
+; CHECK-BE-P9-NEXT:    mtvsrd v2, r3
+; CHECK-BE-P9-NEXT:    vmrghb v2, v2, v3
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v8i16_v2i64:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    sldi r3, r3, 48
+; CHECK-AIX-64-P8-NEXT:    mtvsrd v3, r4
+; CHECK-AIX-64-P8-NEXT:    mtvsrd v2, r3
+; CHECK-AIX-64-P8-NEXT:    vmrghb v2, v2, v3
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v8i16_v2i64:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    sldi r3, r3, 48
+; CHECK-AIX-64-P9-NEXT:    mtvsrd v3, r4
+; CHECK-AIX-64-P9-NEXT:    mtvsrd v2, r3
+; CHECK-AIX-64-P9-NEXT:    vmrghb v2, v2, v3
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v8i16_v2i64:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    addi r5, r1, -16
+; CHECK-AIX-32-P8-NEXT:    sth r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -32
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v2, 0, r5
+; CHECK-AIX-32-P8-NEXT:    stw r4, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v3, 0, r3
+; CHECK-AIX-32-P8-NEXT:    vmrghb v2, v2, v3
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v8i16_v2i64:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    sth r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r4, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v2, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    vmrghb v2, v2, v3
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %lhs.tmp = insertelement <8 x i16> %a, i16 %arg, i32 0
+  %lhs = bitcast <8 x i16> %lhs.tmp to <16 x i8>
+  %rhs.tmp = insertelement <2 x i64> %b, i64 %arg1, i32 0
+  %rhs = bitcast <2 x i64> %rhs.tmp to <16 x i8>
+  %shuffle = shufflevector <16 x i8> %lhs, <16 x i8> %rhs, <16 x i32> <i32 0, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @test_v4i32_v4i32(i32 %arg, i32 %arg1, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LE-P8-LABEL: test_v4i32_v4i32:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    mtfprwz f0, r3
+; CHECK-LE-P8-NEXT:    mtfprwz f1, r4
+; CHECK-LE-P8-NEXT:    xxmrghw v2, vs1, vs0
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v4i32_v4i32:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    mtfprwz f0, r3
+; CHECK-LE-P9-NEXT:    mtfprwz f1, r4
+; CHECK-LE-P9-NEXT:    xxmrghw v2, vs1, vs0
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v4i32_v4i32:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    mtvsrwz v2, r4
+; CHECK-BE-P8-NEXT:    mtvsrwz v3, r3
+; CHECK-BE-P8-NEXT:    vmrgow v2, v3, v2
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v4i32_v4i32:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    mtvsrwz v2, r4
+; CHECK-BE-P9-NEXT:    mtvsrwz v3, r3
+; CHECK-BE-P9-NEXT:    vmrgow v2, v3, v2
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v4i32_v4i32:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    mtvsrwz v2, r4
+; CHECK-AIX-64-P8-NEXT:    mtvsrwz v3, r3
+; CHECK-AIX-64-P8-NEXT:    vmrgow v2, v3, v2
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v4i32_v4i32:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    mtvsrwz v2, r4
+; CHECK-AIX-64-P9-NEXT:    mtvsrwz v3, r3
+; CHECK-AIX-64-P9-NEXT:    vmrgow v2, v3, v2
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v4i32_v4i32:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    stw r4, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    stw r3, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
+; CHECK-AIX-32-P8-NEXT:    addi r4, r1, -32
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r3
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs1, 0, r4
+; CHECK-AIX-32-P8-NEXT:    xxmrghw v2, vs1, vs0
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v4i32_v4i32:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    stw r4, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs1, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    xxmrghw v2, vs1, vs0
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %lhs.tmp = insertelement <4 x i32> %a, i32 %arg, i32 0
+  %lhs = bitcast <4 x i32> %lhs.tmp to <16 x i8>
+  %rhs.tmp = insertelement <4 x i32> %b, i32 %arg1, i32 0
+  %rhs = bitcast <4 x i32> %rhs.tmp to <16 x i8>
+  %shuffle = shufflevector <16 x i8> %lhs, <16 x i8> %rhs, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @test_v4i32_v8i16(i32 %arg, i16 %arg1) {
+; CHECK-LE-P8-LABEL: test_v4i32_v8i16:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    mtfprd f0, r3
+; CHECK-LE-P8-NEXT:    mtfprd f1, r4
+; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
+; CHECK-LE-P8-NEXT:    xxswapd v3, vs1
+; CHECK-LE-P8-NEXT:    vmrglh v2, v3, v2
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v4i32_v8i16:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    mtfprd f0, r4
+; CHECK-LE-P9-NEXT:    mtvsrws v2, r3
+; CHECK-LE-P9-NEXT:    xxswapd v3, vs0
+; CHECK-LE-P9-NEXT:    vmrglh v2, v3, v2
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v4i32_v8i16:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    sldi r3, r3, 32
+; CHECK-BE-P8-NEXT:    sldi r4, r4, 48
+; CHECK-BE-P8-NEXT:    mtvsrd v2, r3
+; CHECK-BE-P8-NEXT:    mtvsrd v3, r4
+; CHECK-BE-P8-NEXT:    vmrghh v2, v2, v3
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v4i32_v8i16:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    mtvsrws v2, r3
+; CHECK-BE-P9-NEXT:    sldi r3, r4, 48
+; CHECK-BE-P9-NEXT:    mtvsrd v3, r3
+; CHECK-BE-P9-NEXT:    vmrghh v2, v2, v3
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v4i32_v8i16:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    sldi r3, r3, 32
+; CHECK-AIX-64-P8-NEXT:    sldi r4, r4, 48
+; CHECK-AIX-64-P8-NEXT:    mtvsrd v2, r3
+; CHECK-AIX-64-P8-NEXT:    mtvsrd v3, r4
+; CHECK-AIX-64-P8-NEXT:    vmrghh v2, v2, v3
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v4i32_v8i16:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    mtvsrws v2, r3
+; CHECK-AIX-64-P9-NEXT:    sldi r3, r4, 48
+; CHECK-AIX-64-P9-NEXT:    mtvsrd v3, r3
+; CHECK-AIX-64-P9-NEXT:    vmrghh v2, v2, v3
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v4i32_v8i16:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    addi r5, r1, -16
+; CHECK-AIX-32-P8-NEXT:    stw r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -32
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v2, 0, r5
+; CHECK-AIX-32-P8-NEXT:    sth r4, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v3, 0, r3
+; CHECK-AIX-32-P8-NEXT:    vmrghh v2, v2, v3
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v4i32_v8i16:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    stw r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    sth r4, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v2, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    vmrghh v2, v2, v3
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %lhs.tmp = insertelement <4 x i32> undef, i32 %arg, i32 0
+  %lhs = bitcast <4 x i32> %lhs.tmp to <16 x i8>
+  %rhs.tmp = insertelement <8 x i16> undef, i16 %arg1, i32 0
+  %rhs = bitcast <8 x i16> %rhs.tmp to <16 x i8>
+  %shuffle = shufflevector <16 x i8> %lhs, <16 x i8> %rhs, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @test_v2i64_v2i64(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
+; CHECK-LE-P8-LABEL: test_v2i64_v2i64:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    lxsdx v2, 0, r3
+; CHECK-LE-P8-NEXT:    lxsdx v3, 0, r4
+; CHECK-LE-P8-NEXT:    vmrghh v2, v3, v2
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v2i64_v2i64:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    lxsd v2, 0(r3)
+; CHECK-LE-P9-NEXT:    lxsd v3, 0(r4)
+; CHECK-LE-P9-NEXT:    vmrghh v2, v3, v2
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v2i64_v2i64:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    lxsdx v2, 0, r3
+; CHECK-BE-P8-NEXT:    lxsdx v3, 0, r4
+; CHECK-BE-P8-NEXT:    vmrghh v2, v2, v3
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v2i64_v2i64:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    lxsd v2, 0(r3)
+; CHECK-BE-P9-NEXT:    lxsd v3, 0(r4)
+; CHECK-BE-P9-NEXT:    vmrghh v2, v2, v3
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v2i64_v2i64:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    lxsdx v2, 0, r3
+; CHECK-AIX-64-P8-NEXT:    lxsdx v3, 0, r4
+; CHECK-AIX-64-P8-NEXT:    vmrghh v2, v2, v3
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v2i64_v2i64:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    lxsd v2, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    lxsd v3, 0(r4)
+; CHECK-AIX-64-P9-NEXT:    vmrghh v2, v2, v3
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v2i64_v2i64:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    lwz r5, L..C8(r2) # %const.0
+; CHECK-AIX-32-P8-NEXT:    lfiwzx f0, 0, r3
+; CHECK-AIX-32-P8-NEXT:    lxsiwzx v3, 0, r4
+; CHECK-AIX-32-P8-NEXT:    xxspltw v2, vs0, 1
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v4, 0, r5
+; CHECK-AIX-32-P8-NEXT:    vperm v2, v2, v3, v4
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v2i64_v2i64:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    lxvwsx v2, 0, r3
+; CHECK-AIX-32-P9-NEXT:    lwz r3, L..C4(r2) # %const.0
+; CHECK-AIX-32-P9-NEXT:    lxsiwzx v3, 0, r4
+; CHECK-AIX-32-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    vperm v2, v2, v3, v4
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %0 = load <8 x i8>, ptr %a, align 4
+  %bc1 = bitcast <8 x i8> %0 to i64
+  %vecinit3 = insertelement <2 x i64> poison, i64 %bc1, i64 0
+  %1 = load <2 x i8>, ptr %b, align 8
+  %bc2 = bitcast <2 x i8> %1 to i16
+  %vecinit6 = insertelement <8 x i16> undef, i16 %bc2, i64 0
+  %2 = bitcast <2 x i64> %vecinit3 to <16 x i8>
+  %3 = bitcast <8 x i16> %vecinit6 to <16 x i8>
+  %shuffle = shufflevector <16 x i8> %2, <16 x i8> %3, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @test_v2i64_v4i32(i64 %arg, i32 %arg1, <2 x i64> %a, <4 x i32> %b) {
+; CHECK-LE-P8-LABEL: test_v2i64_v4i32:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    mtfprd f0, r3
+; CHECK-LE-P8-NEXT:    mtfprd f1, r4
+; CHECK-LE-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-LE-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-LE-P8-NEXT:    xxmrglw v2, vs1, vs0
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v2i64_v4i32:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    mtfprd f0, r3
+; CHECK-LE-P9-NEXT:    mtvsrws vs1, r4
+; CHECK-LE-P9-NEXT:    xxswapd vs0, vs0
+; CHECK-LE-P9-NEXT:    xxmrglw v2, vs1, vs0
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v2i64_v4i32:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    mtfprd f0, r3
+; CHECK-BE-P8-NEXT:    sldi r3, r4, 32
+; CHECK-BE-P8-NEXT:    mtfprd f1, r3
+; CHECK-BE-P8-NEXT:    xxmrghw v2, vs0, vs1
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v2i64_v4i32:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    mtvsrws vs1, r4
+; CHECK-BE-P9-NEXT:    mtfprd f0, r3
+; CHECK-BE-P9-NEXT:    xxmrghw v2, vs0, vs1
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v2i64_v4i32:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    mtfprd f0, r3
+; CHECK-AIX-64-P8-NEXT:    sldi r3, r4, 32
+; CHECK-AIX-64-P8-NEXT:    mtfprd f1, r3
+; CHECK-AIX-64-P8-NEXT:    xxmrghw v2, vs0, vs1
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v2i64_v4i32:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    mtvsrws vs1, r4
+; CHECK-AIX-64-P9-NEXT:    mtfprd f0, r3
+; CHECK-AIX-64-P9-NEXT:    xxmrghw v2, vs0, vs1
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v2i64_v4i32:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    stw r5, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    stw r3, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
+; CHECK-AIX-32-P8-NEXT:    addi r4, r1, -32
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r3
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs1, 0, r4
+; CHECK-AIX-32-P8-NEXT:    xxmrghw v2, vs1, vs0
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v2i64_v4i32:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    stw r5, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs1, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    xxmrghw v2, vs1, vs0
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %lhs.tmp = insertelement <2 x i64> %a, i64 %arg, i32 0
+  %lhs = bitcast <2 x i64> %lhs.tmp to <16 x i8>
+  %rhs.tmp = insertelement <4 x i32> %b, i32 %arg1, i32 0
+  %rhs = bitcast <4 x i32> %rhs.tmp to <16 x i8>
+  %shuffle = shufflevector <16 x i8> %lhs, <16 x i8> %rhs, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @test_v2i64_v8i16(i64 %arg, i16 %arg1) {
+; CHECK-LE-P8-LABEL: test_v2i64_v8i16:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    mtfprd f0, r3
+; CHECK-LE-P8-NEXT:    mtfprd f1, r4
+; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
+; CHECK-LE-P8-NEXT:    xxswapd v3, vs1
+; CHECK-LE-P8-NEXT:    vmrglh v2, v3, v2
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v2i64_v8i16:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    mtfprd f0, r3
+; CHECK-LE-P9-NEXT:    xxswapd v2, vs0
+; CHECK-LE-P9-NEXT:    mtfprd f0, r4
+; CHECK-LE-P9-NEXT:    xxswapd v3, vs0
+; CHECK-LE-P9-NEXT:    vmrglh v2, v3, v2
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v2i64_v8i16:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    mtvsrd v2, r3
+; CHECK-BE-P8-NEXT:    sldi r3, r4, 48
+; CHECK-BE-P8-NEXT:    mtvsrd v3, r3
+; CHECK-BE-P8-NEXT:    vmrghh v2, v2, v3
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v2i64_v8i16:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    mtvsrd v2, r3
+; CHECK-BE-P9-NEXT:    sldi r3, r4, 48
+; CHECK-BE-P9-NEXT:    mtvsrd v3, r3
+; CHECK-BE-P9-NEXT:    vmrghh v2, v2, v3
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v2i64_v8i16:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    mtvsrd v2, r3
+; CHECK-AIX-64-P8-NEXT:    sldi r3, r4, 48
+; CHECK-AIX-64-P8-NEXT:    mtvsrd v3, r3
+; CHECK-AIX-64-P8-NEXT:    vmrghh v2, v2, v3
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v2i64_v8i16:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    mtvsrd v2, r3
+; CHECK-AIX-64-P9-NEXT:    sldi r3, r4, 48
+; CHECK-AIX-64-P9-NEXT:    mtvsrd v3, r3
+; CHECK-AIX-64-P9-NEXT:    vmrghh v2, v2, v3
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v2i64_v8i16:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    addi r4, r1, -16
+; CHECK-AIX-32-P8-NEXT:    stw r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -32
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v2, 0, r4
+; CHECK-AIX-32-P8-NEXT:    sth r5, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v3, 0, r3
+; CHECK-AIX-32-P8-NEXT:    vmrghh v2, v2, v3
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v2i64_v8i16:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    stw r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    sth r5, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v2, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    vmrghh v2, v2, v3
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %lhs.tmp = insertelement <2 x i64> undef, i64 %arg, i32 0
+  %lhs = bitcast <2 x i64> %lhs.tmp to <16 x i8>
+  %rhs.tmp = insertelement <8 x i16> undef, i16 %arg1, i32 0
+  %rhs = bitcast <8 x i16> %rhs.tmp to <16 x i8>
+  %shuffle = shufflevector <16 x i8> %lhs, <16 x i8> %rhs, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @test_v4i32_v2i64(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
+; CHECK-LE-P8-LABEL: test_v4i32_v2i64:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    addis r5, r2, .LCPI24_0@toc@ha
+; CHECK-LE-P8-NEXT:    lfiwzx f0, 0, r3
+; CHECK-LE-P8-NEXT:    lfdx f1, 0, r4
+; CHECK-LE-P8-NEXT:    addi r3, r5, .LCPI24_0@toc@l
+; CHECK-LE-P8-NEXT:    lxvd2x vs2, 0, r3
+; CHECK-LE-P8-NEXT:    xxswapd v2, f0
+; CHECK-LE-P8-NEXT:    xxswapd v3, f1
+; CHECK-LE-P8-NEXT:    xxswapd v4, vs2
+; CHECK-LE-P8-NEXT:    vperm v2, v3, v2, v4
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v4i32_v2i64:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    lfiwzx f0, 0, r3
+; CHECK-LE-P9-NEXT:    addis r3, r2, .LCPI24_0@toc@ha
+; CHECK-LE-P9-NEXT:    addi r3, r3, .LCPI24_0@toc@l
+; CHECK-LE-P9-NEXT:    xxswapd v2, f0
+; CHECK-LE-P9-NEXT:    lfd f0, 0(r4)
+; CHECK-LE-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-LE-P9-NEXT:    xxswapd v3, f0
+; CHECK-LE-P9-NEXT:    vperm v2, v3, v2, v4
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v4i32_v2i64:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    lfiwzx f0, 0, r3
+; CHECK-BE-P8-NEXT:    addis r5, r2, .LCPI24_0@toc@ha
+; CHECK-BE-P8-NEXT:    lxsdx v3, 0, r4
+; CHECK-BE-P8-NEXT:    addi r3, r5, .LCPI24_0@toc@l
+; CHECK-BE-P8-NEXT:    lxvw4x v4, 0, r3
+; CHECK-BE-P8-NEXT:    xxsldwi v2, f0, f0, 1
+; CHECK-BE-P8-NEXT:    vperm v2, v2, v3, v4
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v4i32_v2i64:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    lfiwzx f0, 0, r3
+; CHECK-BE-P9-NEXT:    addis r3, r2, .LCPI24_0@toc@ha
+; CHECK-BE-P9-NEXT:    lxsd v3, 0(r4)
+; CHECK-BE-P9-NEXT:    addi r3, r3, .LCPI24_0@toc@l
+; CHECK-BE-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-BE-P9-NEXT:    xxsldwi v2, f0, f0, 1
+; CHECK-BE-P9-NEXT:    vperm v2, v2, v3, v4
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v4i32_v2i64:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    ld r5, L..C9(r2) # %const.0
+; CHECK-AIX-64-P8-NEXT:    lfiwzx f0, 0, r3
+; CHECK-AIX-64-P8-NEXT:    lxsdx v3, 0, r4
+; CHECK-AIX-64-P8-NEXT:    xxsldwi v2, f0, f0, 1
+; CHECK-AIX-64-P8-NEXT:    lxvw4x v4, 0, r5
+; CHECK-AIX-64-P8-NEXT:    vperm v2, v2, v3, v4
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v4i32_v2i64:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    lfiwzx f0, 0, r3
+; CHECK-AIX-64-P9-NEXT:    ld r3, L..C5(r2) # %const.0
+; CHECK-AIX-64-P9-NEXT:    lxsd v3, 0(r4)
+; CHECK-AIX-64-P9-NEXT:    xxsldwi v2, f0, f0, 1
+; CHECK-AIX-64-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    vperm v2, v2, v3, v4
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v4i32_v2i64:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    lwz r5, 4(r4)
+; CHECK-AIX-32-P8-NEXT:    lxsiwzx v2, 0, r3
+; CHECK-AIX-32-P8-NEXT:    stw r5, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    lwz r3, 0(r4)
+; CHECK-AIX-32-P8-NEXT:    addi r4, r1, -32
+; CHECK-AIX-32-P8-NEXT:    stw r3, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r3
+; CHECK-AIX-32-P8-NEXT:    lwz r3, L..C9(r2) # %const.0
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs1, 0, r4
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v4, 0, r3
+; CHECK-AIX-32-P8-NEXT:    xxmrghw v3, vs1, vs0
+; CHECK-AIX-32-P8-NEXT:    vperm v2, v2, v3, v4
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v4i32_v2i64:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    lxsiwzx v2, 0, r3
+; CHECK-AIX-32-P9-NEXT:    lwz r3, 4(r4)
+; CHECK-AIX-32-P9-NEXT:    stw r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r3, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r3, L..C5(r2) # %const.0
+; CHECK-AIX-32-P9-NEXT:    lxv vs1, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    xxmrghw v3, vs1, vs0
+; CHECK-AIX-32-P9-NEXT:    vperm v2, v2, v3, v4
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %0 = load <4 x i8>, ptr %a, align 4
+  %bc1 = bitcast <4 x i8> %0 to i32
+  %vecinit3 = insertelement <4 x i32> poison, i32 %bc1, i64 0
+  %1 = load <8 x i8>, ptr %b, align 8
+  %bc2 = bitcast <8 x i8> %1 to i64
+  %vecinit6 = insertelement <2 x i64> undef, i64 %bc2, i64 0
+  %2 = bitcast <4 x i32> %vecinit3 to <16 x i8>
+  %3 = bitcast <2 x i64> %vecinit6 to <16 x i8>
+  %shuffle = shufflevector <16 x i8> %2, <16 x i8> %3, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %shuffle
+}
diff --git a/llvm/test/CodeGen/PowerPC/v2i64_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v2i64_scalar_to_vector_shuffle.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/v2i64_scalar_to_vector_shuffle.ll
@@ -0,0 +1,1909 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc64le-unknown-linux-gnu < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-LE-P8
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc64le-unknown-linux-gnu < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-LE-P9
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc64-unknown-linux-gnu < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-BE-P8
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc64-unknown-linux-gnu < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-BE-P9
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc64-ibm-aix < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-AIX-64-P8
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc64-ibm-aix < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-AIX-64-P9
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc-ibm-aix < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-AIX-32-P8
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc-ibm-aix < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-AIX-32-P9
+
+define <2 x i64> @test_v16i8_v16i8(i8 %arg1, i8 %arg) {
+; CHECK-LE-P8-LABEL: test_v16i8_v16i8:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    mtfprd f0, r3
+; CHECK-LE-P8-NEXT:    mtfprd f1, r4
+; CHECK-LE-P8-NEXT:    xxmrgld v2, vs1, vs0
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v16i8_v16i8:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    mtfprd f0, r3
+; CHECK-LE-P9-NEXT:    mtfprd f1, r4
+; CHECK-LE-P9-NEXT:    xxmrgld v2, vs1, vs0
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v16i8_v16i8:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    mtfprwz f0, r3
+; CHECK-BE-P8-NEXT:    mtfprwz f1, r4
+; CHECK-BE-P8-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v16i8_v16i8:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    mtfprwz f0, r3
+; CHECK-BE-P9-NEXT:    mtfprwz f1, r4
+; CHECK-BE-P9-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v16i8_v16i8:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-64-P8-NEXT:    mtfprwz f1, r4
+; CHECK-AIX-64-P8-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v16i8_v16i8:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-64-P9-NEXT:    mtfprwz f1, r4
+; CHECK-AIX-64-P9-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v16i8_v16i8:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    addi r5, r1, -16
+; CHECK-AIX-32-P8-NEXT:    stb r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -32
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r5
+; CHECK-AIX-32-P8-NEXT:    stb r4, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs1, 0, r3
+; CHECK-AIX-32-P8-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v16i8_v16i8:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    stb r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    stb r4, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs1, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %lhs.tmp = insertelement <16 x i8> undef, i8 %arg1, i32 0
+  %lhs = bitcast <16 x i8> %lhs.tmp to <2 x i64>
+  %rhs.tmp = insertelement <16 x i8> undef, i8 %arg, i32 0
+  %rhs = bitcast <16 x i8> %rhs.tmp to <2 x i64>
+  %shuffle = shufflevector <2 x i64> %lhs, <2 x i64> %rhs, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %shuffle
+}
+
+define <2 x i64> @test_none_v16i8(i8 %arg1, ptr nocapture noundef readonly %b) {
+; CHECK-LE-P8-LABEL: test_none_v16i8:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
+; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
+; CHECK-LE-P8-NEXT:    mtfprd f0, r3
+; CHECK-LE-P8-NEXT:    xxmrgld v2, vs0, v2
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_none_v16i8:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-LE-P9-NEXT:    mtfprd f0, r3
+; CHECK-LE-P9-NEXT:    xxmrgld v2, vs0, v2
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_none_v16i8:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-BE-P8-NEXT:    mtfprwz f0, r3
+; CHECK-BE-P8-NEXT:    xxmrghd v2, v2, vs0
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_none_v16i8:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-BE-P9-NEXT:    mtfprwz f0, r3
+; CHECK-BE-P9-NEXT:    xxmrghd v2, v2, vs0
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_none_v16i8:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-AIX-64-P8-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-64-P8-NEXT:    xxmrghd v2, v2, vs0
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_none_v16i8:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-64-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-64-P9-NEXT:    xxmrghd v2, v2, vs0
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_none_v16i8:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    addi r5, r1, -16
+; CHECK-AIX-32-P8-NEXT:    stb r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r5
+; CHECK-AIX-32-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-AIX-32-P8-NEXT:    xxmrghd v2, v2, vs0
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_none_v16i8:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    stb r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    xxmrghd v2, v2, vs0
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %lhs.tmp = insertelement <16 x i8> undef, i8 %arg1, i32 0
+  %lhs = bitcast <16 x i8> %lhs.tmp to <2 x i64>
+  %rhs = load <2 x i64>, ptr %b, align 4
+  %shuffle = shufflevector <2 x i64> %rhs, <2 x i64> %lhs, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %shuffle
+}
+
+define <2 x i64> @test_v16i8_none(i8 %arg1, ptr nocapture noundef readonly %b) {
+; CHECK-LE-P8-LABEL: test_v16i8_none:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
+; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
+; CHECK-LE-P8-NEXT:    mtfprd f0, r3
+; CHECK-LE-P8-NEXT:    xxmrgld v2, v2, vs0
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v16i8_none:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-LE-P9-NEXT:    mtfprd f0, r3
+; CHECK-LE-P9-NEXT:    xxmrgld v2, v2, vs0
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v16i8_none:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-BE-P8-NEXT:    mtfprwz f0, r3
+; CHECK-BE-P8-NEXT:    xxmrghd v2, vs0, v2
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v16i8_none:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-BE-P9-NEXT:    mtfprwz f0, r3
+; CHECK-BE-P9-NEXT:    xxmrghd v2, vs0, v2
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v16i8_none:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-AIX-64-P8-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-64-P8-NEXT:    xxmrghd v2, vs0, v2
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v16i8_none:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-64-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-64-P9-NEXT:    xxmrghd v2, vs0, v2
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v16i8_none:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    addi r5, r1, -16
+; CHECK-AIX-32-P8-NEXT:    stb r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r5
+; CHECK-AIX-32-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-AIX-32-P8-NEXT:    xxmrghd v2, vs0, v2
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v16i8_none:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    stb r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    xxmrghd v2, vs0, v2
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %lhs.tmp = insertelement <16 x i8> undef, i8 %arg1, i32 0
+  %lhs = bitcast <16 x i8> %lhs.tmp to <2 x i64>
+  %rhs = load <2 x i64>, ptr %b, align 4
+  %shuffle = shufflevector <2 x i64> %lhs, <2 x i64> %rhs, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %shuffle
+}
+
+define <2 x i64> @test_v16i8_v8i16(i8 %arg1, i16 %arg) {
+; CHECK-LE-P8-LABEL: test_v16i8_v8i16:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    mtfprd f0, r3
+; CHECK-LE-P8-NEXT:    mtfprd f1, r4
+; CHECK-LE-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-LE-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-LE-P8-NEXT:    xxmrgld v2, vs1, vs0
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v16i8_v8i16:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    mtfprd f0, r3
+; CHECK-LE-P9-NEXT:    mtfprd f1, r4
+; CHECK-LE-P9-NEXT:    xxswapd vs0, vs0
+; CHECK-LE-P9-NEXT:    xxswapd vs1, vs1
+; CHECK-LE-P9-NEXT:    xxmrgld v2, vs1, vs0
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v16i8_v8i16:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    sldi r3, r3, 56
+; CHECK-BE-P8-NEXT:    sldi r4, r4, 48
+; CHECK-BE-P8-NEXT:    mtfprd f0, r3
+; CHECK-BE-P8-NEXT:    mtfprd f1, r4
+; CHECK-BE-P8-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v16i8_v8i16:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    sldi r3, r3, 56
+; CHECK-BE-P9-NEXT:    mtfprd f0, r3
+; CHECK-BE-P9-NEXT:    sldi r3, r4, 48
+; CHECK-BE-P9-NEXT:    mtfprd f1, r3
+; CHECK-BE-P9-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v16i8_v8i16:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    sldi r3, r3, 56
+; CHECK-AIX-64-P8-NEXT:    sldi r4, r4, 48
+; CHECK-AIX-64-P8-NEXT:    mtfprd f0, r3
+; CHECK-AIX-64-P8-NEXT:    mtfprd f1, r4
+; CHECK-AIX-64-P8-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v16i8_v8i16:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    sldi r3, r3, 56
+; CHECK-AIX-64-P9-NEXT:    mtfprd f0, r3
+; CHECK-AIX-64-P9-NEXT:    sldi r3, r4, 48
+; CHECK-AIX-64-P9-NEXT:    mtfprd f1, r3
+; CHECK-AIX-64-P9-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v16i8_v8i16:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    addi r5, r1, -16
+; CHECK-AIX-32-P8-NEXT:    stb r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -32
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r5
+; CHECK-AIX-32-P8-NEXT:    sth r4, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs1, 0, r3
+; CHECK-AIX-32-P8-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v16i8_v8i16:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    stb r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    sth r4, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs1, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %lhs.tmp = insertelement <16 x i8> undef, i8 %arg1, i32 0
+  %lhs = bitcast <16 x i8> %lhs.tmp to <2 x i64>
+  %rhs.tmp = insertelement <8 x i16> undef, i16 %arg, i32 0
+  %rhs = bitcast <8 x i16> %rhs.tmp to <2 x i64>
+  %shuffle = shufflevector <2 x i64> %lhs, <2 x i64> %rhs, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %shuffle
+}
+
+define <2 x i64> @test_v8i16_v16i8(i8 %arg1, i16 %arg) {
+; CHECK-LE-P8-LABEL: test_v8i16_v16i8:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    mtfprd f0, r3
+; CHECK-LE-P8-NEXT:    mtfprd f1, r4
+; CHECK-LE-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-LE-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-LE-P8-NEXT:    xxmrgld v2, vs0, vs1
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v8i16_v16i8:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    mtfprd f0, r3
+; CHECK-LE-P9-NEXT:    mtfprd f1, r4
+; CHECK-LE-P9-NEXT:    xxswapd vs0, vs0
+; CHECK-LE-P9-NEXT:    xxswapd vs1, vs1
+; CHECK-LE-P9-NEXT:    xxmrgld v2, vs0, vs1
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v8i16_v16i8:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    sldi r3, r3, 56
+; CHECK-BE-P8-NEXT:    sldi r4, r4, 48
+; CHECK-BE-P8-NEXT:    mtfprd f0, r3
+; CHECK-BE-P8-NEXT:    mtfprd f1, r4
+; CHECK-BE-P8-NEXT:    xxmrghd v2, vs1, vs0
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v8i16_v16i8:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    sldi r3, r3, 56
+; CHECK-BE-P9-NEXT:    mtfprd f0, r3
+; CHECK-BE-P9-NEXT:    sldi r3, r4, 48
+; CHECK-BE-P9-NEXT:    mtfprd f1, r3
+; CHECK-BE-P9-NEXT:    xxmrghd v2, vs1, vs0
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v8i16_v16i8:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    sldi r3, r3, 56
+; CHECK-AIX-64-P8-NEXT:    sldi r4, r4, 48
+; CHECK-AIX-64-P8-NEXT:    mtfprd f0, r3
+; CHECK-AIX-64-P8-NEXT:    mtfprd f1, r4
+; CHECK-AIX-64-P8-NEXT:    xxmrghd v2, vs1, vs0
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v8i16_v16i8:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    sldi r3, r3, 56
+; CHECK-AIX-64-P9-NEXT:    mtfprd f0, r3
+; CHECK-AIX-64-P9-NEXT:    sldi r3, r4, 48
+; CHECK-AIX-64-P9-NEXT:    mtfprd f1, r3
+; CHECK-AIX-64-P9-NEXT:    xxmrghd v2, vs1, vs0
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v8i16_v16i8:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    addi r5, r1, -16
+; CHECK-AIX-32-P8-NEXT:    stb r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -32
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r5
+; CHECK-AIX-32-P8-NEXT:    sth r4, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs1, 0, r3
+; CHECK-AIX-32-P8-NEXT:    xxmrghd v2, vs1, vs0
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v8i16_v16i8:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    stb r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    sth r4, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs1, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    xxmrghd v2, vs1, vs0
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %lhs.tmp = insertelement <16 x i8> undef, i8 %arg1, i32 0
+  %lhs = bitcast <16 x i8> %lhs.tmp to <2 x i64>
+  %rhs.tmp = insertelement <8 x i16> undef, i16 %arg, i32 0
+  %rhs = bitcast <8 x i16> %rhs.tmp to <2 x i64>
+  %shuffle = shufflevector <2 x i64> %rhs, <2 x i64> %lhs, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %shuffle
+}
+
+define <2 x i64> @test_v8i16_none(i16 %arg1, ptr nocapture noundef readonly %b) {
+; CHECK-LE-P8-LABEL: test_v8i16_none:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
+; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
+; CHECK-LE-P8-NEXT:    mtfprd f0, r3
+; CHECK-LE-P8-NEXT:    xxmrgld v2, v2, vs0
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v8i16_none:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-LE-P9-NEXT:    mtfprd f0, r3
+; CHECK-LE-P9-NEXT:    xxmrgld v2, v2, vs0
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v8i16_none:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-BE-P8-NEXT:    mtfprwz f0, r3
+; CHECK-BE-P8-NEXT:    xxmrghd v2, vs0, v2
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v8i16_none:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-BE-P9-NEXT:    mtfprwz f0, r3
+; CHECK-BE-P9-NEXT:    xxmrghd v2, vs0, v2
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v8i16_none:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-AIX-64-P8-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-64-P8-NEXT:    xxmrghd v2, vs0, v2
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v8i16_none:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-64-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-64-P9-NEXT:    xxmrghd v2, vs0, v2
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v8i16_none:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    addi r5, r1, -16
+; CHECK-AIX-32-P8-NEXT:    sth r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r5
+; CHECK-AIX-32-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-AIX-32-P8-NEXT:    xxmrghd v2, vs0, v2
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v8i16_none:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    sth r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    xxmrghd v2, vs0, v2
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %lhs.tmp = insertelement <8 x i16> undef, i16 %arg1, i32 0
+  %lhs = bitcast <8 x i16> %lhs.tmp to <2 x i64>
+  %rhs = load <2 x i64>, ptr %b, align 4
+  %shuffle = shufflevector <2 x i64> %lhs, <2 x i64> %rhs, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %shuffle
+}
+
+define <2 x i64> @test_none_v8i16(i16 %arg1, ptr nocapture noundef readonly %b) {
+; CHECK-LE-P8-LABEL: test_none_v8i16:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
+; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
+; CHECK-LE-P8-NEXT:    mtfprd f0, r3
+; CHECK-LE-P8-NEXT:    xxmrgld v2, vs0, v2
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_none_v8i16:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-LE-P9-NEXT:    mtfprd f0, r3
+; CHECK-LE-P9-NEXT:    xxmrgld v2, vs0, v2
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_none_v8i16:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-BE-P8-NEXT:    mtfprwz f0, r3
+; CHECK-BE-P8-NEXT:    xxmrghd v2, v2, vs0
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_none_v8i16:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-BE-P9-NEXT:    mtfprwz f0, r3
+; CHECK-BE-P9-NEXT:    xxmrghd v2, v2, vs0
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_none_v8i16:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-AIX-64-P8-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-64-P8-NEXT:    xxmrghd v2, v2, vs0
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_none_v8i16:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-64-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-64-P9-NEXT:    xxmrghd v2, v2, vs0
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_none_v8i16:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    addi r5, r1, -16
+; CHECK-AIX-32-P8-NEXT:    sth r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r5
+; CHECK-AIX-32-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-AIX-32-P8-NEXT:    xxmrghd v2, v2, vs0
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_none_v8i16:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    sth r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    xxmrghd v2, v2, vs0
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %lhs.tmp = insertelement <8 x i16> undef, i16 %arg1, i32 0
+  %lhs = bitcast <8 x i16> %lhs.tmp to <2 x i64>
+  %rhs = load <2 x i64>, ptr %b, align 4
+  %shuffle = shufflevector <2 x i64> %rhs, <2 x i64> %lhs, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %shuffle
+}
+
+define <2 x i64> @test_v16i8_v4i32(i8 %arg1, i32 %arg) {
+; CHECK-LE-P8-LABEL: test_v16i8_v4i32:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    mtfprd f0, r3
+; CHECK-LE-P8-NEXT:    mtfprd f1, r4
+; CHECK-LE-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-LE-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-LE-P8-NEXT:    xxmrgld v2, vs1, vs0
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v16i8_v4i32:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    mtfprd f0, r3
+; CHECK-LE-P9-NEXT:    mtvsrws vs1, r4
+; CHECK-LE-P9-NEXT:    xxswapd vs0, vs0
+; CHECK-LE-P9-NEXT:    xxmrgld v2, vs1, vs0
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v16i8_v4i32:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    sldi r3, r3, 56
+; CHECK-BE-P8-NEXT:    sldi r4, r4, 32
+; CHECK-BE-P8-NEXT:    mtfprd f0, r3
+; CHECK-BE-P8-NEXT:    mtfprd f1, r4
+; CHECK-BE-P8-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v16i8_v4i32:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    sldi r3, r3, 56
+; CHECK-BE-P9-NEXT:    mtvsrws vs1, r4
+; CHECK-BE-P9-NEXT:    mtfprd f0, r3
+; CHECK-BE-P9-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v16i8_v4i32:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    sldi r3, r3, 56
+; CHECK-AIX-64-P8-NEXT:    sldi r4, r4, 32
+; CHECK-AIX-64-P8-NEXT:    mtfprd f0, r3
+; CHECK-AIX-64-P8-NEXT:    mtfprd f1, r4
+; CHECK-AIX-64-P8-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v16i8_v4i32:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    sldi r3, r3, 56
+; CHECK-AIX-64-P9-NEXT:    mtvsrws vs1, r4
+; CHECK-AIX-64-P9-NEXT:    mtfprd f0, r3
+; CHECK-AIX-64-P9-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v16i8_v4i32:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    addi r5, r1, -16
+; CHECK-AIX-32-P8-NEXT:    stb r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -32
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r5
+; CHECK-AIX-32-P8-NEXT:    stw r4, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs1, 0, r3
+; CHECK-AIX-32-P8-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v16i8_v4i32:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    stb r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r4, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs1, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %lhs.tmp = insertelement <16 x i8> undef, i8 %arg1, i32 0
+  %lhs = bitcast <16 x i8> %lhs.tmp to <2 x i64>
+  %rhs.tmp = insertelement <4 x i32> undef, i32 %arg, i32 0
+  %rhs = bitcast <4 x i32> %rhs.tmp to <2 x i64>
+  %shuffle = shufflevector <2 x i64> %lhs, <2 x i64> %rhs, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %shuffle
+}
+
+define <2 x i64> @test_v4i32_v16i8(i8 %arg1, i32 %arg) {
+; CHECK-LE-P8-LABEL: test_v4i32_v16i8:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    mtfprd f0, r3
+; CHECK-LE-P8-NEXT:    mtfprd f1, r4
+; CHECK-LE-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-LE-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-LE-P8-NEXT:    xxmrgld v2, vs0, vs1
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v4i32_v16i8:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    mtfprd f0, r3
+; CHECK-LE-P9-NEXT:    mtvsrws vs1, r4
+; CHECK-LE-P9-NEXT:    xxswapd vs0, vs0
+; CHECK-LE-P9-NEXT:    xxmrgld v2, vs0, vs1
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v4i32_v16i8:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    sldi r3, r3, 56
+; CHECK-BE-P8-NEXT:    sldi r4, r4, 32
+; CHECK-BE-P8-NEXT:    mtfprd f0, r3
+; CHECK-BE-P8-NEXT:    mtfprd f1, r4
+; CHECK-BE-P8-NEXT:    xxmrghd v2, vs1, vs0
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v4i32_v16i8:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    sldi r3, r3, 56
+; CHECK-BE-P9-NEXT:    mtvsrws vs1, r4
+; CHECK-BE-P9-NEXT:    mtfprd f0, r3
+; CHECK-BE-P9-NEXT:    xxmrghd v2, vs1, vs0
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v4i32_v16i8:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    sldi r3, r3, 56
+; CHECK-AIX-64-P8-NEXT:    sldi r4, r4, 32
+; CHECK-AIX-64-P8-NEXT:    mtfprd f0, r3
+; CHECK-AIX-64-P8-NEXT:    mtfprd f1, r4
+; CHECK-AIX-64-P8-NEXT:    xxmrghd v2, vs1, vs0
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v4i32_v16i8:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    sldi r3, r3, 56
+; CHECK-AIX-64-P9-NEXT:    mtvsrws vs1, r4
+; CHECK-AIX-64-P9-NEXT:    mtfprd f0, r3
+; CHECK-AIX-64-P9-NEXT:    xxmrghd v2, vs1, vs0
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v4i32_v16i8:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    addi r5, r1, -16
+; CHECK-AIX-32-P8-NEXT:    stb r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -32
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r5
+; CHECK-AIX-32-P8-NEXT:    stw r4, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs1, 0, r3
+; CHECK-AIX-32-P8-NEXT:    xxmrghd v2, vs1, vs0
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v4i32_v16i8:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    stb r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r4, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs1, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    xxmrghd v2, vs1, vs0
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %lhs.tmp = insertelement <16 x i8> undef, i8 %arg1, i32 0
+  %lhs = bitcast <16 x i8> %lhs.tmp to <2 x i64>
+  %rhs.tmp = insertelement <4 x i32> undef, i32 %arg, i32 0
+  %rhs = bitcast <4 x i32> %rhs.tmp to <2 x i64>
+  %shuffle = shufflevector <2 x i64> %rhs, <2 x i64> %lhs, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %shuffle
+}
+
+define <2 x i64> @test_none_v4i32(i32 %arg1, ptr nocapture noundef readonly %b) {
+; CHECK-LE-P8-LABEL: test_none_v4i32:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
+; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
+; CHECK-LE-P8-NEXT:    mtfprwz f0, r3
+; CHECK-LE-P8-NEXT:    xxmrgld v2, vs0, v2
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_none_v4i32:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-LE-P9-NEXT:    mtfprwz f0, r3
+; CHECK-LE-P9-NEXT:    xxmrgld v2, vs0, v2
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_none_v4i32:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-BE-P8-NEXT:    mtfprwz f0, r3
+; CHECK-BE-P8-NEXT:    xxmrghd v2, v2, vs0
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_none_v4i32:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-BE-P9-NEXT:    mtfprwz f0, r3
+; CHECK-BE-P9-NEXT:    xxmrghd v2, v2, vs0
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_none_v4i32:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-AIX-64-P8-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-64-P8-NEXT:    xxmrghd v2, v2, vs0
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_none_v4i32:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-64-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-64-P9-NEXT:    xxmrghd v2, v2, vs0
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_none_v4i32:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    addi r5, r1, -16
+; CHECK-AIX-32-P8-NEXT:    stw r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r5
+; CHECK-AIX-32-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-AIX-32-P8-NEXT:    xxmrghd v2, v2, vs0
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_none_v4i32:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    stw r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    xxmrghd v2, v2, vs0
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %lhs.tmp = insertelement <4 x i32> undef, i32 %arg1, i32 0
+  %lhs = bitcast <4 x i32> %lhs.tmp to <2 x i64>
+  %rhs = load <2 x i64>, ptr %b, align 4
+  %shuffle = shufflevector <2 x i64> %rhs, <2 x i64> %lhs, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %shuffle
+}
+
+define <2 x i64> @test_v4i32_none(i32 %arg1, ptr nocapture noundef readonly %b) {
+; CHECK-LE-P8-LABEL: test_v4i32_none:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
+; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
+; CHECK-LE-P8-NEXT:    mtfprwz f0, r3
+; CHECK-LE-P8-NEXT:    xxmrgld v2, v2, vs0
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v4i32_none:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-LE-P9-NEXT:    mtfprwz f0, r3
+; CHECK-LE-P9-NEXT:    xxmrgld v2, v2, vs0
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v4i32_none:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-BE-P8-NEXT:    mtfprwz f0, r3
+; CHECK-BE-P8-NEXT:    xxmrghd v2, vs0, v2
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v4i32_none:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-BE-P9-NEXT:    mtfprwz f0, r3
+; CHECK-BE-P9-NEXT:    xxmrghd v2, vs0, v2
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v4i32_none:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-AIX-64-P8-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-64-P8-NEXT:    xxmrghd v2, vs0, v2
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v4i32_none:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-64-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-64-P9-NEXT:    xxmrghd v2, vs0, v2
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v4i32_none:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    addi r5, r1, -16
+; CHECK-AIX-32-P8-NEXT:    stw r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r5
+; CHECK-AIX-32-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-AIX-32-P8-NEXT:    xxmrghd v2, vs0, v2
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v4i32_none:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    stw r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    xxmrghd v2, vs0, v2
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %lhs.tmp = insertelement <4 x i32> undef, i32 %arg1, i32 0
+  %lhs = bitcast <4 x i32> %lhs.tmp to <2 x i64>
+  %rhs = load <2 x i64>, ptr %b, align 4
+  %shuffle = shufflevector <2 x i64> %lhs, <2 x i64> %rhs, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %shuffle
+}
+
+define <2 x i64> @test_v16i8_v2i64(i8 %arg1, i64 %arg) {
+; CHECK-LE-P8-LABEL: test_v16i8_v2i64:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    mtfprd f0, r3
+; CHECK-LE-P8-NEXT:    mtfprd f1, r4
+; CHECK-LE-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-LE-P8-NEXT:    xxswapd v2, vs1
+; CHECK-LE-P8-NEXT:    xxmrgld v2, v2, vs0
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v16i8_v2i64:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    mtfprd f0, r3
+; CHECK-LE-P9-NEXT:    mtfprd f1, r4
+; CHECK-LE-P9-NEXT:    xxswapd vs0, vs0
+; CHECK-LE-P9-NEXT:    xxswapd v2, vs1
+; CHECK-LE-P9-NEXT:    xxmrgld v2, v2, vs0
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v16i8_v2i64:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    sldi r3, r3, 56
+; CHECK-BE-P8-NEXT:    mtfprd f1, r4
+; CHECK-BE-P8-NEXT:    mtfprd f0, r3
+; CHECK-BE-P8-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v16i8_v2i64:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    sldi r3, r3, 56
+; CHECK-BE-P9-NEXT:    mtfprd f1, r4
+; CHECK-BE-P9-NEXT:    mtfprd f0, r3
+; CHECK-BE-P9-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v16i8_v2i64:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    sldi r3, r3, 56
+; CHECK-AIX-64-P8-NEXT:    mtfprd f1, r4
+; CHECK-AIX-64-P8-NEXT:    mtfprd f0, r3
+; CHECK-AIX-64-P8-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v16i8_v2i64:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    sldi r3, r3, 56
+; CHECK-AIX-64-P9-NEXT:    mtfprd f1, r4
+; CHECK-AIX-64-P9-NEXT:    mtfprd f0, r3
+; CHECK-AIX-64-P9-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v16i8_v2i64:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    stb r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    lwz r3, L..C0(r2) # %const.0
+; CHECK-AIX-32-P8-NEXT:    addi r6, r1, -16
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v2, 0, r6
+; CHECK-AIX-32-P8-NEXT:    stw r4, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    stw r5, -48(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r4, r1, -32
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v3, 0, r3
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v4, 0, r4
+; CHECK-AIX-32-P8-NEXT:    lwz r3, L..C1(r2) # %const.1
+; CHECK-AIX-32-P8-NEXT:    addi r4, r1, -48
+; CHECK-AIX-32-P8-NEXT:    vperm v2, v2, v4, v3
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v3, 0, r3
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v4, 0, r4
+; CHECK-AIX-32-P8-NEXT:    vperm v2, v2, v4, v3
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v16i8_v2i64:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    stb r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r4
+; CHECK-AIX-32-P9-NEXT:    lxv v2, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v2, vs0, 8
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r5
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v2, vs0, 12
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %lhs.tmp = insertelement <16 x i8> undef, i8 %arg1, i32 0
+  %lhs = bitcast <16 x i8> %lhs.tmp to <2 x i64>
+  %rhs.tmp = insertelement <2 x i64> undef, i64 %arg, i32 0
+  %rhs = bitcast <2 x i64> %rhs.tmp to <2 x i64>
+  %shuffle = shufflevector <2 x i64> %lhs, <2 x i64> %rhs, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %shuffle
+}
+
+define <2 x i64> @test_v2i64_v16i8(i8 %arg1, i64 %arg) {
+; CHECK-LE-P8-LABEL: test_v2i64_v16i8:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    mtfprd f0, r3
+; CHECK-LE-P8-NEXT:    mtfprd f1, r4
+; CHECK-LE-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-LE-P8-NEXT:    xxswapd v2, vs1
+; CHECK-LE-P8-NEXT:    xxmrgld v2, vs0, v2
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v2i64_v16i8:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    mtfprd f0, r3
+; CHECK-LE-P9-NEXT:    mtfprd f1, r4
+; CHECK-LE-P9-NEXT:    xxswapd vs0, vs0
+; CHECK-LE-P9-NEXT:    xxswapd v2, vs1
+; CHECK-LE-P9-NEXT:    xxmrgld v2, vs0, v2
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v2i64_v16i8:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    mtfprd f0, r4
+; CHECK-BE-P8-NEXT:    xxspltd v2, vs0, 0
+; CHECK-BE-P8-NEXT:    mtfprwz f0, r3
+; CHECK-BE-P8-NEXT:    xxmrghd v2, v2, vs0
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v2i64_v16i8:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    mtfprwz f0, r3
+; CHECK-BE-P9-NEXT:    mtvsrdd v2, r4, r4
+; CHECK-BE-P9-NEXT:    xxmrghd v2, v2, vs0
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v2i64_v16i8:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    mtfprd f0, r4
+; CHECK-AIX-64-P8-NEXT:    xxmrghd v2, vs0, vs0
+; CHECK-AIX-64-P8-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-64-P8-NEXT:    xxmrghd v2, v2, vs0
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v2i64_v16i8:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-64-P9-NEXT:    mtvsrdd v2, r4, r4
+; CHECK-AIX-64-P9-NEXT:    xxmrghd v2, v2, vs0
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v2i64_v16i8:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    addi r6, r1, -16
+; CHECK-AIX-32-P8-NEXT:    stb r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -32
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r6
+; CHECK-AIX-32-P8-NEXT:    stw r5, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    stw r4, -48(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r4, r1, -48
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs1, 0, r3
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs2, 0, r4
+; CHECK-AIX-32-P8-NEXT:    xxmrghw vs1, vs2, vs1
+; CHECK-AIX-32-P8-NEXT:    xxmrghd v2, vs1, vs0
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v2i64_v16i8:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    stw r5, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r4, -48(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs1, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs2, -48(r1)
+; CHECK-AIX-32-P9-NEXT:    stb r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    xxmrghw vs1, vs2, vs1
+; CHECK-AIX-32-P9-NEXT:    xxmrghd v2, vs1, vs0
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %lhs.tmp = insertelement <16 x i8> undef, i8 %arg1, i32 0
+  %lhs = bitcast <16 x i8> %lhs.tmp to <2 x i64>
+  %rhs.tmp = insertelement <2 x i64> undef, i64 %arg, i32 0
+  %rhs = bitcast <2 x i64> %rhs.tmp to <2 x i64>
+  %shuffle = shufflevector <2 x i64> %rhs, <2 x i64> %lhs, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %shuffle
+}
+
+define <2 x i64> @test_none_v2i64(ptr nocapture noundef readonly %b, i64 %arg) {
+; CHECK-LE-P8-LABEL: test_none_v2i64:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r3
+; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
+; CHECK-LE-P8-NEXT:    mtfprd f0, r4
+; CHECK-LE-P8-NEXT:    xxpermdi v2, vs0, v2, 1
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_none_v2i64:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    lxv v2, 0(r3)
+; CHECK-LE-P9-NEXT:    mtfprd f0, r4
+; CHECK-LE-P9-NEXT:    xxpermdi v2, vs0, v2, 1
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_none_v2i64:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    lxvd2x v2, 0, r3
+; CHECK-BE-P8-NEXT:    mtfprd f0, r4
+; CHECK-BE-P8-NEXT:    xxmrghd v2, v2, vs0
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_none_v2i64:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    lxv v2, 0(r3)
+; CHECK-BE-P9-NEXT:    mtfprd f0, r4
+; CHECK-BE-P9-NEXT:    xxmrghd v2, v2, vs0
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_none_v2i64:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    lxvd2x v2, 0, r3
+; CHECK-AIX-64-P8-NEXT:    mtfprd f0, r4
+; CHECK-AIX-64-P8-NEXT:    xxmrghd v2, v2, vs0
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_none_v2i64:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    lxv v2, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    mtfprd f0, r4
+; CHECK-AIX-64-P9-NEXT:    xxmrghd v2, v2, vs0
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_none_v2i64:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    lwz r6, L..C2(r2) # %const.0
+; CHECK-AIX-32-P8-NEXT:    stw r4, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    stw r5, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r4, r1, -16
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v3, 0, r3
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v4, 0, r4
+; CHECK-AIX-32-P8-NEXT:    lwz r3, L..C3(r2) # %const.1
+; CHECK-AIX-32-P8-NEXT:    addi r4, r1, -32
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v2, 0, r6
+; CHECK-AIX-32-P8-NEXT:    vperm v2, v3, v4, v2
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v3, 0, r3
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v4, 0, r4
+; CHECK-AIX-32-P8-NEXT:    vperm v2, v2, v4, v3
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_none_v2i64:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    lxv v2, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r4
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v2, vs0, 8
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r5
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v2, vs0, 12
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %lhs = load <2 x i64>, ptr %b, align 4
+  %rhs = insertelement <2 x i64> undef, i64 %arg, i32 0
+  %shuffle = shufflevector <2 x i64> %lhs, <2 x i64> %rhs, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %shuffle
+}
+
+define <2 x i64> @test_v2i64_none(ptr nocapture noundef readonly %b, i64 %arg) {
+; CHECK-LE-P8-LABEL: test_v2i64_none:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r3
+; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
+; CHECK-LE-P8-NEXT:    mtfprd f0, r4
+; CHECK-LE-P8-NEXT:    xxpermdi v2, v2, vs0, 2
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v2i64_none:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    lxv v2, 0(r3)
+; CHECK-LE-P9-NEXT:    mtfprd f0, r4
+; CHECK-LE-P9-NEXT:    xxpermdi v2, v2, vs0, 2
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v2i64_none:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    mtfprd f0, r4
+; CHECK-BE-P8-NEXT:    lxvd2x v2, 0, r3
+; CHECK-BE-P8-NEXT:    xxspltd v3, vs0, 0
+; CHECK-BE-P8-NEXT:    xxmrghd v2, v3, v2
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v2i64_none:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    lxv v2, 0(r3)
+; CHECK-BE-P9-NEXT:    mtvsrdd v3, r4, r4
+; CHECK-BE-P9-NEXT:    xxmrghd v2, v3, v2
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v2i64_none:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    mtfprd f0, r4
+; CHECK-AIX-64-P8-NEXT:    lxvd2x v2, 0, r3
+; CHECK-AIX-64-P8-NEXT:    xxmrghd v3, vs0, vs0
+; CHECK-AIX-64-P8-NEXT:    xxmrghd v2, v3, v2
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v2i64_none:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    lxv v2, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    mtvsrdd v3, r4, r4
+; CHECK-AIX-64-P9-NEXT:    xxmrghd v2, v3, v2
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v2i64_none:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    lxvd2x v2, 0, r3
+; CHECK-AIX-32-P8-NEXT:    stw r5, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    stw r4, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
+; CHECK-AIX-32-P8-NEXT:    addi r4, r1, -32
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r3
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs1, 0, r4
+; CHECK-AIX-32-P8-NEXT:    xxmrghw vs0, vs1, vs0
+; CHECK-AIX-32-P8-NEXT:    xxmrghd v2, vs0, v2
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v2i64_none:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    lxv v2, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    stw r5, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r4, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs1, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    xxmrghw vs0, vs1, vs0
+; CHECK-AIX-32-P9-NEXT:    xxmrghd v2, vs0, v2
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %lhs = load <2 x i64>, ptr %b, align 4
+  %rhs = insertelement <2 x i64> undef, i64 %arg, i32 0
+  %shuffle = shufflevector <2 x i64> %rhs, <2 x i64> %lhs, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %shuffle
+}
+
+define <2 x i64> @test_v8i16_v8i16(i16 %arg1, i16 %arg) {
+; CHECK-LE-P8-LABEL: test_v8i16_v8i16:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    mtfprd f0, r3
+; CHECK-LE-P8-NEXT:    mtfprd f1, r4
+; CHECK-LE-P8-NEXT:    xxmrgld v2, vs1, vs0
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v8i16_v8i16:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    mtfprd f0, r3
+; CHECK-LE-P9-NEXT:    mtfprd f1, r4
+; CHECK-LE-P9-NEXT:    xxmrgld v2, vs1, vs0
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v8i16_v8i16:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    mtfprwz f0, r3
+; CHECK-BE-P8-NEXT:    mtfprwz f1, r4
+; CHECK-BE-P8-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v8i16_v8i16:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    mtfprwz f0, r3
+; CHECK-BE-P9-NEXT:    mtfprwz f1, r4
+; CHECK-BE-P9-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v8i16_v8i16:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-64-P8-NEXT:    mtfprwz f1, r4
+; CHECK-AIX-64-P8-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v8i16_v8i16:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-64-P9-NEXT:    mtfprwz f1, r4
+; CHECK-AIX-64-P9-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v8i16_v8i16:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    addi r5, r1, -16
+; CHECK-AIX-32-P8-NEXT:    sth r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -32
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r5
+; CHECK-AIX-32-P8-NEXT:    sth r4, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs1, 0, r3
+; CHECK-AIX-32-P8-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v8i16_v8i16:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    sth r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    sth r4, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs1, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %lhs.tmp = insertelement <8 x i16> undef, i16 %arg1, i32 0
+  %lhs = bitcast <8 x i16> %lhs.tmp to <2 x i64>
+  %rhs.tmp = insertelement <8 x i16> undef, i16 %arg, i32 0
+  %rhs = bitcast <8 x i16> %rhs.tmp to <2 x i64>
+  %shuffle = shufflevector <2 x i64> %lhs, <2 x i64> %rhs, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %shuffle
+}
+
+define <2 x i64> @test_v8i16_v4i32(i16 %arg1, i32 %arg) {
+; CHECK-LE-P8-LABEL: test_v8i16_v4i32:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    mtfprd f0, r3
+; CHECK-LE-P8-NEXT:    mtfprd f1, r4
+; CHECK-LE-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-LE-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-LE-P8-NEXT:    xxmrgld v2, vs1, vs0
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v8i16_v4i32:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    mtfprd f0, r3
+; CHECK-LE-P9-NEXT:    mtvsrws vs1, r4
+; CHECK-LE-P9-NEXT:    xxswapd vs0, vs0
+; CHECK-LE-P9-NEXT:    xxmrgld v2, vs1, vs0
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v8i16_v4i32:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    sldi r3, r3, 48
+; CHECK-BE-P8-NEXT:    sldi r4, r4, 32
+; CHECK-BE-P8-NEXT:    mtfprd f0, r3
+; CHECK-BE-P8-NEXT:    mtfprd f1, r4
+; CHECK-BE-P8-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v8i16_v4i32:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    sldi r3, r3, 48
+; CHECK-BE-P9-NEXT:    mtvsrws vs1, r4
+; CHECK-BE-P9-NEXT:    mtfprd f0, r3
+; CHECK-BE-P9-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v8i16_v4i32:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    sldi r3, r3, 48
+; CHECK-AIX-64-P8-NEXT:    sldi r4, r4, 32
+; CHECK-AIX-64-P8-NEXT:    mtfprd f0, r3
+; CHECK-AIX-64-P8-NEXT:    mtfprd f1, r4
+; CHECK-AIX-64-P8-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v8i16_v4i32:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    sldi r3, r3, 48
+; CHECK-AIX-64-P9-NEXT:    mtvsrws vs1, r4
+; CHECK-AIX-64-P9-NEXT:    mtfprd f0, r3
+; CHECK-AIX-64-P9-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v8i16_v4i32:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    addi r5, r1, -16
+; CHECK-AIX-32-P8-NEXT:    sth r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -32
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r5
+; CHECK-AIX-32-P8-NEXT:    stw r4, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs1, 0, r3
+; CHECK-AIX-32-P8-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v8i16_v4i32:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    sth r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r4, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs1, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %lhs.tmp = insertelement <8 x i16> undef, i16 %arg1, i32 0
+  %lhs = bitcast <8 x i16> %lhs.tmp to <2 x i64>
+  %rhs.tmp = insertelement <4 x i32> undef, i32 %arg, i32 0
+  %rhs = bitcast <4 x i32> %rhs.tmp to <2 x i64>
+  %shuffle = shufflevector <2 x i64> %lhs, <2 x i64> %rhs, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %shuffle
+}
+
+define <2 x i64> @test_v8i16_v2i64(i16 %arg1, i64 %arg) {
+; CHECK-LE-P8-LABEL: test_v8i16_v2i64:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    mtfprd f0, r3
+; CHECK-LE-P8-NEXT:    mtfprd f1, r4
+; CHECK-LE-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-LE-P8-NEXT:    xxswapd v2, vs1
+; CHECK-LE-P8-NEXT:    xxmrgld v2, v2, vs0
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v8i16_v2i64:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    mtfprd f0, r3
+; CHECK-LE-P9-NEXT:    mtfprd f1, r4
+; CHECK-LE-P9-NEXT:    xxswapd vs0, vs0
+; CHECK-LE-P9-NEXT:    xxswapd v2, vs1
+; CHECK-LE-P9-NEXT:    xxmrgld v2, v2, vs0
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v8i16_v2i64:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    sldi r3, r3, 48
+; CHECK-BE-P8-NEXT:    mtfprd f1, r4
+; CHECK-BE-P8-NEXT:    mtfprd f0, r3
+; CHECK-BE-P8-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v8i16_v2i64:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    sldi r3, r3, 48
+; CHECK-BE-P9-NEXT:    mtfprd f1, r4
+; CHECK-BE-P9-NEXT:    mtfprd f0, r3
+; CHECK-BE-P9-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v8i16_v2i64:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    sldi r3, r3, 48
+; CHECK-AIX-64-P8-NEXT:    mtfprd f1, r4
+; CHECK-AIX-64-P8-NEXT:    mtfprd f0, r3
+; CHECK-AIX-64-P8-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v8i16_v2i64:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    sldi r3, r3, 48
+; CHECK-AIX-64-P9-NEXT:    mtfprd f1, r4
+; CHECK-AIX-64-P9-NEXT:    mtfprd f0, r3
+; CHECK-AIX-64-P9-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v8i16_v2i64:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    sth r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    lwz r3, L..C4(r2) # %const.0
+; CHECK-AIX-32-P8-NEXT:    addi r6, r1, -16
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v2, 0, r6
+; CHECK-AIX-32-P8-NEXT:    stw r4, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    stw r5, -48(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r4, r1, -32
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v3, 0, r3
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v4, 0, r4
+; CHECK-AIX-32-P8-NEXT:    lwz r3, L..C5(r2) # %const.1
+; CHECK-AIX-32-P8-NEXT:    addi r4, r1, -48
+; CHECK-AIX-32-P8-NEXT:    vperm v2, v2, v4, v3
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v3, 0, r3
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v4, 0, r4
+; CHECK-AIX-32-P8-NEXT:    vperm v2, v2, v4, v3
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v8i16_v2i64:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    sth r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r4
+; CHECK-AIX-32-P9-NEXT:    lxv v2, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v2, vs0, 8
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r5
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v2, vs0, 12
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %lhs.tmp = insertelement <8 x i16> undef, i16 %arg1, i32 0
+  %lhs = bitcast <8 x i16> %lhs.tmp to <2 x i64>
+  %rhs.tmp = insertelement <2 x i64> undef, i64 %arg, i32 0
+  %rhs = bitcast <2 x i64> %rhs.tmp to <2 x i64>
+  %shuffle = shufflevector <2 x i64> %lhs, <2 x i64> %rhs, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %shuffle
+}
+
+define <2 x i64> @test_v4i32_v4i32(i32 %arg1, i32 %arg) {
+; CHECK-LE-P8-LABEL: test_v4i32_v4i32:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    mtfprwz f0, r3
+; CHECK-LE-P8-NEXT:    mtfprwz f1, r4
+; CHECK-LE-P8-NEXT:    xxmrgld v2, vs1, vs0
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v4i32_v4i32:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    mtfprwz f0, r3
+; CHECK-LE-P9-NEXT:    mtfprwz f1, r4
+; CHECK-LE-P9-NEXT:    xxmrgld v2, vs1, vs0
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v4i32_v4i32:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    mtfprwz f0, r3
+; CHECK-BE-P8-NEXT:    mtfprwz f1, r4
+; CHECK-BE-P8-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v4i32_v4i32:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    mtfprwz f0, r3
+; CHECK-BE-P9-NEXT:    mtfprwz f1, r4
+; CHECK-BE-P9-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v4i32_v4i32:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-64-P8-NEXT:    mtfprwz f1, r4
+; CHECK-AIX-64-P8-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v4i32_v4i32:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-64-P9-NEXT:    mtfprwz f1, r4
+; CHECK-AIX-64-P9-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v4i32_v4i32:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    addi r5, r1, -16
+; CHECK-AIX-32-P8-NEXT:    stw r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -32
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r5
+; CHECK-AIX-32-P8-NEXT:    stw r4, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs1, 0, r3
+; CHECK-AIX-32-P8-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v4i32_v4i32:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    stw r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r4, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs1, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %lhs.tmp = insertelement <4 x i32> undef, i32 %arg1, i32 0
+  %lhs = bitcast <4 x i32> %lhs.tmp to <2 x i64>
+  %rhs.tmp = insertelement <4 x i32> undef, i32 %arg, i32 0
+  %rhs = bitcast <4 x i32> %rhs.tmp to <2 x i64>
+  %shuffle = shufflevector <2 x i64> %lhs, <2 x i64> %rhs, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %shuffle
+}
+
+define <2 x i64> @test_v4i32_v8i16(i32 %arg1, i16 %arg) {
+; CHECK-LE-P8-LABEL: test_v4i32_v8i16:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    mtfprd f0, r3
+; CHECK-LE-P8-NEXT:    mtfprd f1, r4
+; CHECK-LE-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-LE-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-LE-P8-NEXT:    xxmrgld v2, vs1, vs0
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v4i32_v8i16:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    mtfprd f1, r4
+; CHECK-LE-P9-NEXT:    mtvsrws vs0, r3
+; CHECK-LE-P9-NEXT:    xxswapd vs1, vs1
+; CHECK-LE-P9-NEXT:    xxmrgld v2, vs1, vs0
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v4i32_v8i16:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    sldi r3, r3, 32
+; CHECK-BE-P8-NEXT:    sldi r4, r4, 48
+; CHECK-BE-P8-NEXT:    mtfprd f0, r3
+; CHECK-BE-P8-NEXT:    mtfprd f1, r4
+; CHECK-BE-P8-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v4i32_v8i16:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    mtvsrws vs0, r3
+; CHECK-BE-P9-NEXT:    sldi r3, r4, 48
+; CHECK-BE-P9-NEXT:    mtfprd f1, r3
+; CHECK-BE-P9-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v4i32_v8i16:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    sldi r3, r3, 32
+; CHECK-AIX-64-P8-NEXT:    sldi r4, r4, 48
+; CHECK-AIX-64-P8-NEXT:    mtfprd f0, r3
+; CHECK-AIX-64-P8-NEXT:    mtfprd f1, r4
+; CHECK-AIX-64-P8-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v4i32_v8i16:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    mtvsrws vs0, r3
+; CHECK-AIX-64-P9-NEXT:    sldi r3, r4, 48
+; CHECK-AIX-64-P9-NEXT:    mtfprd f1, r3
+; CHECK-AIX-64-P9-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v4i32_v8i16:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    addi r5, r1, -16
+; CHECK-AIX-32-P8-NEXT:    stw r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -32
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r5
+; CHECK-AIX-32-P8-NEXT:    sth r4, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs1, 0, r3
+; CHECK-AIX-32-P8-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v4i32_v8i16:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    stw r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    sth r4, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs1, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %lhs.tmp = insertelement <4 x i32> undef, i32 %arg1, i32 0
+  %lhs = bitcast <4 x i32> %lhs.tmp to <2 x i64>
+  %rhs.tmp = insertelement <8 x i16> undef, i16 %arg, i32 0
+  %rhs = bitcast <8 x i16> %rhs.tmp to <2 x i64>
+  %shuffle = shufflevector <2 x i64> %lhs, <2 x i64> %rhs, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %shuffle
+}
+
+define <2 x i64> @test_v4i32_v2i64(i32 %arg1, i64 %arg) {
+; CHECK-LE-P8-LABEL: test_v4i32_v2i64:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    mtfprd f0, r3
+; CHECK-LE-P8-NEXT:    mtfprd f1, r4
+; CHECK-LE-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-LE-P8-NEXT:    xxswapd v2, vs1
+; CHECK-LE-P8-NEXT:    xxmrgld v2, v2, vs0
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v4i32_v2i64:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    mtfprd f1, r4
+; CHECK-LE-P9-NEXT:    mtvsrws vs0, r3
+; CHECK-LE-P9-NEXT:    xxswapd v2, vs1
+; CHECK-LE-P9-NEXT:    xxmrgld v2, v2, vs0
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v4i32_v2i64:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    sldi r3, r3, 32
+; CHECK-BE-P8-NEXT:    mtfprd f1, r4
+; CHECK-BE-P8-NEXT:    mtfprd f0, r3
+; CHECK-BE-P8-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v4i32_v2i64:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    mtvsrws vs0, r3
+; CHECK-BE-P9-NEXT:    mtfprd f1, r4
+; CHECK-BE-P9-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v4i32_v2i64:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    sldi r3, r3, 32
+; CHECK-AIX-64-P8-NEXT:    mtfprd f1, r4
+; CHECK-AIX-64-P8-NEXT:    mtfprd f0, r3
+; CHECK-AIX-64-P8-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v4i32_v2i64:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    mtvsrws vs0, r3
+; CHECK-AIX-64-P9-NEXT:    mtfprd f1, r4
+; CHECK-AIX-64-P9-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v4i32_v2i64:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    stw r4, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    stw r3, -48(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -32
+; CHECK-AIX-32-P8-NEXT:    addi r4, r1, -48
+; CHECK-AIX-32-P8-NEXT:    stw r5, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r3
+; CHECK-AIX-32-P8-NEXT:    lwz r3, L..C6(r2) # %const.0
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs1, 0, r4
+; CHECK-AIX-32-P8-NEXT:    addi r4, r1, -16
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v4, 0, r4
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v3, 0, r3
+; CHECK-AIX-32-P8-NEXT:    xxmrghw v2, vs1, vs0
+; CHECK-AIX-32-P8-NEXT:    vperm v2, v2, v4, v3
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v4i32_v2i64:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    stw r3, -48(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r3, L..C0(r2) # %const.0
+; CHECK-AIX-32-P9-NEXT:    stw r4, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs0, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs1, -48(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r5, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v4, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v3, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    xxmrghw v2, vs1, vs0
+; CHECK-AIX-32-P9-NEXT:    vperm v2, v2, v4, v3
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %lhs.tmp = insertelement <4 x i32> undef, i32 %arg1, i32 0
+  %lhs = bitcast <4 x i32> %lhs.tmp to <2 x i64>
+  %rhs = insertelement <2 x i64> undef, i64 %arg, i32 0
+  %shuffle = shufflevector <2 x i64> %lhs, <2 x i64> %rhs, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %shuffle
+}
+
+define <2 x i64> @test_v2i64_v2i64(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
+; CHECK-LE-P8-LABEL: test_v2i64_v2i64:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    ld r3, 0(r3)
+; CHECK-LE-P8-NEXT:    lfdx f0, 0, r4
+; CHECK-LE-P8-NEXT:    mtfprd f1, r3
+; CHECK-LE-P8-NEXT:    xxswapd v2, vs1
+; CHECK-LE-P8-NEXT:    xxmrghd v3, vs0, vs1
+; CHECK-LE-P8-NEXT:    vaddudm v2, v3, v2
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v2i64_v2i64:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    ld r3, 0(r3)
+; CHECK-LE-P9-NEXT:    lfd f1, 0(r4)
+; CHECK-LE-P9-NEXT:    mtfprd f0, r3
+; CHECK-LE-P9-NEXT:    xxswapd v2, vs0
+; CHECK-LE-P9-NEXT:    xxmrghd v3, vs1, vs0
+; CHECK-LE-P9-NEXT:    vaddudm v2, v3, v2
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v2i64_v2i64:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    lxsdx v2, 0, r3
+; CHECK-BE-P8-NEXT:    lfdx f0, 0, r4
+; CHECK-BE-P8-NEXT:    xxmrghd v3, v2, vs0
+; CHECK-BE-P8-NEXT:    vaddudm v2, v3, v2
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v2i64_v2i64:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    lxsd v2, 0(r3)
+; CHECK-BE-P9-NEXT:    lfd f0, 0(r4)
+; CHECK-BE-P9-NEXT:    xxmrghd v3, v2, vs0
+; CHECK-BE-P9-NEXT:    vaddudm v2, v3, v2
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v2i64_v2i64:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    lxsdx v2, 0, r3
+; CHECK-AIX-64-P8-NEXT:    lfdx f0, 0, r4
+; CHECK-AIX-64-P8-NEXT:    xxmrghd v3, v2, vs0
+; CHECK-AIX-64-P8-NEXT:    vaddudm v2, v3, v2
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v2i64_v2i64:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    lxsd v2, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    lfd f0, 0(r4)
+; CHECK-AIX-64-P9-NEXT:    xxmrghd v3, v2, vs0
+; CHECK-AIX-64-P9-NEXT:    vaddudm v2, v3, v2
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v2i64_v2i64:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    lwz r5, 4(r3)
+; CHECK-AIX-32-P8-NEXT:    addi r6, r1, -32
+; CHECK-AIX-32-P8-NEXT:    stw r5, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r5, r1, -16
+; CHECK-AIX-32-P8-NEXT:    lwz r3, 0(r3)
+; CHECK-AIX-32-P8-NEXT:    stw r3, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    lwz r3, 4(r4)
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r5
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs1, 0, r6
+; CHECK-AIX-32-P8-NEXT:    stw r3, -48(r1)
+; CHECK-AIX-32-P8-NEXT:    lwz r3, 0(r4)
+; CHECK-AIX-32-P8-NEXT:    addi r4, r1, -64
+; CHECK-AIX-32-P8-NEXT:    xxmrghw v2, vs1, vs0
+; CHECK-AIX-32-P8-NEXT:    stw r3, -64(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -48
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs2, 0, r3
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs3, 0, r4
+; CHECK-AIX-32-P8-NEXT:    xxmrghw vs0, vs3, vs2
+; CHECK-AIX-32-P8-NEXT:    xxmrghd v3, v2, vs0
+; CHECK-AIX-32-P8-NEXT:    vaddudm v2, v3, v2
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v2i64_v2i64:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    lwz r5, 4(r3)
+; CHECK-AIX-32-P9-NEXT:    stw r5, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r3, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r3, 4(r4)
+; CHECK-AIX-32-P9-NEXT:    lxv vs1, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r3, -48(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r3, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    xxmrghw v2, vs1, vs0
+; CHECK-AIX-32-P9-NEXT:    lxv vs0, -48(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r3, -64(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs1, -64(r1)
+; CHECK-AIX-32-P9-NEXT:    xxmrghw vs0, vs1, vs0
+; CHECK-AIX-32-P9-NEXT:    xxmrghd v3, v2, vs0
+; CHECK-AIX-32-P9-NEXT:    vaddudm v2, v3, v2
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %0 = load <8 x i8>, ptr %a, align 8
+  %bc1 = bitcast <8 x i8> %0 to i64
+  %vecinit3 = insertelement <2 x i64> poison, i64 %bc1, i64 0
+  %1 = load <8 x i8>, ptr %b, align 8
+  %bc2 = bitcast <8 x i8> %1 to i64
+  %vecinit6 = insertelement <2 x i64> undef, i64 %bc2, i64 0
+  %2 = bitcast <2 x i64> %vecinit3 to <2 x i64>
+  %3 = bitcast <2 x i64> %vecinit6 to <2 x i64>
+  %shuffle = shufflevector <2 x i64> %2, <2 x i64> %3, <2 x i32> <i32 0, i32 2>
+  %4 = add <2 x i64> %shuffle, %2
+  ret <2 x i64> %4
+}
+
+define <2 x i64> @test_v2i64_v4i32(i64 %arg1, i32 %arg) {
+; CHECK-LE-P8-LABEL: test_v2i64_v4i32:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    mtfprd f0, r3
+; CHECK-LE-P8-NEXT:    mtfprd f1, r4
+; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
+; CHECK-LE-P8-NEXT:    xxswapd vs0, vs1
+; CHECK-LE-P8-NEXT:    xxmrgld v2, vs0, v2
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v2i64_v4i32:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    mtfprd f0, r3
+; CHECK-LE-P9-NEXT:    xxswapd v2, vs0
+; CHECK-LE-P9-NEXT:    mtvsrws vs0, r4
+; CHECK-LE-P9-NEXT:    xxmrgld v2, vs0, v2
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v2i64_v4i32:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    mtfprd f0, r3
+; CHECK-BE-P8-NEXT:    xxspltd v2, vs0, 0
+; CHECK-BE-P8-NEXT:    mtfprwz f0, r4
+; CHECK-BE-P8-NEXT:    xxmrghd v2, v2, vs0
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v2i64_v4i32:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    mtfprwz f0, r4
+; CHECK-BE-P9-NEXT:    mtvsrdd v2, r3, r3
+; CHECK-BE-P9-NEXT:    xxmrghd v2, v2, vs0
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v2i64_v4i32:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    mtfprd f0, r3
+; CHECK-AIX-64-P8-NEXT:    xxmrghd v2, vs0, vs0
+; CHECK-AIX-64-P8-NEXT:    mtfprwz f0, r4
+; CHECK-AIX-64-P8-NEXT:    xxmrghd v2, v2, vs0
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v2i64_v4i32:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    mtfprwz f0, r4
+; CHECK-AIX-64-P9-NEXT:    mtvsrdd v2, r3, r3
+; CHECK-AIX-64-P9-NEXT:    xxmrghd v2, v2, vs0
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v2i64_v4i32:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    addi r6, r1, -48
+; CHECK-AIX-32-P8-NEXT:    stw r5, -48(r1)
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r6
+; CHECK-AIX-32-P8-NEXT:    stw r4, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    stw r3, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
+; CHECK-AIX-32-P8-NEXT:    addi r4, r1, -32
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs1, 0, r3
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs2, 0, r4
+; CHECK-AIX-32-P8-NEXT:    xxmrghw vs1, vs2, vs1
+; CHECK-AIX-32-P8-NEXT:    xxmrghd v2, vs1, vs0
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v2i64_v4i32:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    stw r4, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs1, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs2, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r5, -48(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs0, -48(r1)
+; CHECK-AIX-32-P9-NEXT:    xxmrghw vs1, vs2, vs1
+; CHECK-AIX-32-P9-NEXT:    xxmrghd v2, vs1, vs0
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %lhs.tmp = insertelement <2 x i64> undef, i64 %arg1, i32 0
+  %lhs = bitcast <2 x i64> %lhs.tmp to <2 x i64>
+  %rhs.tmp = insertelement <4 x i32> undef, i32 %arg, i32 0
+  %rhs = bitcast <4 x i32> %rhs.tmp to <2 x i64>
+  %shuffle = shufflevector <2 x i64> %lhs, <2 x i64> %rhs, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %shuffle
+}
+
+define <2 x i64> @test_v2i64_v8i16(i64 %arg1, i16 %arg) {
+; CHECK-LE-P8-LABEL: test_v2i64_v8i16:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    mtfprd f0, r3
+; CHECK-LE-P8-NEXT:    mtfprd f1, r4
+; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
+; CHECK-LE-P8-NEXT:    xxswapd vs0, vs1
+; CHECK-LE-P8-NEXT:    xxmrgld v2, vs0, v2
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v2i64_v8i16:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    mtfprd f0, r3
+; CHECK-LE-P9-NEXT:    xxswapd v2, vs0
+; CHECK-LE-P9-NEXT:    mtfprd f0, r4
+; CHECK-LE-P9-NEXT:    xxswapd vs0, vs0
+; CHECK-LE-P9-NEXT:    xxmrgld v2, vs0, v2
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v2i64_v8i16:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    mtfprd f0, r3
+; CHECK-BE-P8-NEXT:    xxspltd v2, vs0, 0
+; CHECK-BE-P8-NEXT:    mtfprwz f0, r4
+; CHECK-BE-P8-NEXT:    xxmrghd v2, v2, vs0
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v2i64_v8i16:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    mtfprwz f0, r4
+; CHECK-BE-P9-NEXT:    mtvsrdd v2, r3, r3
+; CHECK-BE-P9-NEXT:    xxmrghd v2, v2, vs0
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v2i64_v8i16:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    mtfprd f0, r3
+; CHECK-AIX-64-P8-NEXT:    xxmrghd v2, vs0, vs0
+; CHECK-AIX-64-P8-NEXT:    mtfprwz f0, r4
+; CHECK-AIX-64-P8-NEXT:    xxmrghd v2, v2, vs0
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v2i64_v8i16:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    mtfprwz f0, r4
+; CHECK-AIX-64-P9-NEXT:    mtvsrdd v2, r3, r3
+; CHECK-AIX-64-P9-NEXT:    xxmrghd v2, v2, vs0
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v2i64_v8i16:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    addi r6, r1, -48
+; CHECK-AIX-32-P8-NEXT:    sth r5, -48(r1)
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r6
+; CHECK-AIX-32-P8-NEXT:    stw r4, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    stw r3, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
+; CHECK-AIX-32-P8-NEXT:    addi r4, r1, -32
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs1, 0, r3
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs2, 0, r4
+; CHECK-AIX-32-P8-NEXT:    xxmrghw vs1, vs2, vs1
+; CHECK-AIX-32-P8-NEXT:    xxmrghd v2, vs1, vs0
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v2i64_v8i16:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    stw r4, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs1, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs2, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    sth r5, -48(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs0, -48(r1)
+; CHECK-AIX-32-P9-NEXT:    xxmrghw vs1, vs2, vs1
+; CHECK-AIX-32-P9-NEXT:    xxmrghd v2, vs1, vs0
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %lhs.tmp = insertelement <2 x i64> undef, i64 %arg1, i32 0
+  %lhs = bitcast <2 x i64> %lhs.tmp to <2 x i64>
+  %rhs.tmp = insertelement <8 x i16> undef, i16 %arg, i32 0
+  %rhs = bitcast <8 x i16> %rhs.tmp to <2 x i64>
+  %shuffle = shufflevector <2 x i64> %lhs, <2 x i64> %rhs, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %shuffle
+}
+
diff --git a/llvm/test/CodeGen/PowerPC/v4i32_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v4i32_scalar_to_vector_shuffle.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/v4i32_scalar_to_vector_shuffle.ll
@@ -0,0 +1,1445 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc64le-unknown-linux-gnu < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-LE-P8
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc64le-unknown-linux-gnu < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-LE-P9
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc64-unknown-linux-gnu < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-BE-P8
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc64-unknown-linux-gnu < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-BE-P9
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc64-ibm-aix < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-AIX-64-P8
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc64-ibm-aix < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-AIX-64-P9
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc-ibm-aix < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-AIX-32-P8
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc-ibm-aix < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-AIX-32-P9
+
+define void @test_none_v8i16(ptr %a) {
+; CHECK-LE-P8-LABEL: test_none_v8i16:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    addis r4, r2, .LCPI0_0@toc@ha
+; CHECK-LE-P8-NEXT:    lxsdx v2, 0, r3
+; CHECK-LE-P8-NEXT:    addi r4, r4, .LCPI0_0@toc@l
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
+; CHECK-LE-P8-NEXT:    lhz r4, 0(r3)
+; CHECK-LE-P8-NEXT:    mtvsrd v4, r4
+; CHECK-LE-P8-NEXT:    xxswapd v3, vs0
+; CHECK-LE-P8-NEXT:    vperm v2, v4, v2, v3
+; CHECK-LE-P8-NEXT:    xxswapd vs0, v2
+; CHECK-LE-P8-NEXT:    stfdx f0, 0, r3
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_none_v8i16:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    lxsihzx v2, 0, r3
+; CHECK-LE-P9-NEXT:    lxsd v3, 0(r3)
+; CHECK-LE-P9-NEXT:    addis r3, r2, .LCPI0_0@toc@ha
+; CHECK-LE-P9-NEXT:    addi r3, r3, .LCPI0_0@toc@l
+; CHECK-LE-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-LE-P9-NEXT:    vperm v2, v2, v3, v4
+; CHECK-LE-P9-NEXT:    xxswapd vs0, v2
+; CHECK-LE-P9-NEXT:    stfd f0, 0(r3)
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_none_v8i16:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    lhz r4, 0(r3)
+; CHECK-BE-P8-NEXT:    lfdx f0, 0, r3
+; CHECK-BE-P8-NEXT:    sldi r3, r4, 48
+; CHECK-BE-P8-NEXT:    mtfprd f1, r3
+; CHECK-BE-P8-NEXT:    xxmrghw vs0, vs0, vs1
+; CHECK-BE-P8-NEXT:    stfdx f0, 0, r3
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_none_v8i16:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    lxsihzx v2, 0, r3
+; CHECK-BE-P9-NEXT:    lfd f0, 0(r3)
+; CHECK-BE-P9-NEXT:    vsplth v2, v2, 3
+; CHECK-BE-P9-NEXT:    xxmrghw vs0, vs0, v2
+; CHECK-BE-P9-NEXT:    stfd f0, 0(r3)
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_none_v8i16:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    lhz r4, 0(r3)
+; CHECK-AIX-64-P8-NEXT:    lfdx f0, 0, r3
+; CHECK-AIX-64-P8-NEXT:    sldi r3, r4, 48
+; CHECK-AIX-64-P8-NEXT:    mtfprd f1, r3
+; CHECK-AIX-64-P8-NEXT:    xxmrghw vs0, vs0, vs1
+; CHECK-AIX-64-P8-NEXT:    stfdx f0, 0, r3
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_none_v8i16:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    lxsihzx v2, 0, r3
+; CHECK-AIX-64-P9-NEXT:    lfd f0, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    vsplth v2, v2, 3
+; CHECK-AIX-64-P9-NEXT:    xxmrghw vs0, vs0, v2
+; CHECK-AIX-64-P9-NEXT:    stfd f0, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_none_v8i16:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    lhz r4, 0(r3)
+; CHECK-AIX-32-P8-NEXT:    sth r4, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r4, r1, -32
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r4
+; CHECK-AIX-32-P8-NEXT:    lwz r3, 0(r3)
+; CHECK-AIX-32-P8-NEXT:    addi r4, r1, -16
+; CHECK-AIX-32-P8-NEXT:    stxvw4x vs0, 0, r4
+; CHECK-AIX-32-P8-NEXT:    stw r3, 0(r3)
+; CHECK-AIX-32-P8-NEXT:    lwz r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    stw r3, 0(r3)
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_none_v8i16:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    lhz r4, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    sth r4, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs0, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r3, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    stw r3, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    stxv vs0, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r3, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %0 = load <2 x i8>, ptr undef, align 1
+  %tmp0_1 = bitcast <2 x i8> %0 to i16
+  %tmp0_2 = insertelement <8 x i16> undef, i16 %tmp0_1, i32 0
+  %tmp0_3 = bitcast <8 x i16> %tmp0_2 to <4 x i32>
+  %1 = load <2 x i32>, ptr %a
+  %tmp1_1 = shufflevector <2 x i32> %1, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  %2 = shufflevector <4 x i32> %tmp0_3, <4 x i32> %tmp1_1, <2 x i32> <i32 4, i32 0>
+  store <2 x i32> %2, ptr undef, align 4
+  ret void
+}
+
+define void @test_v8i16_none(ptr %a) {
+; CHECK-LE-P8-LABEL: test_v8i16_none:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r3
+; CHECK-LE-P8-NEXT:    lhz r3, 0(r3)
+; CHECK-LE-P8-NEXT:    mtfprd f1, r3
+; CHECK-LE-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-LE-P8-NEXT:    xxmrglw vs0, vs0, vs1
+; CHECK-LE-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-LE-P8-NEXT:    stxvd2x vs0, 0, r3
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v8i16_none:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    lxsihzx f0, 0, r3
+; CHECK-LE-P9-NEXT:    lxv vs1, 0(r3)
+; CHECK-LE-P9-NEXT:    xxmrglw vs0, vs1, vs0
+; CHECK-LE-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v8i16_none:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    lhz r4, 0(r3)
+; CHECK-BE-P8-NEXT:    lxvw4x vs0, 0, r3
+; CHECK-BE-P8-NEXT:    mtfprwz f1, r4
+; CHECK-BE-P8-NEXT:    xxmrghw vs0, vs1, vs0
+; CHECK-BE-P8-NEXT:    stxvw4x vs0, 0, r3
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v8i16_none:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    lxsihzx f0, 0, r3
+; CHECK-BE-P9-NEXT:    lxv vs1, 0(r3)
+; CHECK-BE-P9-NEXT:    xxmrghw vs0, vs0, vs1
+; CHECK-BE-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v8i16_none:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    lhz r4, 0(r3)
+; CHECK-AIX-64-P8-NEXT:    lxvw4x vs0, 0, r3
+; CHECK-AIX-64-P8-NEXT:    mtfprwz f1, r4
+; CHECK-AIX-64-P8-NEXT:    xxmrghw vs0, vs1, vs0
+; CHECK-AIX-64-P8-NEXT:    stxvw4x vs0, 0, r3
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v8i16_none:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    lxsihzx f0, 0, r3
+; CHECK-AIX-64-P9-NEXT:    lxv vs1, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    xxmrghw vs0, vs0, vs1
+; CHECK-AIX-64-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v8i16_none:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    lhz r4, 0(r3)
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r3
+; CHECK-AIX-32-P8-NEXT:    mtfprwz f1, r4
+; CHECK-AIX-32-P8-NEXT:    xxmrghw vs0, vs1, vs0
+; CHECK-AIX-32-P8-NEXT:    stxvw4x vs0, 0, r3
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v8i16_none:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    lxsihzx f0, 0, r3
+; CHECK-AIX-32-P9-NEXT:    lxv vs1, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    xxmrghw vs0, vs0, vs1
+; CHECK-AIX-32-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %0 = load <2 x i8>, ptr undef, align 1
+  %tmp0_1 = bitcast <2 x i8> %0 to i16
+  %tmp0_2 = insertelement <8 x i16> undef, i16 %tmp0_1, i32 0
+  %tmp0_3 = bitcast <8 x i16> %tmp0_2 to <4 x i32>
+  %1 = load <4 x i32>, ptr %a, align 1
+  %2 = shufflevector <4 x i32> %tmp0_3, <4 x i32> %1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  store <4 x i32> %2, ptr undef, align 4
+  ret void
+}
+
+define void @test_none_v4i32(<2 x i32> %vec, ptr %ptr1) {
+; CHECK-LE-P8-LABEL: test_none_v4i32:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    addis r3, r2, .LCPI2_0@toc@ha
+; CHECK-LE-P8-NEXT:    xxswapd vs0, v2
+; CHECK-LE-P8-NEXT:    addi r3, r3, .LCPI2_0@toc@l
+; CHECK-LE-P8-NEXT:    lxvd2x vs1, 0, r3
+; CHECK-LE-P8-NEXT:    mffprwz r3, f0
+; CHECK-LE-P8-NEXT:    xxswapd v3, vs1
+; CHECK-LE-P8-NEXT:    mtvsrwz v4, r3
+; CHECK-LE-P8-NEXT:    vperm v2, v4, v2, v3
+; CHECK-LE-P8-NEXT:    xxswapd vs0, v2
+; CHECK-LE-P8-NEXT:    stxvd2x vs0, 0, r5
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_none_v4i32:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    li r3, 0
+; CHECK-LE-P9-NEXT:    vextuwrx r3, r3, v2
+; CHECK-LE-P9-NEXT:    mtvsrwz v3, r3
+; CHECK-LE-P9-NEXT:    addis r3, r2, .LCPI2_0@toc@ha
+; CHECK-LE-P9-NEXT:    addi r3, r3, .LCPI2_0@toc@l
+; CHECK-LE-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-LE-P9-NEXT:    vperm v2, v3, v2, v4
+; CHECK-LE-P9-NEXT:    stxv v2, 0(r5)
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_none_v4i32:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    xxsldwi vs0, v2, v2, 3
+; CHECK-BE-P8-NEXT:    addis r3, r2, .LCPI2_0@toc@ha
+; CHECK-BE-P8-NEXT:    addi r3, r3, .LCPI2_0@toc@l
+; CHECK-BE-P8-NEXT:    lxvw4x v3, 0, r3
+; CHECK-BE-P8-NEXT:    mffprwz r4, f0
+; CHECK-BE-P8-NEXT:    mtvsrwz v4, r4
+; CHECK-BE-P8-NEXT:    vperm v2, v2, v4, v3
+; CHECK-BE-P8-NEXT:    stxvw4x v2, 0, r5
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_none_v4i32:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    li r3, 0
+; CHECK-BE-P9-NEXT:    vextuwlx r3, r3, v2
+; CHECK-BE-P9-NEXT:    mtvsrwz v3, r3
+; CHECK-BE-P9-NEXT:    addis r3, r2, .LCPI2_0@toc@ha
+; CHECK-BE-P9-NEXT:    addi r3, r3, .LCPI2_0@toc@l
+; CHECK-BE-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-BE-P9-NEXT:    vperm v2, v2, v3, v4
+; CHECK-BE-P9-NEXT:    stxv v2, 0(r5)
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_none_v4i32:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    xxsldwi vs0, v2, v2, 3
+; CHECK-AIX-64-P8-NEXT:    ld r4, L..C0(r2) # %const.0
+; CHECK-AIX-64-P8-NEXT:    mffprwz r5, f0
+; CHECK-AIX-64-P8-NEXT:    lxvw4x v3, 0, r4
+; CHECK-AIX-64-P8-NEXT:    mtvsrwz v4, r5
+; CHECK-AIX-64-P8-NEXT:    vperm v2, v2, v4, v3
+; CHECK-AIX-64-P8-NEXT:    stxvw4x v2, 0, r3
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_none_v4i32:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    li r4, 0
+; CHECK-AIX-64-P9-NEXT:    vextuwlx r4, r4, v2
+; CHECK-AIX-64-P9-NEXT:    mtvsrwz v3, r4
+; CHECK-AIX-64-P9-NEXT:    ld r4, L..C0(r2) # %const.0
+; CHECK-AIX-64-P9-NEXT:    lxv v4, 0(r4)
+; CHECK-AIX-64-P9-NEXT:    vperm v2, v2, v3, v4
+; CHECK-AIX-64-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_none_v4i32:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    lwz r4, L..C0(r2) # %const.0
+; CHECK-AIX-32-P8-NEXT:    addi r5, r1, -16
+; CHECK-AIX-32-P8-NEXT:    stxvw4x v2, 0, r5
+; CHECK-AIX-32-P8-NEXT:    lxsiwzx v3, 0, r5
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v4, 0, r4
+; CHECK-AIX-32-P8-NEXT:    vperm v2, v2, v3, v4
+; CHECK-AIX-32-P8-NEXT:    stxvw4x v2, 0, r3
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_none_v4i32:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    addi r4, r1, -16
+; CHECK-AIX-32-P9-NEXT:    stxv v2, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxsiwzx v3, 0, r4
+; CHECK-AIX-32-P9-NEXT:    lwz r4, L..C0(r2) # %const.0
+; CHECK-AIX-32-P9-NEXT:    lxv v4, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    vperm v2, v2, v3, v4
+; CHECK-AIX-32-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %0 = extractelement <2 x i32> %vec, i64 0
+  %1 = bitcast i32 %0 to <2 x i16>
+  %2 = shufflevector <2 x i16> %1, <2 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  %3 = shufflevector <2 x i32> %vec, <2 x i32> %vec, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %4 = bitcast <4 x i32> %3 to <8 x i16>
+  %5 = shufflevector <8 x i16> %4, <8 x i16> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+  store <8 x i16> %5, ptr %ptr1, align 16
+  ret void
+}
+
+define void @test_v4i32_none(<2 x i32> %vec, ptr %ptr1) {
+; CHECK-LE-P8-LABEL: test_v4i32_none:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    addis r3, r2, .LCPI3_0@toc@ha
+; CHECK-LE-P8-NEXT:    xxswapd vs0, v2
+; CHECK-LE-P8-NEXT:    addi r3, r3, .LCPI3_0@toc@l
+; CHECK-LE-P8-NEXT:    lxvd2x vs1, 0, r3
+; CHECK-LE-P8-NEXT:    mffprwz r3, f0
+; CHECK-LE-P8-NEXT:    xxswapd v3, vs1
+; CHECK-LE-P8-NEXT:    mtvsrwz v4, r3
+; CHECK-LE-P8-NEXT:    vperm v2, v2, v4, v3
+; CHECK-LE-P8-NEXT:    xxswapd vs0, v2
+; CHECK-LE-P8-NEXT:    stxvd2x vs0, 0, r5
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v4i32_none:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    li r3, 0
+; CHECK-LE-P9-NEXT:    vextuwrx r3, r3, v2
+; CHECK-LE-P9-NEXT:    mtvsrwz v3, r3
+; CHECK-LE-P9-NEXT:    addis r3, r2, .LCPI3_0@toc@ha
+; CHECK-LE-P9-NEXT:    addi r3, r3, .LCPI3_0@toc@l
+; CHECK-LE-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-LE-P9-NEXT:    vperm v2, v2, v3, v4
+; CHECK-LE-P9-NEXT:    stxv v2, 0(r5)
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v4i32_none:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    xxsldwi vs0, v2, v2, 3
+; CHECK-BE-P8-NEXT:    addis r3, r2, .LCPI3_0@toc@ha
+; CHECK-BE-P8-NEXT:    addi r3, r3, .LCPI3_0@toc@l
+; CHECK-BE-P8-NEXT:    lxvw4x v3, 0, r3
+; CHECK-BE-P8-NEXT:    mffprwz r4, f0
+; CHECK-BE-P8-NEXT:    mtvsrwz v4, r4
+; CHECK-BE-P8-NEXT:    vperm v2, v4, v2, v3
+; CHECK-BE-P8-NEXT:    stxvw4x v2, 0, r5
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v4i32_none:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    li r3, 0
+; CHECK-BE-P9-NEXT:    vextuwlx r3, r3, v2
+; CHECK-BE-P9-NEXT:    mtvsrwz v3, r3
+; CHECK-BE-P9-NEXT:    addis r3, r2, .LCPI3_0@toc@ha
+; CHECK-BE-P9-NEXT:    addi r3, r3, .LCPI3_0@toc@l
+; CHECK-BE-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-BE-P9-NEXT:    vperm v2, v3, v2, v4
+; CHECK-BE-P9-NEXT:    stxv v2, 0(r5)
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v4i32_none:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    xxsldwi vs0, v2, v2, 3
+; CHECK-AIX-64-P8-NEXT:    ld r4, L..C1(r2) # %const.0
+; CHECK-AIX-64-P8-NEXT:    mffprwz r5, f0
+; CHECK-AIX-64-P8-NEXT:    lxvw4x v3, 0, r4
+; CHECK-AIX-64-P8-NEXT:    mtvsrwz v4, r5
+; CHECK-AIX-64-P8-NEXT:    vperm v2, v4, v2, v3
+; CHECK-AIX-64-P8-NEXT:    stxvw4x v2, 0, r3
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v4i32_none:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    li r4, 0
+; CHECK-AIX-64-P9-NEXT:    vextuwlx r4, r4, v2
+; CHECK-AIX-64-P9-NEXT:    mtvsrwz v3, r4
+; CHECK-AIX-64-P9-NEXT:    ld r4, L..C1(r2) # %const.0
+; CHECK-AIX-64-P9-NEXT:    lxv v4, 0(r4)
+; CHECK-AIX-64-P9-NEXT:    vperm v2, v3, v2, v4
+; CHECK-AIX-64-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v4i32_none:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    lwz r4, L..C1(r2) # %const.0
+; CHECK-AIX-32-P8-NEXT:    addi r5, r1, -16
+; CHECK-AIX-32-P8-NEXT:    stxvw4x v2, 0, r5
+; CHECK-AIX-32-P8-NEXT:    lxsiwzx v3, 0, r5
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v4, 0, r4
+; CHECK-AIX-32-P8-NEXT:    vperm v2, v3, v2, v4
+; CHECK-AIX-32-P8-NEXT:    stxvw4x v2, 0, r3
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v4i32_none:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    addi r4, r1, -16
+; CHECK-AIX-32-P9-NEXT:    stxv v2, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxsiwzx v3, 0, r4
+; CHECK-AIX-32-P9-NEXT:    lwz r4, L..C1(r2) # %const.0
+; CHECK-AIX-32-P9-NEXT:    lxv v4, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    vperm v2, v3, v2, v4
+; CHECK-AIX-32-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %0 = extractelement <2 x i32> %vec, i64 0
+  %1 = bitcast i32 %0 to <2 x i16>
+  %2 = shufflevector <2 x i16> %1, <2 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  %3 = shufflevector <2 x i32> %vec, <2 x i32> %vec, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %4 = bitcast <4 x i32> %3 to <8 x i16>
+  %5 = shufflevector <8 x i16> %2, <8 x i16> %4, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13>
+  store <8 x i16> %5, ptr %ptr1, align 16
+  ret void
+}
+
+define void @test_none_v2i64(ptr %ptr, i32 %v1, <2 x i32> %vec) local_unnamed_addr #0 {
+; CHECK-LE-P8-LABEL: test_none_v2i64:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    addis r5, r2, .LCPI4_0@toc@ha
+; CHECK-LE-P8-NEXT:    mtvsrwz v4, r4
+; CHECK-LE-P8-NEXT:    addis r4, r2, .LCPI4_1@toc@ha
+; CHECK-LE-P8-NEXT:    addi r5, r5, .LCPI4_0@toc@l
+; CHECK-LE-P8-NEXT:    addi r4, r4, .LCPI4_1@toc@l
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r5
+; CHECK-LE-P8-NEXT:    xxswapd v3, vs0
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
+; CHECK-LE-P8-NEXT:    vperm v2, v2, v4, v3
+; CHECK-LE-P8-NEXT:    lxsdx v3, 0, r3
+; CHECK-LE-P8-NEXT:    xxswapd v4, vs0
+; CHECK-LE-P8-NEXT:    vperm v2, v3, v2, v4
+; CHECK-LE-P8-NEXT:    xxswapd vs0, v2
+; CHECK-LE-P8-NEXT:    stxvd2x vs0, 0, r3
+;
+; CHECK-LE-P9-LABEL: test_none_v2i64:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    lxsd v3, 0(r3)
+; CHECK-LE-P9-NEXT:    addis r3, r2, .LCPI4_0@toc@ha
+; CHECK-LE-P9-NEXT:    mtfprwz f0, r4
+; CHECK-LE-P9-NEXT:    addi r3, r3, .LCPI4_0@toc@l
+; CHECK-LE-P9-NEXT:    xxinsertw v2, vs0, 12
+; CHECK-LE-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-LE-P9-NEXT:    vperm v2, v3, v2, v4
+; CHECK-LE-P9-NEXT:    stxv v2, 0(r3)
+;
+; CHECK-BE-P8-LABEL: test_none_v2i64:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    addis r5, r2, .LCPI4_0@toc@ha
+; CHECK-BE-P8-NEXT:    mtvsrwz v4, r4
+; CHECK-BE-P8-NEXT:    addis r4, r2, .LCPI4_1@toc@ha
+; CHECK-BE-P8-NEXT:    addi r5, r5, .LCPI4_0@toc@l
+; CHECK-BE-P8-NEXT:    addi r4, r4, .LCPI4_1@toc@l
+; CHECK-BE-P8-NEXT:    lxvw4x v3, 0, r5
+; CHECK-BE-P8-NEXT:    vperm v2, v4, v2, v3
+; CHECK-BE-P8-NEXT:    lxsdx v3, 0, r3
+; CHECK-BE-P8-NEXT:    lxvw4x v4, 0, r4
+; CHECK-BE-P8-NEXT:    vperm v2, v2, v3, v4
+; CHECK-BE-P8-NEXT:    stxvw4x v2, 0, r3
+;
+; CHECK-BE-P9-LABEL: test_none_v2i64:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    lxsd v3, 0(r3)
+; CHECK-BE-P9-NEXT:    addis r3, r2, .LCPI4_0@toc@ha
+; CHECK-BE-P9-NEXT:    mtfprwz f0, r4
+; CHECK-BE-P9-NEXT:    addi r3, r3, .LCPI4_0@toc@l
+; CHECK-BE-P9-NEXT:    xxinsertw v2, vs0, 0
+; CHECK-BE-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-BE-P9-NEXT:    vperm v2, v2, v3, v4
+; CHECK-BE-P9-NEXT:    stxv v2, 0(r3)
+;
+; CHECK-AIX-64-P8-LABEL: test_none_v2i64:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    ld r5, L..C2(r2) # %const.0
+; CHECK-AIX-64-P8-NEXT:    mtvsrwz v4, r4
+; CHECK-AIX-64-P8-NEXT:    ld r4, L..C3(r2) # %const.1
+; CHECK-AIX-64-P8-NEXT:    lxvw4x v3, 0, r5
+; CHECK-AIX-64-P8-NEXT:    vperm v2, v4, v2, v3
+; CHECK-AIX-64-P8-NEXT:    lxsdx v3, 0, r3
+; CHECK-AIX-64-P8-NEXT:    lxvw4x v4, 0, r4
+; CHECK-AIX-64-P8-NEXT:    vperm v2, v2, v3, v4
+; CHECK-AIX-64-P8-NEXT:    stxvw4x v2, 0, r3
+;
+; CHECK-AIX-64-P9-LABEL: test_none_v2i64:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    lxsd v3, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    ld r3, L..C2(r2) # %const.0
+; CHECK-AIX-64-P9-NEXT:    mtfprwz f0, r4
+; CHECK-AIX-64-P9-NEXT:    xxinsertw v2, vs0, 0
+; CHECK-AIX-64-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    vperm v2, v2, v3, v4
+; CHECK-AIX-64-P9-NEXT:    stxv v2, 0(r3)
+;
+; CHECK-AIX-32-P8-LABEL: test_none_v2i64:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    lwz r5, L..C2(r2) # %const.0
+; CHECK-AIX-32-P8-NEXT:    lxsiwzx v3, 0, r3
+; CHECK-AIX-32-P8-NEXT:    stw r4, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v5, 0, r3
+; CHECK-AIX-32-P8-NEXT:    lwz r3, L..C3(r2) # %const.1
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v4, 0, r5
+; CHECK-AIX-32-P8-NEXT:    vperm v2, v5, v2, v4
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v4, 0, r3
+; CHECK-AIX-32-P8-NEXT:    vperm v2, v2, v3, v4
+; CHECK-AIX-32-P8-NEXT:    stxvw4x v2, 0, r3
+;
+; CHECK-AIX-32-P9-LABEL: test_none_v2i64:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    lxsiwzx v3, 0, r3
+; CHECK-AIX-32-P9-NEXT:    lwz r3, L..C2(r2) # %const.0
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r4
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v2, vs0, 0
+; CHECK-AIX-32-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    vperm v2, v2, v3, v4
+; CHECK-AIX-32-P9-NEXT:    stxv v2, 0(r3)
+entry:
+  %0 = load <2 x i32>, ptr %ptr, align 4
+  %tmp = insertelement <2 x i32> %vec, i32 %v1, i32 0
+  %1 = shufflevector <2 x i32> %0, <2 x i32> %tmp, <4 x i32> <i32 3, i32 2, i32 2, i32 0>
+  store <4 x i32> %1, ptr undef, align 4
+  unreachable
+}
+
+define void @test_v2i64_none() {
+; CHECK-LE-P8-LABEL: test_v2i64_none:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    lfdx f0, 0, r3
+; CHECK-LE-P8-NEXT:    xxmrghw vs0, vs0, vs0
+; CHECK-LE-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-LE-P8-NEXT:    stxvd2x vs0, 0, r3
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v2i64_none:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    lfd f0, 0(r3)
+; CHECK-LE-P9-NEXT:    xxmrghw vs0, vs0, vs0
+; CHECK-LE-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v2i64_none:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    lfdx f0, 0, r3
+; CHECK-BE-P8-NEXT:    xxmrghw vs0, vs0, vs0
+; CHECK-BE-P8-NEXT:    stxvw4x vs0, 0, r3
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v2i64_none:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    lfd f0, 0(r3)
+; CHECK-BE-P9-NEXT:    xxmrghw vs0, vs0, vs0
+; CHECK-BE-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v2i64_none:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    lfdx f0, 0, r3
+; CHECK-AIX-64-P8-NEXT:    xxmrghw vs0, vs0, vs0
+; CHECK-AIX-64-P8-NEXT:    stxvw4x vs0, 0, r3
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v2i64_none:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    lfd f0, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    xxmrghw vs0, vs0, vs0
+; CHECK-AIX-64-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v2i64_none:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    lfiwzx f0, 0, r3
+; CHECK-AIX-32-P8-NEXT:    xxspltw vs0, vs0, 1
+; CHECK-AIX-32-P8-NEXT:    xxmrghw vs0, vs0, vs0
+; CHECK-AIX-32-P8-NEXT:    stxvw4x vs0, 0, r3
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v2i64_none:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    lxvwsx vs0, 0, r3
+; CHECK-AIX-32-P9-NEXT:    xxmrghw vs0, vs0, vs0
+; CHECK-AIX-32-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %0 = load <2 x i32>, ptr undef, align 4
+  %1 = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  store <4 x i32> %1, ptr undef, align 4
+  ret void
+}
+
+define void @test_v8i16_v8i16(ptr %a) {
+; CHECK-LE-P8-LABEL: test_v8i16_v8i16:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    lhz r4, 0(r3)
+; CHECK-LE-P8-NEXT:    lhz r3, 0(r3)
+; CHECK-LE-P8-NEXT:    mtfprd f0, r4
+; CHECK-LE-P8-NEXT:    mtfprd f1, r3
+; CHECK-LE-P8-NEXT:    xxmrglw vs0, vs1, vs0
+; CHECK-LE-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-LE-P8-NEXT:    stxvd2x vs0, 0, r3
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v8i16_v8i16:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    lxsihzx f0, 0, r3
+; CHECK-LE-P9-NEXT:    lxsihzx f1, 0, r3
+; CHECK-LE-P9-NEXT:    xxmrglw vs0, vs1, vs0
+; CHECK-LE-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v8i16_v8i16:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    lhz r4, 0(r3)
+; CHECK-BE-P8-NEXT:    lhz r3, 0(r3)
+; CHECK-BE-P8-NEXT:    mtfprwz f0, r4
+; CHECK-BE-P8-NEXT:    mtfprwz f1, r3
+; CHECK-BE-P8-NEXT:    xxmrghw vs0, vs0, vs1
+; CHECK-BE-P8-NEXT:    stxvw4x vs0, 0, r3
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v8i16_v8i16:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    lxsihzx f0, 0, r3
+; CHECK-BE-P9-NEXT:    lxsihzx f1, 0, r3
+; CHECK-BE-P9-NEXT:    xxmrghw vs0, vs0, vs1
+; CHECK-BE-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v8i16_v8i16:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    lhz r4, 0(r3)
+; CHECK-AIX-64-P8-NEXT:    lhz r3, 0(r3)
+; CHECK-AIX-64-P8-NEXT:    mtfprwz f0, r4
+; CHECK-AIX-64-P8-NEXT:    mtfprwz f1, r3
+; CHECK-AIX-64-P8-NEXT:    xxmrghw vs0, vs0, vs1
+; CHECK-AIX-64-P8-NEXT:    stxvw4x vs0, 0, r3
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v8i16_v8i16:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    lxsihzx f0, 0, r3
+; CHECK-AIX-64-P9-NEXT:    lxsihzx f1, 0, r3
+; CHECK-AIX-64-P9-NEXT:    xxmrghw vs0, vs0, vs1
+; CHECK-AIX-64-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v8i16_v8i16:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    lhz r4, 0(r3)
+; CHECK-AIX-32-P8-NEXT:    lhz r3, 0(r3)
+; CHECK-AIX-32-P8-NEXT:    mtfprwz f0, r4
+; CHECK-AIX-32-P8-NEXT:    mtfprwz f1, r3
+; CHECK-AIX-32-P8-NEXT:    xxmrghw vs0, vs0, vs1
+; CHECK-AIX-32-P8-NEXT:    stxvw4x vs0, 0, r3
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v8i16_v8i16:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    lxsihzx f0, 0, r3
+; CHECK-AIX-32-P9-NEXT:    lxsihzx f1, 0, r3
+; CHECK-AIX-32-P9-NEXT:    xxmrghw vs0, vs0, vs1
+; CHECK-AIX-32-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %0 = load <2 x i8>, ptr undef, align 1
+  %tmp0_1 = bitcast <2 x i8> %0 to i16
+  %tmp0_2 = insertelement <8 x i16> undef, i16 %tmp0_1, i32 0
+  %tmp0_3 = bitcast <8 x i16> %tmp0_2 to <4 x i32>
+  %1 = load <2 x i8>, ptr %a, align 1
+  %tmp1_1 = bitcast <2 x i8> %1 to i16
+  %tmp1_2 = insertelement <8 x i16> undef, i16 %tmp1_1, i32 0
+  %tmp1_3 = bitcast <8 x i16> %tmp1_2 to <4 x i32>
+  %2 = shufflevector <4 x i32> %tmp0_3, <4 x i32> %tmp1_3, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  store <4 x i32> %2, ptr undef, align 4
+  ret void
+}
+
+define void @test_v8i16_v4i32(ptr %a) {
+; CHECK-LE-P8-LABEL: test_v8i16_v4i32:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    lhz r4, 0(r3)
+; CHECK-LE-P8-NEXT:    lfiwzx f0, 0, r3
+; CHECK-LE-P8-NEXT:    mtfprd f1, r4
+; CHECK-LE-P8-NEXT:    xxswapd vs0, f0
+; CHECK-LE-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-LE-P8-NEXT:    xxmrglw vs0, vs0, vs1
+; CHECK-LE-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-LE-P8-NEXT:    stxvd2x vs0, 0, r3
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v8i16_v4i32:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    lxsihzx v2, 0, r3
+; CHECK-LE-P9-NEXT:    lfiwzx f0, 0, r3
+; CHECK-LE-P9-NEXT:    xxswapd vs0, f0
+; CHECK-LE-P9-NEXT:    vsplth v2, v2, 3
+; CHECK-LE-P9-NEXT:    xxmrglw vs0, vs0, v2
+; CHECK-LE-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v8i16_v4i32:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    lfiwzx f0, 0, r3
+; CHECK-BE-P8-NEXT:    lhz r4, 0(r3)
+; CHECK-BE-P8-NEXT:    sldi r3, r4, 48
+; CHECK-BE-P8-NEXT:    xxsldwi vs0, f0, f0, 1
+; CHECK-BE-P8-NEXT:    mtfprd f1, r3
+; CHECK-BE-P8-NEXT:    xxmrghw vs0, vs1, vs0
+; CHECK-BE-P8-NEXT:    stxvw4x vs0, 0, r3
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v8i16_v4i32:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    lxsihzx v2, 0, r3
+; CHECK-BE-P9-NEXT:    lfiwzx f0, 0, r3
+; CHECK-BE-P9-NEXT:    xxsldwi vs0, f0, f0, 1
+; CHECK-BE-P9-NEXT:    vsplth v2, v2, 3
+; CHECK-BE-P9-NEXT:    xxmrghw vs0, v2, vs0
+; CHECK-BE-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v8i16_v4i32:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    lfiwzx f0, 0, r3
+; CHECK-AIX-64-P8-NEXT:    lhz r4, 0(r3)
+; CHECK-AIX-64-P8-NEXT:    sldi r3, r4, 48
+; CHECK-AIX-64-P8-NEXT:    xxsldwi vs0, f0, f0, 1
+; CHECK-AIX-64-P8-NEXT:    mtfprd f1, r3
+; CHECK-AIX-64-P8-NEXT:    xxmrghw vs0, vs1, vs0
+; CHECK-AIX-64-P8-NEXT:    stxvw4x vs0, 0, r3
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v8i16_v4i32:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    lxsihzx v2, 0, r3
+; CHECK-AIX-64-P9-NEXT:    lfiwzx f0, 0, r3
+; CHECK-AIX-64-P9-NEXT:    xxsldwi vs0, f0, f0, 1
+; CHECK-AIX-64-P9-NEXT:    vsplth v2, v2, 3
+; CHECK-AIX-64-P9-NEXT:    xxmrghw vs0, v2, vs0
+; CHECK-AIX-64-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v8i16_v4i32:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    lhz r4, 0(r3)
+; CHECK-AIX-32-P8-NEXT:    sth r4, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r4, r1, -32
+; CHECK-AIX-32-P8-NEXT:    lwz r3, 0(r3)
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r4
+; CHECK-AIX-32-P8-NEXT:    stw r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs1, 0, r3
+; CHECK-AIX-32-P8-NEXT:    xxmrghw vs0, vs0, vs1
+; CHECK-AIX-32-P8-NEXT:    stxvw4x vs0, 0, r3
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v8i16_v4i32:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    lhz r4, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    sth r4, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r3, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    lxv vs0, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs1, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    xxmrghw vs0, vs0, vs1
+; CHECK-AIX-32-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %0 = load <2 x i8>, ptr undef, align 1
+  %tmp0_1 = bitcast <2 x i8> %0 to i16
+  %tmp0_2 = insertelement <8 x i16> undef, i16 %tmp0_1, i32 0
+  %tmp0_3 = bitcast <8 x i16> %tmp0_2 to <4 x i32>
+  %1 = load <2 x i16>, ptr %a, align 4
+  %tmp1_1 = bitcast <2 x i16> %1 to i32
+  %tmp1_2 = insertelement <4 x i32> undef, i32 %tmp1_1, i32 0
+  %2 = shufflevector <4 x i32> %tmp0_3, <4 x i32> %tmp1_2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  store <4 x i32> %2, ptr undef, align 4
+  ret void
+}
+
+define void @test_v8i16_v2i64(ptr %a) {
+; CHECK-LE-P8-LABEL: test_v8i16_v2i64:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    lhz r4, 0(r3)
+; CHECK-LE-P8-NEXT:    lfdx f0, 0, r3
+; CHECK-LE-P8-NEXT:    mtfprd f1, r4
+; CHECK-LE-P8-NEXT:    xxswapd vs0, f0
+; CHECK-LE-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-LE-P8-NEXT:    xxmrglw vs0, vs0, vs1
+; CHECK-LE-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-LE-P8-NEXT:    stxvd2x vs0, 0, r3
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v8i16_v2i64:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    lxsihzx v2, 0, r3
+; CHECK-LE-P9-NEXT:    lfd f0, 0(r3)
+; CHECK-LE-P9-NEXT:    xxswapd vs0, f0
+; CHECK-LE-P9-NEXT:    vsplth v2, v2, 3
+; CHECK-LE-P9-NEXT:    xxmrglw vs0, vs0, v2
+; CHECK-LE-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v8i16_v2i64:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    lhz r4, 0(r3)
+; CHECK-BE-P8-NEXT:    lfdx f0, 0, r3
+; CHECK-BE-P8-NEXT:    sldi r3, r4, 48
+; CHECK-BE-P8-NEXT:    mtfprd f1, r3
+; CHECK-BE-P8-NEXT:    xxmrghw vs0, vs1, vs0
+; CHECK-BE-P8-NEXT:    stxvw4x vs0, 0, r3
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v8i16_v2i64:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    lxsihzx v2, 0, r3
+; CHECK-BE-P9-NEXT:    lfd f0, 0(r3)
+; CHECK-BE-P9-NEXT:    vsplth v2, v2, 3
+; CHECK-BE-P9-NEXT:    xxmrghw vs0, v2, vs0
+; CHECK-BE-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v8i16_v2i64:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    lhz r4, 0(r3)
+; CHECK-AIX-64-P8-NEXT:    lfdx f0, 0, r3
+; CHECK-AIX-64-P8-NEXT:    sldi r3, r4, 48
+; CHECK-AIX-64-P8-NEXT:    mtfprd f1, r3
+; CHECK-AIX-64-P8-NEXT:    xxmrghw vs0, vs1, vs0
+; CHECK-AIX-64-P8-NEXT:    stxvw4x vs0, 0, r3
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v8i16_v2i64:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    lxsihzx v2, 0, r3
+; CHECK-AIX-64-P9-NEXT:    lfd f0, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    vsplth v2, v2, 3
+; CHECK-AIX-64-P9-NEXT:    xxmrghw vs0, v2, vs0
+; CHECK-AIX-64-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v8i16_v2i64:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    lhz r4, 0(r3)
+; CHECK-AIX-32-P8-NEXT:    sth r4, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r4, r1, -32
+; CHECK-AIX-32-P8-NEXT:    lwz r3, 0(r3)
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r4
+; CHECK-AIX-32-P8-NEXT:    stw r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs1, 0, r3
+; CHECK-AIX-32-P8-NEXT:    xxmrghw vs0, vs0, vs1
+; CHECK-AIX-32-P8-NEXT:    stxvw4x vs0, 0, r3
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v8i16_v2i64:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    lhz r4, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    sth r4, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r3, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    lxv vs0, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs1, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    xxmrghw vs0, vs0, vs1
+; CHECK-AIX-32-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %0 = load <2 x i8>, ptr undef, align 1
+  %tmp0_1 = bitcast <2 x i8> %0 to i16
+  %tmp0_2 = insertelement <8 x i16> undef, i16 %tmp0_1, i32 0
+  %tmp0_3 = bitcast <8 x i16> %tmp0_2 to <4 x i32>
+  %1 = load <2 x i16>, ptr %a, align 8
+  %tmp1_1 = bitcast <2 x i16> %1 to i32
+  %tmp1_2 = insertelement <4 x i32> undef, i32 %tmp1_1, i32 0
+  %2 = shufflevector <4 x i32> %tmp0_3, <4 x i32> %tmp1_2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  store <4 x i32> %2, ptr undef, align 4
+  ret void
+}
+
+define <16 x i8> @test_v4i32_v4i32(ptr %a, ptr %b) {
+; CHECK-LE-P8-LABEL: test_v4i32_v4i32:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    addis r5, r2, .LCPI9_0@toc@ha
+; CHECK-LE-P8-NEXT:    lxsiwzx v2, 0, r3
+; CHECK-LE-P8-NEXT:    lxsiwzx v3, 0, r4
+; CHECK-LE-P8-NEXT:    addi r5, r5, .LCPI9_0@toc@l
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r5
+; CHECK-LE-P8-NEXT:    xxswapd v4, vs0
+; CHECK-LE-P8-NEXT:    vperm v2, v3, v2, v4
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v4i32_v4i32:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    lxsiwzx v2, 0, r3
+; CHECK-LE-P9-NEXT:    addis r3, r2, .LCPI9_0@toc@ha
+; CHECK-LE-P9-NEXT:    lxsiwzx v3, 0, r4
+; CHECK-LE-P9-NEXT:    addi r3, r3, .LCPI9_0@toc@l
+; CHECK-LE-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-LE-P9-NEXT:    vperm v2, v3, v2, v4
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v4i32_v4i32:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    addis r5, r2, .LCPI9_0@toc@ha
+; CHECK-BE-P8-NEXT:    lxsiwzx v2, 0, r3
+; CHECK-BE-P8-NEXT:    lxsiwzx v3, 0, r4
+; CHECK-BE-P8-NEXT:    addi r5, r5, .LCPI9_0@toc@l
+; CHECK-BE-P8-NEXT:    lxvw4x v4, 0, r5
+; CHECK-BE-P8-NEXT:    vperm v2, v2, v3, v4
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v4i32_v4i32:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    lxsiwzx v2, 0, r3
+; CHECK-BE-P9-NEXT:    addis r3, r2, .LCPI9_0@toc@ha
+; CHECK-BE-P9-NEXT:    lxsiwzx v3, 0, r4
+; CHECK-BE-P9-NEXT:    addi r3, r3, .LCPI9_0@toc@l
+; CHECK-BE-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-BE-P9-NEXT:    vperm v2, v2, v3, v4
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v4i32_v4i32:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    ld r5, L..C4(r2) # %const.0
+; CHECK-AIX-64-P8-NEXT:    lxsiwzx v2, 0, r3
+; CHECK-AIX-64-P8-NEXT:    lxsiwzx v3, 0, r4
+; CHECK-AIX-64-P8-NEXT:    lxvw4x v4, 0, r5
+; CHECK-AIX-64-P8-NEXT:    vperm v2, v2, v3, v4
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v4i32_v4i32:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    lxsiwzx v2, 0, r3
+; CHECK-AIX-64-P9-NEXT:    ld r3, L..C3(r2) # %const.0
+; CHECK-AIX-64-P9-NEXT:    lxsiwzx v3, 0, r4
+; CHECK-AIX-64-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    vperm v2, v2, v3, v4
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v4i32_v4i32:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    lwz r5, L..C4(r2) # %const.0
+; CHECK-AIX-32-P8-NEXT:    lxsiwzx v2, 0, r3
+; CHECK-AIX-32-P8-NEXT:    lxsiwzx v3, 0, r4
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v4, 0, r5
+; CHECK-AIX-32-P8-NEXT:    vperm v2, v2, v3, v4
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v4i32_v4i32:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    lxsiwzx v2, 0, r3
+; CHECK-AIX-32-P9-NEXT:    lwz r3, L..C3(r2) # %const.0
+; CHECK-AIX-32-P9-NEXT:    lxsiwzx v3, 0, r4
+; CHECK-AIX-32-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    vperm v2, v2, v3, v4
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %load1 = load <4 x i8>, ptr %a
+  %load2 = load <4 x i8>, ptr %b
+  %shuffle1 = shufflevector <4 x i8> %load1, <4 x i8> %load2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %shuffle2 = shufflevector <8 x i8> %shuffle1, <8 x i8> %shuffle1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %shuffle2
+}
+
+define void @test_v4i32_v8i16(ptr %a) {
+; CHECK-LE-P8-LABEL: test_v4i32_v8i16:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    lhz r4, 0(r3)
+; CHECK-LE-P8-NEXT:    lfiwzx f0, 0, r3
+; CHECK-LE-P8-NEXT:    mtfprd f1, r4
+; CHECK-LE-P8-NEXT:    xxswapd vs0, f0
+; CHECK-LE-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-LE-P8-NEXT:    xxmrglw vs0, vs1, vs0
+; CHECK-LE-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-LE-P8-NEXT:    stxvd2x vs0, 0, r3
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v4i32_v8i16:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    lxsihzx v2, 0, r3
+; CHECK-LE-P9-NEXT:    lfiwzx f0, 0, r3
+; CHECK-LE-P9-NEXT:    xxswapd vs0, f0
+; CHECK-LE-P9-NEXT:    vsplth v2, v2, 3
+; CHECK-LE-P9-NEXT:    xxmrglw vs0, v2, vs0
+; CHECK-LE-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v4i32_v8i16:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    lfiwzx f0, 0, r3
+; CHECK-BE-P8-NEXT:    lhz r4, 0(r3)
+; CHECK-BE-P8-NEXT:    sldi r3, r4, 48
+; CHECK-BE-P8-NEXT:    xxsldwi vs0, f0, f0, 1
+; CHECK-BE-P8-NEXT:    mtfprd f1, r3
+; CHECK-BE-P8-NEXT:    xxmrghw vs0, vs0, vs1
+; CHECK-BE-P8-NEXT:    stxvw4x vs0, 0, r3
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v4i32_v8i16:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    lxsihzx v2, 0, r3
+; CHECK-BE-P9-NEXT:    lfiwzx f0, 0, r3
+; CHECK-BE-P9-NEXT:    xxsldwi vs0, f0, f0, 1
+; CHECK-BE-P9-NEXT:    vsplth v2, v2, 3
+; CHECK-BE-P9-NEXT:    xxmrghw vs0, vs0, v2
+; CHECK-BE-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v4i32_v8i16:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    lfiwzx f0, 0, r3
+; CHECK-AIX-64-P8-NEXT:    lhz r4, 0(r3)
+; CHECK-AIX-64-P8-NEXT:    sldi r3, r4, 48
+; CHECK-AIX-64-P8-NEXT:    xxsldwi vs0, f0, f0, 1
+; CHECK-AIX-64-P8-NEXT:    mtfprd f1, r3
+; CHECK-AIX-64-P8-NEXT:    xxmrghw vs0, vs0, vs1
+; CHECK-AIX-64-P8-NEXT:    stxvw4x vs0, 0, r3
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v4i32_v8i16:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    lxsihzx v2, 0, r3
+; CHECK-AIX-64-P9-NEXT:    lfiwzx f0, 0, r3
+; CHECK-AIX-64-P9-NEXT:    xxsldwi vs0, f0, f0, 1
+; CHECK-AIX-64-P9-NEXT:    vsplth v2, v2, 3
+; CHECK-AIX-64-P9-NEXT:    xxmrghw vs0, vs0, v2
+; CHECK-AIX-64-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v4i32_v8i16:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    lhz r4, 0(r3)
+; CHECK-AIX-32-P8-NEXT:    sth r4, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r4, r1, -32
+; CHECK-AIX-32-P8-NEXT:    lwz r3, 0(r3)
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r4
+; CHECK-AIX-32-P8-NEXT:    stw r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs1, 0, r3
+; CHECK-AIX-32-P8-NEXT:    xxmrghw vs0, vs1, vs0
+; CHECK-AIX-32-P8-NEXT:    stxvw4x vs0, 0, r3
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v4i32_v8i16:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    lhz r4, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    sth r4, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r3, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    lxv vs0, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs1, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    xxmrghw vs0, vs1, vs0
+; CHECK-AIX-32-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %0 = load <2 x i8>, ptr undef, align 1
+  %tmp0_1 = bitcast <2 x i8> %0 to i16
+  %tmp0_2 = insertelement <8 x i16> undef, i16 %tmp0_1, i32 0
+  %tmp0_3 = bitcast <8 x i16> %tmp0_2 to <4 x i32>
+  %1 = load <2 x i16>, ptr %a, align 4
+  %tmp1_1 = bitcast <2 x i16> %1 to i32
+  %tmp1_2 = insertelement <4 x i32> undef, i32 %tmp1_1, i32 0
+  %2 = shufflevector <4 x i32> %tmp1_2, <4 x i32> %tmp0_3, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  store <4 x i32> %2, ptr undef, align 4
+  ret void
+}
+
+define void @test_v4i32_v2i64(ptr %a) {
+; CHECK-LE-P8-LABEL: test_v4i32_v2i64:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    lfdx f0, 0, r3
+; CHECK-LE-P8-NEXT:    lfiwzx f1, 0, r3
+; CHECK-LE-P8-NEXT:    xxswapd vs0, f0
+; CHECK-LE-P8-NEXT:    xxswapd vs1, f1
+; CHECK-LE-P8-NEXT:    xxmrglw vs0, vs0, vs1
+; CHECK-LE-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-LE-P8-NEXT:    stxvd2x vs0, 0, r3
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v4i32_v2i64:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    lfd f0, 0(r3)
+; CHECK-LE-P9-NEXT:    lfiwzx f1, 0, r3
+; CHECK-LE-P9-NEXT:    xxswapd vs0, f0
+; CHECK-LE-P9-NEXT:    xxswapd vs1, f1
+; CHECK-LE-P9-NEXT:    xxmrglw vs0, vs0, vs1
+; CHECK-LE-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v4i32_v2i64:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    lfiwzx f1, 0, r3
+; CHECK-BE-P8-NEXT:    lfdx f0, 0, r3
+; CHECK-BE-P8-NEXT:    xxsldwi vs1, f1, f1, 1
+; CHECK-BE-P8-NEXT:    xxmrghw vs0, vs1, vs0
+; CHECK-BE-P8-NEXT:    stxvw4x vs0, 0, r3
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v4i32_v2i64:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    lfiwzx f1, 0, r3
+; CHECK-BE-P9-NEXT:    lfd f0, 0(r3)
+; CHECK-BE-P9-NEXT:    xxsldwi vs1, f1, f1, 1
+; CHECK-BE-P9-NEXT:    xxmrghw vs0, vs1, vs0
+; CHECK-BE-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v4i32_v2i64:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    lfiwzx f1, 0, r3
+; CHECK-AIX-64-P8-NEXT:    lfdx f0, 0, r3
+; CHECK-AIX-64-P8-NEXT:    xxsldwi vs1, f1, f1, 1
+; CHECK-AIX-64-P8-NEXT:    xxmrghw vs0, vs1, vs0
+; CHECK-AIX-64-P8-NEXT:    stxvw4x vs0, 0, r3
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v4i32_v2i64:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    lfiwzx f1, 0, r3
+; CHECK-AIX-64-P9-NEXT:    lfd f0, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    xxsldwi vs1, f1, f1, 1
+; CHECK-AIX-64-P9-NEXT:    xxmrghw vs0, vs1, vs0
+; CHECK-AIX-64-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v4i32_v2i64:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    lwz r4, 0(r3)
+; CHECK-AIX-32-P8-NEXT:    lwz r3, 0(r3)
+; CHECK-AIX-32-P8-NEXT:    stw r4, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    stw r3, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
+; CHECK-AIX-32-P8-NEXT:    addi r4, r1, -32
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r3
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs1, 0, r4
+; CHECK-AIX-32-P8-NEXT:    xxmrghw vs0, vs1, vs0
+; CHECK-AIX-32-P8-NEXT:    stxvw4x vs0, 0, r3
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v4i32_v2i64:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    lwz r4, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    lwz r3, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    stw r4, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs1, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    xxmrghw vs0, vs1, vs0
+; CHECK-AIX-32-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %0 = load <2 x i16>, ptr undef, align 8
+  %tmp0_1 = bitcast <2 x i16> %0 to i32
+  %tmp0_2 = insertelement <4 x i32> undef, i32 %tmp0_1, i32 0
+  %1 = load <2 x i16>, ptr %a, align 4
+  %tmp1_1 = bitcast <2 x i16> %1 to i32
+  %tmp1_2 = insertelement <4 x i32> undef, i32 %tmp1_1, i32 0
+  %2 = shufflevector <4 x i32> %tmp1_2, <4 x i32> %tmp0_2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  store <4 x i32> %2, ptr undef, align 4
+  ret void
+}
+
+define void @test_v2i64_v2i64(ptr %a) {
+; CHECK-LE-P8-LABEL: test_v2i64_v2i64:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    lfdx f0, 0, r3
+; CHECK-LE-P8-NEXT:    lfdx f1, 0, r3
+; CHECK-LE-P8-NEXT:    xxmrghw vs0, vs1, vs0
+; CHECK-LE-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-LE-P8-NEXT:    stxvd2x vs0, 0, r3
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v2i64_v2i64:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    lfd f0, 0(r3)
+; CHECK-LE-P9-NEXT:    lfd f1, 0(r3)
+; CHECK-LE-P9-NEXT:    xxmrghw vs0, vs1, vs0
+; CHECK-LE-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v2i64_v2i64:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    lfdx f0, 0, r3
+; CHECK-BE-P8-NEXT:    lfdx f1, 0, r3
+; CHECK-BE-P8-NEXT:    xxmrghw vs0, vs0, vs1
+; CHECK-BE-P8-NEXT:    stxvw4x vs0, 0, r3
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v2i64_v2i64:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    lfd f0, 0(r3)
+; CHECK-BE-P9-NEXT:    lfd f1, 0(r3)
+; CHECK-BE-P9-NEXT:    xxmrghw vs0, vs0, vs1
+; CHECK-BE-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v2i64_v2i64:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    lfdx f0, 0, r3
+; CHECK-AIX-64-P8-NEXT:    lfdx f1, 0, r3
+; CHECK-AIX-64-P8-NEXT:    xxmrghw vs0, vs0, vs1
+; CHECK-AIX-64-P8-NEXT:    stxvw4x vs0, 0, r3
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v2i64_v2i64:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    lfd f0, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    lfd f1, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    xxmrghw vs0, vs0, vs1
+; CHECK-AIX-64-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v2i64_v2i64:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    lwz r5, 4(r3)
+; CHECK-AIX-32-P8-NEXT:    addi r4, r1, -16
+; CHECK-AIX-32-P8-NEXT:    stw r5, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    lwz r3, 0(r3)
+; CHECK-AIX-32-P8-NEXT:    stw r3, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -32
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r4
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs1, 0, r3
+; CHECK-AIX-32-P8-NEXT:    lfiwzx f2, 0, r3
+; CHECK-AIX-32-P8-NEXT:    xxmrghw vs0, vs1, vs0
+; CHECK-AIX-32-P8-NEXT:    xxspltw vs1, vs2, 1
+; CHECK-AIX-32-P8-NEXT:    xxmrghw vs0, vs1, vs0
+; CHECK-AIX-32-P8-NEXT:    stxvw4x vs0, 0, r3
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v2i64_v2i64:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    lwz r4, 4(r3)
+; CHECK-AIX-32-P9-NEXT:    stw r4, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r3, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs1, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    xxmrghw vs0, vs1, vs0
+; CHECK-AIX-32-P9-NEXT:    lxvwsx vs1, 0, r3
+; CHECK-AIX-32-P9-NEXT:    xxmrghw vs0, vs1, vs0
+; CHECK-AIX-32-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %0 = load <2 x i32>, ptr undef, align 4
+  %1 = load <2 x i32>, ptr %a, align 4
+  %2 = shufflevector <2 x i32> %0, <2 x i32> %1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  store <4 x i32> %2, ptr undef, align 4
+  ret void
+}
+
+define void @test_v2i64_v4i32(ptr %a) {
+; CHECK-LE-P8-LABEL: test_v2i64_v4i32:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    lfdx f0, 0, r3
+; CHECK-LE-P8-NEXT:    lfiwzx f1, 0, r3
+; CHECK-LE-P8-NEXT:    xxswapd vs0, f0
+; CHECK-LE-P8-NEXT:    xxswapd vs1, f1
+; CHECK-LE-P8-NEXT:    xxmrglw vs0, vs1, vs0
+; CHECK-LE-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-LE-P8-NEXT:    stxvd2x vs0, 0, r3
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v2i64_v4i32:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    lfd f0, 0(r3)
+; CHECK-LE-P9-NEXT:    lfiwzx f1, 0, r3
+; CHECK-LE-P9-NEXT:    xxswapd vs0, f0
+; CHECK-LE-P9-NEXT:    xxswapd vs1, f1
+; CHECK-LE-P9-NEXT:    xxmrglw vs0, vs1, vs0
+; CHECK-LE-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v2i64_v4i32:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    lfiwzx f1, 0, r3
+; CHECK-BE-P8-NEXT:    lfdx f0, 0, r3
+; CHECK-BE-P8-NEXT:    xxsldwi vs1, f1, f1, 1
+; CHECK-BE-P8-NEXT:    xxmrghw vs0, vs0, vs1
+; CHECK-BE-P8-NEXT:    stxvw4x vs0, 0, r3
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v2i64_v4i32:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    lfiwzx f1, 0, r3
+; CHECK-BE-P9-NEXT:    lfd f0, 0(r3)
+; CHECK-BE-P9-NEXT:    xxsldwi vs1, f1, f1, 1
+; CHECK-BE-P9-NEXT:    xxmrghw vs0, vs0, vs1
+; CHECK-BE-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v2i64_v4i32:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    lfiwzx f1, 0, r3
+; CHECK-AIX-64-P8-NEXT:    lfdx f0, 0, r3
+; CHECK-AIX-64-P8-NEXT:    xxsldwi vs1, f1, f1, 1
+; CHECK-AIX-64-P8-NEXT:    xxmrghw vs0, vs0, vs1
+; CHECK-AIX-64-P8-NEXT:    stxvw4x vs0, 0, r3
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v2i64_v4i32:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    lfiwzx f1, 0, r3
+; CHECK-AIX-64-P9-NEXT:    lfd f0, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    xxsldwi vs1, f1, f1, 1
+; CHECK-AIX-64-P9-NEXT:    xxmrghw vs0, vs0, vs1
+; CHECK-AIX-64-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v2i64_v4i32:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    lwz r4, 0(r3)
+; CHECK-AIX-32-P8-NEXT:    lwz r3, 0(r3)
+; CHECK-AIX-32-P8-NEXT:    stw r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    stw r4, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
+; CHECK-AIX-32-P8-NEXT:    addi r4, r1, -32
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r3
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs1, 0, r4
+; CHECK-AIX-32-P8-NEXT:    xxmrghw vs0, vs1, vs0
+; CHECK-AIX-32-P8-NEXT:    stxvw4x vs0, 0, r3
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v2i64_v4i32:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    lwz r4, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    lwz r3, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    stw r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r4, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs1, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    xxmrghw vs0, vs1, vs0
+; CHECK-AIX-32-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %0 = load <2 x i16>, ptr undef, align 8
+  %tmp0_1 = bitcast <2 x i16> %0 to i32
+  %tmp0_2 = insertelement <4 x i32> undef, i32 %tmp0_1, i32 0
+  %1 = load <2 x i16>, ptr %a, align 4
+  %tmp1_1 = bitcast <2 x i16> %1 to i32
+  %tmp1_2 = insertelement <4 x i32> undef, i32 %tmp1_1, i32 0
+  %2 = shufflevector <4 x i32> %tmp0_2, <4 x i32> %tmp1_2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  store <4 x i32> %2, ptr undef, align 4
+  ret void
+}
+
+define void @test_v2i64_v8i16(ptr %a) {
+; CHECK-LE-P8-LABEL: test_v2i64_v8i16:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    lhz r4, 0(r3)
+; CHECK-LE-P8-NEXT:    lfdx f0, 0, r3
+; CHECK-LE-P8-NEXT:    mtfprd f1, r4
+; CHECK-LE-P8-NEXT:    xxswapd vs0, f0
+; CHECK-LE-P8-NEXT:    xxswapd vs1, vs1
+; CHECK-LE-P8-NEXT:    xxmrglw vs0, vs1, vs0
+; CHECK-LE-P8-NEXT:    xxswapd vs0, vs0
+; CHECK-LE-P8-NEXT:    stxvd2x vs0, 0, r3
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v2i64_v8i16:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    lxsihzx v2, 0, r3
+; CHECK-LE-P9-NEXT:    lfd f0, 0(r3)
+; CHECK-LE-P9-NEXT:    xxswapd vs0, f0
+; CHECK-LE-P9-NEXT:    vsplth v2, v2, 3
+; CHECK-LE-P9-NEXT:    xxmrglw vs0, v2, vs0
+; CHECK-LE-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v2i64_v8i16:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    lhz r4, 0(r3)
+; CHECK-BE-P8-NEXT:    lfdx f0, 0, r3
+; CHECK-BE-P8-NEXT:    sldi r3, r4, 48
+; CHECK-BE-P8-NEXT:    mtfprd f1, r3
+; CHECK-BE-P8-NEXT:    xxmrghw vs0, vs0, vs1
+; CHECK-BE-P8-NEXT:    stxvw4x vs0, 0, r3
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v2i64_v8i16:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    lxsihzx v2, 0, r3
+; CHECK-BE-P9-NEXT:    lfd f0, 0(r3)
+; CHECK-BE-P9-NEXT:    vsplth v2, v2, 3
+; CHECK-BE-P9-NEXT:    xxmrghw vs0, vs0, v2
+; CHECK-BE-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v2i64_v8i16:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    lhz r4, 0(r3)
+; CHECK-AIX-64-P8-NEXT:    lfdx f0, 0, r3
+; CHECK-AIX-64-P8-NEXT:    sldi r3, r4, 48
+; CHECK-AIX-64-P8-NEXT:    mtfprd f1, r3
+; CHECK-AIX-64-P8-NEXT:    xxmrghw vs0, vs0, vs1
+; CHECK-AIX-64-P8-NEXT:    stxvw4x vs0, 0, r3
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v2i64_v8i16:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    lxsihzx v2, 0, r3
+; CHECK-AIX-64-P9-NEXT:    lfd f0, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    vsplth v2, v2, 3
+; CHECK-AIX-64-P9-NEXT:    xxmrghw vs0, vs0, v2
+; CHECK-AIX-64-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v2i64_v8i16:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    lhz r4, 0(r3)
+; CHECK-AIX-32-P8-NEXT:    sth r4, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r4, r1, -32
+; CHECK-AIX-32-P8-NEXT:    lwz r3, 0(r3)
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r4
+; CHECK-AIX-32-P8-NEXT:    stw r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs1, 0, r3
+; CHECK-AIX-32-P8-NEXT:    xxmrghw vs0, vs1, vs0
+; CHECK-AIX-32-P8-NEXT:    stxvw4x vs0, 0, r3
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v2i64_v8i16:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    lhz r4, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    sth r4, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r3, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    lxv vs0, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv vs1, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    xxmrghw vs0, vs1, vs0
+; CHECK-AIX-32-P9-NEXT:    stxv vs0, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %0 = load <2 x i8>, ptr undef, align 1
+  %tmp0_1 = bitcast <2 x i8> %0 to i16
+  %tmp0_2 = insertelement <8 x i16> undef, i16 %tmp0_1, i32 0
+  %tmp0_3 = bitcast <8 x i16> %tmp0_2 to <4 x i32>
+  %1 = load <2 x i16>, ptr %a, align 8
+  %tmp1_1 = bitcast <2 x i16> %1 to i32
+  %tmp1_2 = insertelement <4 x i32> undef, i32 %tmp1_1, i32 0
+  %2 = shufflevector <4 x i32> %tmp1_2, <4 x i32> %tmp0_3, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  store <4 x i32> %2, ptr undef, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll
@@ -0,0 +1,1554 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc64le-unknown-linux-gnu < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-LE-P8
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc64le-unknown-linux-gnu < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-LE-P9
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc64-unknown-linux-gnu < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-BE-P8
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc64-unknown-linux-gnu < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-BE-P9
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc64-ibm-aix < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-AIX-64-P8
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc64-ibm-aix < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-AIX-64-P9
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc-ibm-aix < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-AIX-32-P8
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc-ibm-aix < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-AIX-32-P9
+
+define void @test_none_v8i16(ptr %a0, ptr %a1, <16 x i8> %a, <8 x i16> %b, i8 %arg) {
+; CHECK-LE-P8-LABEL: test_none_v8i16:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    addis r4, r2, .LCPI0_0@toc@ha
+; CHECK-LE-P8-NEXT:    lhz r3, 0(r3)
+; CHECK-LE-P8-NEXT:    addi r4, r4, .LCPI0_0@toc@l
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
+; CHECK-LE-P8-NEXT:    mtvsrd v4, r3
+; CHECK-LE-P8-NEXT:    xxswapd v3, vs0
+; CHECK-LE-P8-NEXT:    vperm v2, v4, v2, v3
+; CHECK-LE-P8-NEXT:    xxswapd vs0, v2
+; CHECK-LE-P8-NEXT:    stxvd2x vs0, 0, r3
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_none_v8i16:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    lxsihzx v3, 0, r3
+; CHECK-LE-P9-NEXT:    addis r3, r2, .LCPI0_0@toc@ha
+; CHECK-LE-P9-NEXT:    addi r3, r3, .LCPI0_0@toc@l
+; CHECK-LE-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-LE-P9-NEXT:    vperm v2, v3, v2, v4
+; CHECK-LE-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_none_v8i16:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    addis r4, r2, .LCPI0_0@toc@ha
+; CHECK-BE-P8-NEXT:    lhz r3, 0(r3)
+; CHECK-BE-P8-NEXT:    addi r4, r4, .LCPI0_0@toc@l
+; CHECK-BE-P8-NEXT:    lxvw4x v3, 0, r4
+; CHECK-BE-P8-NEXT:    mtvsrwz v4, r3
+; CHECK-BE-P8-NEXT:    vperm v2, v2, v4, v3
+; CHECK-BE-P8-NEXT:    stxvw4x v2, 0, r3
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_none_v8i16:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    lxsihzx v3, 0, r3
+; CHECK-BE-P9-NEXT:    addis r3, r2, .LCPI0_0@toc@ha
+; CHECK-BE-P9-NEXT:    addi r3, r3, .LCPI0_0@toc@l
+; CHECK-BE-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-BE-P9-NEXT:    vperm v2, v2, v3, v4
+; CHECK-BE-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_none_v8i16:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    ld r4, L..C0(r2) # %const.0
+; CHECK-AIX-64-P8-NEXT:    lhz r3, 0(r3)
+; CHECK-AIX-64-P8-NEXT:    mtvsrwz v4, r3
+; CHECK-AIX-64-P8-NEXT:    lxvw4x v3, 0, r4
+; CHECK-AIX-64-P8-NEXT:    vperm v2, v2, v4, v3
+; CHECK-AIX-64-P8-NEXT:    stxvw4x v2, 0, r3
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_none_v8i16:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    lxsihzx v3, 0, r3
+; CHECK-AIX-64-P9-NEXT:    ld r3, L..C0(r2) # %const.0
+; CHECK-AIX-64-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    vperm v2, v2, v3, v4
+; CHECK-AIX-64-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_none_v8i16:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    lwz r4, L..C0(r2) # %const.0
+; CHECK-AIX-32-P8-NEXT:    lhz r3, 0(r3)
+; CHECK-AIX-32-P8-NEXT:    mtvsrwz v4, r3
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v3, 0, r4
+; CHECK-AIX-32-P8-NEXT:    vperm v2, v2, v4, v3
+; CHECK-AIX-32-P8-NEXT:    stxvw4x v2, 0, r3
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_none_v8i16:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    lxsihzx v3, 0, r3
+; CHECK-AIX-32-P9-NEXT:    lwz r3, L..C0(r2) # %const.0
+; CHECK-AIX-32-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    vperm v2, v2, v3, v4
+; CHECK-AIX-32-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %load0.tmp = load <2 x i8>, ptr %a0
+  %load0.tmp1 = bitcast <2 x i8> %load0.tmp to i16
+  %load0 = insertelement <8 x i16> %b, i16 %load0.tmp1, i64 0
+  %load1.tmp = insertelement <16 x i8> %a, i8 %arg, i32 0
+  %load1 = bitcast <16 x i8> %load1.tmp to <8 x i16>
+  %shuff = shufflevector <8 x i16> %load0, <8 x i16> %load1, <8 x i32> <i32 9, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  store <8 x i16> %shuff, ptr undef
+  ret void
+}
+
+define void @test_v8i16_none(ptr %a0, ptr %a1, <16 x i8> %a, <8 x i16> %b, i8 %arg) {
+; CHECK-LE-P8-LABEL: test_v8i16_none:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    addis r4, r2, .LCPI1_0@toc@ha
+; CHECK-LE-P8-NEXT:    mtvsrd v4, r9
+; CHECK-LE-P8-NEXT:    lhz r3, 0(r3)
+; CHECK-LE-P8-NEXT:    addi r4, r4, .LCPI1_0@toc@l
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
+; CHECK-LE-P8-NEXT:    addis r4, r2, .LCPI1_1@toc@ha
+; CHECK-LE-P8-NEXT:    addi r4, r4, .LCPI1_1@toc@l
+; CHECK-LE-P8-NEXT:    xxswapd v3, vs0
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
+; CHECK-LE-P8-NEXT:    vperm v2, v2, v4, v3
+; CHECK-LE-P8-NEXT:    xxswapd v3, vs0
+; CHECK-LE-P8-NEXT:    mtvsrd v4, r3
+; CHECK-LE-P8-NEXT:    vperm v2, v2, v4, v3
+; CHECK-LE-P8-NEXT:    xxswapd vs0, v2
+; CHECK-LE-P8-NEXT:    stxvd2x vs0, 0, r3
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v8i16_none:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    lxsihzx v3, 0, r3
+; CHECK-LE-P9-NEXT:    addis r3, r2, .LCPI1_0@toc@ha
+; CHECK-LE-P9-NEXT:    mtvsrwz v4, r9
+; CHECK-LE-P9-NEXT:    addi r3, r3, .LCPI1_0@toc@l
+; CHECK-LE-P9-NEXT:    vinsertb v2, v4, 15
+; CHECK-LE-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-LE-P9-NEXT:    vperm v2, v2, v3, v4
+; CHECK-LE-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v8i16_none:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    addis r4, r2, .LCPI1_0@toc@ha
+; CHECK-BE-P8-NEXT:    mtvsrwz v4, r9
+; CHECK-BE-P8-NEXT:    lhz r3, 0(r3)
+; CHECK-BE-P8-NEXT:    addi r4, r4, .LCPI1_0@toc@l
+; CHECK-BE-P8-NEXT:    lxvw4x v3, 0, r4
+; CHECK-BE-P8-NEXT:    addis r4, r2, .LCPI1_1@toc@ha
+; CHECK-BE-P8-NEXT:    addi r4, r4, .LCPI1_1@toc@l
+; CHECK-BE-P8-NEXT:    vperm v2, v4, v2, v3
+; CHECK-BE-P8-NEXT:    lxvw4x v3, 0, r4
+; CHECK-BE-P8-NEXT:    mtvsrwz v4, r3
+; CHECK-BE-P8-NEXT:    vperm v2, v4, v2, v3
+; CHECK-BE-P8-NEXT:    stxvw4x v2, 0, r3
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v8i16_none:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    lxsihzx v3, 0, r3
+; CHECK-BE-P9-NEXT:    addis r3, r2, .LCPI1_0@toc@ha
+; CHECK-BE-P9-NEXT:    mtvsrwz v4, r9
+; CHECK-BE-P9-NEXT:    addi r3, r3, .LCPI1_0@toc@l
+; CHECK-BE-P9-NEXT:    vinsertb v2, v4, 0
+; CHECK-BE-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-BE-P9-NEXT:    vperm v2, v3, v2, v4
+; CHECK-BE-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v8i16_none:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    ld r4, L..C1(r2) # %const.0
+; CHECK-AIX-64-P8-NEXT:    mtvsrwz v4, r5
+; CHECK-AIX-64-P8-NEXT:    lhz r3, 0(r3)
+; CHECK-AIX-64-P8-NEXT:    lxvw4x v3, 0, r4
+; CHECK-AIX-64-P8-NEXT:    ld r4, L..C2(r2) # %const.1
+; CHECK-AIX-64-P8-NEXT:    vperm v2, v4, v2, v3
+; CHECK-AIX-64-P8-NEXT:    lxvw4x v3, 0, r4
+; CHECK-AIX-64-P8-NEXT:    mtvsrwz v4, r3
+; CHECK-AIX-64-P8-NEXT:    vperm v2, v4, v2, v3
+; CHECK-AIX-64-P8-NEXT:    stxvw4x v2, 0, r3
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v8i16_none:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    lxsihzx v3, 0, r3
+; CHECK-AIX-64-P9-NEXT:    ld r3, L..C1(r2) # %const.0
+; CHECK-AIX-64-P9-NEXT:    mtvsrwz v4, r5
+; CHECK-AIX-64-P9-NEXT:    vinsertb v2, v4, 0
+; CHECK-AIX-64-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    vperm v2, v3, v2, v4
+; CHECK-AIX-64-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v8i16_none:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    lwz r4, L..C1(r2) # %const.0
+; CHECK-AIX-32-P8-NEXT:    mtvsrwz v4, r5
+; CHECK-AIX-32-P8-NEXT:    lhz r3, 0(r3)
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v3, 0, r4
+; CHECK-AIX-32-P8-NEXT:    lwz r4, L..C2(r2) # %const.1
+; CHECK-AIX-32-P8-NEXT:    vperm v2, v4, v2, v3
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v3, 0, r4
+; CHECK-AIX-32-P8-NEXT:    mtvsrwz v4, r3
+; CHECK-AIX-32-P8-NEXT:    vperm v2, v4, v2, v3
+; CHECK-AIX-32-P8-NEXT:    stxvw4x v2, 0, r3
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v8i16_none:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    lxsihzx v3, 0, r3
+; CHECK-AIX-32-P9-NEXT:    lwz r3, L..C1(r2) # %const.0
+; CHECK-AIX-32-P9-NEXT:    mtvsrwz v4, r5
+; CHECK-AIX-32-P9-NEXT:    vinsertb v2, v4, 0
+; CHECK-AIX-32-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    vperm v2, v3, v2, v4
+; CHECK-AIX-32-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %load0.tmp = load <2 x i8>, ptr %a0
+  %load0.tmp1 = bitcast <2 x i8> %load0.tmp to i16
+  %load0 = insertelement <8 x i16> %b, i16 %load0.tmp1, i64 0
+  %load1.tmp = insertelement <16 x i8> %a, i8 %arg, i32 0
+  %load1 = bitcast <16 x i8> %load1.tmp to <8 x i16>
+  %shuff = shufflevector <8 x i16> %load0, <8 x i16> %load1, <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+  store <8 x i16> %shuff, ptr undef
+  ret void
+}
+
+define void @test_none_v4i32(ptr %ptr, ptr %ptr2, i8 %v3) local_unnamed_addr #0 {
+; CHECK-LE-P8-LABEL: test_none_v4i32:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    addis r4, r2, .LCPI2_0@toc@ha
+; CHECK-LE-P8-NEXT:    mtvsrd v3, r5
+; CHECK-LE-P8-NEXT:    addi r4, r4, .LCPI2_0@toc@l
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
+; CHECK-LE-P8-NEXT:    addis r4, r2, .LCPI2_1@toc@ha
+; CHECK-LE-P8-NEXT:    addi r4, r4, .LCPI2_1@toc@l
+; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
+; CHECK-LE-P8-NEXT:    vperm v2, v3, v3, v2
+; CHECK-LE-P8-NEXT:    lxsiwzx v3, 0, r3
+; CHECK-LE-P8-NEXT:    xxswapd v4, vs0
+; CHECK-LE-P8-NEXT:    vperm v2, v2, v3, v4
+; CHECK-LE-P8-NEXT:    xxswapd vs0, v2
+; CHECK-LE-P8-NEXT:    stfdx f0, 0, r3
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_none_v4i32:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    lxsiwzx v2, 0, r3
+; CHECK-LE-P9-NEXT:    addis r3, r2, .LCPI2_0@toc@ha
+; CHECK-LE-P9-NEXT:    mtvsrd v3, r5
+; CHECK-LE-P9-NEXT:    addi r3, r3, .LCPI2_0@toc@l
+; CHECK-LE-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-LE-P9-NEXT:    addis r3, r2, .LCPI2_1@toc@ha
+; CHECK-LE-P9-NEXT:    addi r3, r3, .LCPI2_1@toc@l
+; CHECK-LE-P9-NEXT:    vperm v3, v3, v3, v4
+; CHECK-LE-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-LE-P9-NEXT:    vperm v2, v3, v2, v4
+; CHECK-LE-P9-NEXT:    xxswapd vs0, v2
+; CHECK-LE-P9-NEXT:    stfd f0, 0(r3)
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_none_v4i32:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    addis r4, r2, .LCPI2_0@toc@ha
+; CHECK-BE-P8-NEXT:    mtvsrwz v3, r5
+; CHECK-BE-P8-NEXT:    addi r4, r4, .LCPI2_0@toc@l
+; CHECK-BE-P8-NEXT:    lxvw4x v2, 0, r4
+; CHECK-BE-P8-NEXT:    addis r4, r2, .LCPI2_1@toc@ha
+; CHECK-BE-P8-NEXT:    addi r4, r4, .LCPI2_1@toc@l
+; CHECK-BE-P8-NEXT:    lxvw4x v4, 0, r4
+; CHECK-BE-P8-NEXT:    vperm v2, v3, v3, v2
+; CHECK-BE-P8-NEXT:    lxsiwzx v3, 0, r3
+; CHECK-BE-P8-NEXT:    vperm v2, v3, v2, v4
+; CHECK-BE-P8-NEXT:    stxsdx v2, 0, r3
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_none_v4i32:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    lxsiwzx v2, 0, r3
+; CHECK-BE-P9-NEXT:    addis r3, r2, .LCPI2_0@toc@ha
+; CHECK-BE-P9-NEXT:    mtvsrwz v3, r5
+; CHECK-BE-P9-NEXT:    addi r3, r3, .LCPI2_0@toc@l
+; CHECK-BE-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-BE-P9-NEXT:    addis r3, r2, .LCPI2_1@toc@ha
+; CHECK-BE-P9-NEXT:    addi r3, r3, .LCPI2_1@toc@l
+; CHECK-BE-P9-NEXT:    vperm v3, v3, v3, v4
+; CHECK-BE-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-BE-P9-NEXT:    vperm v2, v2, v3, v4
+; CHECK-BE-P9-NEXT:    stxsd v2, 0(r3)
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_none_v4i32:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    ld r4, L..C3(r2) # %const.0
+; CHECK-AIX-64-P8-NEXT:    mtvsrwz v3, r5
+; CHECK-AIX-64-P8-NEXT:    lxvw4x v2, 0, r4
+; CHECK-AIX-64-P8-NEXT:    ld r4, L..C4(r2) # %const.1
+; CHECK-AIX-64-P8-NEXT:    vperm v2, v3, v3, v2
+; CHECK-AIX-64-P8-NEXT:    lxsiwzx v3, 0, r3
+; CHECK-AIX-64-P8-NEXT:    lxvw4x v4, 0, r4
+; CHECK-AIX-64-P8-NEXT:    vperm v2, v3, v2, v4
+; CHECK-AIX-64-P8-NEXT:    stxsdx v2, 0, r3
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_none_v4i32:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    lxsiwzx v2, 0, r3
+; CHECK-AIX-64-P9-NEXT:    ld r3, L..C2(r2) # %const.0
+; CHECK-AIX-64-P9-NEXT:    mtvsrwz v3, r5
+; CHECK-AIX-64-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    ld r3, L..C3(r2) # %const.1
+; CHECK-AIX-64-P9-NEXT:    vperm v3, v3, v3, v4
+; CHECK-AIX-64-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    vperm v2, v2, v3, v4
+; CHECK-AIX-64-P9-NEXT:    stxsd v2, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_none_v4i32:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    addi r4, r1, -32
+; CHECK-AIX-32-P8-NEXT:    lxsiwzx v2, 0, r3
+; CHECK-AIX-32-P8-NEXT:    stb r5, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    lwz r3, L..C3(r2) # %const.0
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v3, 0, r4
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v4, 0, r3
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
+; CHECK-AIX-32-P8-NEXT:    vmrghh v3, v3, v3
+; CHECK-AIX-32-P8-NEXT:    vperm v2, v2, v3, v4
+; CHECK-AIX-32-P8-NEXT:    stxvw4x v2, 0, r3
+; CHECK-AIX-32-P8-NEXT:    lwz r3, -12(r1)
+; CHECK-AIX-32-P8-NEXT:    stw r3, 0(r3)
+; CHECK-AIX-32-P8-NEXT:    lwz r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    stw r3, 0(r3)
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_none_v4i32:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    lxsiwzx v2, 0, r3
+; CHECK-AIX-32-P9-NEXT:    lwz r3, L..C2(r2) # %const.0
+; CHECK-AIX-32-P9-NEXT:    stb r5, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    vmrghh v3, v3, v3
+; CHECK-AIX-32-P9-NEXT:    vperm v2, v2, v3, v4
+; CHECK-AIX-32-P9-NEXT:    stxv v2, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r3, -12(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r3, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    lwz r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r3, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %0 = load <2 x i16>, ptr %ptr, align 4
+  %tmp = insertelement <4 x i8> undef, i8 %v3, i32 0
+  %tmp0 = bitcast <4 x i8> %tmp to <2 x i16>
+  %1 = shufflevector <2 x i16> %0, <2 x i16> %tmp0, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  store <4 x i16> %1, ptr undef, align 4
+  ret void
+}
+
+define void @test_v4i32_none(ptr nocapture readonly %ptr1, ptr nocapture readonly %ptr2) {
+; CHECK-LE-P8-LABEL: test_v4i32_none:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    addis r4, r2, .LCPI3_0@toc@ha
+; CHECK-LE-P8-NEXT:    lxsiwzx v2, 0, r3
+; CHECK-LE-P8-NEXT:    xxlxor v4, v4, v4
+; CHECK-LE-P8-NEXT:    addi r4, r4, .LCPI3_0@toc@l
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
+; CHECK-LE-P8-NEXT:    xxswapd v3, vs0
+; CHECK-LE-P8-NEXT:    vperm v2, v4, v2, v3
+; CHECK-LE-P8-NEXT:    xxswapd vs0, v2
+; CHECK-LE-P8-NEXT:    stxvd2x vs0, 0, r3
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v4i32_none:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    lxsiwzx v2, 0, r3
+; CHECK-LE-P9-NEXT:    addis r3, r2, .LCPI3_0@toc@ha
+; CHECK-LE-P9-NEXT:    xxlxor v4, v4, v4
+; CHECK-LE-P9-NEXT:    addi r3, r3, .LCPI3_0@toc@l
+; CHECK-LE-P9-NEXT:    lxv v3, 0(r3)
+; CHECK-LE-P9-NEXT:    vperm v2, v4, v2, v3
+; CHECK-LE-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v4i32_none:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    addis r4, r2, .LCPI3_0@toc@ha
+; CHECK-BE-P8-NEXT:    lxsiwzx v2, 0, r3
+; CHECK-BE-P8-NEXT:    xxlxor v4, v4, v4
+; CHECK-BE-P8-NEXT:    addi r4, r4, .LCPI3_0@toc@l
+; CHECK-BE-P8-NEXT:    lxvw4x v3, 0, r4
+; CHECK-BE-P8-NEXT:    vperm v2, v4, v2, v3
+; CHECK-BE-P8-NEXT:    stxvw4x v2, 0, r3
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v4i32_none:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    lxsiwzx v2, 0, r3
+; CHECK-BE-P9-NEXT:    addis r3, r2, .LCPI3_0@toc@ha
+; CHECK-BE-P9-NEXT:    xxlxor v4, v4, v4
+; CHECK-BE-P9-NEXT:    addi r3, r3, .LCPI3_0@toc@l
+; CHECK-BE-P9-NEXT:    lxv v3, 0(r3)
+; CHECK-BE-P9-NEXT:    vperm v2, v4, v2, v3
+; CHECK-BE-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v4i32_none:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    ld r4, L..C5(r2) # %const.0
+; CHECK-AIX-64-P8-NEXT:    lxsiwzx v2, 0, r3
+; CHECK-AIX-64-P8-NEXT:    xxlxor v4, v4, v4
+; CHECK-AIX-64-P8-NEXT:    lxvw4x v3, 0, r4
+; CHECK-AIX-64-P8-NEXT:    vperm v2, v4, v2, v3
+; CHECK-AIX-64-P8-NEXT:    stxvw4x v2, 0, r3
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v4i32_none:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    lxsiwzx v2, 0, r3
+; CHECK-AIX-64-P9-NEXT:    ld r3, L..C4(r2) # %const.0
+; CHECK-AIX-64-P9-NEXT:    xxlxor v4, v4, v4
+; CHECK-AIX-64-P9-NEXT:    lxv v3, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    vperm v2, v4, v2, v3
+; CHECK-AIX-64-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v4i32_none:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    lwz r4, L..C4(r2) # %const.0
+; CHECK-AIX-32-P8-NEXT:    lxsiwzx v2, 0, r3
+; CHECK-AIX-32-P8-NEXT:    xxlxor v4, v4, v4
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v3, 0, r4
+; CHECK-AIX-32-P8-NEXT:    vperm v2, v4, v2, v3
+; CHECK-AIX-32-P8-NEXT:    stxvw4x v2, 0, r3
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v4i32_none:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    lxsiwzx v2, 0, r3
+; CHECK-AIX-32-P9-NEXT:    lwz r3, L..C3(r2) # %const.0
+; CHECK-AIX-32-P9-NEXT:    xxlxor v4, v4, v4
+; CHECK-AIX-32-P9-NEXT:    lxv v3, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    vperm v2, v4, v2, v3
+; CHECK-AIX-32-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %0 = load <2 x i16>, ptr %ptr1, align 1
+  %1 = load <2 x i16>, ptr %ptr2, align 1
+  %shuffle1 = shufflevector <2 x i16> %0, <2 x i16> %1, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+  %2 = zext <4 x i16> %shuffle1 to <4 x i32>
+  store <4 x i32> %2, ptr undef, align 16
+  ret void
+}
+
+define void @test_none_v2i64(ptr nocapture readonly %ptr1, ptr nocapture readonly %ptr2) {
+; CHECK-LE-P8-LABEL: test_none_v2i64:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    addis r5, r2, .LCPI4_0@toc@ha
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
+; CHECK-LE-P8-NEXT:    lxsdx v2, 0, r3
+; CHECK-LE-P8-NEXT:    addis r3, r2, .LCPI4_1@toc@ha
+; CHECK-LE-P8-NEXT:    addi r5, r5, .LCPI4_0@toc@l
+; CHECK-LE-P8-NEXT:    addi r3, r3, .LCPI4_1@toc@l
+; CHECK-LE-P8-NEXT:    lxvd2x vs1, 0, r5
+; CHECK-LE-P8-NEXT:    xxswapd v3, vs0
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r3
+; CHECK-LE-P8-NEXT:    xxswapd v4, vs1
+; CHECK-LE-P8-NEXT:    vperm v2, v2, v3, v4
+; CHECK-LE-P8-NEXT:    xxswapd v3, vs0
+; CHECK-LE-P8-NEXT:    xxlxor v4, v4, v4
+; CHECK-LE-P8-NEXT:    vperm v2, v4, v2, v3
+; CHECK-LE-P8-NEXT:    xxswapd vs0, v2
+; CHECK-LE-P8-NEXT:    stxvd2x vs0, 0, r3
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_none_v2i64:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    lxsd v2, 0(r3)
+; CHECK-LE-P9-NEXT:    addis r3, r2, .LCPI4_0@toc@ha
+; CHECK-LE-P9-NEXT:    lxv v3, 0(r4)
+; CHECK-LE-P9-NEXT:    addi r3, r3, .LCPI4_0@toc@l
+; CHECK-LE-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-LE-P9-NEXT:    addis r3, r2, .LCPI4_1@toc@ha
+; CHECK-LE-P9-NEXT:    addi r3, r3, .LCPI4_1@toc@l
+; CHECK-LE-P9-NEXT:    vperm v2, v2, v3, v4
+; CHECK-LE-P9-NEXT:    lxv v3, 0(r3)
+; CHECK-LE-P9-NEXT:    xxlxor v4, v4, v4
+; CHECK-LE-P9-NEXT:    vperm v2, v4, v2, v3
+; CHECK-LE-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_none_v2i64:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    addis r5, r2, .LCPI4_0@toc@ha
+; CHECK-BE-P8-NEXT:    lxsdx v2, 0, r3
+; CHECK-BE-P8-NEXT:    lxvw4x v3, 0, r4
+; CHECK-BE-P8-NEXT:    addi r5, r5, .LCPI4_0@toc@l
+; CHECK-BE-P8-NEXT:    lxvw4x v4, 0, r5
+; CHECK-BE-P8-NEXT:    vperm v2, v3, v2, v4
+; CHECK-BE-P8-NEXT:    xxlxor v3, v3, v3
+; CHECK-BE-P8-NEXT:    vmrghh v2, v3, v2
+; CHECK-BE-P8-NEXT:    stxvw4x v2, 0, r3
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_none_v2i64:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    lxsd v2, 0(r3)
+; CHECK-BE-P9-NEXT:    addis r3, r2, .LCPI4_0@toc@ha
+; CHECK-BE-P9-NEXT:    lxv v3, 0(r4)
+; CHECK-BE-P9-NEXT:    addi r3, r3, .LCPI4_0@toc@l
+; CHECK-BE-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-BE-P9-NEXT:    vperm v2, v3, v2, v4
+; CHECK-BE-P9-NEXT:    xxlxor v3, v3, v3
+; CHECK-BE-P9-NEXT:    vmrghh v2, v3, v2
+; CHECK-BE-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_none_v2i64:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    ld r5, L..C6(r2) # %const.0
+; CHECK-AIX-64-P8-NEXT:    lxsdx v2, 0, r3
+; CHECK-AIX-64-P8-NEXT:    lxvw4x v3, 0, r4
+; CHECK-AIX-64-P8-NEXT:    lxvw4x v4, 0, r5
+; CHECK-AIX-64-P8-NEXT:    vperm v2, v3, v2, v4
+; CHECK-AIX-64-P8-NEXT:    xxlxor v3, v3, v3
+; CHECK-AIX-64-P8-NEXT:    vmrghh v2, v3, v2
+; CHECK-AIX-64-P8-NEXT:    stxvw4x v2, 0, r3
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_none_v2i64:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    lxsd v2, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    ld r3, L..C5(r2) # %const.0
+; CHECK-AIX-64-P9-NEXT:    lxv v3, 0(r4)
+; CHECK-AIX-64-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    vperm v2, v3, v2, v4
+; CHECK-AIX-64-P9-NEXT:    xxlxor v3, v3, v3
+; CHECK-AIX-64-P9-NEXT:    vmrghh v2, v3, v2
+; CHECK-AIX-64-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_none_v2i64:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    lwz r5, L..C5(r2) # %const.0
+; CHECK-AIX-32-P8-NEXT:    lxsiwzx v2, 0, r3
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v3, 0, r4
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v4, 0, r5
+; CHECK-AIX-32-P8-NEXT:    vperm v2, v3, v2, v4
+; CHECK-AIX-32-P8-NEXT:    xxlxor v3, v3, v3
+; CHECK-AIX-32-P8-NEXT:    vmrghh v2, v3, v2
+; CHECK-AIX-32-P8-NEXT:    stxvw4x v2, 0, r3
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_none_v2i64:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    lxsiwzx v2, 0, r3
+; CHECK-AIX-32-P9-NEXT:    lwz r3, L..C4(r2) # %const.0
+; CHECK-AIX-32-P9-NEXT:    lxv v3, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    vperm v2, v3, v2, v4
+; CHECK-AIX-32-P9-NEXT:    xxlxor v3, v3, v3
+; CHECK-AIX-32-P9-NEXT:    vmrghh v2, v3, v2
+; CHECK-AIX-32-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %0 = load <4 x i16>, ptr %ptr1, align 1
+  %1 = load <4 x i32>, ptr %ptr2, align 1
+  %bc = trunc <4 x i32> %1 to <4 x i16>
+  %shuffle1 = shufflevector <4 x i16> %0, <4 x i16> %bc, <4 x i32> <i32 4, i32 5, i32 1, i32 0>
+  %2 = zext <4 x i16> %shuffle1 to <4 x i32>
+  store <4 x i32> %2, ptr undef, align 16
+  ret void
+}
+
+define void @test_v2i64_none(ptr nocapture readonly %ptr1) {
+; CHECK-LE-P8-LABEL: test_v2i64_none:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    addis r4, r2, .LCPI5_0@toc@ha
+; CHECK-LE-P8-NEXT:    lxsdx v2, 0, r3
+; CHECK-LE-P8-NEXT:    xxlxor v4, v4, v4
+; CHECK-LE-P8-NEXT:    addi r4, r4, .LCPI5_0@toc@l
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
+; CHECK-LE-P8-NEXT:    xxswapd v3, vs0
+; CHECK-LE-P8-NEXT:    vperm v2, v4, v2, v3
+; CHECK-LE-P8-NEXT:    xxswapd vs0, v2
+; CHECK-LE-P8-NEXT:    stxvd2x vs0, 0, r3
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v2i64_none:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    lxsd v2, 0(r3)
+; CHECK-LE-P9-NEXT:    addis r3, r2, .LCPI5_0@toc@ha
+; CHECK-LE-P9-NEXT:    xxlxor v4, v4, v4
+; CHECK-LE-P9-NEXT:    addi r3, r3, .LCPI5_0@toc@l
+; CHECK-LE-P9-NEXT:    lxv v3, 0(r3)
+; CHECK-LE-P9-NEXT:    vperm v2, v4, v2, v3
+; CHECK-LE-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v2i64_none:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    addis r4, r2, .LCPI5_0@toc@ha
+; CHECK-BE-P8-NEXT:    lxsdx v2, 0, r3
+; CHECK-BE-P8-NEXT:    xxlxor v4, v4, v4
+; CHECK-BE-P8-NEXT:    addi r4, r4, .LCPI5_0@toc@l
+; CHECK-BE-P8-NEXT:    lxvw4x v3, 0, r4
+; CHECK-BE-P8-NEXT:    vperm v2, v4, v2, v3
+; CHECK-BE-P8-NEXT:    stxvw4x v2, 0, r3
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v2i64_none:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    lxsd v2, 0(r3)
+; CHECK-BE-P9-NEXT:    addis r3, r2, .LCPI5_0@toc@ha
+; CHECK-BE-P9-NEXT:    xxlxor v4, v4, v4
+; CHECK-BE-P9-NEXT:    addi r3, r3, .LCPI5_0@toc@l
+; CHECK-BE-P9-NEXT:    lxv v3, 0(r3)
+; CHECK-BE-P9-NEXT:    vperm v2, v4, v2, v3
+; CHECK-BE-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v2i64_none:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    ld r4, L..C7(r2) # %const.0
+; CHECK-AIX-64-P8-NEXT:    lxsdx v2, 0, r3
+; CHECK-AIX-64-P8-NEXT:    xxlxor v4, v4, v4
+; CHECK-AIX-64-P8-NEXT:    lxvw4x v3, 0, r4
+; CHECK-AIX-64-P8-NEXT:    vperm v2, v4, v2, v3
+; CHECK-AIX-64-P8-NEXT:    stxvw4x v2, 0, r3
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v2i64_none:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    lxsd v2, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    ld r3, L..C6(r2) # %const.0
+; CHECK-AIX-64-P9-NEXT:    xxlxor v4, v4, v4
+; CHECK-AIX-64-P9-NEXT:    lxv v3, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    vperm v2, v4, v2, v3
+; CHECK-AIX-64-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v2i64_none:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    lwz r4, 4(r3)
+; CHECK-AIX-32-P8-NEXT:    xxlxor v4, v4, v4
+; CHECK-AIX-32-P8-NEXT:    stw r4, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r4, r1, -32
+; CHECK-AIX-32-P8-NEXT:    lwz r3, 0(r3)
+; CHECK-AIX-32-P8-NEXT:    stw r3, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r3
+; CHECK-AIX-32-P8-NEXT:    lwz r3, L..C6(r2) # %const.0
+; CHECK-AIX-32-P8-NEXT:    lxvw4x vs1, 0, r4
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v3, 0, r3
+; CHECK-AIX-32-P8-NEXT:    xxmrghw v2, vs1, vs0
+; CHECK-AIX-32-P8-NEXT:    vperm v2, v4, v2, v3
+; CHECK-AIX-32-P8-NEXT:    stxvw4x v2, 0, r3
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v2i64_none:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    lwz r4, 4(r3)
+; CHECK-AIX-32-P9-NEXT:    xxlxor v4, v4, v4
+; CHECK-AIX-32-P9-NEXT:    stw r4, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r3, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r3, L..C5(r2) # %const.0
+; CHECK-AIX-32-P9-NEXT:    lxv vs1, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v3, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    xxmrghw v2, vs1, vs0
+; CHECK-AIX-32-P9-NEXT:    vperm v2, v4, v2, v3
+; CHECK-AIX-32-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %0 = load <4 x i16>, ptr %ptr1, align 1
+  %shuffle1 = shufflevector <4 x i16> %0, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %1 = zext <4 x i16> %shuffle1 to <4 x i32>
+  store <4 x i32> %1, ptr undef, align 16
+  ret void
+}
+
+define <16 x i8> @test_v8i16_v8i16(ptr %a, ptr %b) {
+; CHECK-LE-P8-LABEL: test_v8i16_v8i16:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    addis r5, r2, .LCPI6_0@toc@ha
+; CHECK-LE-P8-NEXT:    lhz r3, 0(r3)
+; CHECK-LE-P8-NEXT:    lhz r4, 0(r4)
+; CHECK-LE-P8-NEXT:    addi r5, r5, .LCPI6_0@toc@l
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r5
+; CHECK-LE-P8-NEXT:    mtvsrd v2, r3
+; CHECK-LE-P8-NEXT:    mtvsrd v4, r4
+; CHECK-LE-P8-NEXT:    xxswapd v3, vs0
+; CHECK-LE-P8-NEXT:    vperm v2, v4, v2, v3
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v8i16_v8i16:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    lxsihzx v2, 0, r3
+; CHECK-LE-P9-NEXT:    addis r3, r2, .LCPI6_0@toc@ha
+; CHECK-LE-P9-NEXT:    lxsihzx v3, 0, r4
+; CHECK-LE-P9-NEXT:    addi r3, r3, .LCPI6_0@toc@l
+; CHECK-LE-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-LE-P9-NEXT:    vperm v2, v3, v2, v4
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v8i16_v8i16:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    addis r5, r2, .LCPI6_0@toc@ha
+; CHECK-BE-P8-NEXT:    lhz r3, 0(r3)
+; CHECK-BE-P8-NEXT:    lhz r4, 0(r4)
+; CHECK-BE-P8-NEXT:    addi r5, r5, .LCPI6_0@toc@l
+; CHECK-BE-P8-NEXT:    lxvw4x v2, 0, r5
+; CHECK-BE-P8-NEXT:    mtvsrwz v3, r3
+; CHECK-BE-P8-NEXT:    mtvsrwz v4, r4
+; CHECK-BE-P8-NEXT:    vperm v2, v3, v4, v2
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v8i16_v8i16:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    lxsihzx v2, 0, r3
+; CHECK-BE-P9-NEXT:    addis r3, r2, .LCPI6_0@toc@ha
+; CHECK-BE-P9-NEXT:    lxsihzx v3, 0, r4
+; CHECK-BE-P9-NEXT:    addi r3, r3, .LCPI6_0@toc@l
+; CHECK-BE-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-BE-P9-NEXT:    vperm v2, v2, v3, v4
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v8i16_v8i16:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    ld r5, L..C8(r2) # %const.0
+; CHECK-AIX-64-P8-NEXT:    lhz r3, 0(r3)
+; CHECK-AIX-64-P8-NEXT:    lhz r4, 0(r4)
+; CHECK-AIX-64-P8-NEXT:    mtvsrwz v2, r3
+; CHECK-AIX-64-P8-NEXT:    lxvw4x v3, 0, r5
+; CHECK-AIX-64-P8-NEXT:    mtvsrwz v4, r4
+; CHECK-AIX-64-P8-NEXT:    vperm v2, v2, v4, v3
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v8i16_v8i16:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    lxsihzx v2, 0, r3
+; CHECK-AIX-64-P9-NEXT:    ld r3, L..C7(r2) # %const.0
+; CHECK-AIX-64-P9-NEXT:    lxsihzx v3, 0, r4
+; CHECK-AIX-64-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    vperm v2, v2, v3, v4
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v8i16_v8i16:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    lwz r5, L..C7(r2) # %const.0
+; CHECK-AIX-32-P8-NEXT:    lhz r3, 0(r3)
+; CHECK-AIX-32-P8-NEXT:    lhz r4, 0(r4)
+; CHECK-AIX-32-P8-NEXT:    mtvsrwz v2, r3
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v3, 0, r5
+; CHECK-AIX-32-P8-NEXT:    mtvsrwz v4, r4
+; CHECK-AIX-32-P8-NEXT:    vperm v2, v2, v4, v3
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v8i16_v8i16:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    lxsihzx v2, 0, r3
+; CHECK-AIX-32-P9-NEXT:    lwz r3, L..C6(r2) # %const.0
+; CHECK-AIX-32-P9-NEXT:    lxsihzx v3, 0, r4
+; CHECK-AIX-32-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    vperm v2, v2, v3, v4
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %load1 = load <2 x i8>, ptr %a
+  %load2 = load <2 x i8>, ptr %b
+  %shuffle1 = shufflevector <2 x i8> %load1, <2 x i8> %load2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  %shuffle2 = shufflevector <8 x i8> %shuffle1, <8 x i8> %shuffle1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %shuffle2
+}
+
+define <16 x i8> @test_v8i16_v4i32(ptr %a, ptr %b) local_unnamed_addr {
+; CHECK-LE-P8-LABEL: test_v8i16_v4i32:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    lhz r3, 0(r3)
+; CHECK-LE-P8-NEXT:    lfiwzx f0, 0, r4
+; CHECK-LE-P8-NEXT:    mtfprd f1, r3
+; CHECK-LE-P8-NEXT:    xxswapd v3, f0
+; CHECK-LE-P8-NEXT:    xxswapd v2, vs1
+; CHECK-LE-P8-NEXT:    vmrglh v2, v3, v2
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v8i16_v4i32:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    lxsihzx v2, 0, r3
+; CHECK-LE-P9-NEXT:    lfiwzx f0, 0, r4
+; CHECK-LE-P9-NEXT:    xxswapd v3, f0
+; CHECK-LE-P9-NEXT:    vsplth v2, v2, 3
+; CHECK-LE-P9-NEXT:    vmrglh v2, v3, v2
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v8i16_v4i32:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    lfiwzx f0, 0, r4
+; CHECK-BE-P8-NEXT:    lhz r3, 0(r3)
+; CHECK-BE-P8-NEXT:    sldi r3, r3, 48
+; CHECK-BE-P8-NEXT:    xxsldwi v2, f0, f0, 1
+; CHECK-BE-P8-NEXT:    mtvsrd v3, r3
+; CHECK-BE-P8-NEXT:    vmrghh v2, v3, v2
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v8i16_v4i32:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    lxsihzx v2, 0, r3
+; CHECK-BE-P9-NEXT:    lfiwzx f0, 0, r4
+; CHECK-BE-P9-NEXT:    xxsldwi v3, f0, f0, 1
+; CHECK-BE-P9-NEXT:    vsplth v2, v2, 3
+; CHECK-BE-P9-NEXT:    vmrghh v2, v2, v3
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v8i16_v4i32:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    lfiwzx f0, 0, r4
+; CHECK-AIX-64-P8-NEXT:    lhz r3, 0(r3)
+; CHECK-AIX-64-P8-NEXT:    sldi r3, r3, 48
+; CHECK-AIX-64-P8-NEXT:    xxsldwi v2, f0, f0, 1
+; CHECK-AIX-64-P8-NEXT:    mtvsrd v3, r3
+; CHECK-AIX-64-P8-NEXT:    vmrghh v2, v3, v2
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v8i16_v4i32:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    lxsihzx v2, 0, r3
+; CHECK-AIX-64-P9-NEXT:    lfiwzx f0, 0, r4
+; CHECK-AIX-64-P9-NEXT:    xxsldwi v3, f0, f0, 1
+; CHECK-AIX-64-P9-NEXT:    vsplth v2, v2, 3
+; CHECK-AIX-64-P9-NEXT:    vmrghh v2, v2, v3
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v8i16_v4i32:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    lhz r3, 0(r3)
+; CHECK-AIX-32-P8-NEXT:    sth r3, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    lwz r3, 0(r4)
+; CHECK-AIX-32-P8-NEXT:    addi r4, r1, -32
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v2, 0, r4
+; CHECK-AIX-32-P8-NEXT:    stw r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v3, 0, r3
+; CHECK-AIX-32-P8-NEXT:    vmrghh v2, v2, v3
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v8i16_v4i32:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    lhz r3, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    sth r3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r3, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    lxv v2, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    vmrghh v2, v2, v3
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %0 = load <2 x i8>, ptr %a
+  %bc1 = bitcast <2 x i8> %0 to i16
+  %vecinit3 = insertelement <8 x i16> poison, i16 %bc1, i64 0
+  %1 = load <2 x i8>, ptr %b, align 4
+  %bc2 = bitcast <2 x i8> %1 to i16
+  %vecinit6 = insertelement <8 x i16> undef, i16 %bc2, i64 0
+  %2 = bitcast <8 x i16> %vecinit3 to <16 x i8>
+  %3 = bitcast <8 x i16> %vecinit6 to <16 x i8>
+  %shuffle = shufflevector <16 x i8> %2, <16 x i8> %3, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @test_v8i16_v2i64(ptr %a, ptr %b) local_unnamed_addr {
+; CHECK-LE-P8-LABEL: test_v8i16_v2i64:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    lhz r3, 0(r3)
+; CHECK-LE-P8-NEXT:    lfdx f0, 0, r4
+; CHECK-LE-P8-NEXT:    mtfprd f1, r3
+; CHECK-LE-P8-NEXT:    xxswapd v3, f0
+; CHECK-LE-P8-NEXT:    xxswapd v2, vs1
+; CHECK-LE-P8-NEXT:    vmrglh v2, v3, v2
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v8i16_v2i64:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    lxsihzx v2, 0, r3
+; CHECK-LE-P9-NEXT:    lfd f0, 0(r4)
+; CHECK-LE-P9-NEXT:    xxswapd v3, f0
+; CHECK-LE-P9-NEXT:    vsplth v2, v2, 3
+; CHECK-LE-P9-NEXT:    vmrglh v2, v3, v2
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v8i16_v2i64:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    lhz r3, 0(r3)
+; CHECK-BE-P8-NEXT:    lxsdx v2, 0, r4
+; CHECK-BE-P8-NEXT:    sldi r3, r3, 48
+; CHECK-BE-P8-NEXT:    mtvsrd v3, r3
+; CHECK-BE-P8-NEXT:    vmrghh v2, v3, v2
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v8i16_v2i64:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    lxsihzx v2, 0, r3
+; CHECK-BE-P9-NEXT:    lxsd v3, 0(r4)
+; CHECK-BE-P9-NEXT:    vsplth v2, v2, 3
+; CHECK-BE-P9-NEXT:    vmrghh v2, v2, v3
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v8i16_v2i64:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    lhz r3, 0(r3)
+; CHECK-AIX-64-P8-NEXT:    lxsdx v2, 0, r4
+; CHECK-AIX-64-P8-NEXT:    sldi r3, r3, 48
+; CHECK-AIX-64-P8-NEXT:    mtvsrd v3, r3
+; CHECK-AIX-64-P8-NEXT:    vmrghh v2, v3, v2
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v8i16_v2i64:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    lxsihzx v2, 0, r3
+; CHECK-AIX-64-P9-NEXT:    lxsd v3, 0(r4)
+; CHECK-AIX-64-P9-NEXT:    vsplth v2, v2, 3
+; CHECK-AIX-64-P9-NEXT:    vmrghh v2, v2, v3
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v8i16_v2i64:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    lhz r3, 0(r3)
+; CHECK-AIX-32-P8-NEXT:    sth r3, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    lwz r3, 0(r4)
+; CHECK-AIX-32-P8-NEXT:    addi r4, r1, -32
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v2, 0, r4
+; CHECK-AIX-32-P8-NEXT:    stw r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v3, 0, r3
+; CHECK-AIX-32-P8-NEXT:    vmrghh v2, v2, v3
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v8i16_v2i64:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    lhz r3, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    sth r3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r3, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    lxv v2, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    vmrghh v2, v2, v3
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %0 = load <2 x i8>, ptr %a
+  %bc1 = bitcast <2 x i8> %0 to i16
+  %vecinit3 = insertelement <8 x i16> poison, i16 %bc1, i64 0
+  %1 = load <2 x i8>, ptr %b, align 8
+  %bc2 = bitcast <2 x i8> %1 to i16
+  %vecinit6 = insertelement <8 x i16> undef, i16 %bc2, i64 0
+  %2 = bitcast <8 x i16> %vecinit3 to <16 x i8>
+  %3 = bitcast <8 x i16> %vecinit6 to <16 x i8>
+  %shuffle = shufflevector <16 x i8> %2, <16 x i8> %3, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %shuffle
+}
+
+define void @test_v4i32_v4i32(ptr nocapture readonly %ptr1, ptr nocapture readonly %ptr2) {
+; CHECK-LE-P8-LABEL: test_v4i32_v4i32:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    addis r5, r2, .LCPI9_0@toc@ha
+; CHECK-LE-P8-NEXT:    lxsiwzx v2, 0, r3
+; CHECK-LE-P8-NEXT:    lxsiwzx v3, 0, r4
+; CHECK-LE-P8-NEXT:    addis r3, r2, .LCPI9_1@toc@ha
+; CHECK-LE-P8-NEXT:    addi r5, r5, .LCPI9_0@toc@l
+; CHECK-LE-P8-NEXT:    addi r3, r3, .LCPI9_1@toc@l
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r5
+; CHECK-LE-P8-NEXT:    xxswapd v4, vs0
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r3
+; CHECK-LE-P8-NEXT:    vperm v2, v2, v3, v4
+; CHECK-LE-P8-NEXT:    xxswapd v3, vs0
+; CHECK-LE-P8-NEXT:    xxlxor v4, v4, v4
+; CHECK-LE-P8-NEXT:    vperm v2, v4, v2, v3
+; CHECK-LE-P8-NEXT:    xxswapd vs0, v2
+; CHECK-LE-P8-NEXT:    stxvd2x vs0, 0, r3
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v4i32_v4i32:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    lxsiwzx v2, 0, r3
+; CHECK-LE-P9-NEXT:    addis r3, r2, .LCPI9_0@toc@ha
+; CHECK-LE-P9-NEXT:    lxsiwzx v3, 0, r4
+; CHECK-LE-P9-NEXT:    addi r3, r3, .LCPI9_0@toc@l
+; CHECK-LE-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-LE-P9-NEXT:    addis r3, r2, .LCPI9_1@toc@ha
+; CHECK-LE-P9-NEXT:    addi r3, r3, .LCPI9_1@toc@l
+; CHECK-LE-P9-NEXT:    vperm v2, v2, v3, v4
+; CHECK-LE-P9-NEXT:    lxv v3, 0(r3)
+; CHECK-LE-P9-NEXT:    xxlxor v4, v4, v4
+; CHECK-LE-P9-NEXT:    vperm v2, v4, v2, v3
+; CHECK-LE-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v4i32_v4i32:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    addis r5, r2, .LCPI9_0@toc@ha
+; CHECK-BE-P8-NEXT:    lxsiwzx v2, 0, r3
+; CHECK-BE-P8-NEXT:    lxsiwzx v3, 0, r4
+; CHECK-BE-P8-NEXT:    addi r5, r5, .LCPI9_0@toc@l
+; CHECK-BE-P8-NEXT:    lxvw4x v4, 0, r5
+; CHECK-BE-P8-NEXT:    vperm v2, v3, v2, v4
+; CHECK-BE-P8-NEXT:    xxlxor v3, v3, v3
+; CHECK-BE-P8-NEXT:    vmrghh v2, v3, v2
+; CHECK-BE-P8-NEXT:    stxvw4x v2, 0, r3
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v4i32_v4i32:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    lxsiwzx v2, 0, r3
+; CHECK-BE-P9-NEXT:    addis r3, r2, .LCPI9_0@toc@ha
+; CHECK-BE-P9-NEXT:    lxsiwzx v3, 0, r4
+; CHECK-BE-P9-NEXT:    addi r3, r3, .LCPI9_0@toc@l
+; CHECK-BE-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-BE-P9-NEXT:    vperm v2, v3, v2, v4
+; CHECK-BE-P9-NEXT:    xxlxor v3, v3, v3
+; CHECK-BE-P9-NEXT:    vmrghh v2, v3, v2
+; CHECK-BE-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v4i32_v4i32:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    ld r5, L..C9(r2) # %const.0
+; CHECK-AIX-64-P8-NEXT:    lxsiwzx v2, 0, r3
+; CHECK-AIX-64-P8-NEXT:    lxsiwzx v3, 0, r4
+; CHECK-AIX-64-P8-NEXT:    lxvw4x v4, 0, r5
+; CHECK-AIX-64-P8-NEXT:    vperm v2, v3, v2, v4
+; CHECK-AIX-64-P8-NEXT:    xxlxor v3, v3, v3
+; CHECK-AIX-64-P8-NEXT:    vmrghh v2, v3, v2
+; CHECK-AIX-64-P8-NEXT:    stxvw4x v2, 0, r3
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v4i32_v4i32:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    lxsiwzx v2, 0, r3
+; CHECK-AIX-64-P9-NEXT:    ld r3, L..C8(r2) # %const.0
+; CHECK-AIX-64-P9-NEXT:    lxsiwzx v3, 0, r4
+; CHECK-AIX-64-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    vperm v2, v3, v2, v4
+; CHECK-AIX-64-P9-NEXT:    xxlxor v3, v3, v3
+; CHECK-AIX-64-P9-NEXT:    vmrghh v2, v3, v2
+; CHECK-AIX-64-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v4i32_v4i32:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    lwz r5, L..C8(r2) # %const.0
+; CHECK-AIX-32-P8-NEXT:    lxsiwzx v2, 0, r3
+; CHECK-AIX-32-P8-NEXT:    lxsiwzx v3, 0, r4
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v4, 0, r5
+; CHECK-AIX-32-P8-NEXT:    vperm v2, v3, v2, v4
+; CHECK-AIX-32-P8-NEXT:    xxlxor v3, v3, v3
+; CHECK-AIX-32-P8-NEXT:    vmrghh v2, v3, v2
+; CHECK-AIX-32-P8-NEXT:    stxvw4x v2, 0, r3
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v4i32_v4i32:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    lxsiwzx v2, 0, r3
+; CHECK-AIX-32-P9-NEXT:    lwz r3, L..C7(r2) # %const.0
+; CHECK-AIX-32-P9-NEXT:    lxsiwzx v3, 0, r4
+; CHECK-AIX-32-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    vperm v2, v3, v2, v4
+; CHECK-AIX-32-P9-NEXT:    xxlxor v3, v3, v3
+; CHECK-AIX-32-P9-NEXT:    vmrghh v2, v3, v2
+; CHECK-AIX-32-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %0 = load <2 x i16>, ptr %ptr1, align 1
+  %1 = load <2 x i16>, ptr %ptr2, align 1
+  %shuffle1 = shufflevector <2 x i16> %0, <2 x i16> %1, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+  %2 = zext <4 x i16> %shuffle1 to <4 x i32>
+  store <4 x i32> %2, ptr undef, align 16
+  ret void
+}
+
+define <16 x i8> @test_v4i32_v8i16(ptr %a, ptr %b) local_unnamed_addr {
+; CHECK-LE-P8-LABEL: test_v4i32_v8i16:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    lhz r3, 0(r3)
+; CHECK-LE-P8-NEXT:    lfiwzx f0, 0, r4
+; CHECK-LE-P8-NEXT:    mtfprd f1, r3
+; CHECK-LE-P8-NEXT:    xxswapd v3, f0
+; CHECK-LE-P8-NEXT:    xxswapd v2, vs1
+; CHECK-LE-P8-NEXT:    vmrglh v2, v2, v3
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v4i32_v8i16:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    lxsihzx v2, 0, r3
+; CHECK-LE-P9-NEXT:    lfiwzx f0, 0, r4
+; CHECK-LE-P9-NEXT:    xxswapd v3, f0
+; CHECK-LE-P9-NEXT:    vsplth v2, v2, 3
+; CHECK-LE-P9-NEXT:    vmrglh v2, v2, v3
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v4i32_v8i16:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    lfiwzx f0, 0, r4
+; CHECK-BE-P8-NEXT:    lhz r3, 0(r3)
+; CHECK-BE-P8-NEXT:    sldi r3, r3, 48
+; CHECK-BE-P8-NEXT:    xxsldwi v2, f0, f0, 1
+; CHECK-BE-P8-NEXT:    mtvsrd v3, r3
+; CHECK-BE-P8-NEXT:    vmrghh v2, v2, v3
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v4i32_v8i16:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    lxsihzx v2, 0, r3
+; CHECK-BE-P9-NEXT:    lfiwzx f0, 0, r4
+; CHECK-BE-P9-NEXT:    xxsldwi v3, f0, f0, 1
+; CHECK-BE-P9-NEXT:    vsplth v2, v2, 3
+; CHECK-BE-P9-NEXT:    vmrghh v2, v3, v2
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v4i32_v8i16:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    lfiwzx f0, 0, r4
+; CHECK-AIX-64-P8-NEXT:    lhz r3, 0(r3)
+; CHECK-AIX-64-P8-NEXT:    sldi r3, r3, 48
+; CHECK-AIX-64-P8-NEXT:    xxsldwi v2, f0, f0, 1
+; CHECK-AIX-64-P8-NEXT:    mtvsrd v3, r3
+; CHECK-AIX-64-P8-NEXT:    vmrghh v2, v2, v3
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v4i32_v8i16:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    lxsihzx v2, 0, r3
+; CHECK-AIX-64-P9-NEXT:    lfiwzx f0, 0, r4
+; CHECK-AIX-64-P9-NEXT:    xxsldwi v3, f0, f0, 1
+; CHECK-AIX-64-P9-NEXT:    vsplth v2, v2, 3
+; CHECK-AIX-64-P9-NEXT:    vmrghh v2, v3, v2
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v4i32_v8i16:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    lhz r3, 0(r3)
+; CHECK-AIX-32-P8-NEXT:    sth r3, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    lwz r3, 0(r4)
+; CHECK-AIX-32-P8-NEXT:    addi r4, r1, -32
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v2, 0, r4
+; CHECK-AIX-32-P8-NEXT:    stw r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v3, 0, r3
+; CHECK-AIX-32-P8-NEXT:    vmrghh v2, v3, v2
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v4i32_v8i16:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    lhz r3, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    sth r3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r3, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    lxv v2, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    vmrghh v2, v3, v2
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %0 = load <2 x i8>, ptr %a
+  %bc1 = bitcast <2 x i8> %0 to i16
+  %vecinit3 = insertelement <8 x i16> poison, i16 %bc1, i64 0
+  %1 = load <2 x i8>, ptr %b, align 4
+  %bc2 = bitcast <2 x i8> %1 to i16
+  %vecinit6 = insertelement <8 x i16> undef, i16 %bc2, i64 0
+  %2 = bitcast <8 x i16> %vecinit3 to <16 x i8>
+  %3 = bitcast <8 x i16> %vecinit6 to <16 x i8>
+  %shuffle = shufflevector <16 x i8> %3, <16 x i8> %2, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @test_v4i32_v2i64(ptr %a, ptr %b) local_unnamed_addr {
+; CHECK-LE-P8-LABEL: test_v4i32_v2i64:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    lfiwzx f0, 0, r3
+; CHECK-LE-P8-NEXT:    lfdx f1, 0, r4
+; CHECK-LE-P8-NEXT:    xxswapd v2, f0
+; CHECK-LE-P8-NEXT:    xxswapd v3, f1
+; CHECK-LE-P8-NEXT:    vmrglh v2, v3, v2
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v4i32_v2i64:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    lfiwzx f0, 0, r3
+; CHECK-LE-P9-NEXT:    xxswapd v2, f0
+; CHECK-LE-P9-NEXT:    lfd f0, 0(r4)
+; CHECK-LE-P9-NEXT:    xxswapd v3, f0
+; CHECK-LE-P9-NEXT:    vmrglh v2, v3, v2
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v4i32_v2i64:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    lfiwzx f0, 0, r3
+; CHECK-BE-P8-NEXT:    lxsdx v3, 0, r4
+; CHECK-BE-P8-NEXT:    xxsldwi v2, f0, f0, 1
+; CHECK-BE-P8-NEXT:    vmrghh v2, v2, v3
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v4i32_v2i64:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    lfiwzx f0, 0, r3
+; CHECK-BE-P9-NEXT:    lxsd v3, 0(r4)
+; CHECK-BE-P9-NEXT:    xxsldwi v2, f0, f0, 1
+; CHECK-BE-P9-NEXT:    vmrghh v2, v2, v3
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v4i32_v2i64:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    lfiwzx f0, 0, r3
+; CHECK-AIX-64-P8-NEXT:    lxsdx v3, 0, r4
+; CHECK-AIX-64-P8-NEXT:    xxsldwi v2, f0, f0, 1
+; CHECK-AIX-64-P8-NEXT:    vmrghh v2, v2, v3
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v4i32_v2i64:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    lfiwzx f0, 0, r3
+; CHECK-AIX-64-P9-NEXT:    lxsd v3, 0(r4)
+; CHECK-AIX-64-P9-NEXT:    xxsldwi v2, f0, f0, 1
+; CHECK-AIX-64-P9-NEXT:    vmrghh v2, v2, v3
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v4i32_v2i64:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    lwz r5, L..C9(r2) # %const.0
+; CHECK-AIX-32-P8-NEXT:    lxsiwzx v2, 0, r3
+; CHECK-AIX-32-P8-NEXT:    lxsiwzx v3, 0, r4
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v4, 0, r5
+; CHECK-AIX-32-P8-NEXT:    vperm v2, v2, v3, v4
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v4i32_v2i64:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    lxsiwzx v2, 0, r3
+; CHECK-AIX-32-P9-NEXT:    lwz r3, L..C8(r2) # %const.0
+; CHECK-AIX-32-P9-NEXT:    lxsiwzx v3, 0, r4
+; CHECK-AIX-32-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    vperm v2, v2, v3, v4
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %0 = load <2 x i8>, ptr %a, align 4
+  %bc1 = bitcast <2 x i8> %0 to i16
+  %vecinit3 = insertelement <8 x i16> poison, i16 %bc1, i64 0
+  %1 = load <2 x i8>, ptr %b, align 8
+  %bc2 = bitcast <2 x i8> %1 to i16
+  %vecinit6 = insertelement <8 x i16> undef, i16 %bc2, i64 0
+  %2 = bitcast <8 x i16> %vecinit3 to <16 x i8>
+  %3 = bitcast <8 x i16> %vecinit6 to <16 x i8>
+  %shuffle = shufflevector <16 x i8> %2, <16 x i8> %3, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %shuffle
+}
+
+define void @test_v2i64_v2i64(ptr nocapture readonly %ptr1, ptr nocapture readonly %ptr2) {
+; CHECK-LE-P8-LABEL: test_v2i64_v2i64:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    addis r5, r2, .LCPI12_0@toc@ha
+; CHECK-LE-P8-NEXT:    lxsdx v2, 0, r3
+; CHECK-LE-P8-NEXT:    lxsdx v3, 0, r4
+; CHECK-LE-P8-NEXT:    addis r3, r2, .LCPI12_1@toc@ha
+; CHECK-LE-P8-NEXT:    addi r5, r5, .LCPI12_0@toc@l
+; CHECK-LE-P8-NEXT:    addi r3, r3, .LCPI12_1@toc@l
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r5
+; CHECK-LE-P8-NEXT:    xxswapd v4, vs0
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r3
+; CHECK-LE-P8-NEXT:    vperm v2, v2, v3, v4
+; CHECK-LE-P8-NEXT:    xxswapd v3, vs0
+; CHECK-LE-P8-NEXT:    xxlxor v4, v4, v4
+; CHECK-LE-P8-NEXT:    vperm v2, v4, v2, v3
+; CHECK-LE-P8-NEXT:    xxswapd vs0, v2
+; CHECK-LE-P8-NEXT:    stxvd2x vs0, 0, r3
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v2i64_v2i64:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    lxsd v2, 0(r3)
+; CHECK-LE-P9-NEXT:    addis r3, r2, .LCPI12_0@toc@ha
+; CHECK-LE-P9-NEXT:    lxsd v3, 0(r4)
+; CHECK-LE-P9-NEXT:    addi r3, r3, .LCPI12_0@toc@l
+; CHECK-LE-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-LE-P9-NEXT:    addis r3, r2, .LCPI12_1@toc@ha
+; CHECK-LE-P9-NEXT:    addi r3, r3, .LCPI12_1@toc@l
+; CHECK-LE-P9-NEXT:    vperm v2, v2, v3, v4
+; CHECK-LE-P9-NEXT:    lxv v3, 0(r3)
+; CHECK-LE-P9-NEXT:    xxlxor v4, v4, v4
+; CHECK-LE-P9-NEXT:    vperm v2, v4, v2, v3
+; CHECK-LE-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v2i64_v2i64:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    addis r5, r2, .LCPI12_0@toc@ha
+; CHECK-BE-P8-NEXT:    lxsdx v2, 0, r3
+; CHECK-BE-P8-NEXT:    lxsdx v3, 0, r4
+; CHECK-BE-P8-NEXT:    addi r5, r5, .LCPI12_0@toc@l
+; CHECK-BE-P8-NEXT:    lxvw4x v4, 0, r5
+; CHECK-BE-P8-NEXT:    vperm v2, v3, v2, v4
+; CHECK-BE-P8-NEXT:    xxlxor v3, v3, v3
+; CHECK-BE-P8-NEXT:    vmrghh v2, v3, v2
+; CHECK-BE-P8-NEXT:    stxvw4x v2, 0, r3
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v2i64_v2i64:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    lxsd v2, 0(r3)
+; CHECK-BE-P9-NEXT:    addis r3, r2, .LCPI12_0@toc@ha
+; CHECK-BE-P9-NEXT:    lxsd v3, 0(r4)
+; CHECK-BE-P9-NEXT:    addi r3, r3, .LCPI12_0@toc@l
+; CHECK-BE-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-BE-P9-NEXT:    vperm v2, v3, v2, v4
+; CHECK-BE-P9-NEXT:    xxlxor v3, v3, v3
+; CHECK-BE-P9-NEXT:    vmrghh v2, v3, v2
+; CHECK-BE-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v2i64_v2i64:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    ld r5, L..C10(r2) # %const.0
+; CHECK-AIX-64-P8-NEXT:    lxsdx v2, 0, r3
+; CHECK-AIX-64-P8-NEXT:    lxsdx v3, 0, r4
+; CHECK-AIX-64-P8-NEXT:    lxvw4x v4, 0, r5
+; CHECK-AIX-64-P8-NEXT:    vperm v2, v3, v2, v4
+; CHECK-AIX-64-P8-NEXT:    xxlxor v3, v3, v3
+; CHECK-AIX-64-P8-NEXT:    vmrghh v2, v3, v2
+; CHECK-AIX-64-P8-NEXT:    stxvw4x v2, 0, r3
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v2i64_v2i64:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    lxsd v2, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    ld r3, L..C9(r2) # %const.0
+; CHECK-AIX-64-P9-NEXT:    lxsd v3, 0(r4)
+; CHECK-AIX-64-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    vperm v2, v3, v2, v4
+; CHECK-AIX-64-P9-NEXT:    xxlxor v3, v3, v3
+; CHECK-AIX-64-P9-NEXT:    vmrghh v2, v3, v2
+; CHECK-AIX-64-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v2i64_v2i64:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    lwz r5, L..C10(r2) # %const.0
+; CHECK-AIX-32-P8-NEXT:    lxsiwzx v2, 0, r3
+; CHECK-AIX-32-P8-NEXT:    lxsiwzx v3, 0, r4
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v4, 0, r5
+; CHECK-AIX-32-P8-NEXT:    vperm v2, v3, v2, v4
+; CHECK-AIX-32-P8-NEXT:    xxlxor v3, v3, v3
+; CHECK-AIX-32-P8-NEXT:    vmrghh v2, v3, v2
+; CHECK-AIX-32-P8-NEXT:    stxvw4x v2, 0, r3
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v2i64_v2i64:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    lxsiwzx v2, 0, r3
+; CHECK-AIX-32-P9-NEXT:    lwz r3, L..C9(r2) # %const.0
+; CHECK-AIX-32-P9-NEXT:    lxsiwzx v3, 0, r4
+; CHECK-AIX-32-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    vperm v2, v3, v2, v4
+; CHECK-AIX-32-P9-NEXT:    xxlxor v3, v3, v3
+; CHECK-AIX-32-P9-NEXT:    vmrghh v2, v3, v2
+; CHECK-AIX-32-P9-NEXT:    stxv v2, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %0 = load <4 x i16>, ptr %ptr1, align 1
+  %1 = load <4 x i16>, ptr %ptr2, align 1
+  %shuffle1 = shufflevector <4 x i16> %0, <4 x i16> %1, <4 x i32> <i32 4, i32 5, i32 1, i32 0>
+  %2 = zext <4 x i16> %shuffle1 to <4 x i32>
+  store <4 x i32> %2, ptr undef, align 16
+  ret void
+}
+
+define <16 x i8> @test_v2i64_v4i32(ptr %a, ptr %b) local_unnamed_addr {
+; CHECK-LE-P8-LABEL: test_v2i64_v4i32:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    lfiwzx f0, 0, r3
+; CHECK-LE-P8-NEXT:    lfdx f1, 0, r4
+; CHECK-LE-P8-NEXT:    xxswapd v2, f0
+; CHECK-LE-P8-NEXT:    xxswapd v3, f1
+; CHECK-LE-P8-NEXT:    vmrglh v2, v2, v3
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v2i64_v4i32:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    lfiwzx f0, 0, r3
+; CHECK-LE-P9-NEXT:    xxswapd v2, f0
+; CHECK-LE-P9-NEXT:    lfd f0, 0(r4)
+; CHECK-LE-P9-NEXT:    xxswapd v3, f0
+; CHECK-LE-P9-NEXT:    vmrglh v2, v2, v3
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v2i64_v4i32:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    lfiwzx f0, 0, r3
+; CHECK-BE-P8-NEXT:    lxsdx v3, 0, r4
+; CHECK-BE-P8-NEXT:    xxsldwi v2, f0, f0, 1
+; CHECK-BE-P8-NEXT:    vmrghh v2, v3, v2
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v2i64_v4i32:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    lfiwzx f0, 0, r3
+; CHECK-BE-P9-NEXT:    lxsd v3, 0(r4)
+; CHECK-BE-P9-NEXT:    xxsldwi v2, f0, f0, 1
+; CHECK-BE-P9-NEXT:    vmrghh v2, v3, v2
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v2i64_v4i32:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    lfiwzx f0, 0, r3
+; CHECK-AIX-64-P8-NEXT:    lxsdx v3, 0, r4
+; CHECK-AIX-64-P8-NEXT:    xxsldwi v2, f0, f0, 1
+; CHECK-AIX-64-P8-NEXT:    vmrghh v2, v3, v2
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v2i64_v4i32:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    lfiwzx f0, 0, r3
+; CHECK-AIX-64-P9-NEXT:    lxsd v3, 0(r4)
+; CHECK-AIX-64-P9-NEXT:    xxsldwi v2, f0, f0, 1
+; CHECK-AIX-64-P9-NEXT:    vmrghh v2, v3, v2
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v2i64_v4i32:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    lwz r5, L..C11(r2) # %const.0
+; CHECK-AIX-32-P8-NEXT:    lxsiwzx v2, 0, r3
+; CHECK-AIX-32-P8-NEXT:    lxsiwzx v3, 0, r4
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v4, 0, r5
+; CHECK-AIX-32-P8-NEXT:    vperm v2, v3, v2, v4
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v2i64_v4i32:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    lxsiwzx v2, 0, r3
+; CHECK-AIX-32-P9-NEXT:    lwz r3, L..C10(r2) # %const.0
+; CHECK-AIX-32-P9-NEXT:    lxsiwzx v3, 0, r4
+; CHECK-AIX-32-P9-NEXT:    lxv v4, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    vperm v2, v3, v2, v4
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %0 = load <2 x i8>, ptr %a, align 4
+  %bc1 = bitcast <2 x i8> %0 to i16
+  %vecinit3 = insertelement <8 x i16> poison, i16 %bc1, i64 0
+  %1 = load <2 x i8>, ptr %b, align 8
+  %bc2 = bitcast <2 x i8> %1 to i16
+  %vecinit6 = insertelement <8 x i16> undef, i16 %bc2, i64 0
+  %2 = bitcast <8 x i16> %vecinit3 to <16 x i8>
+  %3 = bitcast <8 x i16> %vecinit6 to <16 x i8>
+  %shuffle = shufflevector <16 x i8> %3, <16 x i8> %2, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @test_v2i64_v8i16(ptr %a, ptr %b) local_unnamed_addr {
+; CHECK-LE-P8-LABEL: test_v2i64_v8i16:
+; CHECK-LE-P8:       # %bb.0: # %entry
+; CHECK-LE-P8-NEXT:    lhz r3, 0(r3)
+; CHECK-LE-P8-NEXT:    lfdx f0, 0, r4
+; CHECK-LE-P8-NEXT:    mtfprd f1, r3
+; CHECK-LE-P8-NEXT:    xxswapd v3, f0
+; CHECK-LE-P8-NEXT:    xxswapd v2, vs1
+; CHECK-LE-P8-NEXT:    vmrglh v2, v2, v3
+; CHECK-LE-P8-NEXT:    blr
+;
+; CHECK-LE-P9-LABEL: test_v2i64_v8i16:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    lxsihzx v2, 0, r3
+; CHECK-LE-P9-NEXT:    lfd f0, 0(r4)
+; CHECK-LE-P9-NEXT:    xxswapd v3, f0
+; CHECK-LE-P9-NEXT:    vsplth v2, v2, 3
+; CHECK-LE-P9-NEXT:    vmrglh v2, v2, v3
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P8-LABEL: test_v2i64_v8i16:
+; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    lhz r3, 0(r3)
+; CHECK-BE-P8-NEXT:    lxsdx v2, 0, r4
+; CHECK-BE-P8-NEXT:    sldi r3, r3, 48
+; CHECK-BE-P8-NEXT:    mtvsrd v3, r3
+; CHECK-BE-P8-NEXT:    vmrghh v2, v2, v3
+; CHECK-BE-P8-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_v2i64_v8i16:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    lxsihzx v2, 0, r3
+; CHECK-BE-P9-NEXT:    lxsd v3, 0(r4)
+; CHECK-BE-P9-NEXT:    vsplth v2, v2, 3
+; CHECK-BE-P9-NEXT:    vmrghh v2, v3, v2
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P8-LABEL: test_v2i64_v8i16:
+; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    lhz r3, 0(r3)
+; CHECK-AIX-64-P8-NEXT:    lxsdx v2, 0, r4
+; CHECK-AIX-64-P8-NEXT:    sldi r3, r3, 48
+; CHECK-AIX-64-P8-NEXT:    mtvsrd v3, r3
+; CHECK-AIX-64-P8-NEXT:    vmrghh v2, v2, v3
+; CHECK-AIX-64-P8-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_v2i64_v8i16:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    lxsihzx v2, 0, r3
+; CHECK-AIX-64-P9-NEXT:    lxsd v3, 0(r4)
+; CHECK-AIX-64-P9-NEXT:    vsplth v2, v2, 3
+; CHECK-AIX-64-P9-NEXT:    vmrghh v2, v3, v2
+; CHECK-AIX-64-P9-NEXT:    blr
+;
+; CHECK-AIX-32-P8-LABEL: test_v2i64_v8i16:
+; CHECK-AIX-32-P8:       # %bb.0: # %entry
+; CHECK-AIX-32-P8-NEXT:    lhz r3, 0(r3)
+; CHECK-AIX-32-P8-NEXT:    sth r3, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    lwz r3, 0(r4)
+; CHECK-AIX-32-P8-NEXT:    addi r4, r1, -32
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v2, 0, r4
+; CHECK-AIX-32-P8-NEXT:    stw r3, -16(r1)
+; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
+; CHECK-AIX-32-P8-NEXT:    lxvw4x v3, 0, r3
+; CHECK-AIX-32-P8-NEXT:    vmrghh v2, v3, v2
+; CHECK-AIX-32-P8-NEXT:    blr
+;
+; CHECK-AIX-32-P9-LABEL: test_v2i64_v8i16:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    lhz r3, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    sth r3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r3, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    lxv v2, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lxv v3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    vmrghh v2, v3, v2
+; CHECK-AIX-32-P9-NEXT:    blr
+entry:
+  %0 = load <2 x i8>, ptr %a
+  %bc1 = bitcast <2 x i8> %0 to i16
+  %vecinit3 = insertelement <8 x i16> poison, i16 %bc1, i64 0
+  %1 = load <2 x i8>, ptr %b, align 8
+  %bc2 = bitcast <2 x i8> %1 to i16
+  %vecinit6 = insertelement <8 x i16> undef, i16 %bc2, i64 0
+  %2 = bitcast <8 x i16> %vecinit3 to <16 x i8>
+  %3 = bitcast <8 x i16> %vecinit6 to <16 x i8>
+  %shuffle = shufflevector <16 x i8> %3, <16 x i8> %2, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %shuffle
+}
diff --git a/llvm/test/CodeGen/RISCV/iabs.ll b/llvm/test/CodeGen/RISCV/iabs.ll
--- a/llvm/test/CodeGen/RISCV/iabs.ll
+++ b/llvm/test/CodeGen/RISCV/iabs.ll
@@ -727,9 +727,9 @@
 ;
 ; RV64ZBB-LABEL: zext_abs32:
 ; RV64ZBB:       # %bb.0:
-; RV64ZBB-NEXT:    sext.w a0, a0
-; RV64ZBB-NEXT:    negw a1, a0
-; RV64ZBB-NEXT:    max a0, a0, a1
+; RV64ZBB-NEXT:    sext.w a1, a0
+; RV64ZBB-NEXT:    negw a0, a0
+; RV64ZBB-NEXT:    max a0, a1, a0
 ; RV64ZBB-NEXT:    ret
 ;
 ; RV64ZBT-LABEL: zext_abs32:
diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll
--- a/llvm/test/CodeGen/RISCV/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll
@@ -9,8 +9,7 @@
 define signext i32 @ctlz_i32(i32 signext %a) nounwind {
 ; RV64I-LABEL: ctlz_i32:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    sext.w a1, a0
-; RV64I-NEXT:    beqz a1, .LBB0_2
+; RV64I-NEXT:    beqz a0, .LBB0_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
@@ -63,8 +62,7 @@
 define signext i32 @log2_i32(i32 signext %a) nounwind {
 ; RV64I-LABEL: log2_i32:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    sext.w a1, a0
-; RV64I-NEXT:    beqz a1, .LBB1_2
+; RV64I-NEXT:    beqz a0, .LBB1_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
@@ -368,34 +366,34 @@
 define signext i32 @cttz_i32(i32 signext %a) nounwind {
 ; RV64I-LABEL: cttz_i32:
 ; RV64I:       # %bb.0:
+; RV64I-NEXT:    beqz a0, .LBB6_4
+; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sext.w s0, a0
-; RV64I-NEXT:    beqz s0, .LBB6_3
-; RV64I-NEXT:  # %bb.1: # %cond.false
-; RV64I-NEXT:    neg a1, a0
-; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    mv s0, a0
+; RV64I-NEXT:    neg a0, a0
+; RV64I-NEXT:    and a0, s0, a0
 ; RV64I-NEXT:    lui a1, 30667
 ; RV64I-NEXT:    addiw a1, a1, 1329
 ; RV64I-NEXT:    call __muldi3@plt
 ; RV64I-NEXT:    mv a1, a0
 ; RV64I-NEXT:    li a0, 32
-; RV64I-NEXT:    beqz s0, .LBB6_4
+; RV64I-NEXT:    beqz s0, .LBB6_3
 ; RV64I-NEXT:  # %bb.2: # %cond.false
 ; RV64I-NEXT:    srliw a0, a1, 27
 ; RV64I-NEXT:    lui a1, %hi(.LCPI6_0)
 ; RV64I-NEXT:    addi a1, a1, %lo(.LCPI6_0)
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    lbu a0, 0(a0)
-; RV64I-NEXT:    j .LBB6_4
-; RV64I-NEXT:  .LBB6_3:
-; RV64I-NEXT:    li a0, 32
-; RV64I-NEXT:  .LBB6_4: # %cond.end
+; RV64I-NEXT:  .LBB6_3: # %cond.false
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
+; RV64I-NEXT:  .LBB6_4:
+; RV64I-NEXT:    li a0, 32
+; RV64I-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: cttz_i32:
 ; RV64ZBB:       # %bb.0:
@@ -928,7 +926,7 @@
 define signext i32 @abs_i32_sext(i32 signext %x) {
 ; RV64I-LABEL: abs_i32_sext:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    sraiw a1, a0, 31
+; RV64I-NEXT:    srai a1, a0, 31
 ; RV64I-NEXT:    xor a0, a0, a1
 ; RV64I-NEXT:    subw a0, a0, a1
 ; RV64I-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
@@ -1725,17 +1725,10 @@
 ;
 ; RV64ZVE32F-LABEL: mscatter_v2i64_truncstore_v2i32:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    addi sp, sp, -16
-; RV64ZVE32F-NEXT:    .cfi_def_cfa_offset 16
-; RV64ZVE32F-NEXT:    sw a1, 12(sp)
-; RV64ZVE32F-NEXT:    sw a0, 8(sp)
-; RV64ZVE32F-NEXT:    addi a0, sp, 12
-; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, mu
-; RV64ZVE32F-NEXT:    vle32.v v9, (a0)
-; RV64ZVE32F-NEXT:    addi a0, sp, 8
-; RV64ZVE32F-NEXT:    vle32.v v8, (a0)
-; RV64ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, tu, mu
-; RV64ZVE32F-NEXT:    vslideup.vi v8, v9, 1
+; RV64ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, mu
+; RV64ZVE32F-NEXT:    vmv.v.x v8, a1
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e32, m1, tu, mu
+; RV64ZVE32F-NEXT:    vmv.s.x v8, a0
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e8, mf4, ta, mu
 ; RV64ZVE32F-NEXT:    vmv.x.s a0, v0
 ; RV64ZVE32F-NEXT:    andi a1, a0, 1
@@ -1744,7 +1737,6 @@
 ; RV64ZVE32F-NEXT:    andi a0, a0, 2
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB24_4
 ; RV64ZVE32F-NEXT:  .LBB24_2: # %else2
-; RV64ZVE32F-NEXT:    addi sp, sp, 16
 ; RV64ZVE32F-NEXT:    ret
 ; RV64ZVE32F-NEXT:  .LBB24_3: # %cond.store
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, mu
@@ -1755,7 +1747,6 @@
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, mu
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
 ; RV64ZVE32F-NEXT:    vse32.v v8, (a3)
-; RV64ZVE32F-NEXT:    addi sp, sp, 16
 ; RV64ZVE32F-NEXT:    ret
   %tval = trunc <2 x i64> %val to <2 x i32>
   call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %tval, <2 x i32*> %ptrs, i32 4, <2 x i1> %m)
diff --git a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
@@ -0,0 +1,295 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen2/i64/g' %s | llc -mtriple=riscv32 -mattr=+m | \
+; RUN:   FileCheck %s --check-prefix=RV32
+; RUN: sed 's/iXLen2/i128/g' %s | llc -mtriple=riscv64 -mattr=+m | \
+; RUN:   FileCheck %s --check-prefix=RV64
+
+define iXLen2 @test_udiv_3(iXLen2 %x) nounwind {
+; RV32-LABEL: test_udiv_3:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    li a2, 3
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __udivdi3@plt
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test_udiv_3:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    li a2, 3
+; RV64-NEXT:    li a3, 0
+; RV64-NEXT:    call __udivti3@plt
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+  %a = udiv iXLen2 %x, 3
+  ret iXLen2 %a
+}
+
+define iXLen2 @test_udiv_5(iXLen2 %x) nounwind {
+; RV32-LABEL: test_udiv_5:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    li a2, 5
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __udivdi3@plt
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test_udiv_5:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    li a2, 5
+; RV64-NEXT:    li a3, 0
+; RV64-NEXT:    call __udivti3@plt
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+  %a = udiv iXLen2 %x, 5
+  ret iXLen2 %a
+}
+
+define iXLen2 @test_udiv_7(iXLen2 %x) nounwind {
+; RV32-LABEL: test_udiv_7:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    li a2, 7
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __udivdi3@plt
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test_udiv_7:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    li a2, 7
+; RV64-NEXT:    li a3, 0
+; RV64-NEXT:    call __udivti3@plt
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+  %a = udiv iXLen2 %x, 7
+  ret iXLen2 %a
+}
+
+define iXLen2 @test_udiv_9(iXLen2 %x) nounwind {
+; RV32-LABEL: test_udiv_9:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    li a2, 9
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __udivdi3@plt
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test_udiv_9:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    li a2, 9
+; RV64-NEXT:    li a3, 0
+; RV64-NEXT:    call __udivti3@plt
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+  %a = udiv iXLen2 %x, 9
+  ret iXLen2 %a
+}
+
+define iXLen2 @test_udiv_15(iXLen2 %x) nounwind {
+; RV32-LABEL: test_udiv_15:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    li a2, 15
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __udivdi3@plt
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test_udiv_15:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    li a2, 15
+; RV64-NEXT:    li a3, 0
+; RV64-NEXT:    call __udivti3@plt
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+  %a = udiv iXLen2 %x, 15
+  ret iXLen2 %a
+}
+
+define iXLen2 @test_udiv_17(iXLen2 %x) nounwind {
+; RV32-LABEL: test_udiv_17:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    li a2, 17
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __udivdi3@plt
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test_udiv_17:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    li a2, 17
+; RV64-NEXT:    li a3, 0
+; RV64-NEXT:    call __udivti3@plt
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+  %a = udiv iXLen2 %x, 17
+  ret iXLen2 %a
+}
+
+define iXLen2 @test_udiv_255(iXLen2 %x) nounwind {
+; RV32-LABEL: test_udiv_255:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    li a2, 255
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __udivdi3@plt
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test_udiv_255:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 0
+; RV64-NEXT:    call __udivti3@plt
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+  %a = udiv iXLen2 %x, 255
+  ret iXLen2 %a
+}
+
+define iXLen2 @test_udiv_257(iXLen2 %x) nounwind {
+; RV32-LABEL: test_udiv_257:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    li a2, 257
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __udivdi3@plt
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test_udiv_257:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    li a2, 257
+; RV64-NEXT:    li a3, 0
+; RV64-NEXT:    call __udivti3@plt
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+  %a = udiv iXLen2 %x, 257
+  ret iXLen2 %a
+}
+
+define iXLen2 @test_udiv_65535(iXLen2 %x) nounwind {
+; RV32-LABEL: test_udiv_65535:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lui a2, 16
+; RV32-NEXT:    addi a2, a2, -1
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __udivdi3@plt
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test_udiv_65535:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    lui a2, 16
+; RV64-NEXT:    addiw a2, a2, -1
+; RV64-NEXT:    li a3, 0
+; RV64-NEXT:    call __udivti3@plt
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+  %a = udiv iXLen2 %x, 65535
+  ret iXLen2 %a
+}
+
+define iXLen2 @test_udiv_65537(iXLen2 %x) nounwind {
+; RV32-LABEL: test_udiv_65537:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lui a2, 16
+; RV32-NEXT:    addi a2, a2, 1
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __udivdi3@plt
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test_udiv_65537:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    lui a2, 16
+; RV64-NEXT:    addiw a2, a2, 1
+; RV64-NEXT:    li a3, 0
+; RV64-NEXT:    call __udivti3@plt
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+  %a = udiv iXLen2 %x, 65537
+  ret iXLen2 %a
+}
+
+define iXLen2 @test_udiv_12(iXLen2 %x) nounwind {
+; RV32-LABEL: test_udiv_12:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    li a2, 12
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __udivdi3@plt
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test_udiv_12:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    li a2, 12
+; RV64-NEXT:    li a3, 0
+; RV64-NEXT:    call __udivti3@plt
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+  %a = udiv iXLen2 %x, 12
+  ret iXLen2 %a
+}
diff --git a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
@@ -0,0 +1,296 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen2/i64/g' %s | llc -mtriple=riscv32 -mattr=+m | \
+; RUN:   FileCheck %s --check-prefix=RV32
+; RUN: sed 's/iXLen2/i128/g' %s | llc -mtriple=riscv64 -mattr=+m | \
+; RUN:   FileCheck %s --check-prefix=RV64
+
+define iXLen2 @test_urem_3(iXLen2 %x) nounwind {
+; RV32-LABEL: test_urem_3:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    li a2, 3
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __umoddi3@plt
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test_urem_3:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    li a2, 3
+; RV64-NEXT:    li a3, 0
+; RV64-NEXT:    call __umodti3@plt
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+  %a = urem iXLen2 %x, 3
+  ret iXLen2 %a
+}
+
+define iXLen2 @test_urem_5(iXLen2 %x) nounwind {
+; RV32-LABEL: test_urem_5:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    li a2, 5
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __umoddi3@plt
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test_urem_5:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    li a2, 5
+; RV64-NEXT:    li a3, 0
+; RV64-NEXT:    call __umodti3@plt
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+  %a = urem iXLen2 %x, 5
+  ret iXLen2 %a
+}
+
+define iXLen2 @test_urem_7(iXLen2 %x) nounwind {
+; RV32-LABEL: test_urem_7:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    li a2, 7
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __umoddi3@plt
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test_urem_7:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    li a2, 7
+; RV64-NEXT:    li a3, 0
+; RV64-NEXT:    call __umodti3@plt
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+  %a = urem iXLen2 %x, 7
+  ret iXLen2 %a
+}
+
+define iXLen2 @test_urem_9(iXLen2 %x) nounwind {
+; RV32-LABEL: test_urem_9:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    li a2, 9
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __umoddi3@plt
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test_urem_9:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    li a2, 9
+; RV64-NEXT:    li a3, 0
+; RV64-NEXT:    call __umodti3@plt
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+  %a = urem iXLen2 %x, 9
+  ret iXLen2 %a
+}
+
+define iXLen2 @test_urem_15(iXLen2 %x) nounwind {
+; RV32-LABEL: test_urem_15:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    li a2, 15
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __umoddi3@plt
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test_urem_15:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    li a2, 15
+; RV64-NEXT:    li a3, 0
+; RV64-NEXT:    call __umodti3@plt
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+  %a = urem iXLen2 %x, 15
+  ret iXLen2 %a
+}
+
+define iXLen2 @test_urem_17(iXLen2 %x) nounwind {
+; RV32-LABEL: test_urem_17:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    li a2, 17
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __umoddi3@plt
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test_urem_17:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    li a2, 17
+; RV64-NEXT:    li a3, 0
+; RV64-NEXT:    call __umodti3@plt
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+  %a = urem iXLen2 %x, 17
+  ret iXLen2 %a
+}
+
+define iXLen2 @test_urem_255(iXLen2 %x) nounwind {
+; RV32-LABEL: test_urem_255:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    li a2, 255
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __umoddi3@plt
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test_urem_255:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 0
+; RV64-NEXT:    call __umodti3@plt
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+  %a = urem iXLen2 %x, 255
+  ret iXLen2 %a
+}
+
+define iXLen2 @test_urem_257(iXLen2 %x) nounwind {
+; RV32-LABEL: test_urem_257:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    li a2, 257
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __umoddi3@plt
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test_urem_257:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    li a2, 257
+; RV64-NEXT:    li a3, 0
+; RV64-NEXT:    call __umodti3@plt
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+  %a = urem iXLen2 %x, 257
+  ret iXLen2 %a
+}
+
+define iXLen2 @test_urem_65535(iXLen2 %x) nounwind {
+; RV32-LABEL: test_urem_65535:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lui a2, 16
+; RV32-NEXT:    addi a2, a2, -1
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __umoddi3@plt
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test_urem_65535:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    lui a2, 16
+; RV64-NEXT:    addiw a2, a2, -1
+; RV64-NEXT:    li a3, 0
+; RV64-NEXT:    call __umodti3@plt
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+  %a = urem iXLen2 %x, 65535
+  ret iXLen2 %a
+}
+
+define iXLen2 @test_urem_65537(iXLen2 %x) nounwind {
+; RV32-LABEL: test_urem_65537:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lui a2, 16
+; RV32-NEXT:    addi a2, a2, 1
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __umoddi3@plt
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test_urem_65537:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    lui a2, 16
+; RV64-NEXT:    addiw a2, a2, 1
+; RV64-NEXT:    li a3, 0
+; RV64-NEXT:    call __umodti3@plt
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+  %a = urem iXLen2 %x, 65537
+  ret iXLen2 %a
+}
+
+define iXLen2 @test_urem_12(iXLen2 %x) nounwind {
+; RV32-LABEL: test_urem_12:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    li a2, 12
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __umoddi3@plt
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test_urem_12:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    li a2, 12
+; RV64-NEXT:    li a3, 0
+; RV64-NEXT:    call __umodti3@plt
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+  %a = urem iXLen2 %x, 12
+  ret iXLen2 %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/trunc-free.ll b/llvm/test/CodeGen/RISCV/trunc-free.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/trunc-free.ll
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=riscv64 | FileCheck %s
+
+; Make sure we use lwu for the load, and don't emit
+; a sext.w for the compare. This requires isTruncateFree
+; to return true for i64->i32. Otherwise we emit a
+; lw and a shift pair for the zext.
+
+define void @foo(i32* %p, i64* %q, i32* %r) {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lwu a0, 0(a0)
+; CHECK-NEXT:    sd a0, 0(a1)
+; CHECK-NEXT:    beqz a0, .LBB0_2
+; CHECK-NEXT:  # %bb.1: # %if
+; CHECK-NEXT:    sw a0, 0(a2)
+; CHECK-NEXT:  .LBB0_2: # %end
+; CHECK-NEXT:    ret
+  %a = load i32, i32* %p
+  %b = zext i32 %a to i64
+  store i64 %b, i64* %q
+  %c = icmp ne i32 %a, 0
+  br i1 %c, label %if, label %end
+
+if:
+  store i32 %a, i32* %r
+  br label %end
+
+end:
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/avx-insertelt.ll b/llvm/test/CodeGen/X86/avx-insertelt.ll
--- a/llvm/test/CodeGen/X86/avx-insertelt.ll
+++ b/llvm/test/CodeGen/X86/avx-insertelt.ll
@@ -422,7 +422,7 @@
 define <4 x i64> @insert_i64_two_elts_of_high_subvector(<4 x i64> %x, i64 %s) {
 ; AVX-LABEL: insert_i64_two_elts_of_high_subvector:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpinsrq $0, %rdi, %xmm0, %xmm1
+; AVX-NEXT:    vmovq %rdi, %xmm1
 ; AVX-NEXT:    vpinsrq $1, %rdi, %xmm1, %xmm1
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX-NEXT:    retq
@@ -523,7 +523,7 @@
 define <4 x i64> @insert_i64_two_elts_of_low_subvector(<4 x i64> %x, i64 %s) {
 ; AVX-LABEL: insert_i64_two_elts_of_low_subvector:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpinsrq $0, %rdi, %xmm0, %xmm1
+; AVX-NEXT:    vmovq %rdi, %xmm1
 ; AVX-NEXT:    vpinsrq $1, %rdi, %xmm1, %xmm1
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/combine-mul.ll b/llvm/test/CodeGen/X86/combine-mul.ll
--- a/llvm/test/CodeGen/X86/combine-mul.ll
+++ b/llvm/test/CodeGen/X86/combine-mul.ll
@@ -80,13 +80,13 @@
 define <4 x i64> @combine_vec_mul_pow2c(<4 x i64> %x) {
 ; SSE-LABEL: combine_vec_mul_pow2c:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    psllq $1, %xmm2
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
 ; SSE-NEXT:    movdqa %xmm1, %xmm2
 ; SSE-NEXT:    psllq $4, %xmm2
 ; SSE-NEXT:    psllq $2, %xmm1
 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    paddq %xmm0, %xmm2
+; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_mul_pow2c:
diff --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll
--- a/llvm/test/CodeGen/X86/divide-by-constant.ll
+++ b/llvm/test/CodeGen/X86/divide-by-constant.ll
@@ -456,3 +456,481 @@
   %6 = insertvalue { i64, i32 } %5, i32 %4, 1
   ret { i64, i32 } %6
 }
+
+define i64 @urem_i64_3(i64 %x) nounwind {
+; X32-LABEL: urem_i64_3:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    subl $12, %esp
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $3
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    calll __umoddi3
+; X32-NEXT:    addl $28, %esp
+; X32-NEXT:    retl
+;
+; X64-LABEL: urem_i64_3:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    shrq %rdx
+; X64-NEXT:    leaq (%rdx,%rdx,2), %rax
+; X64-NEXT:    subq %rax, %rdi
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    retq
+entry:
+  %rem = urem i64 %x, 3
+  ret i64 %rem
+}
+
+define i64 @urem_i64_5(i64 %x) nounwind {
+; X32-LABEL: urem_i64_5:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    subl $12, %esp
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $5
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    calll __umoddi3
+; X32-NEXT:    addl $28, %esp
+; X32-NEXT:    retl
+;
+; X64-LABEL: urem_i64_5:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movabsq $-3689348814741910323, %rcx # imm = 0xCCCCCCCCCCCCCCCD
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    shrq $2, %rdx
+; X64-NEXT:    leaq (%rdx,%rdx,4), %rax
+; X64-NEXT:    subq %rax, %rdi
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    retq
+entry:
+  %rem = urem i64 %x, 5
+  ret i64 %rem
+}
+
+define i64 @urem_i64_15(i64 %x) nounwind {
+; X32-LABEL: urem_i64_15:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    subl $12, %esp
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $15
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    calll __umoddi3
+; X32-NEXT:    addl $28, %esp
+; X32-NEXT:    retl
+;
+; X64-LABEL: urem_i64_15:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    shrq $3, %rdx
+; X64-NEXT:    leaq (%rdx,%rdx,4), %rax
+; X64-NEXT:    leaq (%rax,%rax,2), %rax
+; X64-NEXT:    subq %rax, %rdi
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    retq
+entry:
+  %rem = urem i64 %x, 15
+  ret i64 %rem
+}
+
+define i64 @urem_i64_17(i64 %x) nounwind {
+; X32-LABEL: urem_i64_17:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    subl $12, %esp
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $17
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    calll __umoddi3
+; X32-NEXT:    addl $28, %esp
+; X32-NEXT:    retl
+;
+; X64-LABEL: urem_i64_17:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movabsq $-1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F1
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    andq $-16, %rax
+; X64-NEXT:    shrq $4, %rdx
+; X64-NEXT:    addq %rax, %rdx
+; X64-NEXT:    subq %rdx, %rdi
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    retq
+entry:
+  %rem = urem i64 %x, 17
+  ret i64 %rem
+}
+
+define i64 @urem_i64_255(i64 %x) nounwind {
+; X32-LABEL: urem_i64_255:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    subl $12, %esp
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $255
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    calll __umoddi3
+; X32-NEXT:    addl $28, %esp
+; X32-NEXT:    retl
+;
+; X64-LABEL: urem_i64_255:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movabsq $-9187201950435737471, %rcx # imm = 0x8080808080808081
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    shrq $7, %rdx
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    shlq $8, %rax
+; X64-NEXT:    subq %rax, %rdx
+; X64-NEXT:    leaq (%rdx,%rdi), %rax
+; X64-NEXT:    retq
+entry:
+  %rem = urem i64 %x, 255
+  ret i64 %rem
+}
+
+define i64 @urem_i64_257(i64 %x) nounwind {
+; X32-LABEL: urem_i64_257:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    subl $12, %esp
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $257 # imm = 0x101
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    calll __umoddi3
+; X32-NEXT:    addl $28, %esp
+; X32-NEXT:    retl
+;
+; X64-LABEL: urem_i64_257:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movabsq $-71777214294589695, %rcx # imm = 0xFF00FF00FF00FF01
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    andq $-256, %rax
+; X64-NEXT:    shrq $8, %rdx
+; X64-NEXT:    addq %rax, %rdx
+; X64-NEXT:    subq %rdx, %rdi
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    retq
+entry:
+  %rem = urem i64 %x, 257
+  ret i64 %rem
+}
+
+define i64 @urem_i64_65535(i64 %x) nounwind {
+; X32-LABEL: urem_i64_65535:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    subl $12, %esp
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $65535 # imm = 0xFFFF
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    calll __umoddi3
+; X32-NEXT:    addl $28, %esp
+; X32-NEXT:    retl
+;
+; X64-LABEL: urem_i64_65535:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movabsq $-9223231297218904063, %rcx # imm = 0x8000800080008001
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    shrq $15, %rdx
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    shlq $16, %rax
+; X64-NEXT:    subq %rax, %rdx
+; X64-NEXT:    leaq (%rdx,%rdi), %rax
+; X64-NEXT:    retq
+entry:
+  %rem = urem i64 %x, 65535
+  ret i64 %rem
+}
+
+define i64 @urem_i64_65537(i64 %x) nounwind {
+; X32-LABEL: urem_i64_65537:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    subl $12, %esp
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $65537 # imm = 0x10001
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    calll __umoddi3
+; X32-NEXT:    addl $28, %esp
+; X32-NEXT:    retl
+;
+; X64-LABEL: urem_i64_65537:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movabsq $-281470681808895, %rcx # imm = 0xFFFF0000FFFF0001
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    andq $-65536, %rax # imm = 0xFFFF0000
+; X64-NEXT:    shrq $16, %rdx
+; X64-NEXT:    addq %rax, %rdx
+; X64-NEXT:    subq %rdx, %rdi
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    retq
+entry:
+  %rem = urem i64 %x, 65537
+  ret i64 %rem
+}
+
+define i64 @urem_i64_12(i64 %x) nounwind {
+; X32-LABEL: urem_i64_12:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    subl $12, %esp
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $12
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    calll __umoddi3
+; X32-NEXT:    addl $28, %esp
+; X32-NEXT:    retl
+;
+; X64-LABEL: urem_i64_12:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    shrq %rdx
+; X64-NEXT:    andq $-4, %rdx
+; X64-NEXT:    leaq (%rdx,%rdx,2), %rax
+; X64-NEXT:    subq %rax, %rdi
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    retq
+entry:
+  %rem = urem i64 %x, 12
+  ret i64 %rem
+}
+
+define i64 @udiv_i64_3(i64 %x) nounwind {
+; X32-LABEL: udiv_i64_3:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    subl $12, %esp
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $3
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    calll __udivdi3
+; X32-NEXT:    addl $28, %esp
+; X32-NEXT:    retl
+;
+; X64-LABEL: udiv_i64_3:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    shrq %rax
+; X64-NEXT:    retq
+entry:
+  %rem = udiv i64 %x, 3
+  ret i64 %rem
+}
+
+define i64 @udiv_i64_5(i64 %x) nounwind {
+; X32-LABEL: udiv_i64_5:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    subl $12, %esp
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $5
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    calll __udivdi3
+; X32-NEXT:    addl $28, %esp
+; X32-NEXT:    retl
+;
+; X64-LABEL: udiv_i64_5:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movabsq $-3689348814741910323, %rcx # imm = 0xCCCCCCCCCCCCCCCD
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    shrq $2, %rax
+; X64-NEXT:    retq
+entry:
+  %rem = udiv i64 %x, 5
+  ret i64 %rem
+}
+
+define i64 @udiv_i64_15(i64 %x) nounwind {
+; X32-LABEL: udiv_i64_15:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    subl $12, %esp
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $15
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    calll __udivdi3
+; X32-NEXT:    addl $28, %esp
+; X32-NEXT:    retl
+;
+; X64-LABEL: udiv_i64_15:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    shrq $3, %rax
+; X64-NEXT:    retq
+entry:
+  %rem = udiv i64 %x, 15
+  ret i64 %rem
+}
+
+define i64 @udiv_i64_17(i64 %x) nounwind {
+; X32-LABEL: udiv_i64_17:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    subl $12, %esp
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $17
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    calll __udivdi3
+; X32-NEXT:    addl $28, %esp
+; X32-NEXT:    retl
+;
+; X64-LABEL: udiv_i64_17:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movabsq $-1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F1
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    shrq $4, %rax
+; X64-NEXT:    retq
+entry:
+  %rem = udiv i64 %x, 17
+  ret i64 %rem
+}
+
+define i64 @udiv_i64_255(i64 %x) nounwind {
+; X32-LABEL: udiv_i64_255:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    subl $12, %esp
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $255
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    calll __udivdi3
+; X32-NEXT:    addl $28, %esp
+; X32-NEXT:    retl
+;
+; X64-LABEL: udiv_i64_255:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movabsq $-9187201950435737471, %rcx # imm = 0x8080808080808081
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    shrq $7, %rax
+; X64-NEXT:    retq
+entry:
+  %rem = udiv i64 %x, 255
+  ret i64 %rem
+}
+
+define i64 @udiv_i64_257(i64 %x) nounwind {
+; X32-LABEL: udiv_i64_257:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    subl $12, %esp
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $257 # imm = 0x101
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    calll __udivdi3
+; X32-NEXT:    addl $28, %esp
+; X32-NEXT:    retl
+;
+; X64-LABEL: udiv_i64_257:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movabsq $-71777214294589695, %rcx # imm = 0xFF00FF00FF00FF01
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    shrq $8, %rax
+; X64-NEXT:    retq
+entry:
+  %rem = udiv i64 %x, 257
+  ret i64 %rem
+}
+
+define i64 @udiv_i64_65535(i64 %x) nounwind {
+; X32-LABEL: udiv_i64_65535:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    subl $12, %esp
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $65535 # imm = 0xFFFF
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    calll __udivdi3
+; X32-NEXT:    addl $28, %esp
+; X32-NEXT:    retl
+;
+; X64-LABEL: udiv_i64_65535:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movabsq $-9223231297218904063, %rcx # imm = 0x8000800080008001
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    shrq $15, %rax
+; X64-NEXT:    retq
+entry:
+  %rem = udiv i64 %x, 65535
+  ret i64 %rem
+}
+
+define i64 @udiv_i64_65537(i64 %x) nounwind {
+; X32-LABEL: udiv_i64_65537:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    subl $12, %esp
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $65537 # imm = 0x10001
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    calll __udivdi3
+; X32-NEXT:    addl $28, %esp
+; X32-NEXT:    retl
+;
+; X64-LABEL: udiv_i64_65537:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movabsq $-281470681808895, %rcx # imm = 0xFFFF0000FFFF0001
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    shrq $16, %rax
+; X64-NEXT:    retq
+entry:
+  %rem = udiv i64 %x, 65537
+  ret i64 %rem
+}
+
+define i64 @udiv_i64_12(i64 %x) nounwind {
+; X32-LABEL: udiv_i64_12:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    subl $12, %esp
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $12
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    calll __udivdi3
+; X32-NEXT:    addl $28, %esp
+; X32-NEXT:    retl
+;
+; X64-LABEL: udiv_i64_12:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    shrq $3, %rax
+; X64-NEXT:    retq
+entry:
+  %rem = udiv i64 %x, 12
+  ret i64 %rem
+}
diff --git a/llvm/test/CodeGen/X86/divmod128.ll b/llvm/test/CodeGen/X86/divmod128.ll
--- a/llvm/test/CodeGen/X86/divmod128.ll
+++ b/llvm/test/CodeGen/X86/divmod128.ll
@@ -123,3 +123,543 @@
   %2 = trunc i128 %1 to i64
   ret i64 %2
 }
+
+define i128 @urem_i128_3(i128 %x) nounwind {
+; X86-64-LABEL: urem_i128_3:
+; X86-64:       # %bb.0: # %entry
+; X86-64-NEXT:    pushq %rax
+; X86-64-NEXT:    movl $3, %edx
+; X86-64-NEXT:    xorl %ecx, %ecx
+; X86-64-NEXT:    callq __umodti3@PLT
+; X86-64-NEXT:    popq %rcx
+; X86-64-NEXT:    retq
+;
+; WIN64-LABEL: urem_i128_3:
+; WIN64:       # %bb.0: # %entry
+; WIN64-NEXT:    subq $72, %rsp
+; WIN64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq $3, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT:    callq __umodti3
+; WIN64-NEXT:    movq %xmm0, %rax
+; WIN64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; WIN64-NEXT:    movq %xmm0, %rdx
+; WIN64-NEXT:    addq $72, %rsp
+; WIN64-NEXT:    retq
+entry:
+  %rem = urem i128 %x, 3
+  ret i128 %rem
+}
+
+define i128 @urem_i128_5(i128 %x) nounwind {
+; X86-64-LABEL: urem_i128_5:
+; X86-64:       # %bb.0: # %entry
+; X86-64-NEXT:    pushq %rax
+; X86-64-NEXT:    movl $5, %edx
+; X86-64-NEXT:    xorl %ecx, %ecx
+; X86-64-NEXT:    callq __umodti3@PLT
+; X86-64-NEXT:    popq %rcx
+; X86-64-NEXT:    retq
+;
+; WIN64-LABEL: urem_i128_5:
+; WIN64:       # %bb.0: # %entry
+; WIN64-NEXT:    subq $72, %rsp
+; WIN64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq $5, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT:    callq __umodti3
+; WIN64-NEXT:    movq %xmm0, %rax
+; WIN64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; WIN64-NEXT:    movq %xmm0, %rdx
+; WIN64-NEXT:    addq $72, %rsp
+; WIN64-NEXT:    retq
+entry:
+  %rem = urem i128 %x, 5
+  ret i128 %rem
+}
+
+define i128 @urem_i128_15(i128 %x) nounwind {
+; X86-64-LABEL: urem_i128_15:
+; X86-64:       # %bb.0: # %entry
+; X86-64-NEXT:    pushq %rax
+; X86-64-NEXT:    movl $15, %edx
+; X86-64-NEXT:    xorl %ecx, %ecx
+; X86-64-NEXT:    callq __umodti3@PLT
+; X86-64-NEXT:    popq %rcx
+; X86-64-NEXT:    retq
+;
+; WIN64-LABEL: urem_i128_15:
+; WIN64:       # %bb.0: # %entry
+; WIN64-NEXT:    subq $72, %rsp
+; WIN64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq $15, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT:    callq __umodti3
+; WIN64-NEXT:    movq %xmm0, %rax
+; WIN64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; WIN64-NEXT:    movq %xmm0, %rdx
+; WIN64-NEXT:    addq $72, %rsp
+; WIN64-NEXT:    retq
+entry:
+  %rem = urem i128 %x, 15
+  ret i128 %rem
+}
+
+define i128 @urem_i128_17(i128 %x) nounwind {
+; X86-64-LABEL: urem_i128_17:
+; X86-64:       # %bb.0: # %entry
+; X86-64-NEXT:    pushq %rax
+; X86-64-NEXT:    movl $17, %edx
+; X86-64-NEXT:    xorl %ecx, %ecx
+; X86-64-NEXT:    callq __umodti3@PLT
+; X86-64-NEXT:    popq %rcx
+; X86-64-NEXT:    retq
+;
+; WIN64-LABEL: urem_i128_17:
+; WIN64:       # %bb.0: # %entry
+; WIN64-NEXT:    subq $72, %rsp
+; WIN64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq $17, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT:    callq __umodti3
+; WIN64-NEXT:    movq %xmm0, %rax
+; WIN64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; WIN64-NEXT:    movq %xmm0, %rdx
+; WIN64-NEXT:    addq $72, %rsp
+; WIN64-NEXT:    retq
+entry:
+  %rem = urem i128 %x, 17
+  ret i128 %rem
+}
+
+define i128 @urem_i128_255(i128 %x) nounwind {
+; X86-64-LABEL: urem_i128_255:
+; X86-64:       # %bb.0: # %entry
+; X86-64-NEXT:    pushq %rax
+; X86-64-NEXT:    movl $255, %edx
+; X86-64-NEXT:    xorl %ecx, %ecx
+; X86-64-NEXT:    callq __umodti3@PLT
+; X86-64-NEXT:    popq %rcx
+; X86-64-NEXT:    retq
+;
+; WIN64-LABEL: urem_i128_255:
+; WIN64:       # %bb.0: # %entry
+; WIN64-NEXT:    subq $72, %rsp
+; WIN64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq $255, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT:    callq __umodti3
+; WIN64-NEXT:    movq %xmm0, %rax
+; WIN64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; WIN64-NEXT:    movq %xmm0, %rdx
+; WIN64-NEXT:    addq $72, %rsp
+; WIN64-NEXT:    retq
+entry:
+  %rem = urem i128 %x, 255
+  ret i128 %rem
+}
+
+define i128 @urem_i128_257(i128 %x) nounwind {
+; X86-64-LABEL: urem_i128_257:
+; X86-64:       # %bb.0: # %entry
+; X86-64-NEXT:    pushq %rax
+; X86-64-NEXT:    movl $257, %edx # imm = 0x101
+; X86-64-NEXT:    xorl %ecx, %ecx
+; X86-64-NEXT:    callq __umodti3@PLT
+; X86-64-NEXT:    popq %rcx
+; X86-64-NEXT:    retq
+;
+; WIN64-LABEL: urem_i128_257:
+; WIN64:       # %bb.0: # %entry
+; WIN64-NEXT:    subq $72, %rsp
+; WIN64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq $257, {{[0-9]+}}(%rsp) # imm = 0x101
+; WIN64-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT:    callq __umodti3
+; WIN64-NEXT:    movq %xmm0, %rax
+; WIN64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; WIN64-NEXT:    movq %xmm0, %rdx
+; WIN64-NEXT:    addq $72, %rsp
+; WIN64-NEXT:    retq
+entry:
+  %rem = urem i128 %x, 257
+  ret i128 %rem
+}
+
+define i128 @urem_i128_65535(i128 %x) nounwind {
+; X86-64-LABEL: urem_i128_65535:
+; X86-64:       # %bb.0: # %entry
+; X86-64-NEXT:    pushq %rax
+; X86-64-NEXT:    movl $65535, %edx # imm = 0xFFFF
+; X86-64-NEXT:    xorl %ecx, %ecx
+; X86-64-NEXT:    callq __umodti3@PLT
+; X86-64-NEXT:    popq %rcx
+; X86-64-NEXT:    retq
+;
+; WIN64-LABEL: urem_i128_65535:
+; WIN64:       # %bb.0: # %entry
+; WIN64-NEXT:    subq $72, %rsp
+; WIN64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq $65535, {{[0-9]+}}(%rsp) # imm = 0xFFFF
+; WIN64-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT:    callq __umodti3
+; WIN64-NEXT:    movq %xmm0, %rax
+; WIN64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; WIN64-NEXT:    movq %xmm0, %rdx
+; WIN64-NEXT:    addq $72, %rsp
+; WIN64-NEXT:    retq
+entry:
+  %rem = urem i128 %x, 65535
+  ret i128 %rem
+}
+
+define i128 @urem_i128_65537(i128 %x) nounwind {
+; X86-64-LABEL: urem_i128_65537:
+; X86-64:       # %bb.0: # %entry
+; X86-64-NEXT:    pushq %rax
+; X86-64-NEXT:    movl $65537, %edx # imm = 0x10001
+; X86-64-NEXT:    xorl %ecx, %ecx
+; X86-64-NEXT:    callq __umodti3@PLT
+; X86-64-NEXT:    popq %rcx
+; X86-64-NEXT:    retq
+;
+; WIN64-LABEL: urem_i128_65537:
+; WIN64:       # %bb.0: # %entry
+; WIN64-NEXT:    subq $72, %rsp
+; WIN64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq $65537, {{[0-9]+}}(%rsp) # imm = 0x10001
+; WIN64-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT:    callq __umodti3
+; WIN64-NEXT:    movq %xmm0, %rax
+; WIN64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; WIN64-NEXT:    movq %xmm0, %rdx
+; WIN64-NEXT:    addq $72, %rsp
+; WIN64-NEXT:    retq
+entry:
+  %rem = urem i128 %x, 65537
+  ret i128 %rem
+}
+
+define i128 @urem_i128_12(i128 %x) nounwind {
+; X86-64-LABEL: urem_i128_12:
+; X86-64:       # %bb.0: # %entry
+; X86-64-NEXT:    pushq %rax
+; X86-64-NEXT:    movl $12, %edx
+; X86-64-NEXT:    xorl %ecx, %ecx
+; X86-64-NEXT:    callq __umodti3@PLT
+; X86-64-NEXT:    popq %rcx
+; X86-64-NEXT:    retq
+;
+; WIN64-LABEL: urem_i128_12:
+; WIN64:       # %bb.0: # %entry
+; WIN64-NEXT:    subq $72, %rsp
+; WIN64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq $12, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT:    callq __umodti3
+; WIN64-NEXT:    movq %xmm0, %rax
+; WIN64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; WIN64-NEXT:    movq %xmm0, %rdx
+; WIN64-NEXT:    addq $72, %rsp
+; WIN64-NEXT:    retq
+entry:
+  %rem = urem i128 %x, 12
+  ret i128 %rem
+}
+
+define i128 @udiv_i128_3(i128 %x) nounwind {
+; X86-64-LABEL: udiv_i128_3:
+; X86-64:       # %bb.0: # %entry
+; X86-64-NEXT:    pushq %rax
+; X86-64-NEXT:    movl $3, %edx
+; X86-64-NEXT:    xorl %ecx, %ecx
+; X86-64-NEXT:    callq __udivti3@PLT
+; X86-64-NEXT:    popq %rcx
+; X86-64-NEXT:    retq
+;
+; WIN64-LABEL: udiv_i128_3:
+; WIN64:       # %bb.0: # %entry
+; WIN64-NEXT:    subq $72, %rsp
+; WIN64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq $3, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT:    callq __udivti3
+; WIN64-NEXT:    movq %xmm0, %rax
+; WIN64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; WIN64-NEXT:    movq %xmm0, %rdx
+; WIN64-NEXT:    addq $72, %rsp
+; WIN64-NEXT:    retq
+entry:
+  %rem = udiv i128 %x, 3
+  ret i128 %rem
+}
+
+define i128 @udiv_i128_5(i128 %x) nounwind {
+; X86-64-LABEL: udiv_i128_5:
+; X86-64:       # %bb.0: # %entry
+; X86-64-NEXT:    pushq %rax
+; X86-64-NEXT:    movl $5, %edx
+; X86-64-NEXT:    xorl %ecx, %ecx
+; X86-64-NEXT:    callq __udivti3@PLT
+; X86-64-NEXT:    popq %rcx
+; X86-64-NEXT:    retq
+;
+; WIN64-LABEL: udiv_i128_5:
+; WIN64:       # %bb.0: # %entry
+; WIN64-NEXT:    subq $72, %rsp
+; WIN64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq $5, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT:    callq __udivti3
+; WIN64-NEXT:    movq %xmm0, %rax
+; WIN64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; WIN64-NEXT:    movq %xmm0, %rdx
+; WIN64-NEXT:    addq $72, %rsp
+; WIN64-NEXT:    retq
+entry:
+  %rem = udiv i128 %x, 5
+  ret i128 %rem
+}
+
+define i128 @udiv_i128_15(i128 %x) nounwind {
+; X86-64-LABEL: udiv_i128_15:
+; X86-64:       # %bb.0: # %entry
+; X86-64-NEXT:    pushq %rax
+; X86-64-NEXT:    movl $15, %edx
+; X86-64-NEXT:    xorl %ecx, %ecx
+; X86-64-NEXT:    callq __udivti3@PLT
+; X86-64-NEXT:    popq %rcx
+; X86-64-NEXT:    retq
+;
+; WIN64-LABEL: udiv_i128_15:
+; WIN64:       # %bb.0: # %entry
+; WIN64-NEXT:    subq $72, %rsp
+; WIN64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq $15, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT:    callq __udivti3
+; WIN64-NEXT:    movq %xmm0, %rax
+; WIN64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; WIN64-NEXT:    movq %xmm0, %rdx
+; WIN64-NEXT:    addq $72, %rsp
+; WIN64-NEXT:    retq
+entry:
+  %rem = udiv i128 %x, 15
+  ret i128 %rem
+}
+
+define i128 @udiv_i128_17(i128 %x) nounwind {
+; X86-64-LABEL: udiv_i128_17:
+; X86-64:       # %bb.0: # %entry
+; X86-64-NEXT:    pushq %rax
+; X86-64-NEXT:    movl $17, %edx
+; X86-64-NEXT:    xorl %ecx, %ecx
+; X86-64-NEXT:    callq __udivti3@PLT
+; X86-64-NEXT:    popq %rcx
+; X86-64-NEXT:    retq
+;
+; WIN64-LABEL: udiv_i128_17:
+; WIN64:       # %bb.0: # %entry
+; WIN64-NEXT:    subq $72, %rsp
+; WIN64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq $17, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT:    callq __udivti3
+; WIN64-NEXT:    movq %xmm0, %rax
+; WIN64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; WIN64-NEXT:    movq %xmm0, %rdx
+; WIN64-NEXT:    addq $72, %rsp
+; WIN64-NEXT:    retq
+entry:
+  %rem = udiv i128 %x, 17
+  ret i128 %rem
+}
+
+define i128 @udiv_i128_255(i128 %x) nounwind {
+; X86-64-LABEL: udiv_i128_255:
+; X86-64:       # %bb.0: # %entry
+; X86-64-NEXT:    pushq %rax
+; X86-64-NEXT:    movl $255, %edx
+; X86-64-NEXT:    xorl %ecx, %ecx
+; X86-64-NEXT:    callq __udivti3@PLT
+; X86-64-NEXT:    popq %rcx
+; X86-64-NEXT:    retq
+;
+; WIN64-LABEL: udiv_i128_255:
+; WIN64:       # %bb.0: # %entry
+; WIN64-NEXT:    subq $72, %rsp
+; WIN64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq $255, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT:    callq __udivti3
+; WIN64-NEXT:    movq %xmm0, %rax
+; WIN64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; WIN64-NEXT:    movq %xmm0, %rdx
+; WIN64-NEXT:    addq $72, %rsp
+; WIN64-NEXT:    retq
+entry:
+  %rem = udiv i128 %x, 255
+  ret i128 %rem
+}
+
+define i128 @udiv_i128_257(i128 %x) nounwind {
+; X86-64-LABEL: udiv_i128_257:
+; X86-64:       # %bb.0: # %entry
+; X86-64-NEXT:    pushq %rax
+; X86-64-NEXT:    movl $257, %edx # imm = 0x101
+; X86-64-NEXT:    xorl %ecx, %ecx
+; X86-64-NEXT:    callq __udivti3@PLT
+; X86-64-NEXT:    popq %rcx
+; X86-64-NEXT:    retq
+;
+; WIN64-LABEL: udiv_i128_257:
+; WIN64:       # %bb.0: # %entry
+; WIN64-NEXT:    subq $72, %rsp
+; WIN64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq $257, {{[0-9]+}}(%rsp) # imm = 0x101
+; WIN64-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT:    callq __udivti3
+; WIN64-NEXT:    movq %xmm0, %rax
+; WIN64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; WIN64-NEXT:    movq %xmm0, %rdx
+; WIN64-NEXT:    addq $72, %rsp
+; WIN64-NEXT:    retq
+entry:
+  %rem = udiv i128 %x, 257
+  ret i128 %rem
+}
+
+define i128 @udiv_i128_65535(i128 %x) nounwind {
+; X86-64-LABEL: udiv_i128_65535:
+; X86-64:       # %bb.0: # %entry
+; X86-64-NEXT:    pushq %rax
+; X86-64-NEXT:    movl $65535, %edx # imm = 0xFFFF
+; X86-64-NEXT:    xorl %ecx, %ecx
+; X86-64-NEXT:    callq __udivti3@PLT
+; X86-64-NEXT:    popq %rcx
+; X86-64-NEXT:    retq
+;
+; WIN64-LABEL: udiv_i128_65535:
+; WIN64:       # %bb.0: # %entry
+; WIN64-NEXT:    subq $72, %rsp
+; WIN64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq $65535, {{[0-9]+}}(%rsp) # imm = 0xFFFF
+; WIN64-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT:    callq __udivti3
+; WIN64-NEXT:    movq %xmm0, %rax
+; WIN64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; WIN64-NEXT:    movq %xmm0, %rdx
+; WIN64-NEXT:    addq $72, %rsp
+; WIN64-NEXT:    retq
+entry:
+  %rem = udiv i128 %x, 65535
+  ret i128 %rem
+}
+
+define i128 @udiv_i128_65537(i128 %x) nounwind {
+; X86-64-LABEL: udiv_i128_65537:
+; X86-64:       # %bb.0: # %entry
+; X86-64-NEXT:    pushq %rax
+; X86-64-NEXT:    movl $65537, %edx # imm = 0x10001
+; X86-64-NEXT:    xorl %ecx, %ecx
+; X86-64-NEXT:    callq __udivti3@PLT
+; X86-64-NEXT:    popq %rcx
+; X86-64-NEXT:    retq
+;
+; WIN64-LABEL: udiv_i128_65537:
+; WIN64:       # %bb.0: # %entry
+; WIN64-NEXT:    subq $72, %rsp
+; WIN64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq $65537, {{[0-9]+}}(%rsp) # imm = 0x10001
+; WIN64-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT:    callq __udivti3
+; WIN64-NEXT:    movq %xmm0, %rax
+; WIN64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; WIN64-NEXT:    movq %xmm0, %rdx
+; WIN64-NEXT:    addq $72, %rsp
+; WIN64-NEXT:    retq
+entry:
+  %rem = udiv i128 %x, 65537
+  ret i128 %rem
+}
+
+define i128 @udiv_i128_12(i128 %x) nounwind {
+; X86-64-LABEL: udiv_i128_12:
+; X86-64:       # %bb.0: # %entry
+; X86-64-NEXT:    pushq %rax
+; X86-64-NEXT:    movl $12, %edx
+; X86-64-NEXT:    xorl %ecx, %ecx
+; X86-64-NEXT:    callq __udivti3@PLT
+; X86-64-NEXT:    popq %rcx
+; X86-64-NEXT:    retq
+;
+; WIN64-LABEL: udiv_i128_12:
+; WIN64:       # %bb.0: # %entry
+; WIN64-NEXT:    subq $72, %rsp
+; WIN64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq $12, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT:    callq __udivti3
+; WIN64-NEXT:    movq %xmm0, %rax
+; WIN64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; WIN64-NEXT:    movq %xmm0, %rdx
+; WIN64-NEXT:    addq $72, %rsp
+; WIN64-NEXT:    retq
+entry:
+  %rem = udiv i128 %x, 12
+  ret i128 %rem
+}
diff --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll
--- a/llvm/test/CodeGen/X86/freeze-binary.ll
+++ b/llvm/test/CodeGen/X86/freeze-binary.ll
@@ -398,7 +398,7 @@
 define <2 x i64> @freeze_shl_vec_outofrange(<2 x i64> %a0) nounwind {
 ; X86-LABEL: freeze_shl_vec_outofrange:
 ; X86:       # %bb.0:
-; X86-NEXT:    psllq $1, %xmm0
+; X86-NEXT:    paddq %xmm0, %xmm0
 ; X86-NEXT:    psllq $2, %xmm0
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/gcc_except_table_bb_sections_nolpads.ll b/llvm/test/CodeGen/X86/gcc_except_table_bb_sections_nolpads.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/gcc_except_table_bb_sections_nolpads.ll
@@ -0,0 +1,44 @@
+;; Verify that @LPStart is omitted when there are no landing pads. This test
+;; uses an unkown personality to force emitting the exception table.
+
+; RUN: llc -basic-block-sections=all -mtriple=x86_64 < %s | FileCheck %s
+
+declare void @throwit()
+declare i32 @__unknown_ehpersonality(...)
+
+define void @foo(i1 %cond) uwtable personality ptr @__unknown_ehpersonality {
+entry:
+  br i1 %cond, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  call void @throwit()
+  unreachable
+
+cond.false:                                         ; preds = %entry
+  ret void
+}
+
+; CHECK:      GCC_except_table0:
+; CHECK-NEXT: .Lexception0:
+; CHECK-NEXT:   .byte	255                             # @LPStart Encoding = omit
+; CHECK-NEXT:   .byte	255                             # @TType Encoding = omit
+; CHECK-NEXT:   .byte	1                               # Call site Encoding = uleb128
+; CHECK-NEXT:   .uleb128 .Laction_table_base0-.Lcst_begin0
+; CHECK-NEXT: .Lcst_begin0:
+; CHECK-NEXT: .Lexception1:
+; CHECK-NEXT:   .byte	255                             # @LPStart Encoding = omit
+; CHECK-NEXT:   .byte	255                             # @TType Encoding = omit
+; CHECK-NEXT:   .byte	1                               # Call site Encoding = uleb128
+; CHECK-NEXT:   .uleb128 .Laction_table_base0-.Lcst_begin1
+; CHECK-NEXT: .Lcst_begin1:
+; CHECK-NEXT: .Lexception2:
+; CHECK-NEXT:   .byte	255                             # @LPStart Encoding = omit
+; CHECK-NEXT:   .byte	255                             # @TType Encoding = omit
+; CHECK-NEXT:   .byte	1                               # Call site Encoding = uleb128
+; CHECK-NEXT:   .uleb128 .Laction_table_base0-.Lcst_begin2
+; CHECK-NEXT: .Lcst_begin2:
+; CHECK-NEXT:   .uleb128 foo.__part.2-foo.__part.2      # >> Call Site 1 <<
+; CHECK-NEXT:   .uleb128 .LBB_END0_2-foo.__part.2       #   Call between foo.__part.2 and .LBB_END0_2
+; CHECK-NEXT:   .byte	0                               #     has no landing pad
+; CHECK-NEXT:   .byte	0                               #   On action: cleanup
+; CHECK-NEXT: .Laction_table_base0:
diff --git a/llvm/test/CodeGen/X86/oddsubvector.ll b/llvm/test/CodeGen/X86/oddsubvector.ll
--- a/llvm/test/CodeGen/X86/oddsubvector.ll
+++ b/llvm/test/CodeGen/X86/oddsubvector.ll
@@ -157,71 +157,71 @@
 ; SSE2-LABEL: PR42833:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movl b(%rip), %eax
-; SSE2-NEXT:    movdqa c+144(%rip), %xmm0
-; SSE2-NEXT:    movdqa c+128(%rip), %xmm1
+; SSE2-NEXT:    movdqa c+128(%rip), %xmm0
+; SSE2-NEXT:    movdqa c+144(%rip), %xmm1
 ; SSE2-NEXT:    addl c+128(%rip), %eax
 ; SSE2-NEXT:    movd %eax, %xmm2
 ; SSE2-NEXT:    movd %eax, %xmm3
-; SSE2-NEXT:    paddd %xmm1, %xmm3
+; SSE2-NEXT:    paddd %xmm0, %xmm3
 ; SSE2-NEXT:    movdqa d+144(%rip), %xmm4
-; SSE2-NEXT:    psubd %xmm0, %xmm4
-; SSE2-NEXT:    paddd %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm5
-; SSE2-NEXT:    paddd %xmm1, %xmm5
+; SSE2-NEXT:    psubd %xmm1, %xmm4
+; SSE2-NEXT:    paddd %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    paddd %xmm0, %xmm5
 ; SSE2-NEXT:    movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3]
-; SSE2-NEXT:    movdqa %xmm0, c+144(%rip)
+; SSE2-NEXT:    movdqa %xmm1, c+144(%rip)
 ; SSE2-NEXT:    movaps %xmm5, c+128(%rip)
-; SSE2-NEXT:    movdqa c+160(%rip), %xmm0
+; SSE2-NEXT:    movdqa c+160(%rip), %xmm1
 ; SSE2-NEXT:    movdqa c+176(%rip), %xmm3
 ; SSE2-NEXT:    movdqa d+160(%rip), %xmm5
 ; SSE2-NEXT:    movdqa d+176(%rip), %xmm6
 ; SSE2-NEXT:    movdqa d+128(%rip), %xmm7
-; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
-; SSE2-NEXT:    psubd %xmm1, %xmm7
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; SSE2-NEXT:    psubd %xmm0, %xmm7
 ; SSE2-NEXT:    psubd %xmm3, %xmm6
-; SSE2-NEXT:    psubd %xmm0, %xmm5
+; SSE2-NEXT:    psubd %xmm1, %xmm5
 ; SSE2-NEXT:    movdqa %xmm5, d+160(%rip)
 ; SSE2-NEXT:    movdqa %xmm6, d+176(%rip)
 ; SSE2-NEXT:    movdqa %xmm4, d+144(%rip)
 ; SSE2-NEXT:    movdqa %xmm7, d+128(%rip)
 ; SSE2-NEXT:    paddd %xmm3, %xmm3
-; SSE2-NEXT:    paddd %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, c+160(%rip)
+; SSE2-NEXT:    paddd %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, c+160(%rip)
 ; SSE2-NEXT:    movdqa %xmm3, c+176(%rip)
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: PR42833:
 ; SSE42:       # %bb.0:
 ; SSE42-NEXT:    movl b(%rip), %eax
-; SSE42-NEXT:    movdqa c+144(%rip), %xmm0
-; SSE42-NEXT:    movdqa c+128(%rip), %xmm1
+; SSE42-NEXT:    movdqa c+128(%rip), %xmm0
+; SSE42-NEXT:    movdqa c+144(%rip), %xmm1
 ; SSE42-NEXT:    addl c+128(%rip), %eax
 ; SSE42-NEXT:    movd %eax, %xmm2
-; SSE42-NEXT:    paddd %xmm1, %xmm2
+; SSE42-NEXT:    paddd %xmm0, %xmm2
 ; SSE42-NEXT:    movdqa d+144(%rip), %xmm3
-; SSE42-NEXT:    psubd %xmm0, %xmm3
-; SSE42-NEXT:    paddd %xmm0, %xmm0
-; SSE42-NEXT:    movdqa %xmm1, %xmm4
-; SSE42-NEXT:    paddd %xmm1, %xmm4
+; SSE42-NEXT:    psubd %xmm1, %xmm3
+; SSE42-NEXT:    paddd %xmm1, %xmm1
+; SSE42-NEXT:    movdqa %xmm0, %xmm4
+; SSE42-NEXT:    paddd %xmm0, %xmm4
 ; SSE42-NEXT:    pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3,4,5,6,7]
-; SSE42-NEXT:    movdqa %xmm0, c+144(%rip)
+; SSE42-NEXT:    movdqa %xmm1, c+144(%rip)
 ; SSE42-NEXT:    movdqa %xmm4, c+128(%rip)
-; SSE42-NEXT:    movdqa c+160(%rip), %xmm0
+; SSE42-NEXT:    movdqa c+160(%rip), %xmm1
 ; SSE42-NEXT:    movdqa c+176(%rip), %xmm2
 ; SSE42-NEXT:    movdqa d+160(%rip), %xmm4
 ; SSE42-NEXT:    movdqa d+176(%rip), %xmm5
 ; SSE42-NEXT:    movdqa d+128(%rip), %xmm6
-; SSE42-NEXT:    pinsrd $0, %eax, %xmm1
-; SSE42-NEXT:    psubd %xmm1, %xmm6
+; SSE42-NEXT:    pinsrd $0, %eax, %xmm0
+; SSE42-NEXT:    psubd %xmm0, %xmm6
 ; SSE42-NEXT:    psubd %xmm2, %xmm5
-; SSE42-NEXT:    psubd %xmm0, %xmm4
+; SSE42-NEXT:    psubd %xmm1, %xmm4
 ; SSE42-NEXT:    movdqa %xmm4, d+160(%rip)
 ; SSE42-NEXT:    movdqa %xmm5, d+176(%rip)
 ; SSE42-NEXT:    movdqa %xmm3, d+144(%rip)
 ; SSE42-NEXT:    movdqa %xmm6, d+128(%rip)
 ; SSE42-NEXT:    paddd %xmm2, %xmm2
-; SSE42-NEXT:    paddd %xmm0, %xmm0
-; SSE42-NEXT:    movdqa %xmm0, c+160(%rip)
+; SSE42-NEXT:    paddd %xmm1, %xmm1
+; SSE42-NEXT:    movdqa %xmm1, c+160(%rip)
 ; SSE42-NEXT:    movdqa %xmm2, c+176(%rip)
 ; SSE42-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/rotate_vec.ll b/llvm/test/CodeGen/X86/rotate_vec.ll
--- a/llvm/test/CodeGen/X86/rotate_vec.ll
+++ b/llvm/test/CodeGen/X86/rotate_vec.ll
@@ -111,21 +111,18 @@
 ; XOPAVX1-LABEL: rot_v4i32_mask_ashr0:
 ; XOPAVX1:       # %bb.0:
 ; XOPAVX1-NEXT:    vpshad {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; XOPAVX1-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; XOPAVX1-NEXT:    retq
 ;
 ; XOPAVX2-LABEL: rot_v4i32_mask_ashr0:
 ; XOPAVX2:       # %bb.0:
 ; XOPAVX2-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; XOPAVX2-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
 ; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: rot_v4i32_mask_ashr0:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
   %1 = ashr <4 x i32> %a0, <i32 25, i32 26, i32 27, i32 28>
@@ -139,7 +136,6 @@
 ; XOPAVX1-LABEL: rot_v4i32_mask_ashr1:
 ; XOPAVX1:       # %bb.0:
 ; XOPAVX1-NEXT:    vpsrad $25, %xmm0, %xmm0
-; XOPAVX1-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; XOPAVX1-NEXT:    retq
@@ -147,7 +143,6 @@
 ; XOPAVX2-LABEL: rot_v4i32_mask_ashr1:
 ; XOPAVX2:       # %bb.0:
 ; XOPAVX2-NEXT:    vpsrad $25, %xmm0, %xmm0
-; XOPAVX2-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
 ; XOPAVX2-NEXT:    vpbroadcastd %xmm0, %xmm0
 ; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; XOPAVX2-NEXT:    retq
@@ -155,7 +150,6 @@
 ; AVX512-LABEL: rot_v4i32_mask_ashr1:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpsrad $25, %xmm0, %xmm0
-; AVX512-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vpbroadcastd %xmm0, %xmm0
 ; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
--- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
@@ -581,28 +581,33 @@
 ; X64-NEXT:    subq $104, %rsp
 ; X64-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; X64-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT:    pxor %xmm2, %xmm2
-; X64-NEXT:    pcmpgtd %xmm0, %xmm2
+; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
+; X64-NEXT:    psllq $32, %xmm3
+; X64-NEXT:    movdqa %xmm3, %xmm2
+; X64-NEXT:    psrad $31, %xmm2
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; X64-NEXT:    psrlq $31, %xmm3
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
 ; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X64-NEXT:    paddq %xmm0, %xmm0
 ; X64-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT:    movq %xmm0, %rbx
-; X64-NEXT:    movq %rbx, %rbp
+; X64-NEXT:    movq %xmm0, %r15
+; X64-NEXT:    movq %r15, %rbp
 ; X64-NEXT:    sarq $63, %rbp
-; X64-NEXT:    shldq $31, %rbx, %rbp
+; X64-NEXT:    shldq $31, %r15, %rbp
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
 ; X64-NEXT:    pxor %xmm0, %xmm0
 ; X64-NEXT:    pcmpgtd %xmm1, %xmm0
 ; X64-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; X64-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; X64-NEXT:    movq %xmm1, %rdx
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %r15
-; X64-NEXT:    sarq $63, %r15
-; X64-NEXT:    movq %rbx, %r12
+; X64-NEXT:    movq %rdx, %rbx
+; X64-NEXT:    sarq $63, %rbx
+; X64-NEXT:    movq %r15, %r12
 ; X64-NEXT:    shlq $31, %r12
 ; X64-NEXT:    movq %r12, %rdi
 ; X64-NEXT:    movq %rbp, %rsi
-; X64-NEXT:    movq %r15, %rcx
+; X64-NEXT:    movq %rbx, %rcx
 ; X64-NEXT:    callq __divti3@PLT
 ; X64-NEXT:    movq %rax, %r13
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -610,16 +615,16 @@
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    subq $1, %r13
 ; X64-NEXT:    sbbq $0, %r14
-; X64-NEXT:    shrq $63, %rbx
-; X64-NEXT:    xorl %r15d, %ebx
+; X64-NEXT:    shrq $63, %r15
+; X64-NEXT:    xorl %ebx, %r15d
 ; X64-NEXT:    movq %r12, %rdi
 ; X64-NEXT:    movq %rbp, %rsi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT:    movq %r15, %rcx
+; X64-NEXT:    movq %rbx, %rcx
 ; X64-NEXT:    callq __modti3@PLT
 ; X64-NEXT:    orq %rax, %rdx
 ; X64-NEXT:    setne %al
-; X64-NEXT:    testb %bl, %al
+; X64-NEXT:    testb %r15b, %al
 ; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
 ; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
 ; X64-NEXT:    movl $4294967295, %edx # imm = 0xFFFFFFFF
@@ -699,43 +704,45 @@
 ; X64-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; X64-NEXT:    psrlq $1, %xmm1
 ; X64-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; X64-NEXT:    # xmm1 = mem[2,3,2,3]
-; X64-NEXT:    pxor %xmm0, %xmm0
-; X64-NEXT:    pcmpgtd %xmm1, %xmm0
-; X64-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X64-NEXT:    paddq %xmm1, %xmm1
-; X64-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT:    movq %xmm1, %rbx
-; X64-NEXT:    movq %rbx, %r12
-; X64-NEXT:    sarq $63, %r12
-; X64-NEXT:    shldq $31, %rbx, %r12
-; X64-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; X64-NEXT:    # xmm1 = mem[2,3,2,3]
-; X64-NEXT:    pxor %xmm0, %xmm0
-; X64-NEXT:    pcmpgtd %xmm1, %xmm0
-; X64-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X64-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT:    movq %xmm1, %rdx
+; X64-NEXT:    pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; X64-NEXT:    # xmm0 = mem[0,1,1,3]
+; X64-NEXT:    psllq $32, %xmm0
+; X64-NEXT:    movdqa %xmm0, %xmm1
+; X64-NEXT:    psrad $31, %xmm1
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; X64-NEXT:    psrlq $31, %xmm0
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-NEXT:    movq %xmm0, %rbx
+; X64-NEXT:    movq %rbx, %r13
+; X64-NEXT:    sarq $63, %r13
+; X64-NEXT:    shldq $31, %rbx, %r13
+; X64-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-NEXT:    pxor %xmm1, %xmm1
+; X64-NEXT:    pcmpgtd %xmm0, %xmm1
+; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-NEXT:    movq %xmm0, %rdx
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rdx, %rbp
 ; X64-NEXT:    sarq $63, %rbp
 ; X64-NEXT:    movq %rbx, %r15
 ; X64-NEXT:    shlq $31, %r15
 ; X64-NEXT:    movq %r15, %rdi
-; X64-NEXT:    movq %r12, %rsi
+; X64-NEXT:    movq %r13, %rsi
 ; X64-NEXT:    movq %rbp, %rcx
 ; X64-NEXT:    callq __divti3@PLT
-; X64-NEXT:    movq %rax, %r13
+; X64-NEXT:    movq %rax, %r12
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rdx, %r14
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    subq $1, %r13
+; X64-NEXT:    subq $1, %r12
 ; X64-NEXT:    sbbq $0, %r14
 ; X64-NEXT:    shrq $63, %rbx
 ; X64-NEXT:    xorl %ebp, %ebx
 ; X64-NEXT:    movq %r15, %rdi
-; X64-NEXT:    movq %r12, %rsi
+; X64-NEXT:    movq %r13, %rsi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
 ; X64-NEXT:    movq %rbp, %rcx
 ; X64-NEXT:    callq __modti3@PLT
@@ -743,25 +750,25 @@
 ; X64-NEXT:    setne %al
 ; X64-NEXT:    testb %bl, %al
 ; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
-; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
 ; X64-NEXT:    movl $4294967295, %ecx # imm = 0xFFFFFFFF
-; X64-NEXT:    cmpq %rcx, %r13
+; X64-NEXT:    cmpq %rcx, %r12
 ; X64-NEXT:    movl $4294967295, %eax # imm = 0xFFFFFFFF
-; X64-NEXT:    cmovbq %r13, %rax
+; X64-NEXT:    cmovbq %r12, %rax
 ; X64-NEXT:    testq %r14, %r14
-; X64-NEXT:    cmovnsq %rcx, %r13
-; X64-NEXT:    cmoveq %rax, %r13
+; X64-NEXT:    cmovnsq %rcx, %r12
+; X64-NEXT:    cmoveq %rax, %r12
 ; X64-NEXT:    movl $0, %eax
 ; X64-NEXT:    cmovnsq %rax, %r14
 ; X64-NEXT:    movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
-; X64-NEXT:    cmpq %rcx, %r13
+; X64-NEXT:    cmpq %rcx, %r12
 ; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    cmovaq %r13, %rax
+; X64-NEXT:    cmovaq %r12, %rax
 ; X64-NEXT:    testq %r14, %r14
-; X64-NEXT:    cmovsq %rcx, %r13
+; X64-NEXT:    cmovsq %rcx, %r12
 ; X64-NEXT:    cmpq $-1, %r14
-; X64-NEXT:    cmoveq %rax, %r13
-; X64-NEXT:    movq %r13, %xmm0
+; X64-NEXT:    cmoveq %rax, %r12
+; X64-NEXT:    movq %r12, %xmm0
 ; X64-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; X64-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; X64-NEXT:    # xmm0 = mem[2,3,2,3]
@@ -816,12 +823,12 @@
 ; X64-NEXT:    cmovsq %rcx, %r12
 ; X64-NEXT:    cmpq $-1, %r14
 ; X64-NEXT:    cmoveq %rax, %r12
-; X64-NEXT:    movq %r12, %xmm0
-; X64-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X64-NEXT:    psrlq $1, %xmm1
-; X64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X64-NEXT:    movq %r12, %xmm1
+; X64-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT:    psrlq $1, %xmm0
+; X64-NEXT:    shufps $136, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; X64-NEXT:    # xmm0 = xmm0[0,2],mem[0,2]
 ; X64-NEXT:    addq $104, %rsp
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    popq %r12
@@ -840,116 +847,108 @@
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $256, %esp # imm = 0x100
-; X86-NEXT:    movl 24(%ebp), %edx
-; X86-NEXT:    movl 40(%ebp), %edi
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    addl %edx, %edx
-; X86-NEXT:    adcl %eax, %eax
+; X86-NEXT:    movl 16(%ebp), %edi
+; X86-NEXT:    movl 32(%ebp), %eax
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    shldl $31, %edx, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shll $31, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl $1, %eax
-; X86-NEXT:    negl %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sarl $31, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    sarl $31, %ebx
+; X86-NEXT:    leal (%edi,%edi), %eax
+; X86-NEXT:    shrl $31, %edi
+; X86-NEXT:    shldl $31, %eax, %edi
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl $0
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %edx
+; X86-NEXT:    calll __divti3
+; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl 32(%ebp)
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll __modti3
 ; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 36(%ebp), %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    sarl $31, %edi
-; X86-NEXT:    movl 20(%ebp), %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    addl %ecx, %ecx
-; X86-NEXT:    adcl %edx, %edx
+; X86-NEXT:    movl 36(%ebp), %edx
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    shldl $31, %ecx, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shll $31, %ecx
+; X86-NEXT:    sarl $31, %ebx
+; X86-NEXT:    movl 20(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    leal (%ecx,%ecx), %eax
+; X86-NEXT:    shrl $31, %ecx
+; X86-NEXT:    shldl $31, %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl $1, %edx
-; X86-NEXT:    negl %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl %edx
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl $0
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll __modti3
 ; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl 28(%ebp), %ebx
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    movl 12(%ebp), %eax
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl 28(%ebp), %edx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    addl %eax, %eax
-; X86-NEXT:    adcl %esi, %esi
-; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    leal (%ecx,%ecx), %eax
+; X86-NEXT:    shrl $31, %ecx
 ; X86-NEXT:    shldl $31, %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shll $31, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl $1, %esi
-; X86-NEXT:    negl %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl %edx
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl $0
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    calll __divti3
 ; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 40(%ebp), %edx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    movl 16(%ebp), %ecx
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    addl %ecx, %ecx
-; X86-NEXT:    adcl %ebx, %ebx
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    shldl $31, %ecx, %edi
-; X86-NEXT:    shll $31, %ecx
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    leal (%ecx,%ecx), %eax
+; X86-NEXT:    shrl $31, %ecx
+; X86-NEXT:    shldl $31, %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl $1, %ebx
-; X86-NEXT:    negl %ebx
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl $0
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll __modti3
 ; X86-NEXT:    addl $32, %esp
@@ -958,39 +957,25 @@
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl 32(%ebp)
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    calll __divti3
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    pushl 40(%ebp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    pushl $0
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll __divti3
 ; X86-NEXT:    addl $32, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl 36(%ebp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    pushl $0
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll __divti3
 ; X86-NEXT:    addl $32, %esp
@@ -1005,22 +990,22 @@
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl $0, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sbbl $0, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    sets %bl
 ; X86-NEXT:    testl %edi, %edi
-; X86-NEXT:    sets %al
-; X86-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    sets %ah
-; X86-NEXT:    xorb %al, %ah
+; X86-NEXT:    sets %bh
+; X86-NEXT:    xorb %bl, %bh
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    orl %edi, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl %edi, %eax
 ; X86-NEXT:    setne %al
-; X86-NEXT:    testb %ah, %al
+; X86-NEXT:    testb %bh, %al
 ; X86-NEXT:    cmovel %esi, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -1030,7 +1015,7 @@
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    cmovel %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -1047,7 +1032,7 @@
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl $0, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    sets %bl
 ; X86-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    sets %bh
@@ -1085,11 +1070,11 @@
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl $0, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    testl %ecx, %ecx
-; X86-NEXT:    sets %al
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    sets %al
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    testl %ecx, %ecx
 ; X86-NEXT:    sets %bl
 ; X86-NEXT:    xorb %al, %bl
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -1100,7 +1085,7 @@
 ; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    pushl $0
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll __modti3
 ; X86-NEXT:    addl $32, %esp
diff --git a/llvm/test/CodeGen/X86/udiv_fix_sat.ll b/llvm/test/CodeGen/X86/udiv_fix_sat.ll
--- a/llvm/test/CodeGen/X86/udiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/udiv_fix_sat.ll
@@ -382,93 +382,85 @@
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    addl %eax, %eax
-; X86-NEXT:    setb %cl
-; X86-NEXT:    shldl $31, %eax, %ecx
-; X86-NEXT:    shll $31, %eax
+; X86-NEXT:    leal (%eax,%eax), %ecx
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    shldl $31, %ecx, %eax
 ; X86-NEXT:    pushl $0
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl $0
 ; X86-NEXT:    calll __udivdi3
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    addl %ebp, %ebp
-; X86-NEXT:    setb %al
-; X86-NEXT:    shldl $31, %ebp, %eax
-; X86-NEXT:    shll $31, %ebp
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    leal (%ebx,%ebx), %eax
+; X86-NEXT:    shrl $31, %ebx
+; X86-NEXT:    shldl $31, %eax, %ebx
 ; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %eax
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl $0
 ; X86-NEXT:    calll __udivdi3
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    addl %edi, %edi
-; X86-NEXT:    setb %al
-; X86-NEXT:    shldl $31, %edi, %eax
-; X86-NEXT:    shll $31, %edi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    leal (%esi,%esi), %eax
+; X86-NEXT:    shrl $31, %esi
+; X86-NEXT:    shldl $31, %eax, %esi
 ; X86-NEXT:    pushl $0
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl $0
 ; X86-NEXT:    calll __udivdi3
 ; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    addl %esi, %esi
-; X86-NEXT:    setb %al
-; X86-NEXT:    shldl $31, %esi, %eax
-; X86-NEXT:    shll $31, %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    leal (%edx,%edx), %ecx
+; X86-NEXT:    shrl $31, %edx
+; X86-NEXT:    shldl $31, %ecx, %edx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    cmpl $2, %esi
+; X86-NEXT:    movl $-1, %edx
+; X86-NEXT:    cmovael %edx, %eax
+; X86-NEXT:    cmpl $1, %esi
+; X86-NEXT:    movl $1, %ebp
+; X86-NEXT:    cmovael %ebp, %esi
+; X86-NEXT:    shldl $31, %eax, %esi
+; X86-NEXT:    cmpl $2, %ebx
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    cmovael %edx, %eax
+; X86-NEXT:    cmpl $1, %ebx
+; X86-NEXT:    cmovael %ebp, %ebx
+; X86-NEXT:    shldl $31, %eax, %ebx
+; X86-NEXT:    cmpl $2, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    cmovael %edx, %eax
+; X86-NEXT:    cmpl $1, %edi
+; X86-NEXT:    cmovael %ebp, %edi
+; X86-NEXT:    shldl $31, %eax, %edi
 ; X86-NEXT:    pushl $0
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl $0
 ; X86-NEXT:    calll __udivdi3
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    cmpl $2, %edx
-; X86-NEXT:    movl $-1, %esi
-; X86-NEXT:    cmovael %esi, %eax
+; X86-NEXT:    movl $-1, %ecx
+; X86-NEXT:    cmovael %ecx, %eax
 ; X86-NEXT:    cmpl $1, %edx
-; X86-NEXT:    movl $1, %ecx
-; X86-NEXT:    cmovael %ecx, %edx
-; X86-NEXT:    shldl $31, %eax, %edx
-; X86-NEXT:    cmpl $2, %edi
-; X86-NEXT:    cmovael %esi, %ebx
-; X86-NEXT:    cmpl $1, %edi
-; X86-NEXT:    cmovael %ecx, %edi
-; X86-NEXT:    shldl $31, %ebx, %edi
-; X86-NEXT:    cmpl $2, %ebp
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    cmovael %esi, %eax
-; X86-NEXT:    cmpl $1, %ebp
-; X86-NEXT:    cmovael %ecx, %ebp
+; X86-NEXT:    cmovbl %edx, %ebp
 ; X86-NEXT:    shldl $31, %eax, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    cmpl $2, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    cmovael %esi, %eax
-; X86-NEXT:    cmpl $1, %ebx
-; X86-NEXT:    cmovbl %ebx, %ecx
-; X86-NEXT:    shldl $31, %eax, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl %ebp, 8(%eax)
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    movl %ebp, 12(%eax)
+; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    movl %ebx, 4(%eax)
+; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll
--- a/llvm/test/CodeGen/X86/vector-fshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll
@@ -36,7 +36,7 @@
 ; SSE2-NEXT:    psrlq %xmm4, %xmm1
 ; SSE2-NEXT:    shufpd {{.*#+}} xmm5 = xmm5[0],xmm1[1]
 ; SSE2-NEXT:    pandn %xmm3, %xmm2
-; SSE2-NEXT:    psllq $1, %xmm0
+; SSE2-NEXT:    paddq %xmm0, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psllq %xmm2, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
@@ -56,12 +56,12 @@
 ; SSE41-NEXT:    psrlq %xmm4, %xmm1
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm1[4,5,6,7]
 ; SSE41-NEXT:    pandn %xmm3, %xmm2
-; SSE41-NEXT:    psllq $1, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    psllq %xmm2, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
+; SSE41-NEXT:    paddq %xmm0, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psllq %xmm1, %xmm3
 ; SSE41-NEXT:    psllq %xmm2, %xmm0
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
 ; SSE41-NEXT:    por %xmm5, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -74,11 +74,11 @@
 ; AVX1-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7]
 ; AVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpsllq $1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsllq %xmm2, %xmm0, %xmm3
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
+; AVX1-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpsllq %xmm3, %xmm0, %xmm3
 ; AVX1-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
 ; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
@@ -88,7 +88,7 @@
 ; AVX2-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX2-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm1
 ; AVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vpsllq $1, %xmm0, %xmm0
+; AVX2-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    vpsllvq %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
@@ -99,7 +99,7 @@
 ; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512F-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT:    vpsllq $1, %xmm0, %xmm0
+; AVX512F-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpsllvq %xmm2, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    retq
@@ -110,7 +110,7 @@
 ; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512VL-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpsllq $1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpsllvq %xmm2, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512VL-NEXT:    retq
@@ -121,7 +121,7 @@
 ; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512BW-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpsllq $1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpsllvq %xmm2, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    retq
@@ -142,7 +142,7 @@
 ; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512VLBW-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm1
 ; AVX512VLBW-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX512VLBW-NEXT:    vpsllq $1, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
 ; AVX512VLBW-NEXT:    vpsllvq %xmm2, %xmm0, %xmm0
 ; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512VLBW-NEXT:    retq
@@ -156,13 +156,13 @@
 ; XOPAVX1-LABEL: var_funnnel_v2i64:
 ; XOPAVX1:       # %bb.0:
 ; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
-; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm4
-; XOPAVX1-NEXT:    vpsllq $1, %xmm0, %xmm0
-; XOPAVX1-NEXT:    vpshlq %xmm4, %xmm0, %xmm0
-; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
-; XOPAVX1-NEXT:    vpshlq %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; XOPAVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; XOPAVX1-NEXT:    vpsubq %xmm4, %xmm5, %xmm4
+; XOPAVX1-NEXT:    vpshlq %xmm4, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpshlq %xmm2, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    retq
 ;
@@ -172,7 +172,7 @@
 ; XOPAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; XOPAVX2-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; XOPAVX2-NEXT:    vpsllq $1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
 ; XOPAVX2-NEXT:    vpsllvq %xmm2, %xmm0, %xmm0
 ; XOPAVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; XOPAVX2-NEXT:    retq
@@ -188,7 +188,7 @@
 ; X86-SSE2-NEXT:    psrlq %xmm5, %xmm1
 ; X86-SSE2-NEXT:    shufpd {{.*#+}} xmm3 = xmm3[0],xmm1[1]
 ; X86-SSE2-NEXT:    pandn %xmm4, %xmm2
-; X86-SSE2-NEXT:    psllq $1, %xmm0
+; X86-SSE2-NEXT:    paddq %xmm0, %xmm0
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; X86-SSE2-NEXT:    psllq %xmm2, %xmm1
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
@@ -225,7 +225,7 @@
 ; SSE2-NEXT:    pslld $23, %xmm2
 ; SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
 ; SSE2-NEXT:    cvttps2dq %xmm2, %xmm1
-; SSE2-NEXT:    pslld $1, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; SSE2-NEXT:    pmuludq %xmm1, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -260,7 +260,7 @@
 ; SSE41-NEXT:    pslld $23, %xmm2
 ; SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
 ; SSE41-NEXT:    cvttps2dq %xmm2, %xmm1
-; SSE41-NEXT:    pslld $1, %xmm0
+; SSE41-NEXT:    paddd %xmm0, %xmm0
 ; SSE41-NEXT:    pmulld %xmm1, %xmm0
 ; SSE41-NEXT:    por %xmm6, %xmm0
 ; SSE41-NEXT:    retq
@@ -285,7 +285,7 @@
 ; AVX1-NEXT:    vpslld $23, %xmm2, %xmm2
 ; AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX1-NEXT:    vcvttps2dq %xmm2, %xmm2
-; AVX1-NEXT:    vpslld $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
 ; AVX1-NEXT:    vpmulld %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
@@ -296,7 +296,7 @@
 ; AVX2-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX2-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm1
 ; AVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vpslld $1, %xmm0, %xmm0
+; AVX2-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    vpsllvd %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
@@ -307,7 +307,7 @@
 ; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512F-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT:    vpslld $1, %xmm0, %xmm0
+; AVX512F-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpsllvd %xmm2, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    retq
@@ -318,7 +318,7 @@
 ; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512VL-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpslld $1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpsllvd %xmm2, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512VL-NEXT:    retq
@@ -329,7 +329,7 @@
 ; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512BW-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpslld $1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpsllvd %xmm2, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    retq
@@ -350,7 +350,7 @@
 ; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512VLBW-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm1
 ; AVX512VLBW-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX512VLBW-NEXT:    vpslld $1, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
 ; AVX512VLBW-NEXT:    vpsllvd %xmm2, %xmm0, %xmm0
 ; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512VLBW-NEXT:    retq
@@ -364,13 +364,13 @@
 ; XOPAVX1-LABEL: var_funnnel_v4i32:
 ; XOPAVX1:       # %bb.0:
 ; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [31,31,31,31]
-; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm4
-; XOPAVX1-NEXT:    vpslld $1, %xmm0, %xmm0
-; XOPAVX1-NEXT:    vpshld %xmm4, %xmm0, %xmm0
-; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX1-NEXT:    vpsubd %xmm2, %xmm3, %xmm2
-; XOPAVX1-NEXT:    vpshld %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; XOPAVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; XOPAVX1-NEXT:    vpsubd %xmm4, %xmm5, %xmm4
+; XOPAVX1-NEXT:    vpshld %xmm4, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpshld %xmm2, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    retq
 ;
@@ -380,7 +380,7 @@
 ; XOPAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; XOPAVX2-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; XOPAVX2-NEXT:    vpslld $1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
 ; XOPAVX2-NEXT:    vpsllvd %xmm2, %xmm0, %xmm0
 ; XOPAVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; XOPAVX2-NEXT:    retq
@@ -409,7 +409,7 @@
 ; X86-SSE2-NEXT:    pslld $23, %xmm2
 ; X86-SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
 ; X86-SSE2-NEXT:    cvttps2dq %xmm2, %xmm1
-; X86-SSE2-NEXT:    pslld $1, %xmm0
+; X86-SSE2-NEXT:    paddd %xmm0, %xmm0
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; X86-SSE2-NEXT:    pmuludq %xmm1, %xmm0
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -473,7 +473,7 @@
 ; SSE2-NEXT:    pslld $16, %xmm2
 ; SSE2-NEXT:    psrad $16, %xmm2
 ; SSE2-NEXT:    packssdw %xmm4, %xmm2
-; SSE2-NEXT:    psllw $1, %xmm0
+; SSE2-NEXT:    paddw %xmm0, %xmm0
 ; SSE2-NEXT:    pmullw %xmm2, %xmm0
 ; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    por %xmm3, %xmm0
@@ -519,7 +519,7 @@
 ; SSE41-NEXT:    paddd %xmm4, %xmm0
 ; SSE41-NEXT:    cvttps2dq %xmm0, %xmm0
 ; SSE41-NEXT:    packusdw %xmm2, %xmm0
-; SSE41-NEXT:    psllw $1, %xmm3
+; SSE41-NEXT:    paddw %xmm3, %xmm3
 ; SSE41-NEXT:    pmullw %xmm0, %xmm3
 ; SSE41-NEXT:    por %xmm1, %xmm3
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
@@ -554,7 +554,7 @@
 ; AVX1-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vcvttps2dq %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpsllw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
 ; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
@@ -608,7 +608,7 @@
 ; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512BW-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpsllw $1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vzeroupper
@@ -630,7 +630,7 @@
 ; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512VLBW-NEXT:    vpsrlvw %xmm4, %xmm1, %xmm1
 ; AVX512VLBW-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX512VLBW-NEXT:    vpsllw $1, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
 ; AVX512VLBW-NEXT:    vpsllvw %xmm2, %xmm0, %xmm0
 ; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512VLBW-NEXT:    retq
@@ -644,13 +644,13 @@
 ; XOP-LABEL: var_funnnel_v8i16:
 ; XOP:       # %bb.0:
 ; XOP-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; XOP-NEXT:    vpandn %xmm3, %xmm2, %xmm4
-; XOP-NEXT:    vpsllw $1, %xmm0, %xmm0
-; XOP-NEXT:    vpshlw %xmm4, %xmm0, %xmm0
-; XOP-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; XOP-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; XOP-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
-; XOP-NEXT:    vpshlw %xmm2, %xmm1, %xmm1
+; XOP-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; XOP-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; XOP-NEXT:    vpsubw %xmm4, %xmm5, %xmm4
+; XOP-NEXT:    vpshlw %xmm4, %xmm1, %xmm1
+; XOP-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; XOP-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
+; XOP-NEXT:    vpshlw %xmm2, %xmm0, %xmm0
 ; XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
@@ -703,7 +703,7 @@
 ; X86-SSE2-NEXT:    pslld $16, %xmm2
 ; X86-SSE2-NEXT:    psrad $16, %xmm2
 ; X86-SSE2-NEXT:    packssdw %xmm4, %xmm2
-; X86-SSE2-NEXT:    psllw $1, %xmm0
+; X86-SSE2-NEXT:    paddw %xmm0, %xmm0
 ; X86-SSE2-NEXT:    pmullw %xmm2, %xmm0
 ; X86-SSE2-NEXT:    por %xmm1, %xmm0
 ; X86-SSE2-NEXT:    por %xmm3, %xmm0
@@ -1036,7 +1036,7 @@
 ; SSE-NEXT:    pand %xmm3, %xmm4
 ; SSE-NEXT:    psrlq %xmm4, %xmm1
 ; SSE-NEXT:    pandn %xmm3, %xmm2
-; SSE-NEXT:    psllq $1, %xmm0
+; SSE-NEXT:    paddq %xmm0, %xmm0
 ; SSE-NEXT:    psllq %xmm2, %xmm0
 ; SSE-NEXT:    por %xmm1, %xmm0
 ; SSE-NEXT:    retq
@@ -1047,7 +1047,7 @@
 ; AVX-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
 ; AVX-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX-NEXT:    vpsllq $1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
@@ -1058,7 +1058,7 @@
 ; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512F-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT:    vpsllq $1, %xmm0, %xmm0
+; AVX512F-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    retq
@@ -1069,7 +1069,7 @@
 ; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512VL-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpsllq $1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512VL-NEXT:    retq
@@ -1080,7 +1080,7 @@
 ; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512BW-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpsllq $1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    retq
@@ -1101,7 +1101,7 @@
 ; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512VLBW-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
 ; AVX512VLBW-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX512VLBW-NEXT:    vpsllq $1, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
 ; AVX512VLBW-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
 ; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512VLBW-NEXT:    retq
@@ -1119,7 +1119,7 @@
 ; XOP-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; XOP-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
 ; XOP-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; XOP-NEXT:    vpsllq $1, %xmm0, %xmm0
+; XOP-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
 ; XOP-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
 ; XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    retq
@@ -1131,7 +1131,7 @@
 ; X86-SSE2-NEXT:    pand %xmm3, %xmm4
 ; X86-SSE2-NEXT:    psrlq %xmm4, %xmm1
 ; X86-SSE2-NEXT:    pandn %xmm3, %xmm2
-; X86-SSE2-NEXT:    psllq $1, %xmm0
+; X86-SSE2-NEXT:    paddq %xmm0, %xmm0
 ; X86-SSE2-NEXT:    psllq %xmm2, %xmm0
 ; X86-SSE2-NEXT:    por %xmm1, %xmm0
 ; X86-SSE2-NEXT:    retl
@@ -1256,7 +1256,7 @@
 ; SSE-NEXT:    pand %xmm3, %xmm4
 ; SSE-NEXT:    psrlw %xmm4, %xmm1
 ; SSE-NEXT:    pandn %xmm3, %xmm2
-; SSE-NEXT:    psllw $1, %xmm0
+; SSE-NEXT:    paddw %xmm0, %xmm0
 ; SSE-NEXT:    psllw %xmm2, %xmm0
 ; SSE-NEXT:    por %xmm1, %xmm0
 ; SSE-NEXT:    retq
@@ -1267,7 +1267,7 @@
 ; AVX-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
 ; AVX-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX-NEXT:    vpsllw $1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vpsllw %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
@@ -1278,7 +1278,7 @@
 ; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512F-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT:    vpsllw $1, %xmm0, %xmm0
+; AVX512F-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpsllw %xmm2, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    retq
@@ -1289,7 +1289,7 @@
 ; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512VL-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpsllw $1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpsllw %xmm2, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512VL-NEXT:    retq
@@ -1300,7 +1300,7 @@
 ; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512BW-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpsllw $1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpsllw %xmm2, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    retq
@@ -1321,7 +1321,7 @@
 ; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512VLBW-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
 ; AVX512VLBW-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX512VLBW-NEXT:    vpsllw $1, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
 ; AVX512VLBW-NEXT:    vpsllw %xmm2, %xmm0, %xmm0
 ; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512VLBW-NEXT:    retq
@@ -1339,7 +1339,7 @@
 ; XOP-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; XOP-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
 ; XOP-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; XOP-NEXT:    vpsllw $1, %xmm0, %xmm0
+; XOP-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
 ; XOP-NEXT:    vpsllw %xmm2, %xmm0, %xmm0
 ; XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    retq
@@ -1351,7 +1351,7 @@
 ; X86-SSE2-NEXT:    pand %xmm3, %xmm4
 ; X86-SSE2-NEXT:    psrlw %xmm4, %xmm1
 ; X86-SSE2-NEXT:    pandn %xmm3, %xmm2
-; X86-SSE2-NEXT:    psllw $1, %xmm0
+; X86-SSE2-NEXT:    paddw %xmm0, %xmm0
 ; X86-SSE2-NEXT:    psllw %xmm2, %xmm0
 ; X86-SSE2-NEXT:    por %xmm1, %xmm0
 ; X86-SSE2-NEXT:    retl
@@ -1761,7 +1761,7 @@
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
 ; SSE2-NEXT:    pandn %xmm1, %xmm2
 ; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    psllw $1, %xmm0
+; SSE2-NEXT:    paddw %xmm0, %xmm0
 ; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    por %xmm1, %xmm0
@@ -1772,7 +1772,7 @@
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = <u,32768,16384,8192,4096,2048,1024,512>
 ; SSE41-NEXT:    pmulhuw %xmm1, %xmm2
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7]
-; SSE41-NEXT:    psllw $1, %xmm0
+; SSE41-NEXT:    paddw %xmm0, %xmm0
 ; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    retq
@@ -1781,7 +1781,7 @@
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
-; AVX-NEXT:    vpsllw $1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
@@ -1790,7 +1790,7 @@
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
 ; AVX512F-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
-; AVX512F-NEXT:    vpsllw $1, %xmm0, %xmm0
+; AVX512F-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX512F-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    retq
@@ -1799,7 +1799,7 @@
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
 ; AVX512VL-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
-; AVX512VL-NEXT:    vpsllw $1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512VL-NEXT:    retq
@@ -1810,7 +1810,7 @@
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
 ; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8]
-; AVX512BW-NEXT:    vpsllw $1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vzeroupper
@@ -1829,7 +1829,7 @@
 ; AVX512VLBW-LABEL: constant_funnnel_v8i16:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX512VLBW-NEXT:    vpsllw $1, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
 ; AVX512VLBW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512VLBW-NEXT:    retq
@@ -1843,7 +1843,7 @@
 ; XOP-LABEL: constant_funnnel_v8i16:
 ; XOP:       # %bb.0:
 ; XOP-NEXT:    vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; XOP-NEXT:    vpsllw $1, %xmm0, %xmm0
+; XOP-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
 ; XOP-NEXT:    vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    retq
@@ -1853,7 +1853,7 @@
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
 ; X86-SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    psllw $1, %xmm0
+; X86-SSE2-NEXT:    paddw %xmm0, %xmm0
 ; X86-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE2-NEXT:    por %xmm2, %xmm0
 ; X86-SSE2-NEXT:    por %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll
--- a/llvm/test/CodeGen/X86/vector-fshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll
@@ -37,17 +37,17 @@
 ; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm1, %ymm1
 ; AVX1-NEXT:    vandnps %ymm3, %ymm2, %ymm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT:    vpsllq $1, %xmm4, %xmm4
-; AVX1-NEXT:    vpsllq %xmm3, %xmm4, %xmm5
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
-; AVX1-NEXT:    vpsllq %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT:    vpsllq $1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsllq %xmm2, %xmm0, %xmm4
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpaddq %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpsllq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpsllq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm4 = xmm2[2,3,2,3]
+; AVX1-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpsllq %xmm4, %xmm0, %xmm4
 ; AVX1-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -58,7 +58,7 @@
 ; AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm4
 ; AVX2-NEXT:    vpsrlvq %ymm4, %ymm1, %ymm1
 ; AVX2-NEXT:    vpandn %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpsllq $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddq %ymm0, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsllvq %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
@@ -69,7 +69,7 @@
 ; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm4
 ; AVX512F-NEXT:    vpsrlvq %ymm4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpandn %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT:    vpsllq $1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpaddq %ymm0, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsllvq %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
@@ -80,7 +80,7 @@
 ; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm4
 ; AVX512VL-NEXT:    vpsrlvq %ymm4, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpandn %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT:    vpsllq $1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpaddq %ymm0, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsllvq %ymm2, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
@@ -91,7 +91,7 @@
 ; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm4
 ; AVX512BW-NEXT:    vpsrlvq %ymm4, %ymm1, %ymm1
 ; AVX512BW-NEXT:    vpandn %ymm3, %ymm2, %ymm2
-; AVX512BW-NEXT:    vpsllq $1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpaddq %ymm0, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpsllvq %ymm2, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    retq
@@ -111,7 +111,7 @@
 ; AVX512VLBW-NEXT:    vpand %ymm3, %ymm2, %ymm4
 ; AVX512VLBW-NEXT:    vpsrlvq %ymm4, %ymm1, %ymm1
 ; AVX512VLBW-NEXT:    vpandn %ymm3, %ymm2, %ymm2
-; AVX512VLBW-NEXT:    vpsllq $1, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpaddq %ymm0, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    vpsllvq %ymm2, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    retq
@@ -125,23 +125,23 @@
 ; XOPAVX1-LABEL: var_funnnel_v4i64:
 ; XOPAVX1:       # %bb.0:
 ; XOPAVX1-NEXT:    vmovaps {{.*#+}} ymm3 = [63,63,63,63]
-; XOPAVX1-NEXT:    vandnps %ymm3, %ymm2, %ymm4
+; XOPAVX1-NEXT:    vandps %ymm3, %ymm2, %ymm4
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm4, %xmm5
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
-; XOPAVX1-NEXT:    vpsllq $1, %xmm6, %xmm6
-; XOPAVX1-NEXT:    vpshlq %xmm5, %xmm6, %xmm5
-; XOPAVX1-NEXT:    vpsllq $1, %xmm0, %xmm0
-; XOPAVX1-NEXT:    vpshlq %xmm4, %xmm0, %xmm0
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
-; XOPAVX1-NEXT:    vandps %ymm3, %ymm2, %ymm2
+; XOPAVX1-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; XOPAVX1-NEXT:    vpsubq %xmm5, %xmm6, %xmm5
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
+; XOPAVX1-NEXT:    vpshlq %xmm5, %xmm7, %xmm5
+; XOPAVX1-NEXT:    vpsubq %xmm4, %xmm6, %xmm4
+; XOPAVX1-NEXT:    vpshlq %xmm4, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm1, %ymm1
+; XOPAVX1-NEXT:    vandnps %ymm3, %ymm2, %ymm2
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; XOPAVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; XOPAVX1-NEXT:    vpsubq %xmm3, %xmm4, %xmm3
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
-; XOPAVX1-NEXT:    vpshlq %xmm3, %xmm5, %xmm3
-; XOPAVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
-; XOPAVX1-NEXT:    vpshlq %xmm2, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; XOPAVX1-NEXT:    vpaddq %xmm4, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpshlq %xmm3, %xmm4, %xmm3
+; XOPAVX1-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpshlq %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; XOPAVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; XOPAVX1-NEXT:    retq
 ;
@@ -151,7 +151,7 @@
 ; XOPAVX2-NEXT:    vpand %ymm3, %ymm2, %ymm4
 ; XOPAVX2-NEXT:    vpsrlvq %ymm4, %ymm1, %ymm1
 ; XOPAVX2-NEXT:    vpandn %ymm3, %ymm2, %ymm2
-; XOPAVX2-NEXT:    vpsllq $1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpaddq %ymm0, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    vpsllvq %ymm2, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
@@ -184,7 +184,7 @@
 ; AVX1-NEXT:    vpaddd %xmm6, %xmm4, %xmm4
 ; AVX1-NEXT:    vcvttps2dq %xmm4, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
-; AVX1-NEXT:    vpslld $1, %xmm7, %xmm7
+; AVX1-NEXT:    vpaddd %xmm7, %xmm7, %xmm7
 ; AVX1-NEXT:    vpmulld %xmm4, %xmm7, %xmm4
 ; AVX1-NEXT:    vpor %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -202,7 +202,7 @@
 ; AVX1-NEXT:    vpslld $23, %xmm2, %xmm2
 ; AVX1-NEXT:    vpaddd %xmm6, %xmm2, %xmm2
 ; AVX1-NEXT:    vcvttps2dq %xmm2, %xmm2
-; AVX1-NEXT:    vpslld $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
 ; AVX1-NEXT:    vpmulld %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
@@ -214,7 +214,7 @@
 ; AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm4
 ; AVX2-NEXT:    vpsrlvd %ymm4, %ymm1, %ymm1
 ; AVX2-NEXT:    vpandn %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpslld $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsllvd %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
@@ -225,7 +225,7 @@
 ; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm4
 ; AVX512F-NEXT:    vpsrlvd %ymm4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpandn %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT:    vpslld $1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsllvd %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
@@ -236,7 +236,7 @@
 ; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm4
 ; AVX512VL-NEXT:    vpsrlvd %ymm4, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpandn %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT:    vpslld $1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsllvd %ymm2, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
@@ -247,7 +247,7 @@
 ; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm4
 ; AVX512BW-NEXT:    vpsrlvd %ymm4, %ymm1, %ymm1
 ; AVX512BW-NEXT:    vpandn %ymm3, %ymm2, %ymm2
-; AVX512BW-NEXT:    vpslld $1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpsllvd %ymm2, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    retq
@@ -267,7 +267,7 @@
 ; AVX512VLBW-NEXT:    vpand %ymm3, %ymm2, %ymm4
 ; AVX512VLBW-NEXT:    vpsrlvd %ymm4, %ymm1, %ymm1
 ; AVX512VLBW-NEXT:    vpandn %ymm3, %ymm2, %ymm2
-; AVX512VLBW-NEXT:    vpslld $1, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    vpsllvd %ymm2, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    retq
@@ -289,13 +289,13 @@
 ; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [31,31,31,31]
 ; XOPAVX1-NEXT:    vpxor %xmm6, %xmm3, %xmm3
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
-; XOPAVX1-NEXT:    vpslld $1, %xmm7, %xmm7
+; XOPAVX1-NEXT:    vpaddd %xmm7, %xmm7, %xmm7
 ; XOPAVX1-NEXT:    vpshld %xmm3, %xmm7, %xmm3
 ; XOPAVX1-NEXT:    vpor %xmm5, %xmm3, %xmm3
 ; XOPAVX1-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; XOPAVX1-NEXT:    vpshld %xmm4, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpxor %xmm6, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpslld $1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vpshld %xmm2, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
@@ -307,7 +307,7 @@
 ; XOPAVX2-NEXT:    vpand %ymm3, %ymm2, %ymm4
 ; XOPAVX2-NEXT:    vpsrlvd %ymm4, %ymm1, %ymm1
 ; XOPAVX2-NEXT:    vpandn %ymm3, %ymm2, %ymm2
-; XOPAVX2-NEXT:    vpslld $1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    vpsllvd %ymm2, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
@@ -348,7 +348,7 @@
 ; AVX1-NEXT:    vcvttps2dq %xmm6, %xmm6
 ; AVX1-NEXT:    vpackusdw %xmm7, %xmm6, %xmm6
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
-; AVX1-NEXT:    vpsllw $1, %xmm7, %xmm7
+; AVX1-NEXT:    vpaddw %xmm7, %xmm7, %xmm7
 ; AVX1-NEXT:    vpmullw %xmm6, %xmm7, %xmm6
 ; AVX1-NEXT:    vpor %xmm4, %xmm6, %xmm4
 ; AVX1-NEXT:    vpsllw $12, %xmm2, %xmm6
@@ -375,7 +375,7 @@
 ; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vcvttps2dq %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vpsllw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
 ; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
@@ -427,7 +427,7 @@
 ; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm4
 ; AVX512BW-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpandn %ymm3, %ymm2, %ymm2
-; AVX512BW-NEXT:    vpsllw $1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpaddw %ymm0, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    retq
@@ -447,7 +447,7 @@
 ; AVX512VLBW-NEXT:    vpand %ymm3, %ymm2, %ymm4
 ; AVX512VLBW-NEXT:    vpsrlvw %ymm4, %ymm1, %ymm1
 ; AVX512VLBW-NEXT:    vpandn %ymm3, %ymm2, %ymm2
-; AVX512VLBW-NEXT:    vpsllw $1, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpaddw %ymm0, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    vpsllvw %ymm2, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    retq
@@ -469,13 +469,13 @@
 ; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [15,15,15,15,15,15,15,15]
 ; XOPAVX1-NEXT:    vpxor %xmm6, %xmm3, %xmm3
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
-; XOPAVX1-NEXT:    vpsllw $1, %xmm7, %xmm7
+; XOPAVX1-NEXT:    vpaddw %xmm7, %xmm7, %xmm7
 ; XOPAVX1-NEXT:    vpshlw %xmm3, %xmm7, %xmm3
 ; XOPAVX1-NEXT:    vpor %xmm5, %xmm3, %xmm3
 ; XOPAVX1-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
 ; XOPAVX1-NEXT:    vpshlw %xmm4, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpxor %xmm6, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpsllw $1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vpshlw %xmm2, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
@@ -484,22 +484,22 @@
 ; XOPAVX2-LABEL: var_funnnel_v16i16:
 ; XOPAVX2:       # %bb.0:
 ; XOPAVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; XOPAVX2-NEXT:    vpandn %ymm3, %ymm2, %ymm4
+; XOPAVX2-NEXT:    vpand %ymm3, %ymm2, %ymm4
 ; XOPAVX2-NEXT:    vextracti128 $1, %ymm4, %xmm5
-; XOPAVX2-NEXT:    vpsllw $1, %ymm0, %ymm0
-; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm6
-; XOPAVX2-NEXT:    vpshlw %xmm5, %xmm6, %xmm5
-; XOPAVX2-NEXT:    vpshlw %xmm4, %xmm0, %xmm0
-; XOPAVX2-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm0
-; XOPAVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; XOPAVX2-NEXT:    vpsubw %xmm5, %xmm6, %xmm5
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm7
+; XOPAVX2-NEXT:    vpshlw %xmm5, %xmm7, %xmm5
+; XOPAVX2-NEXT:    vpsubw %xmm4, %xmm6, %xmm4
+; XOPAVX2-NEXT:    vpshlw %xmm4, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm5, %ymm1, %ymm1
+; XOPAVX2-NEXT:    vpandn %ymm3, %ymm2, %ymm2
 ; XOPAVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; XOPAVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; XOPAVX2-NEXT:    vpsubw %xmm3, %xmm4, %xmm3
-; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm5
-; XOPAVX2-NEXT:    vpshlw %xmm3, %xmm5, %xmm3
-; XOPAVX2-NEXT:    vpsubw %xmm2, %xmm4, %xmm2
-; XOPAVX2-NEXT:    vpshlw %xmm2, %xmm1, %xmm1
-; XOPAVX2-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
+; XOPAVX2-NEXT:    vpaddw %ymm0, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
+; XOPAVX2-NEXT:    vpshlw %xmm3, %xmm4, %xmm3
+; XOPAVX2-NEXT:    vpshlw %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
   %res = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt)
@@ -782,9 +782,9 @@
 ; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm1, %ymm1
 ; AVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpsllq $1, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddq %xmm3, %xmm3, %xmm3
 ; AVX1-NEXT:    vpsllq %xmm2, %xmm3, %xmm3
-; AVX1-NEXT:    vpsllq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
@@ -796,7 +796,7 @@
 ; AVX2-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX2-NEXT:    vpsrlq %xmm4, %ymm1, %ymm1
 ; AVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vpsllq $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddq %ymm0, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsllq %xmm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
@@ -807,7 +807,7 @@
 ; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512F-NEXT:    vpsrlq %xmm4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT:    vpsllq $1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpaddq %ymm0, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsllq %xmm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
@@ -818,7 +818,7 @@
 ; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512VL-NEXT:    vpsrlq %xmm4, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpsllq $1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpaddq %ymm0, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsllq %xmm2, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
@@ -829,7 +829,7 @@
 ; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512BW-NEXT:    vpsrlq %xmm4, %ymm1, %ymm1
 ; AVX512BW-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpsllq $1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpaddq %ymm0, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpsllq %xmm2, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    retq
@@ -849,7 +849,7 @@
 ; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512VLBW-NEXT:    vpsrlq %xmm4, %ymm1, %ymm1
 ; AVX512VLBW-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX512VLBW-NEXT:    vpsllq $1, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpaddq %ymm0, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    vpsllq %xmm2, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    retq
@@ -871,9 +871,9 @@
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm1, %ymm1
 ; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; XOPAVX1-NEXT:    vpsllq $1, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpaddq %xmm3, %xmm3, %xmm3
 ; XOPAVX1-NEXT:    vpsllq %xmm2, %xmm3, %xmm3
-; XOPAVX1-NEXT:    vpsllq $1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; XOPAVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
@@ -885,7 +885,7 @@
 ; XOPAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; XOPAVX2-NEXT:    vpsrlq %xmm4, %ymm1, %ymm1
 ; XOPAVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; XOPAVX2-NEXT:    vpsllq $1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpaddq %ymm0, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    vpsllq %xmm2, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
@@ -1020,11 +1020,11 @@
 ; AVX1-NEXT:    vpsrlw %xmm4, %xmm5, %xmm5
 ; AVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpsllw $1, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
 ; AVX1-NEXT:    vpsllw %xmm2, %xmm3, %xmm3
 ; AVX1-NEXT:    vpor %xmm5, %xmm3, %xmm3
 ; AVX1-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpsllw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsllw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
@@ -1036,7 +1036,7 @@
 ; AVX2-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX2-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
 ; AVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vpsllw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddw %ymm0, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsllw %xmm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
@@ -1047,7 +1047,7 @@
 ; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512F-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT:    vpsllw $1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpaddw %ymm0, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsllw %xmm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
@@ -1058,7 +1058,7 @@
 ; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512VL-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpsllw $1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpaddw %ymm0, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsllw %xmm2, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
@@ -1069,7 +1069,7 @@
 ; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512BW-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
 ; AVX512BW-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpsllw $1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpaddw %ymm0, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpsllw %xmm2, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    retq
@@ -1089,7 +1089,7 @@
 ; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512VLBW-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
 ; AVX512VLBW-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX512VLBW-NEXT:    vpsllw $1, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpaddw %ymm0, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    vpsllw %xmm2, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    retq
@@ -1109,11 +1109,11 @@
 ; XOPAVX1-NEXT:    vpsrlw %xmm4, %xmm5, %xmm5
 ; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; XOPAVX1-NEXT:    vpsllw $1, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
 ; XOPAVX1-NEXT:    vpsllw %xmm2, %xmm3, %xmm3
 ; XOPAVX1-NEXT:    vpor %xmm5, %xmm3, %xmm3
 ; XOPAVX1-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpsllw $1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vpsllw %xmm2, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
@@ -1125,7 +1125,7 @@
 ; XOPAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; XOPAVX2-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
 ; XOPAVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; XOPAVX2-NEXT:    vpsllw $1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpaddw %ymm0, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    vpsllw %xmm2, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
@@ -1494,10 +1494,10 @@
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX1-NEXT:    vpsllw $1, %xmm0, %xmm2
+; AVX1-NEXT:    vpaddw %xmm0, %xmm0, %xmm2
 ; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpsllw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
 ; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
@@ -1508,7 +1508,7 @@
 ; AVX2-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-NEXT:    vpsllw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddw %ymm0, %ymm0, %ymm0
 ; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
@@ -1518,7 +1518,7 @@
 ; AVX512F-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
 ; AVX512F-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT:    vpsllw $1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpaddw %ymm0, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX512F-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
@@ -1528,7 +1528,7 @@
 ; AVX512VL-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
 ; AVX512VL-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
 ; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512VL-NEXT:    vpsllw $1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpaddw %ymm0, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
@@ -1539,7 +1539,7 @@
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
 ; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
-; AVX512BW-NEXT:    vpsllw $1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpaddw %ymm0, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    retq
@@ -1556,7 +1556,7 @@
 ; AVX512VLBW-LABEL: constant_funnnel_v16i16:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512VLBW-NEXT:    vpsllw $1, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpaddw %ymm0, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    retq
@@ -1573,10 +1573,10 @@
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; XOPAVX1-NEXT:    vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; XOPAVX1-NEXT:    vpsllw $1, %xmm0, %xmm2
+; XOPAVX1-NEXT:    vpaddw %xmm0, %xmm0, %xmm2
 ; XOPAVX1-NEXT:    vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; XOPAVX1-NEXT:    vpsllw $1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
 ; XOPAVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
@@ -1587,7 +1587,7 @@
 ; XOPAVX2-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
 ; XOPAVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
 ; XOPAVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; XOPAVX2-NEXT:    vpsllw $1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpaddw %ymm0, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; XOPAVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll
--- a/llvm/test/CodeGen/X86/vector-fshr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll
@@ -22,7 +22,7 @@
 ; AVX512F-NEXT:    vpandq %zmm3, %zmm2, %zmm4
 ; AVX512F-NEXT:    vpsrlvq %zmm4, %zmm1, %zmm1
 ; AVX512F-NEXT:    vpandnq %zmm3, %zmm2, %zmm2
-; AVX512F-NEXT:    vpsllq $1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpaddq %zmm0, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpsllvq %zmm2, %zmm0, %zmm0
 ; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
@@ -33,7 +33,7 @@
 ; AVX512VL-NEXT:    vpandq %zmm3, %zmm2, %zmm4
 ; AVX512VL-NEXT:    vpsrlvq %zmm4, %zmm1, %zmm1
 ; AVX512VL-NEXT:    vpandnq %zmm3, %zmm2, %zmm2
-; AVX512VL-NEXT:    vpsllq $1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpaddq %zmm0, %zmm0, %zmm0
 ; AVX512VL-NEXT:    vpsllvq %zmm2, %zmm0, %zmm0
 ; AVX512VL-NEXT:    vporq %zmm1, %zmm0, %zmm0
 ; AVX512VL-NEXT:    retq
@@ -44,7 +44,7 @@
 ; AVX512BW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
 ; AVX512BW-NEXT:    vpsrlvq %zmm4, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpandnq %zmm3, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpsllq $1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddq %zmm0, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsllvq %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
@@ -61,7 +61,7 @@
 ; AVX512VLBW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
 ; AVX512VLBW-NEXT:    vpsrlvq %zmm4, %zmm1, %zmm1
 ; AVX512VLBW-NEXT:    vpandnq %zmm3, %zmm2, %zmm2
-; AVX512VLBW-NEXT:    vpsllq $1, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpaddq %zmm0, %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    vpsllvq %zmm2, %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    vporq %zmm1, %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    retq
@@ -82,7 +82,7 @@
 ; AVX512F-NEXT:    vpandd %zmm3, %zmm2, %zmm4
 ; AVX512F-NEXT:    vpsrlvd %zmm4, %zmm1, %zmm1
 ; AVX512F-NEXT:    vpandnd %zmm3, %zmm2, %zmm2
-; AVX512F-NEXT:    vpslld $1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpaddd %zmm0, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpsllvd %zmm2, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
@@ -93,7 +93,7 @@
 ; AVX512VL-NEXT:    vpandd %zmm3, %zmm2, %zmm4
 ; AVX512VL-NEXT:    vpsrlvd %zmm4, %zmm1, %zmm1
 ; AVX512VL-NEXT:    vpandnd %zmm3, %zmm2, %zmm2
-; AVX512VL-NEXT:    vpslld $1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpaddd %zmm0, %zmm0, %zmm0
 ; AVX512VL-NEXT:    vpsllvd %zmm2, %zmm0, %zmm0
 ; AVX512VL-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512VL-NEXT:    retq
@@ -104,7 +104,7 @@
 ; AVX512BW-NEXT:    vpandd %zmm3, %zmm2, %zmm4
 ; AVX512BW-NEXT:    vpsrlvd %zmm4, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpandnd %zmm3, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpslld $1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddd %zmm0, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsllvd %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
@@ -121,7 +121,7 @@
 ; AVX512VLBW-NEXT:    vpandd %zmm3, %zmm2, %zmm4
 ; AVX512VLBW-NEXT:    vpsrlvd %zmm4, %zmm1, %zmm1
 ; AVX512VLBW-NEXT:    vpandnd %zmm3, %zmm2, %zmm2
-; AVX512VLBW-NEXT:    vpslld $1, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpaddd %zmm0, %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    vpsllvd %zmm2, %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    retq
@@ -188,7 +188,7 @@
 ; AVX512BW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
 ; AVX512BW-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpandnq %zmm3, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpsllw $1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
@@ -205,7 +205,7 @@
 ; AVX512VLBW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
 ; AVX512VLBW-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm1
 ; AVX512VLBW-NEXT:    vpandnq %zmm3, %zmm2, %zmm2
-; AVX512VLBW-NEXT:    vpsllw $1, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpaddw %zmm0, %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    vporq %zmm1, %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    retq
@@ -428,7 +428,7 @@
 ; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512F-NEXT:    vpsrlq %xmm4, %zmm1, %zmm1
 ; AVX512F-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT:    vpsllq $1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpaddq %zmm0, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpsllq %xmm2, %zmm0, %zmm0
 ; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
@@ -439,7 +439,7 @@
 ; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512VL-NEXT:    vpsrlq %xmm4, %zmm1, %zmm1
 ; AVX512VL-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpsllq $1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpaddq %zmm0, %zmm0, %zmm0
 ; AVX512VL-NEXT:    vpsllq %xmm2, %zmm0, %zmm0
 ; AVX512VL-NEXT:    vporq %zmm1, %zmm0, %zmm0
 ; AVX512VL-NEXT:    retq
@@ -450,7 +450,7 @@
 ; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512BW-NEXT:    vpsrlq %xmm4, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpsllq $1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddq %zmm0, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsllq %xmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
@@ -468,7 +468,7 @@
 ; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512VLBW-NEXT:    vpsrlq %xmm4, %zmm1, %zmm1
 ; AVX512VLBW-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX512VLBW-NEXT:    vpsllq $1, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpaddq %zmm0, %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    vpsllq %xmm2, %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    vporq %zmm1, %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    retq
@@ -554,9 +554,9 @@
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm5, %zmm1, %zmm1
 ; AVX512F-NEXT:    vpandn %xmm3, %xmm2, %xmm2
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; AVX512F-NEXT:    vpsllw $1, %ymm3, %ymm3
+; AVX512F-NEXT:    vpaddw %ymm3, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpsllw %xmm2, %ymm3, %ymm3
-; AVX512F-NEXT:    vpsllw $1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpaddw %ymm0, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsllw %xmm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
 ; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
@@ -572,9 +572,9 @@
 ; AVX512VL-NEXT:    vinserti64x4 $1, %ymm5, %zmm1, %zmm1
 ; AVX512VL-NEXT:    vpandn %xmm3, %xmm2, %xmm2
 ; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; AVX512VL-NEXT:    vpsllw $1, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpaddw %ymm3, %ymm3, %ymm3
 ; AVX512VL-NEXT:    vpsllw %xmm2, %ymm3, %ymm3
-; AVX512VL-NEXT:    vpsllw $1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpaddw %ymm0, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsllw %xmm2, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
 ; AVX512VL-NEXT:    vporq %zmm1, %zmm0, %zmm0
@@ -586,7 +586,7 @@
 ; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512BW-NEXT:    vpsrlw %xmm4, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpsllw $1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsllw %xmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
@@ -604,7 +604,7 @@
 ; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512VLBW-NEXT:    vpsrlw %xmm4, %zmm1, %zmm1
 ; AVX512VLBW-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX512VLBW-NEXT:    vpsllw $1, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpaddw %zmm0, %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    vpsllw %xmm2, %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    vporq %zmm1, %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    retq
@@ -853,7 +853,7 @@
 ; AVX512BW-LABEL: constant_funnnel_v32i16:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
-; AVX512BW-NEXT:    vpsllw $1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
@@ -867,7 +867,7 @@
 ; AVX512VLBW-LABEL: constant_funnnel_v32i16:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
-; AVX512VLBW-NEXT:    vpsllw $1, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpaddw %zmm0, %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    vporq %zmm1, %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
@@ -963,7 +963,7 @@
 ; SSE41-NEXT:    movdqa %xmm0, %xmm4
 ; SSE41-NEXT:    psrlw %xmm3, %xmm4
 ; SSE41-NEXT:    pandn %xmm2, %xmm1
-; SSE41-NEXT:    psllw $1, %xmm0
+; SSE41-NEXT:    paddw %xmm0, %xmm0
 ; SSE41-NEXT:    psllw %xmm1, %xmm0
 ; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    retq
@@ -974,7 +974,7 @@
 ; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm3
 ; AVX-NEXT:    vpsrlw %xmm3, %xmm0, %xmm3
 ; AVX-NEXT:    vpandn %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpsllw $1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpor %xmm3, %xmm0, %xmm0
 ; AVX-NEXT:    retq
@@ -985,7 +985,7 @@
 ; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm3
 ; AVX512F-NEXT:    vpsrlw %xmm3, %xmm0, %xmm3
 ; AVX512F-NEXT:    vpandn %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT:    vpsllw $1, %xmm0, %xmm0
+; AVX512F-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpor %xmm3, %xmm0, %xmm0
 ; AVX512F-NEXT:    retq
@@ -996,7 +996,7 @@
 ; AVX512VL-NEXT:    vpand %xmm2, %xmm1, %xmm3
 ; AVX512VL-NEXT:    vpsrlw %xmm3, %xmm0, %xmm3
 ; AVX512VL-NEXT:    vpandn %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpsllw $1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpor %xmm3, %xmm0, %xmm0
 ; AVX512VL-NEXT:    retq
@@ -1007,7 +1007,7 @@
 ; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm3
 ; AVX512BW-NEXT:    vpsrlw %xmm3, %xmm0, %xmm3
 ; AVX512BW-NEXT:    vpandn %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpsllw $1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpor %xmm3, %xmm0, %xmm0
 ; AVX512BW-NEXT:    retq
@@ -1018,7 +1018,7 @@
 ; AVX512VLBW-NEXT:    vpand %xmm2, %xmm1, %xmm3
 ; AVX512VLBW-NEXT:    vpsrlw %xmm3, %xmm0, %xmm3
 ; AVX512VLBW-NEXT:    vpandn %xmm2, %xmm1, %xmm1
-; AVX512VLBW-NEXT:    vpsllw $1, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
 ; AVX512VLBW-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
 ; AVX512VLBW-NEXT:    vpor %xmm3, %xmm0, %xmm0
 ; AVX512VLBW-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
@@ -789,11 +789,11 @@
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
 ; AVX1-NEXT:    vpsrlw %xmm3, %xmm4, %xmm5
 ; AVX1-NEXT:    vpandn %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpsllw $1, %xmm4, %xmm2
+; AVX1-NEXT:    vpaddw %xmm4, %xmm4, %xmm2
 ; AVX1-NEXT:    vpsllw %xmm1, %xmm2, %xmm2
 ; AVX1-NEXT:    vpor %xmm5, %xmm2, %xmm2
 ; AVX1-NEXT:    vpsrlw %xmm3, %xmm0, %xmm3
-; AVX1-NEXT:    vpsllw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -805,7 +805,7 @@
 ; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
 ; AVX2-NEXT:    vpsrlw %xmm3, %ymm0, %ymm3
 ; AVX2-NEXT:    vpandn %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpsllw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddw %ymm0, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
@@ -816,7 +816,7 @@
 ; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm3
 ; AVX512F-NEXT:    vpsrlw %xmm3, %ymm0, %ymm3
 ; AVX512F-NEXT:    vpandn %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT:    vpsllw $1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpaddw %ymm0, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpor %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
@@ -827,7 +827,7 @@
 ; AVX512VL-NEXT:    vpand %xmm2, %xmm1, %xmm3
 ; AVX512VL-NEXT:    vpsrlw %xmm3, %ymm0, %ymm3
 ; AVX512VL-NEXT:    vpandn %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpsllw $1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpaddw %ymm0, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpor %ymm3, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
@@ -838,7 +838,7 @@
 ; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm3
 ; AVX512BW-NEXT:    vpsrlw %xmm3, %ymm0, %ymm3
 ; AVX512BW-NEXT:    vpandn %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpsllw $1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpaddw %ymm0, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpor %ymm3, %ymm0, %ymm0
 ; AVX512BW-NEXT:    retq
@@ -849,7 +849,7 @@
 ; AVX512VLBW-NEXT:    vpand %xmm2, %xmm1, %xmm3
 ; AVX512VLBW-NEXT:    vpsrlw %xmm3, %ymm0, %ymm3
 ; AVX512VLBW-NEXT:    vpandn %xmm2, %xmm1, %xmm1
-; AVX512VLBW-NEXT:    vpsllw $1, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpaddw %ymm0, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    vpor %ymm3, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
@@ -301,9 +301,9 @@
 ; AVX512F-NEXT:    vpsrlw %xmm3, %ymm0, %ymm3
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm5, %zmm3, %zmm3
 ; AVX512F-NEXT:    vpandn %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT:    vpsllw $1, %ymm4, %ymm2
+; AVX512F-NEXT:    vpaddw %ymm4, %ymm4, %ymm2
 ; AVX512F-NEXT:    vpsllw %xmm1, %ymm2, %ymm2
-; AVX512F-NEXT:    vpsllw $1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpaddw %ymm0, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512F-NEXT:    vporq %zmm3, %zmm0, %zmm0
@@ -318,9 +318,9 @@
 ; AVX512VL-NEXT:    vpsrlw %xmm3, %ymm0, %ymm3
 ; AVX512VL-NEXT:    vinserti64x4 $1, %ymm5, %zmm3, %zmm3
 ; AVX512VL-NEXT:    vpandn %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpsllw $1, %ymm4, %ymm2
+; AVX512VL-NEXT:    vpaddw %ymm4, %ymm4, %ymm2
 ; AVX512VL-NEXT:    vpsllw %xmm1, %ymm2, %ymm2
-; AVX512VL-NEXT:    vpsllw $1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpaddw %ymm0, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512VL-NEXT:    vporq %zmm3, %zmm0, %zmm0
@@ -332,7 +332,7 @@
 ; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm3
 ; AVX512BW-NEXT:    vpsrlw %xmm3, %zmm0, %zmm3
 ; AVX512BW-NEXT:    vpandn %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpsllw $1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsllw %xmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vporq %zmm3, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
@@ -343,7 +343,7 @@
 ; AVX512VLBW-NEXT:    vpand %xmm2, %xmm1, %xmm3
 ; AVX512VLBW-NEXT:    vpsrlw %xmm3, %zmm0, %zmm3
 ; AVX512VLBW-NEXT:    vpandn %xmm2, %xmm1, %xmm1
-; AVX512VLBW-NEXT:    vpsllw $1, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpaddw %zmm0, %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    vpsllw %xmm1, %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    vporq %zmm3, %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll
--- a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll
+++ b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll
@@ -18,7 +18,7 @@
 ; CHECK-NEXT:    pmullw %xmm1, %xmm2
 ; CHECK-NEXT:    psrlw $15, %xmm2
 ; CHECK-NEXT:    pmulhw %xmm1, %xmm0
-; CHECK-NEXT:    psllw $1, %xmm0
+; CHECK-NEXT:    paddw %xmm0, %xmm0
 ; CHECK-NEXT:    por %xmm2, %xmm0
 ; CHECK-NEXT:    retq
   %t = call <4 x i16> @llvm.smul.fix.v4i16(<4 x i16> <i16 1, i16 2, i16 3, i16 4>, <4 x i16> %a, i32 15)
@@ -33,7 +33,7 @@
 ; CHECK-NEXT:    pmullw %xmm1, %xmm2
 ; CHECK-NEXT:    psrlw $15, %xmm2
 ; CHECK-NEXT:    pmulhuw %xmm1, %xmm0
-; CHECK-NEXT:    psllw $1, %xmm0
+; CHECK-NEXT:    paddw %xmm0, %xmm0
 ; CHECK-NEXT:    por %xmm2, %xmm0
 ; CHECK-NEXT:    retq
   %t = call <4 x i16> @llvm.umul.fix.v4i16(<4 x i16> <i16 1, i16 2, i16 3, i16 4>, <4 x i16> %a, i32 15)
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
--- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
@@ -927,23 +927,23 @@
 ; SSE2-LABEL: constant_shift_v2i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psllq $1, %xmm1
-; SSE2-NEXT:    psllq $7, %xmm0
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    psllq $7, %xmm1
+; SSE2-NEXT:    paddq %xmm0, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: constant_shift_v2i64:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; SSE41-NEXT:    psllq $7, %xmm1
-; SSE41-NEXT:    psllq $1, %xmm0
+; SSE41-NEXT:    paddq %xmm0, %xmm0
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: constant_shift_v2i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpsllq $7, %xmm0, %xmm1
-; AVX1-NEXT:    vpsllq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
 ; AVX1-NEXT:    retq
 ;
@@ -975,9 +975,9 @@
 ; X86-SSE-LABEL: constant_shift_v2i64:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE-NEXT:    psllq $1, %xmm1
-; X86-SSE-NEXT:    psllq $7, %xmm0
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X86-SSE-NEXT:    psllq $7, %xmm1
+; X86-SSE-NEXT:    paddq %xmm0, %xmm0
+; X86-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; X86-SSE-NEXT:    retl
   %shift = shl <2 x i64> %a, <i64 1, i64 7>
   ret <2 x i64> %shift
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
--- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
@@ -1061,7 +1061,7 @@
 ; AVX1-NEXT:    vpsllq $31, %xmm1, %xmm1
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
 ; AVX1-NEXT:    vpsllq $7, %xmm0, %xmm2
-; AVX1-NEXT:    vpsllq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -1101,7 +1101,7 @@
 ; X86-AVX1-NEXT:    vpsllq $31, %xmm1, %xmm1
 ; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
 ; X86-AVX1-NEXT:    vpsllq $7, %xmm0, %xmm2
-; X86-AVX1-NEXT:    vpsllq $1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
 ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; X86-AVX1-NEXT:    retl
diff --git a/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_minimal.s b/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_minimal.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_minimal.s
@@ -0,0 +1,18 @@
+# RUN: llvm-mc -triple=i386-unknown-linux-gnu -position-independent -filetype=obj -o %t.o %s
+# RUN: llvm-jitlink -noexec %t.o
+
+	.text
+	.globl	main
+	.p2align	4
+	.type	main,@function
+main:
+    pushl   %ebp
+    movl    %esp, %ebp
+    pushl   %eax
+    movl    $0, -4(%ebp)
+    movl    $42, %eax
+    addl    $4, %esp
+    popl    %ebp
+    retl
+
+	.size	main, .-main
\ No newline at end of file
diff --git a/llvm/test/ExecutionEngine/JITLink/i386/lit.local.cfg b/llvm/test/ExecutionEngine/JITLink/i386/lit.local.cfg
new file mode 100644
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/i386/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'i386' in config.root.targets:
+  config.unsupported = True
\ No newline at end of file
diff --git a/llvm/test/MC/ELF/debug-hash-file.s b/llvm/test/MC/ELF/debug-hash-file.s
--- a/llvm/test/MC/ELF/debug-hash-file.s
+++ b/llvm/test/MC/ELF/debug-hash-file.s
@@ -23,6 +23,26 @@
 // DWARF5-NEXT:      dir_index: 0
 // DWARF5-NOT:  file_names[ 1]:
 
+// RUN: llvm-mc -triple=x86_64 -filetype=obj -g -dwarf-version=4 -fdebug-prefix-map=/MyTest=/src_root %s -o %t.4.o
+// RUN: llvm-dwarfdump -debug-info -debug-line %t.4.o | FileCheck %s --check-prefixes=MAP,MAP_V4
+// RUN: llvm-mc -triple=x86_64 -filetype=obj -g -dwarf-version=5 -fdebug-prefix-map=/MyTest=/src_root %s -o %t.5.o
+// RUN: llvm-dwarfdump -debug-info -debug-line %t.5.o | FileCheck %s --check-prefixes=MAP,MAP_V5
+
+// MAP-LABEL:   DW_TAG_compile_unit
+// MAP:           DW_AT_name      ("/src_root/Inputs{{(/|\\)+}}other.S")
+// MAP-LABEL:     DW_TAG_label
+// MAP:             DW_AT_decl_file      ("/src_root/Inputs{{(/|\\)+}}other.S")
+
+// MAP_V4:      include_directories[  1] = "/src_root/Inputs"
+// MAP_V4-NEXT: file_names[  1]:
+// MAP_V4-NEXT:            name: "other.S"
+// MAP_V4-NEXT:       dir_index: 1
+
+// MAP_V5:      include_directories[  0] = "{{.*}}"
+// MAP_V5-NEXT: file_names[  0]:
+// MAP_V5-NEXT:            name: "/src_root/Inputs/other.S"
+// MAP_V5-NEXT:       dir_index: 0
+
 # 1 "/MyTest/Inputs/other.S"
 
 foo:
diff --git a/llvm/test/Transforms/IndVarSimplify/AArch64/widen-loop-comp.ll b/llvm/test/Transforms/IndVarSimplify/AArch64/widen-loop-comp.ll
--- a/llvm/test/Transforms/IndVarSimplify/AArch64/widen-loop-comp.ll
+++ b/llvm/test/Transforms/IndVarSimplify/AArch64/widen-loop-comp.ll
@@ -275,7 +275,7 @@
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[ADD]] = add nsw i32 [[SUM_0]], [[TMP1]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    br label [[FOR_COND]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[SUM_0]], [[FOR_COND]] ]
@@ -410,7 +410,7 @@
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[ADD]] = add nsw i32 [[SUM_0]], [[TMP2]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i64 0, [[INDVARS_IV_NEXT]]
 ; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR_COND]], label [[FOR_END]]
 ; CHECK:       for.end:
diff --git a/llvm/test/Transforms/IndVarSimplify/X86/iv-widen.ll b/llvm/test/Transforms/IndVarSimplify/X86/iv-widen.ll
--- a/llvm/test/Transforms/IndVarSimplify/X86/iv-widen.ll
+++ b/llvm/test/Transforms/IndVarSimplify/X86/iv-widen.ll
@@ -75,7 +75,7 @@
 ; CHECK-NEXT:    br label [[B18:%.*]]
 ; CHECK:       B18:
 ; CHECK-NEXT:    [[DOT02:%.*]] = phi i32 [ [[TMP33:%.*]], [[B24:%.*]] ], [ 0, [[B18_PREHEADER]] ]
-; CHECK-NEXT:    [[TMP33]] = add nuw nsw i32 [[DOT02]], 1
+; CHECK-NEXT:    [[TMP33]] = add nuw i32 [[DOT02]], 1
 ; CHECK-NEXT:    [[O:%.*]] = getelementptr i32, i32* [[A:%.*]], i32 [[DOT02]]
 ; CHECK-NEXT:    [[V:%.*]] = load i32, i32* [[O]], align 4
 ; CHECK-NEXT:    [[T:%.*]] = icmp eq i32 [[V]], 0
@@ -167,11 +167,11 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[SIZE]] to i64
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[HSIZE:%.*]] to i64
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[NSTEPS:%.*]], i32 1)
-; CHECK-NEXT:    [[WIDE_TRIP_COUNT14:%.*]] = zext i32 [[SMAX]] to i64
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT11:%.*]] = zext i32 [[SMAX]] to i64
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV9:%.*]] = phi i64 [ [[INDVARS_IV_NEXT10:%.*]], [[FOR_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = mul nsw i64 [[INDVARS_IV9]], [[TMP0]]
+; CHECK-NEXT:    [[INDVARS_IV7:%.*]] = phi i64 [ [[INDVARS_IV_NEXT8:%.*]], [[FOR_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nsw i64 [[INDVARS_IV7]], [[TMP0]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw i64 [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[CMP215]], label [[FOR_BODY2_PREHEADER:%.*]], label [[FOR_INC]]
 ; CHECK:       for.body2.preheader:
@@ -188,22 +188,22 @@
 ; CHECK:       for.body3.preheader:
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32
 ; CHECK-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP5]] to i64
-; CHECK-NEXT:    [[WIDE_TRIP_COUNT7:%.*]] = zext i32 [[SIZE]] to i64
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT5:%.*]] = zext i32 [[SIZE]] to i64
 ; CHECK-NEXT:    br label [[FOR_BODY3:%.*]]
 ; CHECK:       for.body3:
-; CHECK-NEXT:    [[INDVARS_IV3:%.*]] = phi i64 [ 1, [[FOR_BODY3_PREHEADER]] ], [ [[INDVARS_IV_NEXT4:%.*]], [[FOR_BODY3]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = add nuw nsw i64 [[TMP6]], [[INDVARS_IV3]]
+; CHECK-NEXT:    [[INDVARS_IV2:%.*]] = phi i64 [ 1, [[FOR_BODY3_PREHEADER]] ], [ [[INDVARS_IV_NEXT3:%.*]], [[FOR_BODY3]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = add nuw nsw i64 [[TMP6]], [[INDVARS_IV2]]
 ; CHECK-NEXT:    [[ADD_PTR2:%.*]] = getelementptr inbounds i8, i8* [[BC0]], i64 [[TMP7]]
 ; CHECK-NEXT:    store i8 [[TMP1]], i8* [[ADD_PTR2]], align 1
-; CHECK-NEXT:    [[INDVARS_IV_NEXT4]] = add nuw nsw i64 [[INDVARS_IV3]], 1
-; CHECK-NEXT:    [[EXITCOND8:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT4]], [[WIDE_TRIP_COUNT7]]
-; CHECK-NEXT:    br i1 [[EXITCOND8]], label [[FOR_BODY3]], label [[FOR_INC_LOOPEXIT:%.*]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT3]] = add nuw nsw i64 [[INDVARS_IV2]], 1
+; CHECK-NEXT:    [[EXITCOND6:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT3]], [[WIDE_TRIP_COUNT5]]
+; CHECK-NEXT:    br i1 [[EXITCOND6]], label [[FOR_BODY3]], label [[FOR_INC_LOOPEXIT:%.*]]
 ; CHECK:       for.inc.loopexit:
 ; CHECK-NEXT:    br label [[FOR_INC]]
 ; CHECK:       for.inc:
-; CHECK-NEXT:    [[INDVARS_IV_NEXT10]] = add nuw nsw i64 [[INDVARS_IV9]], 1
-; CHECK-NEXT:    [[EXITCOND15:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT10]], [[WIDE_TRIP_COUNT14]]
-; CHECK-NEXT:    br i1 [[EXITCOND15]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT8]] = add nuw nsw i64 [[INDVARS_IV7]], 1
+; CHECK-NEXT:    [[EXITCOND12:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT8]], [[WIDE_TRIP_COUNT11]]
+; CHECK-NEXT:    br i1 [[EXITCOND12]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]]
 ; CHECK:       for.end.loopexit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/IndVarSimplify/X86/pr35406.ll b/llvm/test/Transforms/IndVarSimplify/X86/pr35406.ll
--- a/llvm/test/Transforms/IndVarSimplify/X86/pr35406.ll
+++ b/llvm/test/Transforms/IndVarSimplify/X86/pr35406.ll
@@ -77,10 +77,17 @@
 ; CHECK:       general_case24:
 ; CHECK-NEXT:    br i1 false, label [[LOOP2_PREHEADER:%.*]], label [[LOOP2_EXIT]]
 ; CHECK:       loop2.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = udiv i32 14, [[LOCAL_0_]]
+; CHECK-NEXT:    [[TMP1:%.*]] = udiv i32 60392, [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[TMP1]], -1
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i64 [[TMP4]], 60392
 ; CHECK-NEXT:    br label [[LOOP2:%.*]]
 ; CHECK:       loop2:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nsw i64 [[TMP5]], -1
 ; CHECK-NEXT:    [[I4:%.*]] = load atomic i64, i64* [[P1:%.*]] unordered, align 8
-; CHECK-NEXT:    [[I6:%.*]] = sub i64 [[I4]], -1
+; CHECK-NEXT:    [[I6:%.*]] = sub i64 [[I4]], [[INDVARS_IV_NEXT]]
 ; CHECK-NEXT:    store atomic i64 [[I6]], i64* [[P1]] unordered, align 8
 ; CHECK-NEXT:    br i1 true, label [[LOOP2_EXIT_LOOPEXIT:%.*]], label [[LOOP2]]
 ; CHECK:       loop2.exit.loopexit:
diff --git a/llvm/test/Transforms/IndVarSimplify/bbi-63564.ll b/llvm/test/Transforms/IndVarSimplify/bbi-63564.ll
--- a/llvm/test/Transforms/IndVarSimplify/bbi-63564.ll
+++ b/llvm/test/Transforms/IndVarSimplify/bbi-63564.ll
@@ -19,7 +19,7 @@
 ; CHECK-NEXT:    br label [[FOR_BODY2:%.*]]
 ; CHECK:       for.body2:
 ; CHECK-NEXT:    [[INC2:%.*]] = phi i16 [ undef, [[FOR_BODY]] ], [ [[INC:%.*]], [[FOR_BODY2]] ]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i16 [[INC2]], 1
+; CHECK-NEXT:    [[INC]] = add nsw i16 [[INC2]], 1
 ; CHECK-NEXT:    store i16 [[INC]], i16* undef, align 1
 ; CHECK-NEXT:    br i1 true, label [[FOR_BODY2]], label [[CRIT_EDGE:%.*]]
 ; CHECK:       crit_edge:
diff --git a/llvm/test/Transforms/IndVarSimplify/cycled_phis.ll b/llvm/test/Transforms/IndVarSimplify/cycled_phis.ll
--- a/llvm/test/Transforms/IndVarSimplify/cycled_phis.ll
+++ b/llvm/test/Transforms/IndVarSimplify/cycled_phis.ll
@@ -85,7 +85,7 @@
 ; CHECK-NEXT:    [[UNSIGNED_CMP:%.*]] = icmp ult i32 [[IV]], [[LEN]]
 ; CHECK-NEXT:    br i1 [[UNSIGNED_CMP]], label [[BACKEDGE]], label [[FAILED_UNSIGNED:%.*]]
 ; CHECK:       backedge:
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = call i1 @cond()
 ; CHECK-NEXT:    br i1 [[COND]], label [[LOOP]], label [[DONE:%.*]]
 ; CHECK:       failed.signed:
@@ -161,7 +161,7 @@
 ; CHECK-NEXT:    [[UNSIGNED_CMP:%.*]] = icmp ult i32 [[IV]], [[LEN]]
 ; CHECK-NEXT:    br i1 [[UNSIGNED_CMP]], label [[BACKEDGE]], label [[FAILED_UNSIGNED:%.*]]
 ; CHECK:       backedge:
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i32 [[IV]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = call i1 @cond()
 ; CHECK-NEXT:    br i1 [[COND]], label [[LOOP]], label [[DONE:%.*]]
 ; CHECK:       failed.signed:
@@ -252,7 +252,7 @@
 ; CHECK:       signed.passed:
 ; CHECK-NEXT:    br i1 true, label [[BACKEDGE]], label [[FAILED_UNSIGNED:%.*]]
 ; CHECK:       backedge:
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i32 [[IV]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = call i1 @cond()
 ; CHECK-NEXT:    br i1 [[COND]], label [[LOOP]], label [[DONE:%.*]]
 ; CHECK:       failed.signed:
@@ -354,7 +354,7 @@
 ; CHECK-NEXT:    [[UNSIGNED_CMP:%.*]] = icmp ult i32 [[IV_START]], [[LEN]]
 ; CHECK-NEXT:    br i1 [[UNSIGNED_CMP]], label [[BACKEDGE]], label [[FAILED_UNSIGNED:%.*]]
 ; CHECK:       backedge:
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = call i1 @cond()
 ; CHECK-NEXT:    br i1 [[COND]], label [[LOOP]], label [[OUTER_LOOP_BACKEDGE]]
 ; CHECK:       outer.loop.backedge:
@@ -472,7 +472,7 @@
 ; CHECK-NEXT:    [[UNSIGNED_CMP:%.*]] = icmp ult i32 [[IV_START]], [[LEN]]
 ; CHECK-NEXT:    br i1 [[UNSIGNED_CMP]], label [[BACKEDGE]], label [[FAILED_UNSIGNED:%.*]]
 ; CHECK:       backedge:
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = call i1 @cond()
 ; CHECK-NEXT:    br i1 [[COND]], label [[LOOP]], label [[OUTER_LOOP_SELECTION:%.*]]
 ; CHECK:       outer.loop.selection:
diff --git a/llvm/test/Transforms/IndVarSimplify/eliminate-comparison.ll b/llvm/test/Transforms/IndVarSimplify/eliminate-comparison.ll
--- a/llvm/test/Transforms/IndVarSimplify/eliminate-comparison.ll
+++ b/llvm/test/Transforms/IndVarSimplify/eliminate-comparison.ll
@@ -638,7 +638,7 @@
 ; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i16 [[TMP26]], 0
 ; CHECK-NEXT:    br i1 [[TMP29]], label [[BB1]], label [[BB2_LOOPEXIT]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[TMP30]] = add nuw nsw i32 [[VAR_1]], 1
+; CHECK-NEXT:    [[TMP30]] = add nuw i32 [[VAR_1]], 1
 ; CHECK-NEXT:    [[TMP31:%.*]] = icmp eq i32 [[VAR_0]], 0
 ; CHECK-NEXT:    br i1 [[TMP31]], label [[BB3:%.*]], label [[BB0]]
 ; CHECK:       bb2.loopexit:
@@ -1003,7 +1003,7 @@
 ; CHECK:       checked.2:
 ; CHECK-NEXT:    br i1 true, label [[BACKEDGE]], label [[FAIL]]
 ; CHECK:       backedge:
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i32 [[IV]], 758394
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 758394
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = call i1 @cond_func()
 ; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]]
 ; CHECK:       fail:
@@ -1055,7 +1055,7 @@
 ; CHECK:       checked.2:
 ; CHECK-NEXT:    br i1 [[C3]], label [[BACKEDGE]], label [[FAIL]]
 ; CHECK:       backedge:
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i32 [[IV]], 758394
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 758394
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = call i1 @cond_func()
 ; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]]
 ; CHECK:       fail:
diff --git a/llvm/test/Transforms/IndVarSimplify/finite-exit-comparisons.ll b/llvm/test/Transforms/IndVarSimplify/finite-exit-comparisons.ll
--- a/llvm/test/Transforms/IndVarSimplify/finite-exit-comparisons.ll
+++ b/llvm/test/Transforms/IndVarSimplify/finite-exit-comparisons.ll
@@ -1029,7 +1029,7 @@
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw i8 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT]] = add i8 [[IV]], 1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[IV_NEXT]], [[TMP0]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]]
 ; CHECK:       for.end.loopexit:
diff --git a/llvm/test/Transforms/IndVarSimplify/loop-predication.ll b/llvm/test/Transforms/IndVarSimplify/loop-predication.ll
--- a/llvm/test/Transforms/IndVarSimplify/loop-predication.ll
+++ b/llvm/test/Transforms/IndVarSimplify/loop-predication.ll
@@ -611,7 +611,7 @@
 ; CHECK-NEXT:    ret i32 -1
 ; CHECK:       guarded:
 ; CHECK-NEXT:    store volatile i32 0, i32* [[A:%.*]], align 4
-; CHECK-NEXT:    [[I_NEXT]] = add nuw i32 [[I]], 1
+; CHECK-NEXT:    [[I_NEXT]] = add i32 [[I]], 1
 ; CHECK-NEXT:    br label [[LOOP]]
 ;
 loop.preheader:
diff --git a/llvm/test/Transforms/IndVarSimplify/trivial-guard.ll b/llvm/test/Transforms/IndVarSimplify/trivial-guard.ll
--- a/llvm/test/Transforms/IndVarSimplify/trivial-guard.ll
+++ b/llvm/test/Transforms/IndVarSimplify/trivial-guard.ll
@@ -17,7 +17,7 @@
 ; CHECK-NEXT:    [[CHECK_1:%.*]] = icmp slt i32 [[IV_1]], [[X:%.*]]
 ; CHECK-NEXT:    br i1 [[CHECK_1]], label [[GUARDED_1]], label [[FAIL_LOOPEXIT:%.*]]
 ; CHECK:       guarded.1:
-; CHECK-NEXT:    [[IV_NEXT_1]] = add nuw nsw i32 [[IV_1]], 1
+; CHECK-NEXT:    [[IV_NEXT_1]] = add nuw i32 [[IV_1]], 1
 ; CHECK-NEXT:    [[LOOP_COND_1:%.*]] = call i1 @cond()
 ; CHECK-NEXT:    br i1 [[LOOP_COND_1]], label [[LOOP_1]], label [[EXIT_LOOPEXIT:%.*]]
 ; CHECK:       loop.2:
@@ -86,7 +86,7 @@
 ; CHECK-NEXT:    [[CHECK_2:%.*]] = icmp slt i32 [[IV_2]], [[X:%.*]]
 ; CHECK-NEXT:    br i1 [[CHECK_2]], label [[GUARDED_2]], label [[FAIL_LOOPEXIT1:%.*]]
 ; CHECK:       guarded.2:
-; CHECK-NEXT:    [[IV_NEXT_2]] = add nuw nsw i32 [[IV_2]], 1
+; CHECK-NEXT:    [[IV_NEXT_2]] = add nuw i32 [[IV_2]], 1
 ; CHECK-NEXT:    [[LOOP_COND_2:%.*]] = call i1 @cond()
 ; CHECK-NEXT:    br i1 [[LOOP_COND_2]], label [[LOOP_2]], label [[EXIT_LOOPEXIT2:%.*]]
 ; CHECK:       exit.loopexit:
diff --git a/llvm/test/Transforms/InstCombine/known-phi-br.ll b/llvm/test/Transforms/InstCombine/known-phi-br.ll
--- a/llvm/test/Transforms/InstCombine/known-phi-br.ll
+++ b/llvm/test/Transforms/InstCombine/known-phi-br.ll
@@ -6,11 +6,14 @@
 ; the known bits of a phi edge based off a conditional branch feeding the phi.
 ;
 
+declare void @use(i1)
+
 ; TODO: %x either eq 7 or is set to 7
 define i64 @limit_i64_eq_7(i64 %x) {
 ; CHECK-LABEL: @limit_i64_eq_7(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[X:%.*]], 7
+; CHECK-NEXT:    call void @use(i1 [[CMP]])
 ; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[BODY:%.*]]
 ; CHECK:       body:
 ; CHECK-NEXT:    br label [[END]]
@@ -20,6 +23,7 @@
 ;
 entry:
   %cmp = icmp eq i64 %x, 7
+  call void @use(i1 %cmp)
   br i1 %cmp, label %end, label %body
 body:
   br label %end
@@ -32,8 +36,9 @@
 define i64 @limit_i64_ne_255(i64 %x) {
 ; CHECK-LABEL: @limit_i64_ne_255(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[X:%.*]], 255
-; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[END:%.*]], label [[BODY:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i64 [[X:%.*]], 255
+; CHECK-NEXT:    call void @use(i1 [[CMP]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[BODY:%.*]], label [[END:%.*]]
 ; CHECK:       body:
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
@@ -42,6 +47,7 @@
 ;
 entry:
   %cmp = icmp ne i64 %x, 255
+  call void @use(i1 %cmp)
   br i1 %cmp, label %body, label %end
 body:
   br label %end
@@ -55,6 +61,7 @@
 ; CHECK-LABEL: @limit_i64_ule_15(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[X:%.*]], 16
+; CHECK-NEXT:    call void @use(i1 [[CMP]])
 ; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[BODY:%.*]]
 ; CHECK:       body:
 ; CHECK-NEXT:    [[MASK:%.*]] = and i64 [[X]], 15
@@ -66,6 +73,7 @@
 ;
 entry:
   %cmp = icmp ule i64 %x, 15
+  call void @use(i1 %cmp)
   br i1 %cmp, label %end, label %body
 body:
   %mask = and i64 %x, 15
@@ -81,6 +89,7 @@
 ; CHECK-LABEL: @limit_i64_uge_8(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[X:%.*]], 7
+; CHECK-NEXT:    call void @use(i1 [[CMP]])
 ; CHECK-NEXT:    br i1 [[CMP]], label [[BODY:%.*]], label [[END:%.*]]
 ; CHECK:       body:
 ; CHECK-NEXT:    [[MASK:%.*]] = and i64 [[X]], 7
@@ -92,6 +101,7 @@
 ;
 entry:
   %cmp = icmp uge i64 %x, 8
+  call void @use(i1 %cmp)
   br i1 %cmp, label %body, label %end
 body:
   %mask = and i64 %x, 7
@@ -107,6 +117,7 @@
 ; CHECK-LABEL: @limit_i64_ult_8(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[X:%.*]], 8
+; CHECK-NEXT:    call void @use(i1 [[CMP]])
 ; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[BODY:%.*]]
 ; CHECK:       body:
 ; CHECK-NEXT:    [[MASK:%.*]] = and i64 [[X]], 7
@@ -118,6 +129,7 @@
 ;
 entry:
   %cmp = icmp ult i64 %x, 8
+  call void @use(i1 %cmp)
   br i1 %cmp, label %end, label %body
 body:
   %mask = and i64 %x, 7
@@ -133,6 +145,7 @@
 ; CHECK-LABEL: @limit_i64_ugt_7(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[X:%.*]], 7
+; CHECK-NEXT:    call void @use(i1 [[CMP]])
 ; CHECK-NEXT:    br i1 [[CMP]], label [[BODY:%.*]], label [[END:%.*]]
 ; CHECK:       body:
 ; CHECK-NEXT:    [[MASK:%.*]] = and i64 [[X]], 7
@@ -144,6 +157,7 @@
 ;
 entry:
   %cmp = icmp ugt i64 %x, 7
+  call void @use(i1 %cmp)
   br i1 %cmp, label %body, label %end
 body:
   %mask = and i64 %x, 7
@@ -154,4 +168,62 @@
   ret i64 %res
 }
 
+;
+; negative tests
+;
 
+; %x either ule 15 or is masked with 15
+define i64 @limit_i64_ule_15_mask3(i64 %x) {
+; CHECK-LABEL: @limit_i64_ule_15_mask3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[X:%.*]], 16
+; CHECK-NEXT:    call void @use(i1 [[CMP]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[BODY:%.*]]
+; CHECK:       body:
+; CHECK-NEXT:    [[MASK:%.*]] = and i64 [[X]], 15
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[X_MASK:%.*]] = phi i64 [ [[X]], [[ENTRY:%.*]] ], [ [[MASK]], [[BODY]] ]
+; CHECK-NEXT:    [[RES:%.*]] = and i64 [[X_MASK]], 3
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+entry:
+  %cmp = icmp ule i64 %x, 15
+  call void @use(i1 %cmp)
+  br i1 %cmp, label %end, label %body
+body:
+  %mask = and i64 %x, 15
+  br label %end
+end:
+  %x.mask = phi i64 [ %x, %entry ], [ %mask, %body ]
+  %res = and i64 %x.mask, 3
+  ret i64 %res
+}
+
+; %x either ult 8 or is masked with 7
+define i64 @limit_i64_ult_8_mask1(i64 %x) {
+; CHECK-LABEL: @limit_i64_ult_8_mask1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[X:%.*]], 8
+; CHECK-NEXT:    call void @use(i1 [[CMP]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[BODY:%.*]]
+; CHECK:       body:
+; CHECK-NEXT:    [[MASK:%.*]] = and i64 [[X]], 7
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[X_MASK:%.*]] = phi i64 [ [[X]], [[ENTRY:%.*]] ], [ [[MASK]], [[BODY]] ]
+; CHECK-NEXT:    [[RES:%.*]] = and i64 [[X_MASK]], 1
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+entry:
+  %cmp = icmp ult i64 %x, 8
+  call void @use(i1 %cmp)
+  br i1 %cmp, label %end, label %body
+body:
+  %mask = and i64 %x, 7
+  br label %end
+end:
+  %x.mask = phi i64 [ %x, %entry ], [ %mask, %body ]
+  %res = and i64 %x.mask, 1
+  ret i64 %res
+}
diff --git a/llvm/test/Transforms/InstCombine/snprintf-2.ll b/llvm/test/Transforms/InstCombine/snprintf-2.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/snprintf-2.ll
@@ -0,0 +1,136 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+;
+; Verify that snprintf calls with a constant size not exceeding INT_MAX
+; and constant format string with no formatting directives are transformed
+; into memcpy.  Also verify that a size in excess of INT_MAX prevents
+; the transformation.
+;
+; RUN: opt < %s -passes=instcombine -S -data-layout="E" | FileCheck %s -check-prefixes=ANY,BE
+; RUN: opt < %s -passes=instcombine -S -data-layout="e" | FileCheck %s -check-prefixes=ANY,LE
+
+@s = constant [4 x i8] c"123\00"
+
+@adst = external global [0 x i8*]
+@asiz = external global [0 x i32]
+
+declare i32 @snprintf(i8*, i64, i8*, ...)
+
+
+; Verify that all snprintf calls with a bound between INT_MAX and down
+; to 0 are transformed to memcpy.
+
+define void @fold_snprintf_fmt() {
+; BE-LABEL: @fold_snprintf_fmt(
+; BE-NEXT:    [[PDIMAX1:%.*]] = load i32*, i32** bitcast (i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 2147483647) to i32**), align 8
+; BE-NEXT:    store i32 825373440, i32* [[PDIMAX1]], align 1
+; BE-NEXT:    store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 0), align 4
+; BE-NEXT:    [[PD52:%.*]] = load i32*, i32** bitcast (i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 5) to i32**), align 8
+; BE-NEXT:    store i32 825373440, i32* [[PD52]], align 1
+; BE-NEXT:    store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 5), align 4
+; BE-NEXT:    [[PD43:%.*]] = load i32*, i32** bitcast (i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 4) to i32**), align 8
+; BE-NEXT:    store i32 825373440, i32* [[PD43]], align 1
+; BE-NEXT:    store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 4), align 4
+; BE-NEXT:    [[PD3:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 3), align 8
+; BE-NEXT:    [[TMP1:%.*]] = bitcast i8* [[PD3]] to i16*
+; BE-NEXT:    store i16 12594, i16* [[TMP1]], align 1
+; BE-NEXT:    [[ENDPTR:%.*]] = getelementptr inbounds i8, i8* [[PD3]], i64 2
+; BE-NEXT:    store i8 0, i8* [[ENDPTR]], align 1
+; BE-NEXT:    store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 3), align 4
+; BE-NEXT:    [[PD2:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 2), align 8
+; BE-NEXT:    store i8 49, i8* [[PD2]], align 1
+; BE-NEXT:    [[ENDPTR4:%.*]] = getelementptr inbounds i8, i8* [[PD2]], i64 1
+; BE-NEXT:    store i8 0, i8* [[ENDPTR4]], align 1
+; BE-NEXT:    store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 2), align 4
+; BE-NEXT:    [[PD1:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 1), align 8
+; BE-NEXT:    store i8 0, i8* [[PD1]], align 1
+; BE-NEXT:    store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 1), align 4
+; BE-NEXT:    store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 0), align 4
+; BE-NEXT:    ret void
+;
+; LE-LABEL: @fold_snprintf_fmt(
+; LE-NEXT:    [[PDIMAX1:%.*]] = load i32*, i32** bitcast (i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 2147483647) to i32**), align 8
+; LE-NEXT:    store i32 3355185, i32* [[PDIMAX1]], align 1
+; LE-NEXT:    store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 0), align 4
+; LE-NEXT:    [[PD52:%.*]] = load i32*, i32** bitcast (i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 5) to i32**), align 8
+; LE-NEXT:    store i32 3355185, i32* [[PD52]], align 1
+; LE-NEXT:    store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 5), align 4
+; LE-NEXT:    [[PD43:%.*]] = load i32*, i32** bitcast (i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 4) to i32**), align 8
+; LE-NEXT:    store i32 3355185, i32* [[PD43]], align 1
+; LE-NEXT:    store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 4), align 4
+; LE-NEXT:    [[PD3:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 3), align 8
+; LE-NEXT:    [[TMP1:%.*]] = bitcast i8* [[PD3]] to i16*
+; LE-NEXT:    store i16 12849, i16* [[TMP1]], align 1
+; LE-NEXT:    [[ENDPTR:%.*]] = getelementptr inbounds i8, i8* [[PD3]], i64 2
+; LE-NEXT:    store i8 0, i8* [[ENDPTR]], align 1
+; LE-NEXT:    store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 3), align 4
+; LE-NEXT:    [[PD2:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 2), align 8
+; LE-NEXT:    store i8 49, i8* [[PD2]], align 1
+; LE-NEXT:    [[ENDPTR4:%.*]] = getelementptr inbounds i8, i8* [[PD2]], i64 1
+; LE-NEXT:    store i8 0, i8* [[ENDPTR4]], align 1
+; LE-NEXT:    store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 2), align 4
+; LE-NEXT:    [[PD1:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 1), align 8
+; LE-NEXT:    store i8 0, i8* [[PD1]], align 1
+; LE-NEXT:    store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 1), align 4
+; LE-NEXT:    store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 0), align 4
+; LE-NEXT:    ret void
+;
+  %fmt = getelementptr [4 x i8], [4 x i8]* @s, i32 0, i32 0
+
+  %pdimax = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 2147483647)
+  %nimax = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pdimax, i64 2147483647, i8* %fmt)
+  store i32 %nimax, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 0)
+
+  %pd5 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 5)
+  %n5 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pd5, i64 5, i8* %fmt)
+  store i32 %n5, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 5)
+
+  %pd4 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 4)
+  %n4 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pd4, i64 4, i8* %fmt)
+  store i32 %n4, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 4)
+
+  %pd3 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 3)
+  %n3 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pd3, i64 3, i8* %fmt)
+  store i32 %n3, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 3)
+
+  %pd2 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 2)
+  %n2 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pd2, i64 2, i8* %fmt)
+  store i32 %n2, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 2)
+
+  %pd1 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 1)
+  %n1 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pd1, i64 1, i8* %fmt)
+  store i32 %n1, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 1)
+
+  %pd0 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 0)
+  %n0 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pd0, i64 0, i8* %fmt)
+  store i32 %n0, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 0)
+
+  ret void
+}
+
+
+; Verify that snprintf calls with a bound greater than INT_MAX are not
+; transformed.  POSIX requires implementations to set errno to EOVERFLOW
+; so such calls could be folded to just that followed by returning -1.
+
+define void @call_snprintf_fmt_ximax() {
+; ANY-LABEL: @call_snprintf_fmt_ximax(
+; ANY-NEXT:    [[PDM1:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 1), align 8
+; ANY-NEXT:    [[NM1:%.*]] = call i32 (i8*, i64, i8*, ...) @snprintf(i8* noundef nonnull dereferenceable(1) [[PDM1]], i64 -1, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @s, i64 0, i64 0))
+; ANY-NEXT:    store i32 [[NM1]], i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 1), align 4
+; ANY-NEXT:    [[PDIMAXP1:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 0), align 8
+; ANY-NEXT:    [[NIMAXP1:%.*]] = call i32 (i8*, i64, i8*, ...) @snprintf(i8* noundef nonnull dereferenceable(1) [[PDIMAXP1]], i64 2147483648, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @s, i64 0, i64 0))
+; ANY-NEXT:    store i32 [[NIMAXP1]], i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 0), align 4
+; ANY-NEXT:    ret void
+;
+  %fmt = getelementptr [4 x i8], [4 x i8]* @s, i32 0, i32 0
+
+  %pdm1 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 1)
+  %nm1 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pdm1, i64 -1, i8* %fmt)
+  store i32 %nm1, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 1)
+
+  %pdimaxp1 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 0)
+  %nimaxp1 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pdimaxp1, i64 2147483648, i8* %fmt)
+  store i32 %nimaxp1, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 0)
+
+  ret void
+}
diff --git a/llvm/test/Transforms/InstCombine/snprintf-3.ll b/llvm/test/Transforms/InstCombine/snprintf-3.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/snprintf-3.ll
@@ -0,0 +1,139 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+;
+; Verify that snprintf calls with a constant size not exceeding INT_MAX
+; and a "%s" format string and a const string argument are transformed
+; into memcpy.  Also verify that a size in excess of INT_MAX prevents
+; the transformation.
+;
+; RUN: opt < %s -passes=instcombine -S -data-layout="E" | FileCheck %s -check-prefixes=ANY,BE
+; RUN: opt < %s -passes=instcombine -S -data-layout="e" | FileCheck %s -check-prefixes=ANY,LE
+
+@pcnt_s = constant [3 x i8] c"%s\00"
+@s = constant [4 x i8] c"123\00"
+
+@adst = external global [0 x i8*]
+@asiz = external global [0 x i32]
+
+declare i32 @snprintf(i8*, i64, i8*, ...)
+
+
+; Verify that all snprintf calls with a bound between INT_MAX and down
+; to 0 are transformed to memcpy.
+
+define void @fold_snprintf_pcnt_s() {
+; BE-LABEL: @fold_snprintf_pcnt_s(
+; BE-NEXT:    [[PDIMAX1:%.*]] = load i32*, i32** bitcast (i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 2147483647) to i32**), align 8
+; BE-NEXT:    store i32 825373440, i32* [[PDIMAX1]], align 1
+; BE-NEXT:    store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 0), align 4
+; BE-NEXT:    [[PD52:%.*]] = load i32*, i32** bitcast (i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 5) to i32**), align 8
+; BE-NEXT:    store i32 825373440, i32* [[PD52]], align 1
+; BE-NEXT:    store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 5), align 4
+; BE-NEXT:    [[PD43:%.*]] = load i32*, i32** bitcast (i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 4) to i32**), align 8
+; BE-NEXT:    store i32 825373440, i32* [[PD43]], align 1
+; BE-NEXT:    store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 4), align 4
+; BE-NEXT:    [[PD3:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 3), align 8
+; BE-NEXT:    [[TMP1:%.*]] = bitcast i8* [[PD3]] to i16*
+; BE-NEXT:    store i16 12594, i16* [[TMP1]], align 1
+; BE-NEXT:    [[ENDPTR:%.*]] = getelementptr inbounds i8, i8* [[PD3]], i64 2
+; BE-NEXT:    store i8 0, i8* [[ENDPTR]], align 1
+; BE-NEXT:    store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 3), align 4
+; BE-NEXT:    [[PD2:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 2), align 8
+; BE-NEXT:    store i8 49, i8* [[PD2]], align 1
+; BE-NEXT:    [[ENDPTR4:%.*]] = getelementptr inbounds i8, i8* [[PD2]], i64 1
+; BE-NEXT:    store i8 0, i8* [[ENDPTR4]], align 1
+; BE-NEXT:    store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 2), align 4
+; BE-NEXT:    [[PD1:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 1), align 8
+; BE-NEXT:    store i8 0, i8* [[PD1]], align 1
+; BE-NEXT:    store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 1), align 4
+; BE-NEXT:    store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 0), align 4
+; BE-NEXT:    ret void
+;
+; LE-LABEL: @fold_snprintf_pcnt_s(
+; LE-NEXT:    [[PDIMAX1:%.*]] = load i32*, i32** bitcast (i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 2147483647) to i32**), align 8
+; LE-NEXT:    store i32 3355185, i32* [[PDIMAX1]], align 1
+; LE-NEXT:    store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 0), align 4
+; LE-NEXT:    [[PD52:%.*]] = load i32*, i32** bitcast (i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 5) to i32**), align 8
+; LE-NEXT:    store i32 3355185, i32* [[PD52]], align 1
+; LE-NEXT:    store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 5), align 4
+; LE-NEXT:    [[PD43:%.*]] = load i32*, i32** bitcast (i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 4) to i32**), align 8
+; LE-NEXT:    store i32 3355185, i32* [[PD43]], align 1
+; LE-NEXT:    store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 4), align 4
+; LE-NEXT:    [[PD3:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 3), align 8
+; LE-NEXT:    [[TMP1:%.*]] = bitcast i8* [[PD3]] to i16*
+; LE-NEXT:    store i16 12849, i16* [[TMP1]], align 1
+; LE-NEXT:    [[ENDPTR:%.*]] = getelementptr inbounds i8, i8* [[PD3]], i64 2
+; LE-NEXT:    store i8 0, i8* [[ENDPTR]], align 1
+; LE-NEXT:    store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 3), align 4
+; LE-NEXT:    [[PD2:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 2), align 8
+; LE-NEXT:    store i8 49, i8* [[PD2]], align 1
+; LE-NEXT:    [[ENDPTR4:%.*]] = getelementptr inbounds i8, i8* [[PD2]], i64 1
+; LE-NEXT:    store i8 0, i8* [[ENDPTR4]], align 1
+; LE-NEXT:    store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 2), align 4
+; LE-NEXT:    [[PD1:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 1), align 8
+; LE-NEXT:    store i8 0, i8* [[PD1]], align 1
+; LE-NEXT:    store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 1), align 4
+; LE-NEXT:    store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 0), align 4
+; LE-NEXT:    ret void
+;
+  %fmt = getelementptr [3 x i8], [3 x i8]* @pcnt_s, i32 0, i32 0
+  %ps = getelementptr [4 x i8], [4 x i8]* @s, i32 0, i32 0
+
+  %pdimax = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 2147483647)
+  %nimax = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pdimax, i64 2147483647, i8* %fmt, i8* %ps)
+  store i32 %nimax, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 0)
+
+  %pd5 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 5)
+  %n5 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pd5, i64 5, i8* %fmt, i8* %ps)
+  store i32 %n5, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 5)
+
+  %pd4 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 4)
+  %n4 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pd4, i64 4, i8* %fmt, i8* %ps)
+  store i32 %n4, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 4)
+
+  %pd3 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 3)
+  %n3 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pd3, i64 3, i8* %fmt, i8* %ps)
+  store i32 %n3, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 3)
+
+  %pd2 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 2)
+  %n2 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pd2, i64 2, i8* %fmt, i8* %ps)
+  store i32 %n2, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 2)
+
+  %pd1 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 1)
+  %n1 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pd1, i64 1, i8* %fmt, i8* %ps)
+  store i32 %n1, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 1)
+
+  %pd0 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 0)
+  %n0 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pd0, i64 0, i8* %fmt, i8* %ps)
+  store i32 %n0, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 0)
+
+  ret void
+}
+
+
+; Verify that snprintf calls with a bound greater than INT_MAX are not
+; transformed.  POSIX requires implementations to set errno to EOVERFLOW
+; so such calls could be folded to just that followed by returning -1.
+
+define void @call_snprintf_pcnt_s_ximax() {
+; ANY-LABEL: @call_snprintf_pcnt_s_ximax(
+; ANY-NEXT:    [[PDM1:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 1), align 8
+; ANY-NEXT:    [[NM1:%.*]] = call i32 (i8*, i64, i8*, ...) @snprintf(i8* noundef nonnull dereferenceable(1) [[PDM1]], i64 -1, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @pcnt_s, i64 0, i64 0), i8* getelementptr inbounds ([4 x i8], [4 x i8]* @s, i64 0, i64 0))
+; ANY-NEXT:    store i32 [[NM1]], i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 1), align 4
+; ANY-NEXT:    [[PDIMAXP1:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 0), align 8
+; ANY-NEXT:    [[NIMAXP1:%.*]] = call i32 (i8*, i64, i8*, ...) @snprintf(i8* noundef nonnull dereferenceable(1) [[PDIMAXP1]], i64 2147483648, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @pcnt_s, i64 0, i64 0), i8* getelementptr inbounds ([4 x i8], [4 x i8]* @s, i64 0, i64 0))
+; ANY-NEXT:    store i32 [[NIMAXP1]], i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 0), align 4
+; ANY-NEXT:    ret void
+;
+  %fmt = getelementptr [3 x i8], [3 x i8]* @pcnt_s, i32 0, i32 0
+  %ps = getelementptr [4 x i8], [4 x i8]* @s, i32 0, i32 0
+
+  %pdm1 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 1)
+  %nm1 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pdm1, i64 -1, i8* %fmt, i8* %ps)
+  store i32 %nm1, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 1)
+
+  %pdimaxp1 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 0)
+  %nimaxp1 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pdimaxp1, i64 2147483648, i8* %fmt, i8* %ps)
+  store i32 %nimaxp1, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 0)
+
+  ret void
+}
diff --git a/llvm/test/Transforms/InstCombine/snprintf-4.ll b/llvm/test/Transforms/InstCombine/snprintf-4.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/snprintf-4.ll
@@ -0,0 +1,129 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+;
+; Verify that snprintf calls with a constant size not exceeding INT_MAX
+; and a "%c" format string are transformed into a store of the character.
+; Also verify that a size in excess of INT_MAX prevents the transformation.
+;
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+@pcnt_c = constant [3 x i8] c"%c\00"
+
+@adst = external global [0 x i8*]
+@asiz = external global [0 x i32]
+
+declare i32 @snprintf(i8*, i64, i8*, ...)
+
+
+; Verify that all snprintf calls with a bound between INT_MAX and down
+; to 0 are transformed to memcpy.
+
+define void @fold_snprintf_pcnt_c(i32 %c) {
+; CHECK-LABEL: @fold_snprintf_pcnt_c(
+; CHECK-NEXT:    [[PDIMAX:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 0), align 8
+; CHECK-NEXT:    store i8 1, i8* [[PDIMAX]], align 1
+; CHECK-NEXT:    [[NUL:%.*]] = getelementptr inbounds i8, i8* [[PDIMAX]], i64 1
+; CHECK-NEXT:    store i8 0, i8* [[NUL]], align 1
+; CHECK-NEXT:    store i32 1, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 0), align 4
+; CHECK-NEXT:    [[PD2:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 1), align 8
+; CHECK-NEXT:    store i8 2, i8* [[PD2]], align 1
+; CHECK-NEXT:    [[NUL1:%.*]] = getelementptr inbounds i8, i8* [[PD2]], i64 1
+; CHECK-NEXT:    store i8 0, i8* [[NUL1]], align 1
+; CHECK-NEXT:    store i32 1, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 1), align 4
+; CHECK-NEXT:    [[PD2_0:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 2), align 8
+; CHECK-NEXT:    store i8 0, i8* [[PD2_0]], align 1
+; CHECK-NEXT:    [[NUL2:%.*]] = getelementptr inbounds i8, i8* [[PD2_0]], i64 1
+; CHECK-NEXT:    store i8 0, i8* [[NUL2]], align 1
+; CHECK-NEXT:    store i32 1, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 2), align 4
+; CHECK-NEXT:    [[PD1:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 3), align 8
+; CHECK-NEXT:    store i8 0, i8* [[PD1]], align 1
+; CHECK-NEXT:    store i32 1, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 3), align 4
+; CHECK-NEXT:    store i32 1, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 4), align 4
+; CHECK-NEXT:    [[PD2_C:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 4), align 8
+; CHECK-NEXT:    [[CHAR:%.*]] = trunc i32 [[C:%.*]] to i8
+; CHECK-NEXT:    store i8 [[CHAR]], i8* [[PD2_C]], align 1
+; CHECK-NEXT:    [[NUL3:%.*]] = getelementptr inbounds i8, i8* [[PD2_C]], i64 1
+; CHECK-NEXT:    store i8 0, i8* [[NUL3]], align 1
+; CHECK-NEXT:    store i32 1, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 4), align 4
+; CHECK-NEXT:    [[PD1_C:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 5), align 8
+; CHECK-NEXT:    store i8 0, i8* [[PD1_C]], align 1
+; CHECK-NEXT:    store i32 1, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 5), align 4
+; CHECK-NEXT:    ret void
+;
+  %fmt = getelementptr [3 x i8], [3 x i8]* @pcnt_c, i32 0, i32 0
+
+  ; Transform snprintf(dst, INT_MAX, "%c", 1) to memcpy(dst, "1", 2), 1.
+  %pdimax = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 0)
+  %nimax = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pdimax, i64 2147483647, i8* %fmt, i32 1)
+  store i32 %nimax, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 0)
+
+  ; Transform snprintf(dst, 2, "%c", '\2') to memcpy(dst, "2", 2), 1.
+  %pd2 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 1)
+  %n2 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pd2, i64 2, i8* %fmt, i8 2)
+  store i32 %n2, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 1)
+
+  ; Transform snprintf(dst, 2, "%c", '\0') to memcpy(dst, "\0", 2), 1.
+  %pd2_0 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 2)
+  %n2_0 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pd2_0, i64 2, i8* %fmt, i8 0)
+  store i32 %n2_0, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 2)
+
+  ; Transform snprintf(dst, 1, "%c", (short)3) to memcpy(dst, "\3", 2), 1.
+  %pd1 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 3)
+  %n1 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pd1, i64 1, i8* %fmt, i16 3)
+  store i32 %n1, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 3)
+
+  ; Fold snprintf(dst, 0, "%c", 4) to 1.
+  %pd0 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 4)
+  %n0 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pd0, i64 0, i8* %fmt, i32 4)
+  store i32 %n0, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 4)
+
+
+  ; Transform snprintf(dst, 2, "%c", c) with a nonconstant c to
+  ;   dst[0] = c, dst[1] = '\0', 1.
+  %pd2_c = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 4)
+  %n2_c = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pd2_c, i64 2, i8* %fmt, i32 %c)
+  store i32 %n2_c, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 4)
+
+  ; Transform snprintf(dst, 1, "%c", c) with a nonconstant c to *dst = '\0', 0.
+  %pd1_c = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 5)
+  %n1_c = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pd1_c, i64 1, i8* %fmt, i32 %c)
+  store i32 %n1_c, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 5)
+
+  ret void
+}
+
+
+; Verify that snprintf calls with a bound greater than INT_MAX are not
+; transformed.  POSIX requires implementations to set errno to EOVERFLOW
+; so such calls could be folded to just that followed by returning -1.
+
+define void @call_snprintf_pcnt_c_ximax(i32 %c) {
+; CHECK-LABEL: @call_snprintf_pcnt_c_ximax(
+; CHECK-NEXT:    [[PDM1:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 0), align 8
+; CHECK-NEXT:    [[NM1:%.*]] = call i32 (i8*, i64, i8*, ...) @snprintf(i8* noundef nonnull dereferenceable(1) [[PDM1]], i64 -1, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @pcnt_c, i64 0, i64 0), i8 0)
+; CHECK-NEXT:    store i32 [[NM1]], i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 0), align 4
+; CHECK-NEXT:    [[PDIMAXP1:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 1), align 8
+; CHECK-NEXT:    [[NIMAXP1:%.*]] = call i32 (i8*, i64, i8*, ...) @snprintf(i8* noundef nonnull dereferenceable(1) [[PDIMAXP1]], i64 2147483648, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @pcnt_c, i64 0, i64 0), i8 1)
+; CHECK-NEXT:    store i32 [[NIMAXP1]], i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 1), align 4
+; CHECK-NEXT:    [[PDM1SL32:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 2), align 8
+; CHECK-NEXT:    [[NM1SL32:%.*]] = call i32 (i8*, i64, i8*, ...) @snprintf(i8* noundef nonnull dereferenceable(1) [[PDM1SL32]], i64 -4294967296, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @pcnt_c, i64 0, i64 0), i8 1)
+; CHECK-NEXT:    store i32 [[NM1SL32]], i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 2), align 4
+; CHECK-NEXT:    ret void
+;
+  %fmt = getelementptr [3 x i8], [3 x i8]* @pcnt_c, i32 0, i32 0
+
+  %pdm1 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 0)
+  %nm1 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pdm1, i64 -1, i8* %fmt, i8 0)
+  store i32 %nm1, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 0)
+
+
+  %pdimaxp1 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 1)
+  %nimaxp1 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pdimaxp1, i64 2147483648, i8* %fmt, i8 1)
+  store i32 %nimaxp1, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 1)
+
+  ; Exercise snprintf(dst, -1LU << 32, "%c", c).
+  %pdm1sl32 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 2)
+  %nm1sl32 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pdm1sl32, i64 18446744069414584320, i8* %fmt, i8 1)
+  store i32 %nm1sl32, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 2)
+
+  ret void
+}
diff --git a/llvm/test/Transforms/InstCombine/snprintf.ll b/llvm/test/Transforms/InstCombine/snprintf.ll
--- a/llvm/test/Transforms/InstCombine/snprintf.ll
+++ b/llvm/test/Transforms/InstCombine/snprintf.ll
@@ -92,10 +92,10 @@
   ret i32 %call
 }
 
-define i32 @test_char_wrong_size(i8* %buf) #0 {
-; CHECK-LABEL: @test_char_wrong_size(
-; CHECK-NEXT:    [[CALL:%.*]] = call i32 (i8*, i64, i8*, ...) @snprintf(i8* noundef nonnull dereferenceable(1) [[BUF:%.*]], i64 1, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.2, i64 0, i64 0), i32 65)
-; CHECK-NEXT:    ret i32 [[CALL]]
+define i32 @test_char_small_size(i8* %buf) #0 {
+; CHECK-LABEL: @test_char_small_size(
+; CHECK-NEXT:    store i8 0, i8* [[BUF:%.*]], align 1
+; CHECK-NEXT:    ret i32 1
 ;
   %call = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %buf, i64 1, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.2, i64 0, i64 0), i32 65) #2
   ret i32 %call
@@ -120,10 +120,10 @@
   ret i32 %call
 }
 
-define i32 @test_str_wrong_size(i8* %buf) #0 {
-; CHECK-LABEL: @test_str_wrong_size(
-; CHECK-NEXT:    [[CALL:%.*]] = call i32 (i8*, i64, i8*, ...) @snprintf(i8* noundef nonnull dereferenceable(1) [[BUF:%.*]], i64 1, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.3, i64 0, i64 0), i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0))
-; CHECK-NEXT:    ret i32 [[CALL]]
+define i32 @test_str_small_size(i8* %buf) #0 {
+; CHECK-LABEL: @test_str_small_size(
+; CHECK-NEXT:    store i8 0, i8* [[BUF:%.*]], align 1
+; CHECK-NEXT:    ret i32 3
 ;
   %call = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %buf, i64 1, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.3, i64 0, i64 0), i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0)) #2
   ret i32 %call
diff --git a/llvm/test/Transforms/InstCombine/ssubo.ll b/llvm/test/Transforms/InstCombine/ssubo.ll
--- a/llvm/test/Transforms/InstCombine/ssubo.ll
+++ b/llvm/test/Transforms/InstCombine/ssubo.ll
@@ -4,6 +4,8 @@
 declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64)
 declare { i8, i1 } @llvm.ssub.with.overflow.i8(i8, i8)
 
+declare void @use(i1)
+
 define i1 @test_generic(i64 %a, i64 %b) {
 ; CHECK-LABEL: @test_generic(
 ; CHECK-NEXT:    [[RES:%.*]] = tail call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A:%.*]], i64 [[B:%.*]])
@@ -95,3 +97,72 @@
   ret i1 %overflow
 }
 
+define i1 @sub_eq0(i8 %x, i8 %y, i1 %b) {
+; CHECK-LABEL: @sub_eq0(
+; CHECK-NEXT:    [[SS:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
+; CHECK-NEXT:    [[OV:%.*]] = extractvalue { i8, i1 } [[SS]], 1
+; CHECK-NEXT:    call void @use(i1 [[OV]])
+; CHECK-NEXT:    [[EQ0:%.*]] = icmp eq i8 [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[EQ0]]
+;
+  %ss = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 %x, i8 %y)
+  %ov = extractvalue { i8, i1 } %ss, 1
+  call void @use(i1 %ov)
+  %sub = extractvalue { i8, i1 } %ss, 0
+  %eq0 = icmp eq i8 %sub, 0
+  ret i1 %eq0
+}
+
+define i1 @sub_ne0(i8 %x, i8 %y, i1 %b) {
+; CHECK-LABEL: @sub_ne0(
+; CHECK-NEXT:    [[SS:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
+; CHECK-NEXT:    [[OV:%.*]] = extractvalue { i8, i1 } [[SS]], 1
+; CHECK-NEXT:    call void @use(i1 [[OV]])
+; CHECK-NEXT:    [[NE0:%.*]] = icmp ne i8 [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[NE0]]
+;
+  %ss = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 %x, i8 %y)
+  %ov = extractvalue { i8, i1 } %ss, 1
+  call void @use(i1 %ov)
+  %sub = extractvalue { i8, i1 } %ss, 0
+  %ne0 = icmp ne i8 %sub, 0
+  ret i1 %ne0
+}
+
+; negative test - need zero
+
+define i1 @sub_eq1(i8 %x, i8 %y, i1 %b) {
+; CHECK-LABEL: @sub_eq1(
+; CHECK-NEXT:    [[SS:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
+; CHECK-NEXT:    [[OV:%.*]] = extractvalue { i8, i1 } [[SS]], 1
+; CHECK-NEXT:    call void @use(i1 [[OV]])
+; CHECK-NEXT:    [[SUB:%.*]] = extractvalue { i8, i1 } [[SS]], 0
+; CHECK-NEXT:    [[EQ1:%.*]] = icmp eq i8 [[SUB]], 1
+; CHECK-NEXT:    ret i1 [[EQ1]]
+;
+  %ss = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 %x, i8 %y)
+  %ov = extractvalue { i8, i1 } %ss, 1
+  call void @use(i1 %ov)
+  %sub = extractvalue { i8, i1 } %ss, 0
+  %eq1 = icmp eq i8 %sub, 1
+  ret i1 %eq1
+}
+
+; negative test - need equality pred
+
+define i1 @sub_sgt0(i8 %x, i8 %y, i1 %b) {
+; CHECK-LABEL: @sub_sgt0(
+; CHECK-NEXT:    [[SS:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
+; CHECK-NEXT:    [[OV:%.*]] = extractvalue { i8, i1 } [[SS]], 1
+; CHECK-NEXT:    call void @use(i1 [[OV]])
+; CHECK-NEXT:    [[SUB:%.*]] = extractvalue { i8, i1 } [[SS]], 0
+; CHECK-NEXT:    [[SGT0:%.*]] = icmp sgt i8 [[SUB]], 0
+; CHECK-NEXT:    ret i1 [[SGT0]]
+;
+  %ss = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 %x, i8 %y)
+  %ov = extractvalue { i8, i1 } %ss, 1
+  call void @use(i1 %ov)
+  %sub = extractvalue { i8, i1 } %ss, 0
+  %sgt0 = icmp sgt i8 %sub, 0
+  ret i1 %sgt0
+}
diff --git a/llvm/test/Transforms/InstCombine/usubo.ll b/llvm/test/Transforms/InstCombine/usubo.ll
--- a/llvm/test/Transforms/InstCombine/usubo.ll
+++ b/llvm/test/Transforms/InstCombine/usubo.ll
@@ -4,6 +4,8 @@
 declare { i64, i1 } @llvm.usub.with.overflow.i64(i64, i64)
 declare { i8, i1 } @llvm.usub.with.overflow.i8(i8, i8)
 
+declare void @use(i1)
+
 define i1 @test_generic(i64 %a, i64 %b) {
 ; CHECK-LABEL: @test_generic(
 ; CHECK-NEXT:    [[OVERFLOW:%.*]] = icmp ult i64 [[A:%.*]], [[B:%.*]]
@@ -94,3 +96,70 @@
   ret i1 %overflow
 }
 
+define i1 @sub_eq0(i8 %x, i8 %y, i1 %b) {
+; CHECK-LABEL: @sub_eq0(
+; CHECK-NEXT:    [[OV:%.*]] = icmp ult i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    call void @use(i1 [[OV]])
+; CHECK-NEXT:    [[EQ0:%.*]] = icmp eq i8 [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[EQ0]]
+;
+  %us = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 %x, i8 %y)
+  %ov = extractvalue { i8, i1 } %us, 1
+  call void @use(i1 %ov)
+  %sub = extractvalue { i8, i1 } %us, 0
+  %eq0 = icmp eq i8 %sub, 0
+  ret i1 %eq0
+}
+
+define i1 @sub_ne0(i8 %x, i8 %y, i1 %b) {
+; CHECK-LABEL: @sub_ne0(
+; CHECK-NEXT:    [[OV:%.*]] = icmp ult i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    call void @use(i1 [[OV]])
+; CHECK-NEXT:    [[NE0:%.*]] = icmp ne i8 [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[NE0]]
+;
+  %us = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 %x, i8 %y)
+  %ov = extractvalue { i8, i1 } %us, 1
+  call void @use(i1 %ov)
+  %sub = extractvalue { i8, i1 } %us, 0
+  %ne0 = icmp ne i8 %sub, 0
+  ret i1 %ne0
+}
+
+; negative test - need zero
+
+define i1 @sub_eq1(i8 %x, i8 %y, i1 %b) {
+; CHECK-LABEL: @sub_eq1(
+; CHECK-NEXT:    [[SS:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
+; CHECK-NEXT:    [[OV:%.*]] = extractvalue { i8, i1 } [[SS]], 1
+; CHECK-NEXT:    call void @use(i1 [[OV]])
+; CHECK-NEXT:    [[SUB:%.*]] = extractvalue { i8, i1 } [[SS]], 0
+; CHECK-NEXT:    [[EQ1:%.*]] = icmp eq i8 [[SUB]], 1
+; CHECK-NEXT:    ret i1 [[EQ1]]
+;
+  %ss = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 %x, i8 %y)
+  %ov = extractvalue { i8, i1 } %ss, 1
+  call void @use(i1 %ov)
+  %sub = extractvalue { i8, i1 } %ss, 0
+  %eq1 = icmp eq i8 %sub, 1
+  ret i1 %eq1
+}
+
+; negative test - need equality pred
+
+define i1 @sub_sgt0(i8 %x, i8 %y, i1 %b) {
+; CHECK-LABEL: @sub_sgt0(
+; CHECK-NEXT:    [[SS:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
+; CHECK-NEXT:    [[OV:%.*]] = extractvalue { i8, i1 } [[SS]], 1
+; CHECK-NEXT:    call void @use(i1 [[OV]])
+; CHECK-NEXT:    [[SUB:%.*]] = extractvalue { i8, i1 } [[SS]], 0
+; CHECK-NEXT:    [[SGT0:%.*]] = icmp sgt i8 [[SUB]], 0
+; CHECK-NEXT:    ret i1 [[SGT0]]
+;
+  %ss = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 %x, i8 %y)
+  %ov = extractvalue { i8, i1 } %ss, 1
+  call void @use(i1 %ov)
+  %sub = extractvalue { i8, i1 } %ss, 0
+  %sgt0 = icmp sgt i8 %sub, 0
+  ret i1 %sgt0
+}
diff --git a/llvm/test/Transforms/InstSimplify/implies.ll b/llvm/test/Transforms/InstSimplify/implies.ll
--- a/llvm/test/Transforms/InstSimplify/implies.ll
+++ b/llvm/test/Transforms/InstSimplify/implies.ll
@@ -255,3 +255,15 @@
   %res = icmp sge i1 %var30, %var29
   ret i1 %res
 }
+
+; X <=(s) Y == Y ==> X (i1 1 becomes -1 for reasoning)
+define i1 @test_sle(i32 %length.i, i32 %i) {
+; CHECK-LABEL: @test_sle(
+; CHECK-NEXT:    ret i1 true
+;
+  %iplus1 = add nsw nuw i32 %i, 1
+  %var29 = icmp ult i32 %i, %length.i
+  %var30 = icmp ult i32 %iplus1, %length.i
+  %res = icmp sle i1 %var29, %var30
+  ret i1 %res
+}
diff --git a/llvm/test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll b/llvm/test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll
--- a/llvm/test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll
+++ b/llvm/test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll
@@ -3147,7 +3147,7 @@
 ; EPILOG-NEXT:    %cmp1.7 = icmp ult i32 %inc.7, %N
 ; EPILOG-NEXT:    br i1 %cmp1.7, label %latch.7, label %latchExit.epilog-lcssa.loopexit
 ; EPILOG:       latch.7:
-; EPILOG-NEXT:    %niter.next.7 = add nuw i32 %niter.next.6, 1
+; EPILOG-NEXT:    %niter.next.7 = add i32 %niter.next.6, 1
 ; EPILOG-NEXT:    %niter.ncmp.7 = icmp ne i32 %niter.next.7, %unroll_iter
 ; EPILOG-NEXT:    br i1 %niter.ncmp.7, label %header, label %latchExit.unr-lcssa.loopexit
 ; EPILOG:       latchExit.unr-lcssa.loopexit:
@@ -3209,7 +3209,7 @@
 ; EPILOG-BLOCK-NEXT:    %cmp1.1 = icmp ult i32 %inc.1, %N
 ; EPILOG-BLOCK-NEXT:    br i1 %cmp1.1, label %latch.1, label %latchExit.epilog-lcssa.loopexit
 ; EPILOG-BLOCK:       latch.1:
-; EPILOG-BLOCK-NEXT:    %niter.next.1 = add nuw i32 %niter.next, 1
+; EPILOG-BLOCK-NEXT:    %niter.next.1 = add i32 %niter.next, 1
 ; EPILOG-BLOCK-NEXT:    %niter.ncmp.1 = icmp ne i32 %niter.next.1, %unroll_iter
 ; EPILOG-BLOCK-NEXT:    br i1 %niter.ncmp.1, label %header, label %latchExit.unr-lcssa.loopexit, !llvm.loop !8
 ; EPILOG-BLOCK:       latchExit.unr-lcssa.loopexit:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll b/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll
@@ -0,0 +1,248 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -loop-vectorize -scalable-vectorization=on -riscv-v-vector-bits-min=-1 -mtriple riscv64-linux-gnu -mattr=+v,+f -S 2>%t | FileCheck %s
+
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128"
+target triple = "riscv64"
+
+; Dependence distance between read and write is greater than the trip
+; count of the loop.  Thus, values written are never read for any
+; valid vectorization of the loop.
+define void @test(ptr %p) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 32
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[TMP0]], 200
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    store <2 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 32
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 200, 200
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]]
+; CHECK-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 32
+; CHECK-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 200
+; CHECK-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
+; CHECK-NEXT:    store i64 [[V]], ptr [[A2]], align 32
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [0, %entry], [%iv.next, %loop]
+  %a1 = getelementptr i64, ptr %p, i64 %iv
+  %v = load i64, ptr %a1, align 32
+  %offset = add i64 %iv, 200
+  %a2 = getelementptr i64, ptr %p, i64 %offset
+  store i64 %v, ptr %a2, align 32
+  %iv.next = add i64 %iv, 1
+  %cmp = icmp ne i64 %iv, 199
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; Dependence distance is less than trip count, thus we must prove that
+; chosen VF guaranteed to be less than dependence distance.
+define void @test_may_clobber(ptr %p) {
+; CHECK-LABEL: @test_may_clobber(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 32
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[TMP0]], 100
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    store <2 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 32
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 200, 200
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]]
+; CHECK-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 32
+; CHECK-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 100
+; CHECK-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
+; CHECK-NEXT:    store i64 [[V]], ptr [[A2]], align 32
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [0, %entry], [%iv.next, %loop]
+  %a1 = getelementptr i64, ptr %p, i64 %iv
+  %v = load i64, ptr %a1, align 32
+  %offset = add i64 %iv, 100
+  %a2 = getelementptr i64, ptr %p, i64 %offset
+  store i64 %v, ptr %a2, align 32
+  %iv.next = add i64 %iv, 1
+  %cmp = icmp ne i64 %iv, 199
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; Trviailly no overlap due to maximum possible value of VLEN and LMUL
+define void @trivial_due_max_vscale(ptr %p) {
+; CHECK-LABEL: @trivial_due_max_vscale(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 200, [[TMP0]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 200, [[TMP1]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 200, [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i64>, ptr [[TMP4]], align 32
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP2]], 8192
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    store <vscale x 1 x i64> [[WIDE_LOAD]], ptr [[TMP7]], align 32
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 200, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]]
+; CHECK-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 32
+; CHECK-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 8192
+; CHECK-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
+; CHECK-NEXT:    store i64 [[V]], ptr [[A2]], align 32
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [0, %entry], [%iv.next, %loop]
+  %a1 = getelementptr i64, ptr %p, i64 %iv
+  %v = load i64, ptr %a1, align 32
+  %offset = add i64 %iv, 8192
+  %a2 = getelementptr i64, ptr %p, i64 %offset
+  store i64 %v, ptr %a2, align 32
+  %iv.next = add i64 %iv, 1
+  %cmp = icmp ne i64 %iv, 199
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; Dependence distance could be violated via LMUL>=2 or interleaving
+define void @no_high_lmul_or_interleave(ptr %p) {
+; CHECK-LABEL: @no_high_lmul_or_interleave(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 200, [[TMP0]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 200, [[TMP1]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 200, [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i64>, ptr [[TMP4]], align 32
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP2]], 1024
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    store <vscale x 1 x i64> [[WIDE_LOAD]], ptr [[TMP7]], align 32
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 200, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]]
+; CHECK-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 32
+; CHECK-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 1024
+; CHECK-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
+; CHECK-NEXT:    store i64 [[V]], ptr [[A2]], align 32
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [0, %entry], [%iv.next, %loop]
+  %a1 = getelementptr i64, ptr %p, i64 %iv
+  %v = load i64, ptr %a1, align 32
+  %offset = add i64 %iv, 1024
+  %a2 = getelementptr i64, ptr %p, i64 %offset
+  store i64 %v, ptr %a2, align 32
+  %iv.next = add i64 %iv, 1
+  %cmp = icmp ne i64 %iv, 199
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insertelement-postpone.ll b/llvm/test/Transforms/SLPVectorizer/X86/insertelement-postpone.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insertelement-postpone.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -slp-vectorizer -mtriple x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define <4 x double> @test(double* %p2, double %i1754, double %i1781, double %i1778) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[I1771:%.*]] = getelementptr inbounds double, double* [[P2:%.*]], i64 54
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[I1754:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[I1778:%.*]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[I1754]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[I1792:%.*]] = fmul fast double [[I1754]], [[I1781:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[I1771]] to <2 x double>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP0]], double [[I1781]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul fast <2 x double> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x double> [[TMP10]], double [[I1792]], i32 2
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x double> [[TMP11]], double [[TMP8]], i32 3
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, double [[TMP9]], i32 3
+; CHECK-NEXT:    [[TMP14:%.*]] = fadd fast <4 x double> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    ret <4 x double> [[TMP14]]
+;
+entry:
+  %i1771 = getelementptr inbounds double, double* %p2, i64 54
+  %i1772 = load double, double* %i1771, align 8
+  %i1773 = fmul fast double %i1772, %i1754
+  %i1782 = fmul fast double %i1754, %i1754
+  %i1783 = fadd fast double %i1782, 1.000000e+00
+  %i1787 = fmul fast double %i1778, %i1754
+  %i1788 = fadd fast double %i1787, 1.000000e+00
+  %i1792 = fmul fast double %i1754, %i1781
+  %i1793 = fadd fast double %i1792, 1.000000e+00
+  %i1795 = getelementptr inbounds double, double* %p2, i64 55
+  %i1796 = load double, double* %i1795, align 8
+  %i1797 = fmul fast double %i1796, %i1781
+  %i1798 = fadd fast double %i1773, %i1797
+  %i1976 = insertelement <4 x double> zeroinitializer, double %i1783, i64 0
+  %i1982 = insertelement <4 x double> %i1976, double %i1788, i64 1
+  %i1988 = insertelement <4 x double> %i1982, double %i1793, i64 2
+  %i1994 = insertelement <4 x double> %i1988, double %i1798, i64 3
+  ret <4 x double> %i1994
+}
diff --git a/llvm/test/Transforms/SimplifyCFG/nonintegral.ll b/llvm/test/Transforms/SimplifyCFG/nonintegral.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/nonintegral.ll
@@ -0,0 +1,28 @@
+; RUN: opt -passes=simplifycfg -S < %s | FileCheck %s
+
+target datalayout = "ni:1"
+
+define void @test_01(i64 addrspace(1)* align 8 %ptr) {
+; CHECK-LABEL: @test_01(
+; CHECK-NOT:   ptrtoint
+; CHECK-NEXT:  icmp eq i64 addrspace(1)* %ptr, null
+; CHECK-NOT:   ptrtoint
+  %cond1 = icmp eq i64 addrspace(1)* %ptr, null
+  %cond2 = icmp eq i64 addrspace(1)* %ptr, null
+  br i1 %cond1, label %true1, label %false1
+
+true1:
+  br i1 %cond2, label %true2, label %false2
+
+false1:
+  store i64 1, i64 addrspace(1)* %ptr, align 8
+  br label %true1
+
+true2:
+  store i64 2, i64 addrspace(1)* %ptr, align 8
+  ret void
+
+false2:
+  store i64 3, i64 addrspace(1)* %ptr, align 8
+  ret void
+}
diff --git a/llvm/test/tools/llvm-objdump/MachO/chained-fixups.yaml b/llvm/test/tools/llvm-objdump/MachO/chained-fixups.yaml
--- a/llvm/test/tools/llvm-objdump/MachO/chained-fixups.yaml
+++ b/llvm/test/tools/llvm-objdump/MachO/chained-fixups.yaml
@@ -1,102 +1,107 @@
 # RUN: yaml2obj %s -o %t
 # RUN: llvm-objdump -p %t | FileCheck %s
 # RUN: llvm-otool -l %t | FileCheck %s
-#
+
 # CHECK: LC_DYLD_CHAINED_FIXUPS
 # CHECK: LC_DYLD_EXPORTS_TRIE
 
+# RUN: llvm-objdump --macho --chained-fixups %t | \
+# RUN:     FileCheck --check-prefix=DETAILS -DNAME=%t %s
+# RUN: llvm-otool -chained_fixups %t | \
+# RUN:     FileCheck --check-prefix=DETAILS -DNAME=%t %s
+
+# DETAILS:      [[NAME]]:
+# DETAILS-NEXT: chained fixups header (LC_DYLD_CHAINED_FIXUPS)
+# DETAILS-NEXT:   fixups_version = 0
+# DETAILS-NEXT:   starts_offset  = 32
+# DETAILS-NEXT:   imports_offset = 44
+# DETAILS-NEXT:   symbols_offset = 44
+# DETAILS-NEXT:   imports_count  = 0
+# DETAILS-NEXT:   imports_format = 1 (DYLD_CHAINED_IMPORT)
+# DETAILS-NEXT:   symbols_format = 0
+
+## This yaml is from a dylib produced by ld64
+##   echo ".global _foo\n_foo" > dylib.s
+##   clang -target=x86_64-apple-macos12 -dynamiclib -isysroot Inputs/MacOSX.sdk dylib.s -o libdylib.dylib
+##   obj2yaml --raw-segment=data libdylib.dylib
 --- !mach-o
+IsLittleEndian:  true
 FileHeader:
   magic:           0xFEEDFACF
-  cputype:         0x100000C
-  cpusubtype:      0x0
-  filetype:        0x2
-  ncmds:           16
-  sizeofcmds:      744
-  flags:           0x200085
+  cputype:         0x1000007
+  cpusubtype:      0x3
+  filetype:        0x6
+  ncmds:           13
+  sizeofcmds:      568
+  flags:           0x100085
   reserved:        0x0
 LoadCommands:
   - cmd:             LC_SEGMENT_64
-    cmdsize:         72
-    segname:         __PAGEZERO
-    vmaddr:          0
-    vmsize:          4294967296
-    fileoff:         0
-    filesize:        0
-    maxprot:         0
-    initprot:        0
-    nsects:          0
-    flags:           0
-  - cmd:             LC_SEGMENT_64
-    cmdsize:         232
+    cmdsize:         152
     segname:         __TEXT
-    vmaddr:          4294967296
+    vmaddr:          0
     vmsize:          16384
     fileoff:         0
     filesize:        16384
     maxprot:         5
     initprot:        5
-    nsects:          2
+    nsects:          1
     flags:           0
     Sections:
       - sectname:        __text
         segname:         __TEXT
-        addr:            0x100003F98
-        size:            24
-        offset:          0x3F98
-        align:           2
+        addr:            0x4000
+        size:            0
+        offset:          0x4000
+        align:           0
         reloff:          0x0
         nreloc:          0
         flags:           0x80000400
         reserved1:       0x0
         reserved2:       0x0
         reserved3:       0x0
-        content:         C0035FD6FF4300D100008052FF0F00B9FF430091C0035FD6
-      - sectname:        __unwind_info
-        segname:         __TEXT
-        addr:            0x100003FB0
-        size:            80
-        offset:          0x3FB0
-        align:           2
-        reloff:          0x0
-        nreloc:          0
-        flags:           0x0
-        reserved1:       0x0
-        reserved2:       0x0
-        reserved3:       0x0
-        content:         010000001C000000000000001C000000000000001C00000002000000983F00003400000034000000B13F00000000000034000000030000000C0002001400020000000001040000000010000200000002
+        content:         ''
   - cmd:             LC_SEGMENT_64
     cmdsize:         72
     segname:         __LINKEDIT
-    vmaddr:          4294983680
+    vmaddr:          16384
     vmsize:          16384
     fileoff:         16384
-    filesize:        753
+    filesize:        96
     maxprot:         1
     initprot:        1
     nsects:          0
     flags:           0
+  - cmd:             LC_ID_DYLIB
+    cmdsize:         48
+    dylib:
+      name:            24
+      timestamp:       1
+      current_version: 0
+      compatibility_version: 0
+    Content:         libdylib.dylib
+    ZeroPadBytes:    3
   - cmd:             LC_DYLD_CHAINED_FIXUPS
     cmdsize:         16
     dataoff:         16384
-    datasize:        56
+    datasize:        48
   - cmd:             LC_DYLD_EXPORTS_TRIE
     cmdsize:         16
-    dataoff:         16440
-    datasize:        56
+    dataoff:         16432
+    datasize:        16
   - cmd:             LC_SYMTAB
     cmdsize:         24
-    symoff:          16504
-    nsyms:           15
-    stroff:          16744
-    strsize:         120
+    symoff:          16456
+    nsyms:           1
+    stroff:          16472
+    strsize:         8
   - cmd:             LC_DYSYMTAB
     cmdsize:         80
     ilocalsym:       0
-    nlocalsym:       12
-    iextdefsym:      12
-    nextdefsym:      3
-    iundefsym:       15
+    nlocalsym:       0
+    iextdefsym:      0
+    nextdefsym:      1
+    iundefsym:       1
     nundefsym:       0
     tocoff:          0
     ntoc:            0
@@ -110,136 +115,37 @@
     nextrel:         0
     locreloff:       0
     nlocrel:         0
-  - cmd:             LC_LOAD_DYLINKER
-    cmdsize:         32
-    name:            12
-    Content:         '/usr/lib/dyld'
-    ZeroPadBytes:    7
   - cmd:             LC_UUID
     cmdsize:         24
-    uuid:            F445529E-643C-3A38-8F59-AB64566BCAFF
+    uuid:            52409B91-DF59-346A-A63F-D4E6FFDC3E04
   - cmd:             LC_BUILD_VERSION
     cmdsize:         32
     platform:        1
     minos:           786432
-    sdk:             786432
+    sdk:             851968
     ntools:          1
     Tools:
       - tool:            3
-        version:         46596096
+        version:         53674242
   - cmd:             LC_SOURCE_VERSION
     cmdsize:         16
     version:         0
-  - cmd:             LC_MAIN
-    cmdsize:         24
-    entryoff:        16284
-    stacksize:       0
   - cmd:             LC_LOAD_DYLIB
     cmdsize:         56
     dylib:
       name:            24
       timestamp:       2
-      current_version: 85917696
+      current_version: 65793
       compatibility_version: 65536
-    Content:         '/usr/lib/libSystem.B.dylib'
-    ZeroPadBytes:    6
+    Content:         '/usr/lib/libSystem.dylib'
+    ZeroPadBytes:    8
   - cmd:             LC_FUNCTION_STARTS
     cmdsize:         16
-    dataoff:         16496
+    dataoff:         16448
     datasize:        8
   - cmd:             LC_DATA_IN_CODE
     cmdsize:         16
-    dataoff:         16504
+    dataoff:         16456
     datasize:        0
-  - cmd:             LC_CODE_SIGNATURE
-    cmdsize:         16
-    dataoff:         16864
-    datasize:        273
-LinkEditData:
-  NameList:
-    - n_strx:          33
-      n_type:          0x64
-      n_sect:          0
-      n_desc:          0
-      n_value:         0
-    - n_strx:          39
-      n_type:          0x64
-      n_sect:          0
-      n_desc:          0
-      n_value:         0
-    - n_strx:          46
-      n_type:          0x66
-      n_sect:          0
-      n_desc:          1
-      n_value:         1636754403
-    - n_strx:          1
-      n_type:          0x2E
-      n_sect:          1
-      n_desc:          0
-      n_value:         4294983576
-    - n_strx:          109
-      n_type:          0x24
-      n_sect:          1
-      n_desc:          0
-      n_value:         4294983576
-    - n_strx:          1
-      n_type:          0x24
-      n_sect:          0
-      n_desc:          0
-      n_value:         4
-    - n_strx:          1
-      n_type:          0x4E
-      n_sect:          1
-      n_desc:          0
-      n_value:         4
-    - n_strx:          1
-      n_type:          0x2E
-      n_sect:          1
-      n_desc:          0
-      n_value:         4294983580
-    - n_strx:          114
-      n_type:          0x24
-      n_sect:          1
-      n_desc:          0
-      n_value:         4294983580
-    - n_strx:          1
-      n_type:          0x24
-      n_sect:          0
-      n_desc:          0
-      n_value:         20
-    - n_strx:          1
-      n_type:          0x4E
-      n_sect:          1
-      n_desc:          0
-      n_value:         20
-    - n_strx:          1
-      n_type:          0x64
-      n_sect:          1
-      n_desc:          0
-      n_value:         0
-    - n_strx:          2
-      n_type:          0xF
-      n_sect:          1
-      n_desc:          16
-      n_value:         4294967296
-    - n_strx:          22
-      n_type:          0xF
-      n_sect:          1
-      n_desc:          0
-      n_value:         4294983576
-    - n_strx:          27
-      n_type:          0xF
-      n_sect:          1
-      n_desc:          0
-      n_value:         4294983580
-  StringTable:
-    - ' '
-    - __mh_execute_header
-    - _foo
-    - _main
-    - '/tmp/'
-    - main.c
-    - '/var/folders/gj/wf3swl0x215b2sq1qy84kzkm0000gn/T/main-e32fe7.o'
-    - _foo
-    - _main
+__LINKEDIT:      00000000200000002C0000002C000000000000000100000000000000000000000200000000000000000000000000000000015F666F6F000804008080010000000000000000000000020000000F010000004000000000000020005F666F6F0000
 ...
diff --git a/llvm/test/tools/llvm-reduce/simplify-cfg.ll b/llvm/test/tools/llvm-reduce/simplify-cfg.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/tools/llvm-reduce/simplify-cfg.ll
@@ -0,0 +1,33 @@
+; RUN: llvm-reduce --delta-passes=simplify-cfg --test %python --test-arg %p/Inputs/remove-bbs.py -abort-on-invalid-reduction %s -o %t
+
+; RUN: FileCheck --check-prefix=CHECK-FINAL %s --input-file=%t
+; CHECK-FINAL: @f1
+; CHECK-FINAL-NOT: x6:
+; CHECK-FINAL-NOT: x10:
+
+define void @f1(ptr %interesting3, i32 %interesting2) {
+  %x3 = alloca ptr, i32 0, align 8
+  store ptr %interesting3, ptr %interesting3, align 8
+  switch i32 %interesting2, label %interesting1 [
+    i32 0, label %x6
+    i32 1, label %x11
+  ]
+
+x4:
+  %x5 = call ptr @f2()
+  br label %x10
+
+x10:
+  br label %interesting1
+
+x6:
+  br label %x11
+
+x11:
+  br label %interesting1
+
+interesting1:
+  ret void
+}
+
+declare ptr @f2()
diff --git a/llvm/tools/gold/gold-plugin.cpp b/llvm/tools/gold/gold-plugin.cpp
--- a/llvm/tools/gold/gold-plugin.cpp
+++ b/llvm/tools/gold/gold-plugin.cpp
@@ -722,8 +722,8 @@
 // Returns true if S is valid as a C language identifier.
 static bool isValidCIdentifier(StringRef S) {
   return !S.empty() && (isAlpha(S[0]) || S[0] == '_') &&
-         std::all_of(S.begin() + 1, S.end(),
-                     [](char C) { return C == '_' || isAlnum(C); });
+         llvm::all_of(llvm::drop_begin(S),
+                      [](char C) { return C == '_' || isAlnum(C); });
 }
 
 static bool isUndefined(ld_plugin_symbol &Sym) {
diff --git a/llvm/tools/llvm-objdump/MachODump.h b/llvm/tools/llvm-objdump/MachODump.h
--- a/llvm/tools/llvm-objdump/MachODump.h
+++ b/llvm/tools/llvm-objdump/MachODump.h
@@ -36,6 +36,7 @@
 extern bool Bind;
 extern bool DataInCode;
 extern std::string DisSymName;
+extern bool ChainedFixups;
 extern bool DyldInfo;
 extern bool DylibId;
 extern bool DylibsUsed;
diff --git a/llvm/tools/llvm-objdump/MachODump.cpp b/llvm/tools/llvm-objdump/MachODump.cpp
--- a/llvm/tools/llvm-objdump/MachODump.cpp
+++ b/llvm/tools/llvm-objdump/MachODump.cpp
@@ -81,6 +81,7 @@
 bool objdump::FunctionStarts;
 bool objdump::LinkOptHints;
 bool objdump::InfoPlist;
+bool objdump::ChainedFixups;
 bool objdump::DyldInfo;
 bool objdump::DylibsUsed;
 bool objdump::DylibId;
@@ -112,6 +113,7 @@
   FunctionStarts = InputArgs.hasArg(OBJDUMP_function_starts);
   LinkOptHints = InputArgs.hasArg(OBJDUMP_link_opt_hints);
   InfoPlist = InputArgs.hasArg(OBJDUMP_info_plist);
+  ChainedFixups = InputArgs.hasArg(OBJDUMP_chained_fixups);
   DyldInfo = InputArgs.hasArg(OBJDUMP_dyld_info);
   DylibsUsed = InputArgs.hasArg(OBJDUMP_dylibs_used);
   DylibId = InputArgs.hasArg(OBJDUMP_dylib_id);
@@ -1193,6 +1195,48 @@
     reportError(std::move(Err), Obj->getFileName());
 }
 
+static void
+PrintChainedFixupsHeader(const MachO::dyld_chained_fixups_header &H) {
+  outs() << "chained fixups header (LC_DYLD_CHAINED_FIXUPS)\n";
+  outs() << "  fixups_version = " << H.fixups_version << '\n';
+  outs() << "  starts_offset  = " << H.starts_offset << '\n';
+  outs() << "  imports_offset = " << H.imports_offset << '\n';
+  outs() << "  symbols_offset = " << H.symbols_offset << '\n';
+  outs() << "  imports_count  = " << H.imports_count << '\n';
+
+  outs() << "  imports_format = " << H.imports_format;
+  switch (H.imports_format) {
+  case llvm::MachO::DYLD_CHAINED_IMPORT:
+    outs() << " (DYLD_CHAINED_IMPORT)";
+    break;
+  case llvm::MachO::DYLD_CHAINED_IMPORT_ADDEND:
+    outs() << " (DYLD_CHAINED_IMPORT_ADDEND)";
+    break;
+  case llvm::MachO::DYLD_CHAINED_IMPORT_ADDEND64:
+    outs() << " (DYLD_CHAINED_IMPORT_ADDEND64)";
+    break;
+  }
+  outs() << '\n';
+
+  outs() << "  symbols_format = " << H.symbols_format;
+  if (H.symbols_format == llvm::MachO::DYLD_CHAINED_SYMBOL_ZLIB)
+    outs() << " (zlib compressed)";
+  outs() << '\n';
+}
+
+static void PrintChainedFixups(MachOObjectFile *O) {
+  // MachOObjectFile::getChainedFixupsHeader() reads LC_DYLD_CHAINED_FIXUPS.
+  // FIXME: Support chained fixups in __TEXT,__chain_starts section too.
+  auto ChainedFixupHeader =
+      unwrapOrError(O->getChainedFixupsHeader(), O->getFileName());
+  if (!ChainedFixupHeader)
+    return;
+
+  PrintChainedFixupsHeader(*ChainedFixupHeader);
+
+  // FIXME: Print more things.
+}
+
 static void PrintDyldInfo(MachOObjectFile *O) {
   outs() << "dyld information:" << '\n';
   printMachOChainedFixups(O);
@@ -1916,8 +1960,9 @@
   // UniversalHeaders or ArchiveHeaders.
   if (Disassemble || Relocations || PrivateHeaders || ExportsTrie || Rebase ||
       Bind || SymbolTable || LazyBind || WeakBind || IndirectSymbols ||
-      DataInCode || FunctionStarts || LinkOptHints || DyldInfo || DylibsUsed ||
-      DylibId || Rpaths || ObjcMetaData || (!FilterSections.empty())) {
+      DataInCode || FunctionStarts || LinkOptHints || ChainedFixups ||
+      DyldInfo || DylibsUsed || DylibId || Rpaths || ObjcMetaData ||
+      (!FilterSections.empty())) {
     if (LeadingHeaders) {
       outs() << Name;
       if (!ArchiveMemberName.empty())
@@ -1986,6 +2031,8 @@
     DumpSectionContents(FileName, MachOOF, Verbose);
   if (InfoPlist)
     DumpInfoPlistSectionContents(FileName, MachOOF);
+  if (ChainedFixups)
+    PrintChainedFixups(MachOOF);
   if (DyldInfo)
     PrintDyldInfo(MachOOF);
   if (DylibsUsed)
diff --git a/llvm/tools/llvm-objdump/ObjdumpOpts.td b/llvm/tools/llvm-objdump/ObjdumpOpts.td
--- a/llvm/tools/llvm-objdump/ObjdumpOpts.td
+++ b/llvm/tools/llvm-objdump/ObjdumpOpts.td
@@ -299,11 +299,15 @@
            "Mach-O objects (requires --macho)">,
   Group<grp_mach_o>;
 
+def chained_fixups : Flag<["--"], "chained-fixups">,
+  HelpText<"Print chained fixup information (requires --macho)">,
+  Group<grp_mach_o>;
+
 def dyld_info : Flag<["--"], "dyld_info">,
-      HelpText<"Print bind and rebase information used by dyld to resolve "
-               "external references in a final linked binary "
-               "(requires --macho)">,
-      Group<grp_mach_o>;
+  HelpText<"Print bind and rebase information used by dyld to resolve "
+           "external references in a final linked binary "
+           "(requires --macho)">,
+  Group<grp_mach_o>;
 
 def dylibs_used : Flag<["--"], "dylibs-used">,
   HelpText<"Print the shared libraries used for linked "
diff --git a/llvm/tools/llvm-objdump/OtoolOpts.td b/llvm/tools/llvm-objdump/OtoolOpts.td
--- a/llvm/tools/llvm-objdump/OtoolOpts.td
+++ b/llvm/tools/llvm-objdump/OtoolOpts.td
@@ -37,13 +37,15 @@
 def x : Flag<["-"], "x">, HelpText<"print all text sections">;
 def X : Flag<["-"], "X">, HelpText<"omit leading addresses or headers">;
 
+def chained_fixups : Flag<["-"], "chained_fixups">,
+  HelpText<"print chained fixup information">;
+
 // Not (yet?) implemented:
 // def a : Flag<["-"], "a">, HelpText<"print archive header">;
 // -c print argument strings of a core file
 // -m don't use archive(member) syntax
 // -dyld_info
 // -dyld_opcodes
-// -chained_fixups
 // -addr_slide=arg
 // -function_offsets
 
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp
--- a/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -2787,6 +2787,8 @@
     FilterSections.push_back(",__text");
   LeadingAddr = LeadingHeaders = !InputArgs.hasArg(OTOOL_X);
 
+  ChainedFixups = InputArgs.hasArg(OTOOL_chained_fixups);
+
   InputFilenames = InputArgs.getAllArgValues(OTOOL_INPUT);
   if (InputFilenames.empty())
     reportCmdLineError("no input file");
@@ -2990,11 +2992,12 @@
       !DynamicRelocations && !FileHeaders && !PrivateHeaders && !RawClangAST &&
       !Relocations && !SectionHeaders && !SectionContents && !SymbolTable &&
       !DynamicSymbolTable && !UnwindInfo && !FaultMapSection && !Offloading &&
-      !(MachOOpt && (Bind || DataInCode || DyldInfo || DylibId || DylibsUsed ||
-                     ExportsTrie || FirstPrivateHeader || FunctionStarts ||
-                     IndirectSymbols || InfoPlist || LazyBind || LinkOptHints ||
-                     ObjcMetaData || Rebase || Rpaths || UniversalHeaders ||
-                     WeakBind || !FilterSections.empty()))) {
+      !(MachOOpt &&
+        (Bind || DataInCode || ChainedFixups || DyldInfo || DylibId ||
+         DylibsUsed || ExportsTrie || FirstPrivateHeader || FunctionStarts ||
+         IndirectSymbols || InfoPlist || LazyBind || LinkOptHints ||
+         ObjcMetaData || Rebase || Rpaths || UniversalHeaders || WeakBind ||
+         !FilterSections.empty()))) {
     T->printHelp(ToolName);
     return 2;
   }
diff --git a/llvm/tools/llvm-reduce/CMakeLists.txt b/llvm/tools/llvm-reduce/CMakeLists.txt
--- a/llvm/tools/llvm-reduce/CMakeLists.txt
+++ b/llvm/tools/llvm-reduce/CMakeLists.txt
@@ -49,6 +49,7 @@
   deltas/ReduceRegisterMasks.cpp
   deltas/ReduceRegisterDefs.cpp
   deltas/ReduceRegisterUses.cpp
+  deltas/ReduceUsingSimplifyCFG.cpp
   deltas/RunIRPasses.cpp
   deltas/SimplifyInstructions.cpp
   llvm-reduce.cpp
diff --git a/llvm/tools/llvm-reduce/DeltaManager.cpp b/llvm/tools/llvm-reduce/DeltaManager.cpp
--- a/llvm/tools/llvm-reduce/DeltaManager.cpp
+++ b/llvm/tools/llvm-reduce/DeltaManager.cpp
@@ -39,6 +39,7 @@
 #include "deltas/ReduceRegisterMasks.h"
 #include "deltas/ReduceRegisterUses.h"
 #include "deltas/ReduceSpecialGlobals.h"
+#include "deltas/ReduceUsingSimplifyCFG.h"
 #include "deltas/ReduceVirtualRegisters.h"
 #include "deltas/RunIRPasses.h"
 #include "deltas/SimplifyInstructions.h"
@@ -75,6 +76,7 @@
     DELTA_PASS("operands-to-args", reduceOperandsToArgsDeltaPass)              \
     DELTA_PASS("operands-skip", reduceOperandsSkipDeltaPass)                   \
     DELTA_PASS("operand-bundles", reduceOperandBundesDeltaPass)                \
+    DELTA_PASS("simplify-cfg", reduceUsingSimplifyCFGDeltaPass)                \
     DELTA_PASS("attributes", reduceAttributesDeltaPass)                        \
     DELTA_PASS("module-data", reduceModuleDataDeltaPass)                       \
   } while (false)
diff --git a/llvm/tools/llvm-reduce/deltas/ReduceUsingSimplifyCFG.h b/llvm/tools/llvm-reduce/deltas/ReduceUsingSimplifyCFG.h
new file mode 100644
--- /dev/null
+++ b/llvm/tools/llvm-reduce/deltas/ReduceUsingSimplifyCFG.h
@@ -0,0 +1,23 @@
+//===- ReduceUsingSimplifyCFG.h - Specialized Delta Pass ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a function which calls the Generic Delta pass in order
+// to call SimplifyCFG on individual basic blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_REDUCE_DELTAS_SIMPLIFYCFG_H
+#define LLVM_TOOLS_LLVM_REDUCE_DELTAS_SIMPLIFYCFG_H
+
+#include "Delta.h"
+
+namespace llvm {
+void reduceUsingSimplifyCFGDeltaPass(TestRunner &Test);
+} // namespace llvm
+
+#endif
diff --git a/llvm/tools/llvm-reduce/deltas/ReduceUsingSimplifyCFG.cpp b/llvm/tools/llvm-reduce/deltas/ReduceUsingSimplifyCFG.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/tools/llvm-reduce/deltas/ReduceUsingSimplifyCFG.cpp
@@ -0,0 +1,34 @@
+//===- ReduceUsingSimplifyCFG.h - Specialized Delta Pass ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a function which calls the Generic Delta pass in order
+// to call SimplifyCFG on individual basic blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ReduceUsingSimplifyCFG.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+
+static void reduceUsingSimplifyCFG(Oracle &O, Module &Program) {
+  SmallVector<BasicBlock *, 16> ToSimplify;
+  for (auto &F : Program)
+    for (auto &BB : F)
+      if (!O.shouldKeep())
+        ToSimplify.push_back(&BB);
+  TargetTransformInfo TTI(Program.getDataLayout());
+  for (auto *BB : ToSimplify)
+    simplifyCFG(BB, TTI);
+}
+
+void llvm::reduceUsingSimplifyCFGDeltaPass(TestRunner &Test) {
+  outs() << "*** Reducing using SimplifyCFG...\n";
+  runDeltaPass(Test, reduceUsingSimplifyCFG);
+}
diff --git a/llvm/tools/llvm-reduce/llvm-reduce.cpp b/llvm/tools/llvm-reduce/llvm-reduce.cpp
--- a/llvm/tools/llvm-reduce/llvm-reduce.cpp
+++ b/llvm/tools/llvm-reduce/llvm-reduce.cpp
@@ -17,27 +17,19 @@
 #include "DeltaManager.h"
 #include "ReducerWorkItem.h"
 #include "TestRunner.h"
-#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/ModuleSummaryAnalysis.h"
-#include "llvm/ADT/SmallString.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
 #include "llvm/CodeGen/CommandFlags.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Verifier.h"
 #include "llvm/IRReader/IRReader.h"
-#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Passes/PassBuilder.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Host.h"
 #include "llvm/Support/InitLLVM.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/ThinLTOBitcodeWriter.h"
 #include <system_error>
 #include <vector>
 
@@ -117,9 +109,19 @@
 
 void writeBitcode(ReducerWorkItem &M, llvm::raw_ostream &OutStream) {
   if (M.LTOInfo && M.LTOInfo->IsThinLTO && M.LTOInfo->EnableSplitLTOUnit) {
-    legacy::PassManager PM;
-    PM.add(llvm::createWriteThinLTOBitcodePass(OutStream));
-    PM.run(*(M.M));
+    PassBuilder PB;
+    LoopAnalysisManager LAM;
+    FunctionAnalysisManager FAM;
+    CGSCCAnalysisManager CGAM;
+    ModuleAnalysisManager MAM;
+    PB.registerModuleAnalyses(MAM);
+    PB.registerCGSCCAnalyses(CGAM);
+    PB.registerFunctionAnalyses(FAM);
+    PB.registerLoopAnalyses(LAM);
+    PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
+    ModulePassManager MPM;
+    MPM.addPass(ThinLTOBitcodeWriterPass(OutStream, nullptr));
+    MPM.run(*M.M, MAM);
   } else {
     std::unique_ptr<ModuleSummaryIndex> Index;
     if (M.LTOInfo && M.LTOInfo->HasSummary) {
diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp
--- a/llvm/tools/opt/opt.cpp
+++ b/llvm/tools/opt/opt.cpp
@@ -970,8 +970,8 @@
         report_fatal_error("Text output is incompatible with -module-hash");
       Passes.add(createPrintModulePass(*OS, "", PreserveAssemblyUseListOrder));
     } else if (OutputThinLTOBC)
-      Passes.add(createWriteThinLTOBitcodePass(
-          *OS, ThinLinkOut ? &ThinLinkOut->os() : nullptr));
+      report_fatal_error(
+          "Use the new pass manager for printing ThinLTO bitcode");
     else
       Passes.add(createBitcodeWriterPass(*OS, PreserveBitcodeUseListOrder,
                                          EmitSummaryIndex, EmitModuleHash));
diff --git a/llvm/unittests/ADT/SmallSetTest.cpp b/llvm/unittests/ADT/SmallSetTest.cpp
--- a/llvm/unittests/ADT/SmallSetTest.cpp
+++ b/llvm/unittests/ADT/SmallSetTest.cpp
@@ -21,11 +21,17 @@
 
   SmallSet<int, 4> s1;
 
-  for (int i = 0; i < 4; i++)
-    s1.insert(i);
+  for (int i = 0; i < 4; i++) {
+    auto InsertResult = s1.insert(i);
+    EXPECT_EQ(*InsertResult.first, i);
+    EXPECT_EQ(InsertResult.second, true);
+  }
 
-  for (int i = 0; i < 4; i++)
-    s1.insert(i);
+  for (int i = 0; i < 4; i++) {
+    auto InsertResult = s1.insert(i);
+    EXPECT_EQ(*InsertResult.first, i);
+    EXPECT_EQ(InsertResult.second, false);
+  }
 
   EXPECT_EQ(4u, s1.size());
 
@@ -38,8 +44,17 @@
 TEST(SmallSetTest, Grow) {
   SmallSet<int, 4> s1;
 
-  for (int i = 0; i < 8; i++)
-    s1.insert(i);
+  for (int i = 0; i < 8; i++) {
+    auto InsertResult = s1.insert(i);
+    EXPECT_EQ(*InsertResult.first, i);
+    EXPECT_EQ(InsertResult.second, true);
+  }
+
+  for (int i = 0; i < 8; i++) {
+    auto InsertResult = s1.insert(i);
+    EXPECT_EQ(*InsertResult.first, i);
+    EXPECT_EQ(InsertResult.second, false);
+  }
 
   EXPECT_EQ(8u, s1.size());
 
diff --git a/llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp b/llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp
--- a/llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp
+++ b/llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp
@@ -361,7 +361,7 @@
     char32_t Codepoint = Entry.first;
     const std::string &Name = Entry.second;
     // Ignore names which are not valid.
-    if (Name.empty() || !std::all_of(Name.begin(), Name.end(), [](char C) {
+    if (Name.empty() || !llvm::all_of(Name, [](char C) {
           return llvm::is_contained(Letters, C);
         })) {
       continue;
diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/lib/BUILD.gn
--- a/llvm/utils/gn/secondary/compiler-rt/lib/BUILD.gn
+++ b/llvm/utils/gn/secondary/compiler-rt/lib/BUILD.gn
@@ -7,7 +7,10 @@
     deps += [ "//compiler-rt/lib/msan" ]
   }
   if (current_os == "linux" || current_os == "android") {
-    deps += [ "//compiler-rt/lib/ubsan_minimal" ]
+    deps += [
+      "//compiler-rt/lib/ubsan",
+      "//compiler-rt/lib/ubsan_minimal",
+    ]
   }
   if (current_os != "win" && current_os != "baremetal") {
     deps += [ "//compiler-rt/lib/asan" ]
diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/ubsan/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/lib/ubsan/BUILD.gn
--- a/llvm/utils/gn/secondary/compiler-rt/lib/ubsan/BUILD.gn
+++ b/llvm/utils/gn/secondary/compiler-rt/lib/ubsan/BUILD.gn
@@ -1,3 +1,12 @@
+import("//compiler-rt/target.gni")
+
+group("ubsan") {
+  deps = [
+    ":ubsan_standalone",
+    ":ubsan_standalone_cxx",
+  ]
+}
+
 source_set("sources") {
   configs -= [ "//llvm/utils/gn/build:llvm_code" ]
   configs += [ "//llvm/utils/gn/build:crt_code" ]
@@ -46,7 +55,6 @@
   sources = [ "ubsan_win_dynamic_runtime_thunk.cpp" ]
 }
 
-# Unreferenced; at the moment exists to make sync_source_lists_from_cmake happy.
 source_set("standalone_sources") {
   configs -= [ "//llvm/utils/gn/build:llvm_code" ]
   configs -= [ "//llvm/utils/gn/build:no_rtti" ]
@@ -57,6 +65,11 @@
     "ubsan_init_standalone.cpp",
     "ubsan_signals_standalone.cpp",
   ]
+  deps = [
+    ":sources",
+    "//compiler-rt/lib/interception:sources",
+    "//compiler-rt/lib/sanitizer_common:sources",
+  ]
 }
 
 source_set("cxx_sources") {
@@ -72,3 +85,34 @@
     "ubsan_type_hash_win.cpp",
   ]
 }
+
+# FIXME: Make ubsan_standalone work on mac.
+if (current_os != "mac") {
+  static_library("ubsan_standalone") {
+    output_dir = crt_current_out_dir
+    output_name = "clang_rt.ubsan_standalone$crt_current_target_suffix"
+    complete_static_lib = true
+    configs -= [
+      "//llvm/utils/gn/build:llvm_code",
+      "//llvm/utils/gn/build:thin_archive",
+    ]
+    deps = [
+      ":sources",
+      ":standalone_sources",
+    ]
+    configs += [ "//llvm/utils/gn/build:crt_code" ]
+    sources = [ "ubsan_init_standalone_preinit.cpp" ]
+  }
+
+  static_library("ubsan_standalone_cxx") {
+    output_dir = crt_current_out_dir
+    output_name = "clang_rt.ubsan_standalone_cxx$crt_current_target_suffix"
+    complete_static_lib = true
+    configs -= [
+      "//llvm/utils/gn/build:llvm_code",
+      "//llvm/utils/gn/build:thin_archive",
+    ]
+    deps = [ ":cxx_sources" ]
+    configs += [ "//llvm/utils/gn/build:crt_code" ]
+  }
+}
diff --git a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/JITLink/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/JITLink/BUILD.gn
--- a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/JITLink/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/JITLink/BUILD.gn
@@ -25,6 +25,7 @@
     "ELF.cpp",
     "ELFLinkGraphBuilder.cpp",
     "ELF_aarch64.cpp",
+    "ELF_i386.cpp",
     "ELF_riscv.cpp",
     "ELF_x86_64.cpp",
     "JITLink.cpp",
@@ -36,6 +37,7 @@
     "MachO_x86_64.cpp",
     "MemoryFlags.cpp",
     "aarch64.cpp",
+    "i386.cpp",
     "riscv.cpp",
     "x86_64.cpp",
   ]
diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-reduce/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-reduce/BUILD.gn
--- a/llvm/utils/gn/secondary/llvm/tools/llvm-reduce/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/tools/llvm-reduce/BUILD.gn
@@ -40,6 +40,7 @@
     "deltas/ReduceRegisterMasks.cpp",
     "deltas/ReduceRegisterUses.cpp",
     "deltas/ReduceSpecialGlobals.cpp",
+    "deltas/ReduceUsingSimplifyCFG.cpp",
     "deltas/ReduceVirtualRegisters.cpp",
     "deltas/RunIRPasses.cpp",
     "deltas/SimplifyInstructions.cpp",
diff --git a/llvm/utils/release/test-release.sh b/llvm/utils/release/test-release.sh
--- a/llvm/utils/release/test-release.sh
+++ b/llvm/utils/release/test-release.sh
@@ -35,6 +35,7 @@
 do_libunwind="yes"
 do_test_suite="yes"
 do_openmp="yes"
+do_bolt="no"
 do_lld="yes"
 do_lldb="yes"
 do_polly="yes"
@@ -163,6 +164,12 @@
         -no-openmp )
             do_openmp="no"
             ;;
+        -bolt )
+            do_bolt="yes"
+            ;;
+        -no-bolt )
+            do_bolt="no"
+            ;;
         -no-lld )
             do_lld="no"
             ;;
@@ -265,6 +272,9 @@
 if [ $do_openmp = "yes" ]; then
   projects="$projects openmp"
 fi
+if [ $do_bolt = "yes" ]; then
+  projects="$projects bolt"
+fi
 if [ $do_lld = "yes" ]; then
   projects="$projects lld"
 fi
diff --git a/mlir/cmake/modules/AddMLIR.cmake b/mlir/cmake/modules/AddMLIR.cmake
--- a/mlir/cmake/modules/AddMLIR.cmake
+++ b/mlir/cmake/modules/AddMLIR.cmake
@@ -159,7 +159,7 @@
       "  filepath: \"${LLVM_TARGET_DEFINITIONS_ABSOLUTE}\"\n"
       "  includes: \"${CMAKE_CURRENT_SOURCE_DIR};${tblgen_includes}\"\n"
   )
-  
+
   add_public_tablegen_target(${target})
 endfunction()
 
@@ -490,6 +490,17 @@
     ${ARG_PUBLIC_LIBS}
   )
   target_sources(${name} PRIVATE ${_objects})
+
+  # Linux defaults to allowing undefined symbols in shared libraries whereas
+  # many other platforms are more strict. We want these libraries to be
+  # self contained, and we want any undefined symbols to be reported at
+  # library construction time, not at library use, so make Linux strict too.
+  if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+    target_link_options(${name} PRIVATE
+      "LINKER:-z,defs"
+    )
+  endif()
+
   # TODO: Should be transitive.
   set_target_properties(${name} PROPERTIES
     MLIR_AGGREGATE_EXCLUDE_LIBS "${_embed_libs}")
diff --git a/mlir/include/mlir/Analysis/DataFlow/DeadCodeAnalysis.h b/mlir/include/mlir/Analysis/DataFlow/DeadCodeAnalysis.h
--- a/mlir/include/mlir/Analysis/DataFlow/DeadCodeAnalysis.h
+++ b/mlir/include/mlir/Analysis/DataFlow/DeadCodeAnalysis.h
@@ -41,9 +41,6 @@
   /// The state is initialized by default.
   bool isUninitialized() const override { return false; }
 
-  /// The state is always initialized.
-  ChangeResult defaultInitialize() override { return ChangeResult::NoChange; }
-
   /// Set the state of the program point to live.
   ChangeResult setToLive();
 
@@ -101,9 +98,6 @@
   /// The state is initialized by default.
   bool isUninitialized() const override { return false; }
 
-  /// The state is always initialized.
-  ChangeResult defaultInitialize() override { return ChangeResult::NoChange; }
-
   /// Print the known predecessors.
   void print(raw_ostream &os) const override;
 
diff --git a/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h b/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h
--- a/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h
+++ b/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h
@@ -42,10 +42,6 @@
   /// Reset the dense lattice to a pessimistic value. This occurs when the
   /// analysis cannot reason about the data-flow.
   virtual ChangeResult reset() = 0;
-
-  /// Returns true if the lattice state has reached a pessimistic fixpoint. That
-  /// is, no further modifications to the lattice can occur.
-  virtual bool isAtFixpoint() const = 0;
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Analysis/DataFlow/SparseAnalysis.h b/mlir/include/mlir/Analysis/DataFlow/SparseAnalysis.h
--- a/mlir/include/mlir/Analysis/DataFlow/SparseAnalysis.h
+++ b/mlir/include/mlir/Analysis/DataFlow/SparseAnalysis.h
@@ -38,10 +38,6 @@
   /// if the value of the lattice changed.
   virtual ChangeResult join(const AbstractSparseLattice &rhs) = 0;
 
-  /// Returns true if the lattice element is at fixpoint and further calls to
-  /// `join` will not update the value of the element.
-  virtual bool isAtFixpoint() const = 0;
-
   /// Mark the lattice element as having reached a pessimistic fixpoint. This
   /// means that the lattice may potentially have conflicting value states, and
   /// only the most conservative value should be relied on.
@@ -97,16 +93,6 @@
 
   /// Returns true if the value of this lattice hasn't yet been initialized.
   bool isUninitialized() const override { return !optimisticValue.has_value(); }
-  /// Force the initialization of the element by setting it to its pessimistic
-  /// fixpoint.
-  ChangeResult defaultInitialize() override {
-    return markPessimisticFixpoint();
-  }
-
-  /// Returns true if the lattice has reached a fixpoint. A fixpoint is when
-  /// the information optimistically assumed to be true is the same as the
-  /// information known to be true.
-  bool isAtFixpoint() const override { return optimisticValue == knownValue; }
 
   /// Join the information contained in the 'rhs' lattice into this
   /// lattice. Returns if the state of the current lattice changed.
@@ -114,8 +100,8 @@
     const Lattice<ValueT> &rhsLattice =
         static_cast<const Lattice<ValueT> &>(rhs);
 
-    // If we are at a fixpoint, or rhs is uninitialized, there is nothing to do.
-    if (isAtFixpoint() || rhsLattice.isUninitialized())
+    // If rhs is uninitialized, there is nothing to do.
+    if (rhsLattice.isUninitialized())
       return ChangeResult::NoChange;
 
     // Join the rhs value into this lattice.
@@ -150,7 +136,7 @@
   /// means that the lattice may potentially have conflicting value states,
   /// and only the conservatively known value state should be relied on.
   ChangeResult markPessimisticFixpoint() override {
-    if (isAtFixpoint())
+    if (optimisticValue == knownValue)
       return ChangeResult::NoChange;
 
     // For this fixed point, we take whatever we knew to be true and set that
diff --git a/mlir/include/mlir/Analysis/DataFlowFramework.h b/mlir/include/mlir/Analysis/DataFlowFramework.h
--- a/mlir/include/mlir/Analysis/DataFlowFramework.h
+++ b/mlir/include/mlir/Analysis/DataFlowFramework.h
@@ -291,10 +291,6 @@
   /// Returns true if the analysis state is uninitialized.
   virtual bool isUninitialized() const = 0;
 
-  /// Force an uninitialized analysis state to initialize itself with a default
-  /// value.
-  virtual ChangeResult defaultInitialize() = 0;
-
   /// Print the contents of the analysis state.
   virtual void print(raw_ostream &os) const = 0;
 
diff --git a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
--- a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
@@ -369,7 +369,10 @@
   /// to
   ///
   /// %iv = %lb + %procId * %step
-  CyclicNumProcsEqNumIters = 2
+  CyclicNumProcsEqNumIters = 2,
+
+  /// No Distribution.
+  None = 3
 };
 
 /// Callback function type used to get processor ID, and number of processors
@@ -377,11 +380,10 @@
 struct ProcInfo {
   Value procId;
   Value nprocs;
+  DistributionMethod distributionMethod;
 };
-using ProcInfoCallBackFn = std::function<SmallVector<ProcInfo, 2>(
+using ProcInfoCallBackFn = std::function<SmallVector<ProcInfo>(
     OpBuilder &b, Location loc, ArrayRef<Range> parallelLoopRanges)>;
-using OneDimProcInfoCallBackFn =
-    std::function<ProcInfo(OpBuilder &b, Location loc)>;
 
 /// Options that allow distribution of loops generated in Linalg transforms to
 /// processors while generating the loops.
@@ -389,21 +391,10 @@
   /// Callback function that returns the Values for processor ID (`procId`), and
   /// number of processors (`nprocs`) used to execute the parallel loops. The
   /// number of `{procId, nprocs}` pairs returned must be equal to the number of
-  /// `parallelLoopRanges` passed into the callback, which in-turn is same as
-  /// the number of parallel loops for which the `distributionMethod` is
-  /// specified below.
+  /// `parallelLoopRanges` passed into the callback. The `parallelLoopRanges`
+  /// are ranges of the outer parallel loops of the operation that
+  /// do have non-zero tile sizes specified.
   ProcInfoCallBackFn procInfo;
-  /// Specification of how to distribute the `scf.parallel` loops that are
-  /// generated. As the `scf.parallel` loop is generated, the elements of this
-  /// vector is used (from left to right) and the specified distribution is
-  /// applied. If the vector is less than the number of `scf.parallel` loops
-  /// generated, then no distribution is applied.
-  SmallVector<DistributionMethod, 0> distributionMethod = {};
-
-  /// The map keyed by the distribution type that contains callback functions
-  /// that return the Values for processor ID (`procId`), and number of
-  /// processors (`nprocs`) used to execute the parallel loops.
-  DenseMap<StringRef, OneDimProcInfoCallBackFn> procInfoMap;
 };
 
 /// Update the `lb`, `ub` and `step` to get per processor `lb`, `ub` and `step`.
@@ -521,8 +512,7 @@
                    function_ref<scf::ValueVector(OpBuilder &, Location,
                                                  ValueRange, ValueRange)>
                        bodyBuilderFn,
-                   Optional<LinalgLoopDistributionOptions> = None,
-                   ArrayRef<StringRef> distributionTypes = {});
+                   ArrayRef<linalg::ProcInfo> procInfo = {});
 };
 
 } // namespace linalg
diff --git a/mlir/include/mlir/Dialect/Math/IR/MathOps.td b/mlir/include/mlir/Dialect/Math/IR/MathOps.td
--- a/mlir/include/mlir/Dialect/Math/IR/MathOps.td
+++ b/mlir/include/mlir/Dialect/Math/IR/MathOps.td
@@ -538,6 +538,7 @@
     %a = math.ipowi %b, %c : i32
     ```
   }];
+  let hasFolder = 1;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVArithmeticOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVArithmeticOps.td
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVArithmeticOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVArithmeticOps.td
@@ -27,12 +27,12 @@
   // In addition to normal types arithmetic instructions can support cooperative
   // matrix.
   let arguments = (ins
-    SPV_ScalarOrVectorOrCoopMatrixOf<type>:$operand1,
-    SPV_ScalarOrVectorOrCoopMatrixOf<type>:$operand2
+    SPV_ScalarOrVectorOrCoopMatrixOfOrJointMatrixOf<type>:$operand1,
+    SPV_ScalarOrVectorOrCoopMatrixOfOrJointMatrixOf<type>:$operand2
   );
 
   let results = (outs
-    SPV_ScalarOrVectorOrCoopMatrixOf<type>:$result
+    SPV_ScalarOrVectorOrCoopMatrixOfOrJointMatrixOf<type>:$result
   );
   let assemblyFormat = "operands attr-dict `:` type($result)";
 }
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVAttributes.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVAttributes.td
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVAttributes.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVAttributes.td
@@ -64,6 +64,27 @@
     TypedArrayAttrBase<SPV_CooperativeMatrixPropertiesNVAttr,
                        "CooperativeMatrixPropertiesNV array attribute">;
 
+// Description of the supported joint matrix operations. See
+// https://github.com/intel/llvm/blob/sycl/sycl/doc/design/spirv-extensions/SPV_INTEL_joint_matrix.asciidoc
+def SPV_JointMatrixPropertiesINTELAttr :
+    SPV_Attr<"JointMatrixPropertiesINTEL", "joint_matrix_props"> {
+  let parameters = (ins
+    "int":$m_size,
+    "int":$n_size,
+    "int":$k_size,
+    "mlir::Type":$a_type,
+    "mlir::Type":$b_type,
+    "mlir::Type":$c_type,
+    "mlir::Type":$result_type,
+    "mlir::spirv::ScopeAttr":$scope
+  );
+  let assemblyFormat = "`<` struct(params) `>`";
+}
+
+def SPV_JointMatrixPropertiesINTELArrayAttr :
+    TypedArrayAttrBase<SPV_JointMatrixPropertiesINTELAttr,
+                       "JointMatrixPropertiesINTEL array attribute">;
+
 // This attribute specifies the limits for various resources on the target
 // architecture.
 //
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
@@ -387,6 +387,7 @@
 def SPV_INTEL_fp_fast_math_mode                  : I32EnumAttrCase<"SPV_INTEL_fp_fast_math_mode", 4027>;
 def SPV_INTEL_memory_access_aliasing             : I32EnumAttrCase<"SPV_INTEL_memory_access_aliasing", 4028>;
 def SPV_INTEL_split_barrier                      : I32EnumAttrCase<"SPV_INTEL_split_barrier", 4029>;
+def SPV_INTEL_joint_matrix                       : I32EnumAttrCase<"SPV_INTEL_joint_matrix", 4030>;
 
 def SPV_NV_compute_shader_derivatives    : I32EnumAttrCase<"SPV_NV_compute_shader_derivatives", 5000>;
 def SPV_NV_cooperative_matrix            : I32EnumAttrCase<"SPV_NV_cooperative_matrix", 5001>;
@@ -443,7 +444,7 @@
       SPV_INTEL_usm_storage_classes, SPV_INTEL_io_pipes, SPV_INTEL_blocking_pipes,
       SPV_INTEL_fpga_reg, SPV_INTEL_long_constant_composite, SPV_INTEL_optnone,
       SPV_INTEL_debug_module, SPV_INTEL_fp_fast_math_mode,
-      SPV_INTEL_memory_access_aliasing, SPV_INTEL_split_barrier,
+      SPV_INTEL_memory_access_aliasing, SPV_INTEL_split_barrier, SPV_INTEL_joint_matrix,
       SPV_NV_compute_shader_derivatives, SPV_NV_cooperative_matrix,
       SPV_NV_fragment_shader_barycentric, SPV_NV_geometry_shader_passthrough,
       SPV_NV_mesh_shader, SPV_NV_ray_tracing, SPV_NV_sample_mask_override_coverage,
@@ -1390,6 +1391,12 @@
   ];
 }
 
+def SPV_C_JointMatrixINTEL                         : I32EnumAttrCase<"JointMatrixINTEL", 6118> {
+  list<Availability> availability = [
+    Extension<[SPV_INTEL_joint_matrix]>
+  ];
+}
+
 def SPV_CapabilityAttr :
     SPV_I32EnumAttr<"Capability", "valid SPIR-V Capability", "capability", [
       SPV_C_Matrix, SPV_C_Addresses, SPV_C_Linkage, SPV_C_Kernel, SPV_C_Float16,
@@ -1481,7 +1488,7 @@
       SPV_C_UniformTexelBufferArrayNonUniformIndexing,
       SPV_C_StorageTexelBufferArrayNonUniformIndexing,
       SPV_C_ShaderViewportIndexLayerEXT, SPV_C_ShaderViewportMaskNV,
-      SPV_C_ShaderStereoViewNV
+      SPV_C_ShaderStereoViewNV, SPV_C_JointMatrixINTEL
     ]>;
 
 def SPV_AM_Logical                 : I32EnumAttrCase<"Logical", 0>;
@@ -3981,6 +3988,16 @@
   "image_sampler_use_info",
   [SPV_ISUI_SamplerUnknown, SPV_ISUI_NeedSampler, SPV_ISUI_NoSampler]>;
 
+def SPV_ML_ColumnMajor : I32EnumAttrCase<"ColumnMajor", 0>;
+def SPV_ML_RowMajor    : I32EnumAttrCase<"RowMajor", 1>;
+def SPV_ML_PackedA     : I32EnumAttrCase<"PackedA", 2>;
+def SPV_ML_PackedB     : I32EnumAttrCase<"PackedB", 3>;
+
+def SPV_MatrixLayoutAttr  :
+    SPV_I32EnumAttr<"MatrixLayout", "valid SPIR-V MatrixLayout", "matrixLayout", [
+      SPV_ML_ColumnMajor, SPV_ML_RowMajor, SPV_ML_PackedA, SPV_ML_PackedB
+    ]>;
+
 //===----------------------------------------------------------------------===//
 // SPIR-V attribute definitions
 //===----------------------------------------------------------------------===//
@@ -4013,6 +4030,8 @@
 def SPV_IsCooperativeMatrixType :
   CPred<"$_self.isa<::mlir::spirv::CooperativeMatrixNVType>()">;
 def SPV_IsImageType : CPred<"$_self.isa<::mlir::spirv::ImageType>()">;
+def SPV_IsJointMatrixType :
+  CPred<"$_self.isa<::mlir::spirv::JointMatrixINTELType>()">;
 def SPV_IsMatrixType : CPred<"$_self.isa<::mlir::spirv::MatrixType>()">;
 def SPV_IsPtrType : CPred<"$_self.isa<::mlir::spirv::PointerType>()">;
 def SPV_IsRTArrayType : CPred<"$_self.isa<::mlir::spirv::RuntimeArrayType>()">;
@@ -4043,6 +4062,8 @@
                                "any SPIR-V cooperative matrix type">;
 def SPV_AnyImage : DialectType<SPIRV_Dialect, SPV_IsImageType,
                                 "any SPIR-V image type">;
+def SPV_AnyJointMatrix : DialectType<SPIRV_Dialect, SPV_IsJointMatrixType,
+                                "any SPIR-V joint matrix type">;
 def SPV_AnyMatrix : DialectType<SPIRV_Dialect, SPV_IsMatrixType,
                                 "any SPIR-V matrix type">;
 def SPV_AnyRTArray : DialectType<SPIRV_Dialect, SPV_IsRTArrayType,
@@ -4057,11 +4078,12 @@
 def SPV_Aggregate : AnyTypeOf<[SPV_AnyArray, SPV_AnyRTArray, SPV_AnyStruct]>;
 def SPV_Composite :
     AnyTypeOf<[SPV_Vector, SPV_AnyArray, SPV_AnyRTArray, SPV_AnyStruct,
-               SPV_AnyCooperativeMatrix, SPV_AnyMatrix]>;
+               SPV_AnyCooperativeMatrix, SPV_AnyJointMatrix, SPV_AnyMatrix]>;
 def SPV_Type : AnyTypeOf<[
     SPV_Void, SPV_Bool, SPV_Integer, SPV_Float, SPV_Vector,
     SPV_AnyPtr, SPV_AnyArray, SPV_AnyRTArray, SPV_AnyStruct,
-    SPV_AnyCooperativeMatrix, SPV_AnyMatrix, SPV_AnySampledImage
+    SPV_AnyCooperativeMatrix, SPV_AnyJointMatrix, SPV_AnyMatrix,
+    SPV_AnySampledImage
   ]>;
 
 def SPV_SignedInt : SignedIntOfWidths<[8, 16, 32, 64]>;
@@ -4072,6 +4094,11 @@
     "$_self.cast<::mlir::spirv::CooperativeMatrixNVType>().getElementType()",
     "Cooperative Matrix">;
 
+class SPV_JointMatrixOfType<list<Type> allowedTypes> :
+  ContainerType<AnyTypeOf<allowedTypes>, SPV_IsJointMatrixType,
+    "$_self.cast<::mlir::spirv::JointMatrixINTELType>().getElementType()",
+    "Joint Matrix">;
+
 class SPV_ScalarOrVectorOf<Type type> :
     AnyTypeOf<[type, VectorOfLengthAndType<[2, 3, 4, 8, 16], [type]>]>;
 
@@ -4079,6 +4106,14 @@
     AnyTypeOf<[type, VectorOfLengthAndType<[2, 3, 4, 8, 16], [type]>,
                SPV_CoopMatrixOfType<[type]>]>;
 
+class SPV_ScalarOrVectorOrJointMatrixOf<Type type> :
+    AnyTypeOf<[type, VectorOfLengthAndType<[2, 3, 4, 8, 16], [type]>,
+               SPV_JointMatrixOfType<[type]>]>;
+
+class SPV_ScalarOrVectorOrCoopMatrixOfOrJointMatrixOf<Type type> :
+    AnyTypeOf<[type, VectorOfLengthAndType<[2, 3, 4, 8, 16], [type]>,
+               SPV_CoopMatrixOfType<[type]>, SPV_JointMatrixOfType<[type]> ]>;
+
 def SPV_ScalarOrVector : AnyTypeOf<[SPV_Scalar, SPV_Vector]>;
 def SPV_ScalarOrVectorOrPtr : AnyTypeOf<[SPV_ScalarOrVector, SPV_AnyPtr]>;
 
@@ -4311,6 +4346,11 @@
 def SPV_OC_OpSubgroupBlockWriteINTEL   : I32EnumAttrCase<"OpSubgroupBlockWriteINTEL", 5576>;
 def SPV_OC_OpAssumeTrueKHR             : I32EnumAttrCase<"OpAssumeTrueKHR", 5630>;
 def SPV_OC_OpAtomicFAddEXT             : I32EnumAttrCase<"OpAtomicFAddEXT", 6035>;
+def SPV_OC_OpTypeJointMatrixINTEL      : I32EnumAttrCase<"OpTypeJointMatrixINTEL", 6119>;
+def SPV_OC_OpJointMatrixLoadINTEL      : I32EnumAttrCase<"OpJointMatrixLoadINTEL", 6120>;
+def SPV_OC_OpJointMatrixStoreINTEL     : I32EnumAttrCase<"OpJointMatrixStoreINTEL", 6121>;
+def SPV_OC_OpJointMatrixMadINTEL       : I32EnumAttrCase<"OpJointMatrixMadINTEL", 6122>;
+def SPV_OC_OpTypejointMatrixWorkItemLengthINTEL : I32EnumAttrCase<"OpJointMatrixWorkItemLengthINTEL", 6410>;
 
 def SPV_OpcodeAttr :
     SPV_I32EnumAttr<"Opcode", "valid SPIR-V instructions", "opcode", [
@@ -4376,7 +4416,10 @@
       SPV_OC_OpCooperativeMatrixLoadNV, SPV_OC_OpCooperativeMatrixStoreNV,
       SPV_OC_OpCooperativeMatrixMulAddNV, SPV_OC_OpCooperativeMatrixLengthNV,
       SPV_OC_OpSubgroupBlockReadINTEL, SPV_OC_OpSubgroupBlockWriteINTEL,
-      SPV_OC_OpAssumeTrueKHR, SPV_OC_OpAtomicFAddEXT
+      SPV_OC_OpAssumeTrueKHR, SPV_OC_OpAtomicFAddEXT,
+      SPV_OC_OpTypeJointMatrixINTEL, SPV_OC_OpJointMatrixLoadINTEL,
+      SPV_OC_OpJointMatrixStoreINTEL, SPV_OC_OpJointMatrixMadINTEL,
+      SPV_OC_OpTypejointMatrixWorkItemLengthINTEL
     ]>;
 
 // End opcode section. Generated from SPIR-V spec; DO NOT MODIFY!
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCastOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCastOps.td
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCastOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCastOps.td
@@ -23,11 +23,11 @@
              !listconcat(traits,
                          [NoSideEffect, SameOperandsAndResultShape])> {
   let arguments = (ins
-    SPV_ScalarOrVectorOrCoopMatrixOf<operandType>:$operand
+    SPV_ScalarOrVectorOrCoopMatrixOfOrJointMatrixOf<operandType>:$operand
   );
 
   let results = (outs
-    SPV_ScalarOrVectorOrCoopMatrixOf<resultType>:$result
+    SPV_ScalarOrVectorOrCoopMatrixOfOrJointMatrixOf<resultType>:$result
   );
   let assemblyFormat = [{
     $operand attr-dict `:` type($operand) `to` type($result)
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVJointMatrixOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVJointMatrixOps.td
new file mode 100644
--- /dev/null
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVJointMatrixOps.td
@@ -0,0 +1,248 @@
+//===- SPIRVJointMatrixOps.td - joint matmul ---------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the op definition spec of joint matrix multiply extension ops.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_SPIRV_IR_JOINT_MATRIX_OPS
+#define MLIR_DIALECT_SPIRV_IR_JOINT_MATRIX_OPS
+
+// -----
+
+def SPV_JointMatrixWorkItemLengthINTELOp : SPV_Op<"JointMatrixWorkItemLengthINTEL",
+  [NoSideEffect]> {
+  let summary = "See extension SPV_INTEL_joint_matrix";
+
+  let description = [{
+    Return number of components owned by the current work-item in 
+    a joint matrix.
+
+    Result Type must be an 32-bit unsigned integer type scalar.
+
+    Type is a joint matrix type.
+
+    ``` {.ebnf}
+    joint-matrix-length-op ::= ssa-id `=` `spv.JointMatrixWorkItemLengthINTEL
+                                    ` : ` joint-matrix-type
+    ```
+
+    For example:
+
+    ```
+    %0 = spv.JointMatrixWorkItemLengthINTEL : !spv.jointmatrix<Subgroup, i32, 8, 16>
+    ```
+  }];
+
+  let assemblyFormat = "attr-dict `:` $type";
+
+  let availability = [
+    MinVersion<SPV_V_1_0>,
+    MaxVersion<SPV_V_1_6>,
+    Extension<[SPV_INTEL_joint_matrix]>,
+    Capability<[SPV_C_JointMatrixINTEL]>
+  ];
+
+  let arguments = (ins
+    TypeAttr:$type
+  );
+
+  let results = (outs
+    SPV_Int32:$result
+  );
+  let hasVerifier = 0;
+}
+
+// -----
+
+def SPV_JointMatrixLoadINTELOp : SPV_Op<"JointMatrixLoadINTEL", []> {
+  let summary = "See extension SPV_INTEL_joint_matrix";
+
+  let description = [{
+    Load a matrix through a pointer.
+
+    Result Type is the type of the loaded matrix. It must be OpTypeJointMatrixINTEL.
+
+    Pointer is the pointer to load through. It specifies start of memory region where 
+    elements of the matrix are stored and arranged according to Layout.
+
+    Stride is the number of elements in memory between beginnings of successive rows, 
+    columns (or words) in the result. It must be a scalar integer type.
+
+    Layout indicates how the values loaded from memory are arranged. It must be the 
+    result of a constant instruction.
+
+    Scope is syncronization scope for operation on the matrix. It must be the result 
+    of a constant instruction with scalar integer type.
+
+    If present, any Memory Operands must begin with a memory operand literal. If not 
+    present, it is the same as specifying the memory operand None.
+
+    #### Example:
+    ```mlir
+    %0 = spv.JointMatrixLoadINTEL <Subgroup> <RowMajor> %ptr, %stride 
+         {memory_access = #spv.memory_access<Volatile>} : 
+         (!spv.ptr<i32, CrossWorkgroup>, i32) -> 
+         !spv.jointmatrix<8x16xi32, ColumnMajor, Subgroup>
+    ```
+  }];
+
+  let assemblyFormat = [{
+    $scope $layout operands attr-dict `:` `(` type(operands) `)` `->` type($result)
+  }];
+
+  let availability = [
+    MinVersion<SPV_V_1_0>,
+    MaxVersion<SPV_V_1_6>,
+    Extension<[SPV_INTEL_joint_matrix]>,
+    Capability<[SPV_C_JointMatrixINTEL]>
+  ];
+
+  let arguments = (ins
+    SPV_ScopeAttr:$scope,
+    SPV_MatrixLayoutAttr:$layout,
+    SPV_AnyPtr:$pointer,
+    SPV_Integer:$stride,
+    OptionalAttr<SPV_MemoryAccessAttr>:$memory_access,
+    OptionalAttr<I32Attr>:$alignment
+  );
+
+  let results = (outs
+    SPV_AnyJointMatrix:$result
+  );
+}
+
+// -----
+
+def SPV_JointMatrixMadINTELOp : SPV_Op<"JointMatrixMadINTEL",
+  [NoSideEffect, AllTypesMatch<["c", "result"]>]> {
+  let summary = "See extension SPV_INTEL_joint_matrix";
+
+  let description = [{
+    Multiply matrix A by matrix B and add matrix C to the result 
+    of the multiplication: A*B+C. Here A is a M x K matrix, B is 
+    a K x N matrix and C is a M x N matrix.
+
+    Behavior is undefined if sizes of operands do not meet the 
+    conditions above. All operands and the Result Type must be 
+    OpTypeJointMatrixINTEL.
+
+    A must be a OpTypeJointMatrixINTEL whose Component Type is a 
+    signed numerical type, Row Count equals to M and Column Count 
+    equals to K
+
+    B must be a OpTypeJointMatrixINTEL whose Component Type is a 
+    signed numerical type, Row Count equals to K and Column Count 
+    equals to N
+
+    C and Result Type must be a OpTypeJointMatrixINTEL with Row 
+    Count equals to M and Column Count equals to N
+
+    Scope is syncronization scope for operation on the matrix. 
+    It must be the result of a constant instruction with scalar 
+    integer type.
+
+    #### Example:
+    ```mlir
+    %r = spv.JointMatrixMadINTEL <Subgroup> %a, %b, %c : 
+         !spv.jointmatrix<8x32xi8, RowMajor, Subgroup>, 
+         !spv.jointmatrix<32x8xi8, ColumnMajor, Subgroup> 
+         -> !spv.jointmatrix<8x8xi32,  RowMajor, Subgroup>
+    ```
+
+  }];
+
+  let assemblyFormat = [{
+    $scope operands attr-dict`:` type($a) `,` type($b) `->` type($c)
+  }];
+
+  let availability = [
+    MinVersion<SPV_V_1_0>,
+    MaxVersion<SPV_V_1_6>,
+    Extension<[SPV_INTEL_joint_matrix]>,
+    Capability<[SPV_C_JointMatrixINTEL]>
+  ];
+
+  let arguments = (ins
+    SPV_ScopeAttr:$scope,
+    SPV_AnyJointMatrix:$a,
+    SPV_AnyJointMatrix:$b,
+    SPV_AnyJointMatrix:$c
+  );
+
+  let results = (outs
+    SPV_AnyJointMatrix:$result
+  );
+}
+
+// -----
+
+def SPV_JointMatrixStoreINTELOp : SPV_Op<"JointMatrixStoreINTEL", []> {
+  let summary = "See extension SPV_INTEL_joint_matrix";
+
+  let description = [{
+    Store a matrix through a pointer.
+
+    Pointer is the pointer to store through. It specifies 
+    start of memory region where elements of the matrix must 
+    be stored and arranged according to Layout.
+
+    Object is the matrix to store. It must be 
+    OpTypeJointMatrixINTEL.
+
+    Stride is the number of elements in memory between beginnings 
+    of successive rows, columns (or words) of the Object. It must 
+    be a scalar integer type.
+
+    Layout indicates how the values stored to memory are arranged. 
+    It must be the result of a constant instruction.
+
+    Scope is syncronization scope for operation on the matrix. 
+    It must be the result of a constant instruction with scalar 
+    integer type.
+
+    If present, any Memory Operands must begin with a memory operand 
+    literal. If not present, it is the same as specifying the memory 
+    operand None.
+
+    #### Example:
+    ```mlir
+    spv.JointMatrixStoreINTEL <Subgroup> <ColumnMajor> %ptr, %m, %stride 
+    {memory_access = #spv.memory_access<Volatile>} : (!spv.ptr<i32, Workgroup>, 
+    !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, i32)
+    ```
+
+  }];
+
+   let assemblyFormat = [{
+    $scope $layout operands attr-dict `:` `(` type(operands) `)`
+  }];
+
+  let availability = [
+    MinVersion<SPV_V_1_0>,
+    MaxVersion<SPV_V_1_6>,
+    Extension<[SPV_INTEL_joint_matrix]>,
+    Capability<[SPV_C_JointMatrixINTEL]>
+  ];
+
+  let arguments = (ins
+    SPV_ScopeAttr:$scope,
+    SPV_MatrixLayoutAttr:$layout,
+    SPV_AnyPtr:$pointer,
+    SPV_AnyJointMatrix:$object,
+    SPV_Integer:$stride,
+    OptionalAttr<SPV_MemoryAccessAttr>:$memory_access,
+    OptionalAttr<I32Attr>:$alignment
+  );
+
+  let results = (outs);
+}
+
+// -----
+
+#endif // MLIR_DIALECT_SPIRV_IR_JOINT_MATRIX_OPS
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVOps.td
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVOps.td
@@ -30,6 +30,7 @@
 include "mlir/Dialect/SPIRV/IR/SPIRVCompositeOps.td"
 include "mlir/Dialect/SPIRV/IR/SPIRVControlFlowOps.td"
 include "mlir/Dialect/SPIRV/IR/SPIRVCooperativeMatrixOps.td"
+include "mlir/Dialect/SPIRV/IR/SPIRVJointMatrixOps.td"
 include "mlir/Dialect/SPIRV/IR/SPIRVGLOps.td"
 include "mlir/Dialect/SPIRV/IR/SPIRVGroupOps.td"
 include "mlir/Dialect/SPIRV/IR/SPIRVImageOps.td"
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVTypes.h b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVTypes.h
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVTypes.h
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVTypes.h
@@ -29,6 +29,7 @@
 struct ArrayTypeStorage;
 struct CooperativeMatrixTypeStorage;
 struct ImageTypeStorage;
+struct JointMatrixTypeStorage;
 struct MatrixTypeStorage;
 struct PointerTypeStorage;
 struct RuntimeArrayTypeStorage;
@@ -420,6 +421,33 @@
                        Optional<StorageClass> storage = llvm::None);
 };
 
+// SPIR-V joint matrix type
+class JointMatrixINTELType
+    : public Type::TypeBase<JointMatrixINTELType, CompositeType,
+                            detail::JointMatrixTypeStorage> {
+public:
+  using Base::Base;
+
+  static JointMatrixINTELType get(Type elementType, Scope scope, unsigned rows,
+                                  unsigned columns, MatrixLayout matrixLayout);
+  Type getElementType() const;
+
+  /// Return the scope of the joint matrix.
+  Scope getScope() const;
+  /// return the number of rows of the matrix.
+  unsigned getRows() const;
+  /// return the number of columns of the matrix.
+  unsigned getColumns() const;
+
+  /// return the layout of the matrix
+  MatrixLayout getMatrixLayout() const;
+
+  void getExtensions(SPIRVType::ExtensionArrayRefVector &extensions,
+                     Optional<StorageClass> storage = llvm::None);
+  void getCapabilities(SPIRVType::CapabilityArrayRefVector &capabilities,
+                       Optional<StorageClass> storage = llvm::None);
+};
+
 // SPIR-V matrix type
 class MatrixType : public Type::TypeBase<MatrixType, CompositeType,
                                          detail::MatrixTypeStorage> {
diff --git a/mlir/include/mlir/ExecutionEngine/CRunnerUtils.h b/mlir/include/mlir/ExecutionEngine/CRunnerUtils.h
--- a/mlir/include/mlir/ExecutionEngine/CRunnerUtils.h
+++ b/mlir/include/mlir/ExecutionEngine/CRunnerUtils.h
@@ -36,6 +36,7 @@
 #include <cassert>
 #include <cstdint>
 #include <initializer_list>
+#include <vector>
 
 //===----------------------------------------------------------------------===//
 // Codegen-compatible structures for Vector type.
@@ -209,13 +210,19 @@
 template <typename T, int Rank>
 class StridedMemrefIterator {
 public:
+  using iterator_category = std::forward_iterator_tag;
+  using value_type = T;
+  using difference_type = std::ptrdiff_t;
+  using pointer = T *;
+  using reference = T &;
+
   StridedMemrefIterator(StridedMemRefType<T, Rank> &descriptor,
                         int64_t offset = 0)
-      : offset(offset), descriptor(descriptor) {}
+      : offset(offset), descriptor(&descriptor) {}
   StridedMemrefIterator<T, Rank> &operator++() {
     int dim = Rank - 1;
-    while (dim >= 0 && indices[dim] == (descriptor.sizes[dim] - 1)) {
-      offset -= indices[dim] * descriptor.strides[dim];
+    while (dim >= 0 && indices[dim] == (descriptor->sizes[dim] - 1)) {
+      offset -= indices[dim] * descriptor->strides[dim];
       indices[dim] = 0;
       --dim;
     }
@@ -224,17 +231,17 @@
       return *this;
     }
     ++indices[dim];
-    offset += descriptor.strides[dim];
+    offset += descriptor->strides[dim];
     return *this;
   }
 
-  T &operator*() { return descriptor.data[offset]; }
-  T *operator->() { return &descriptor.data[offset]; }
+  reference operator*() { return descriptor->data[offset]; }
+  pointer operator->() { return &descriptor->data[offset]; }
 
   const std::array<int64_t, Rank> &getIndices() { return indices; }
 
   bool operator==(const StridedMemrefIterator &other) const {
-    return other.offset == offset && &other.descriptor == &descriptor;
+    return other.offset == offset && other.descriptor == descriptor;
   }
 
   bool operator!=(const StridedMemrefIterator &other) const {
@@ -245,16 +252,24 @@
   /// Offset in the buffer. This can be derived from the indices and the
   /// descriptor.
   int64_t offset = 0;
+
   /// Array of indices in the multi-dimensional memref.
   std::array<int64_t, Rank> indices = {};
+
   /// Descriptor for the strided memref.
-  StridedMemRefType<T, Rank> &descriptor;
+  StridedMemRefType<T, Rank> *descriptor;
 };
 
 /// Iterate over all elements in a 0-ranked strided memref.
 template <typename T>
 class StridedMemrefIterator<T, 0> {
 public:
+  using iterator_category = std::forward_iterator_tag;
+  using value_type = T;
+  using difference_type = std::ptrdiff_t;
+  using pointer = T *;
+  using reference = T &;
+
   StridedMemrefIterator(StridedMemRefType<T, 0> &descriptor, int64_t offset = 0)
       : elt(descriptor.data + offset) {}
 
@@ -263,8 +278,8 @@
     return *this;
   }
 
-  T &operator*() { return *elt; }
-  T *operator->() { return elt; }
+  reference operator*() { return *elt; }
+  pointer operator->() { return elt; }
 
   // There are no indices for a 0-ranked memref, but this API is provided for
   // consistency with the general case.
@@ -301,10 +316,20 @@
 //===----------------------------------------------------------------------===//
 // DynamicMemRefType type.
 //===----------------------------------------------------------------------===//
+template <typename T>
+class DynamicMemRefIterator;
+
 // A reference to one of the StridedMemRef types.
 template <typename T>
 class DynamicMemRefType {
 public:
+  int64_t rank;
+  T *basePtr;
+  T *data;
+  int64_t offset;
+  const int64_t *sizes;
+  const int64_t *strides;
+
   explicit DynamicMemRefType(const StridedMemRefType<T, 0> &memRef)
       : rank(0), basePtr(memRef.basePtr), data(memRef.data),
         offset(memRef.offset), sizes(nullptr), strides(nullptr) {}
@@ -322,12 +347,113 @@
     strides = sizes + rank;
   }
 
-  int64_t rank;
-  T *basePtr;
-  T *data;
-  int64_t offset;
-  const int64_t *sizes;
-  const int64_t *strides;
+  template <typename Range,
+            typename sfinae = decltype(std::declval<Range>().begin())>
+  T &operator[](Range &&indices) {
+    assert(indices.size() == rank &&
+           "indices should match rank in memref subscript");
+    if (rank == 0)
+      return data[offset];
+
+    int64_t curOffset = offset;
+    for (int dim = rank - 1; dim >= 0; --dim) {
+      int64_t currentIndex = *(indices.begin() + dim);
+      assert(currentIndex < sizes[dim] && "Index overflow");
+      curOffset += currentIndex * strides[dim];
+    }
+    return data[curOffset];
+  }
+
+  DynamicMemRefIterator<T> begin() { return {*this}; }
+  DynamicMemRefIterator<T> end() { return {*this, -1}; }
+
+  // This operator[] is extremely slow and only for sugaring purposes.
+  DynamicMemRefType<T> operator[](int64_t idx) {
+    assert(rank > 0 && "can't make a subscript of a zero ranked array");
+
+    DynamicMemRefType<T> res(*this);
+    --res.rank;
+    res.offset += idx * res.strides[0];
+    ++res.sizes;
+    ++res.strides;
+    return res;
+  }
+
+  // This operator* can be used in conjunction with the previous operator[] in
+  // order to access the underlying value in case of zero-ranked memref.
+  T &operator*() {
+    assert(rank == 0 && "not a zero-ranked memRef");
+    return data[offset];
+  }
+
+private:
+  DynamicMemRefType(const DynamicMemRefType<T> &other)
+      : rank(other.rank), basePtr(other.basePtr), data(other.data),
+        offset(other.offset), strides(other.strides) {}
+};
+
+/// Iterate over all elements in a dynamic memref.
+template <typename T>
+class DynamicMemRefIterator {
+public:
+  using iterator_category = std::forward_iterator_tag;
+  using value_type = T;
+  using difference_type = std::ptrdiff_t;
+  using pointer = T *;
+  using reference = T &;
+
+  DynamicMemRefIterator(DynamicMemRefType<T> &descriptor, int64_t offset = 0)
+      : offset(offset), descriptor(&descriptor) {
+    indices.resize(descriptor.rank, 0);
+  }
+
+  DynamicMemRefIterator<T> &operator++() {
+    if (descriptor->rank == 0) {
+      offset = -1;
+      return *this;
+    }
+
+    int dim = descriptor->rank - 1;
+
+    while (dim >= 0 && indices[dim] == (descriptor->sizes[dim] - 1)) {
+      offset -= indices[dim] * descriptor->strides[dim];
+      indices[dim] = 0;
+      --dim;
+    }
+
+    if (dim < 0) {
+      offset = -1;
+      return *this;
+    }
+
+    ++indices[dim];
+    offset += descriptor->strides[dim];
+    return *this;
+  }
+
+  reference operator*() { return descriptor->data[offset]; }
+  pointer operator->() { return &descriptor->data[offset]; }
+
+  const std::vector<int64_t> &getIndices() { return indices; }
+
+  bool operator==(const DynamicMemRefIterator &other) const {
+    return other.offset == offset && other.descriptor == descriptor;
+  }
+
+  bool operator!=(const DynamicMemRefIterator &other) const {
+    return !(*this == other);
+  }
+
+private:
+  /// Offset in the buffer. This can be derived from the indices and the
+  /// descriptor.
+  int64_t offset = 0;
+
+  /// Array of indices in the multi-dimensional memref.
+  std::vector<int64_t> indices = {};
+
+  /// Descriptor for the dynamic memref.
+  DynamicMemRefType<T> *descriptor;
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/IR/OpBase.td b/mlir/include/mlir/IR/OpBase.td
--- a/mlir/include/mlir/IR/OpBase.td
+++ b/mlir/include/mlir/IR/OpBase.td
@@ -2288,8 +2288,11 @@
 class ElementType<string name> : StrFunc<"getElementTypeOrSelf($" # name # ")">;
 
 class AllMatchPred<list<string> values> :
-    CPred<"::llvm::is_splat(::llvm::makeArrayRef({"
-          # !interleave(values, ", ") #"}))">;
+  CPred<!if(!lt(!size(values), 2),
+            "true",
+            !foldl("(" # !head(values) # ")", !tail(values), acc, v,
+                   acc # " == (" # v # ") && (" # v # ")")
+              # " == (" # !head(values) # ")")>;
 
 class AllMatch<list<string> values, string summary> :
     PredOpTrait<summary, AllMatchPred<values>>;
diff --git a/mlir/include/mlir/Transforms/TopologicalSortUtils.h b/mlir/include/mlir/Transforms/TopologicalSortUtils.h
--- a/mlir/include/mlir/Transforms/TopologicalSortUtils.h
+++ b/mlir/include/mlir/Transforms/TopologicalSortUtils.h
@@ -90,11 +90,23 @@
     function_ref<bool(Value, Operation *)> isOperandReady = nullptr);
 
 /// Given a block, sort its operations in topological order, excluding its
-/// terminator if it has one.
+/// terminator if it has one. This sort is stable.
 bool sortTopologically(
     Block *block,
     function_ref<bool(Value, Operation *)> isOperandReady = nullptr);
 
+/// Compute a topological ordering of the given ops. All ops must belong to the
+/// specified block.
+///
+/// This sort is not stable.
+///
+/// Note: If the specified ops contain incomplete/interrupted SSA use-def
+/// chains, the result may not actually be a topological sorting with respect to
+/// the entire program.
+bool computeTopologicalSorting(
+    Block *block, MutableArrayRef<Operation *> ops,
+    function_ref<bool(Value, Operation *)> isOperandReady = nullptr);
+
 } // end namespace mlir
 
 #endif // MLIR_TRANSFORMS_TOPOLOGICALSORTUTILS_H
diff --git a/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp b/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp
--- a/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp
+++ b/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp
@@ -49,8 +49,6 @@
 
   // Get the dense lattice to update.
   AbstractDenseLattice *after = getLattice(op);
-  if (after->isAtFixpoint())
-    return;
 
   // If this op implements region control-flow, then control-flow dictates its
   // transfer function.
@@ -91,8 +89,6 @@
 
   // Get the dense lattice to update.
   AbstractDenseLattice *after = getLattice(block);
-  if (after->isAtFixpoint())
-    return;
 
   // The dense lattices of entry blocks are set by region control-flow or the
   // callgraph.
diff --git a/mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp b/mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp
--- a/mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp
+++ b/mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp
@@ -87,16 +87,10 @@
   // Get the result lattices.
   SmallVector<AbstractSparseLattice *> resultLattices;
   resultLattices.reserve(op->getNumResults());
-  // Track whether all results have reached their fixpoint.
-  bool allAtFixpoint = true;
   for (Value result : op->getResults()) {
     AbstractSparseLattice *resultLattice = getLatticeElement(result);
-    allAtFixpoint &= resultLattice->isAtFixpoint();
     resultLattices.push_back(resultLattice);
   }
-  // If all result lattices have reached a fixpoint, there is nothing to do.
-  if (allAtFixpoint)
-    return;
 
   // The results of a region branch operation are determined by control-flow.
   if (auto branch = dyn_cast<RegionBranchOpInterface>(op)) {
@@ -145,16 +139,10 @@
   // Get the argument lattices.
   SmallVector<AbstractSparseLattice *> argLattices;
   argLattices.reserve(block->getNumArguments());
-  bool allAtFixpoint = true;
   for (BlockArgument argument : block->getArguments()) {
     AbstractSparseLattice *argLattice = getLatticeElement(argument);
-    allAtFixpoint &= argLattice->isAtFixpoint();
     argLattices.push_back(argLattice);
   }
-  // If all argument lattices have reached their fixpoints, then there is
-  // nothing to do.
-  if (allAtFixpoint)
-    return;
 
   // The argument lattices of entry blocks are set by region control-flow or the
   // callgraph.
diff --git a/mlir/lib/Bindings/Python/IRAttributes.cpp b/mlir/lib/Bindings/Python/IRAttributes.cpp
--- a/mlir/lib/Bindings/Python/IRAttributes.cpp
+++ b/mlir/lib/Bindings/Python/IRAttributes.cpp
@@ -113,15 +113,9 @@
 /// A python-wrapped dense array attribute with an element type and a derived
 /// implementation class.
 template <typename EltTy, typename DerivedT>
-class PyDenseArrayAttribute
-    : public PyConcreteAttribute<PyDenseArrayAttribute<EltTy, DerivedT>> {
+class PyDenseArrayAttribute : public PyConcreteAttribute<DerivedT> {
 public:
-  static constexpr typename PyConcreteAttribute<
-      PyDenseArrayAttribute<EltTy, DerivedT>>::IsAFunctionTy isaFunction =
-      DerivedT::isaFunction;
-  static constexpr const char *pyClassName = DerivedT::pyClassName;
-  using PyConcreteAttribute<
-      PyDenseArrayAttribute<EltTy, DerivedT>>::PyConcreteAttribute;
+  using PyConcreteAttribute<DerivedT>::PyConcreteAttribute;
 
   /// Iterator over the integer elements of a dense array.
   class PyDenseArrayIterator {
@@ -158,33 +152,29 @@
   EltTy getItem(intptr_t i) { return DerivedT::getElement(*this, i); }
 
   /// Bind the attribute class.
-  static void bindDerived(typename PyConcreteAttribute<
-                          PyDenseArrayAttribute<EltTy, DerivedT>>::ClassTy &c) {
+  static void bindDerived(typename PyConcreteAttribute<DerivedT>::ClassTy &c) {
     // Bind the constructor.
     c.def_static(
         "get",
         [](const std::vector<EltTy> &values, DefaultingPyMlirContext ctx) {
           MlirAttribute attr =
               DerivedT::getAttribute(ctx->get(), values.size(), values.data());
-          return PyDenseArrayAttribute<EltTy, DerivedT>(ctx->getRef(), attr);
+          return DerivedT(ctx->getRef(), attr);
         },
         py::arg("values"), py::arg("context") = py::none(),
         "Gets a uniqued dense array attribute");
     // Bind the array methods.
-    c.def("__getitem__",
-          [](PyDenseArrayAttribute<EltTy, DerivedT> &arr, intptr_t i) {
-            if (i >= mlirDenseArrayGetNumElements(arr))
-              throw py::index_error("DenseArray index out of range");
-            return arr.getItem(i);
-          });
-    c.def("__len__", [](const PyDenseArrayAttribute<EltTy, DerivedT> &arr) {
-      return mlirDenseArrayGetNumElements(arr);
+    c.def("__getitem__", [](DerivedT &arr, intptr_t i) {
+      if (i >= mlirDenseArrayGetNumElements(arr))
+        throw py::index_error("DenseArray index out of range");
+      return arr.getItem(i);
     });
-    c.def("__iter__", [](const PyDenseArrayAttribute<EltTy, DerivedT> &arr) {
-      return PyDenseArrayIterator(arr);
+    c.def("__len__", [](const DerivedT &arr) {
+      return mlirDenseArrayGetNumElements(arr);
     });
-    c.def("__add__", [](PyDenseArrayAttribute<EltTy, DerivedT> &arr,
-                        py::list extras) {
+    c.def("__iter__",
+          [](const DerivedT &arr) { return PyDenseArrayIterator(arr); });
+    c.def("__add__", [](DerivedT &arr, py::list extras) {
       std::vector<EltTy> values;
       intptr_t numOldElements = mlirDenseArrayGetNumElements(arr);
       values.reserve(numOldElements + py::len(extras));
@@ -194,7 +184,7 @@
         values.push_back(pyTryCast<EltTy>(attr));
       MlirAttribute attr = DerivedT::getAttribute(arr.getContext()->get(),
                                                   values.size(), values.data());
-      return PyDenseArrayAttribute<EltTy, DerivedT>(arr.getContext(), attr);
+      return DerivedT(arr.getContext(), attr);
     });
   }
 };
diff --git a/mlir/lib/Conversion/ComplexToLibm/ComplexToLibm.cpp b/mlir/lib/Conversion/ComplexToLibm/ComplexToLibm.cpp
--- a/mlir/lib/Conversion/ComplexToLibm/ComplexToLibm.cpp
+++ b/mlir/lib/Conversion/ComplexToLibm/ComplexToLibm.cpp
@@ -131,7 +131,8 @@
   ConversionTarget target(getContext());
   target.addLegalDialect<func::FuncDialect>();
   target.addIllegalOp<complex::PowOp, complex::SqrtOp, complex::TanhOp,
-                      complex::AbsOp, complex::AngleOp>();
+                      complex::CosOp, complex::SinOp, complex::ConjOp,
+                      complex::LogOp, complex::AbsOp, complex::AngleOp>();
   if (failed(applyPartialConversion(module, target, std::move(patterns))))
     signalPassFailure();
 }
diff --git a/mlir/lib/Dialect/Affine/Analysis/AffineStructures.cpp b/mlir/lib/Dialect/Affine/Analysis/AffineStructures.cpp
--- a/mlir/lib/Dialect/Affine/Analysis/AffineStructures.cpp
+++ b/mlir/lib/Dialect/Affine/Analysis/AffineStructures.cpp
@@ -399,13 +399,13 @@
   assert(areVarsUnique(*a) && "A's values aren't unique");
   assert(areVarsUnique(*b) && "B's values aren't unique");
 
-  assert(std::all_of(a->getMaybeValues().begin() + offset,
-                     a->getMaybeValues().end(),
-                     [](Optional<Value> var) { return var.has_value(); }));
+  assert(
+      llvm::all_of(llvm::drop_begin(a->getMaybeValues(), offset),
+                   [](const Optional<Value> &var) { return var.has_value(); }));
 
-  assert(std::all_of(b->getMaybeValues().begin() + offset,
-                     b->getMaybeValues().end(),
-                     [](Optional<Value> var) { return var.has_value(); }));
+  assert(
+      llvm::all_of(llvm::drop_begin(b->getMaybeValues(), offset),
+                   [](const Optional<Value> &var) { return var.has_value(); }));
 
   SmallVector<Value, 4> aDimValues;
   a->getValues(offset, a->getNumDimVars(), &aDimValues);
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/AllocTensorElimination.cpp b/mlir/lib/Dialect/Bufferization/Transforms/AllocTensorElimination.cpp
--- a/mlir/lib/Dialect/Bufferization/Transforms/AllocTensorElimination.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/AllocTensorElimination.cpp
@@ -140,6 +140,15 @@
         return WalkResult::skip();
       Value allocTensor = maybeAllocTensor.front();
 
+      // Replace only if the types match.
+      // TODO: This could be extended to support IR such as:
+      // %0 = bufferization.alloc_tensor : tensor<128xf32>
+      // %1 = "some_op"(%0) : (tensor<128xf32>) -> (tensor<128xf32>)
+      // %2 = tensor.expand_shape %1 ...
+      // %3 = tensor.insert_slice %2 into ...
+      if (allocTensor.getType() != operand.get().getType())
+        return WalkResult::skip();
+
       // Find a suitable insertion point.
       Operation *insertionPoint =
           findValidInsertionPoint(allocTensor.getDefiningOp(), neededValues);
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
@@ -450,6 +450,31 @@
     applyPermutationToVector(iteratorTypes, permutation);
   }
 
+  // Handle distribution. Create a vector of the same size of loops that are to
+  // be tiled.
+  SmallVector<linalg::ProcInfo> procInfo;
+  if (options.distribution) {
+    procInfo.resize(
+        iteratorTypes.size(),
+        linalg::ProcInfo{nullptr, nullptr, linalg::DistributionMethod::None});
+    // Collect loop ranges of tiled loopss, loops that are parallel.
+    SmallVector<Range> parallelLoopRanges;
+    for (auto iteratorType : llvm::enumerate(iteratorTypes)) {
+      if (!isParallelIterator(iteratorType.value()))
+        break;
+      parallelLoopRanges.push_back(loopRanges[iteratorType.index()]);
+    }
+    auto returnedProcInfo =
+        options.distribution->procInfo(b, op.getLoc(), parallelLoopRanges);
+    unsigned procIdIdx = 0;
+    // Update the distribution information for the loops.
+    for (auto iteratorType : llvm::enumerate(iteratorTypes)) {
+      if (!isParallelIterator(iteratorType.value()))
+        break;
+      procInfo[iteratorType.index()] = returnedProcInfo[procIdIdx++];
+    }
+  }
+
   // 2. Create the tiled loops.
   LinalgOp res = op;
   SmallVector<Value, 4> ivs, tensorResults;
@@ -489,8 +514,7 @@
     return scf::ValueVector(tensorResults.begin(), tensorResults.end());
   };
   GenerateLoopNest<LoopTy>::doit(b, op.getLoc(), loopRanges, op, iteratorTypes,
-                                 tiledLoopBodyBuilder, options.distribution,
-                                 options.distributionTypes);
+                                 tiledLoopBodyBuilder, procInfo);
 
   // 3. Transform IndexOp results w.r.t. the tiling.
   transformIndexOps(b, res, ivs, loopIndexToRangeIndex);
diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
--- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
@@ -518,25 +518,11 @@
     function_ref<scf::ValueVector(OpBuilder &, Location, ValueRange,
                                   ValueRange)>
         bodyBuilderFn,
-    Optional<LinalgLoopDistributionOptions> distributionOptions,
-    ArrayRef<StringRef> distributionTypes) {
+    ArrayRef<linalg::ProcInfo> procInfo) {
+  assert((procInfo.empty() || (procInfo.size() == loopRanges.size())) &&
+         "expected as many entries for proc info as number of loops, even if "
+         "they are null entries");
   SmallVector<Value> iterArgInitValues = linalgOp.getOutputTensorOperands();
-  // Create procInfo so it dominates loops, if appropriate.
-  SmallVector<ProcInfo, 4> procInfo;
-  SmallVector<DistributionMethod, 0> distributionMethod;
-  if (distributionOptions) {
-    // Collect loop ranges for parallel dimensions.
-    SmallVector<Range, 2> parallelLoopRanges;
-    for (const auto &iteratorType : enumerate(iteratorTypes))
-      if (isParallelIterator(iteratorType.value()))
-        parallelLoopRanges.push_back(loopRanges[iteratorType.index()]);
-
-    // Get their distribution schemes.
-    distributionMethod = distributionOptions->distributionMethod;
-    if (distributionMethod.size() < parallelLoopRanges.size())
-      parallelLoopRanges.resize(distributionMethod.size());
-    procInfo = distributionOptions->procInfo(b, loc, parallelLoopRanges);
-  }
 
   SmallVector<Value, 4> lbs, ubs, steps;
   unpackRanges(b, loc, loopRanges, lbs, ubs, steps);
@@ -554,20 +540,17 @@
         return bodyBuilderFn(b, loc, ivs, operandValuesToUse);
       });
 
-  if (!distributionOptions || loopNest.loops.empty())
+  if (loopNest.loops.empty() || procInfo.empty())
     return;
 
   // Filter out scf.for loops that were created out of parallel dimensions.
-  SmallVector<scf::ForOp, 4> loops;
-  for (const auto &iteratorType : enumerate(iteratorTypes))
-    if (isParallelIterator(iteratorType.value()))
-      loops.push_back(loopNest.loops[iteratorType.index()]);
-
-  // Distribute - only supports cyclic distribution for now.
-  for (auto it : llvm::zip(loops, procInfo, distributionMethod))
-    if (std::get<2>(it) == DistributionMethod::Cyclic)
-      mapLoopToProcessorIds(std::get<0>(it), std::get<1>(it).procId,
-                            std::get<1>(it).nprocs);
+  for (auto loop : llvm::enumerate(loopNest.loops)) {
+    if (procInfo[loop.index()].distributionMethod ==
+        DistributionMethod::Cyclic) {
+      mapLoopToProcessorIds(loop.value(), procInfo[loop.index()].procId,
+                            procInfo[loop.index()].nprocs);
+    }
+  }
 }
 
 /// Specialization to build affine "for" nest.
@@ -578,7 +561,7 @@
     function_ref<scf::ValueVector(OpBuilder &, Location, ValueRange,
                                   ValueRange)>
         bodyBuilderFn,
-    Optional<LinalgLoopDistributionOptions>, ArrayRef<StringRef>) {
+    ArrayRef<linalg::ProcInfo> /*procInfo*/) {
   SmallVector<Value> iterArgInitValues = linalgOp.getOutputTensorOperands();
   assert(iterArgInitValues.empty() && "unexpected AffineForOp init values");
   SmallVector<Value, 4> lbs, ubs, steps;
@@ -625,12 +608,13 @@
 static void generateParallelLoopNest(
     OpBuilder &b, Location loc, ValueRange lbs, ValueRange ubs,
     ValueRange steps, ArrayRef<Attribute> iteratorTypes,
+    ArrayRef<linalg::ProcInfo> procInfo,
     function_ref<void(OpBuilder &, Location, ValueRange)> bodyBuilderFn,
-    SmallVectorImpl<Value> &ivStorage,
-    ArrayRef<DistributionMethod> distributionMethod = {}) {
+    SmallVectorImpl<Value> &ivStorage) {
   assert(lbs.size() == ubs.size());
   assert(lbs.size() == steps.size());
   assert(lbs.size() == iteratorTypes.size());
+  assert(procInfo.empty() || (lbs.size() == procInfo.size()));
 
   // If there are no (more) loops to be generated, generate the body and be
   // done with it.
@@ -639,55 +623,56 @@
     return;
   }
 
-  // Find the outermost parallel loops and drop their types from the list.
-  unsigned nLoops = iteratorTypes.size();
-  unsigned nOuterPar =
-      nLoops - iteratorTypes.drop_while(isParallelIterator).size();
-
   // If there are no outer parallel loops, generate one sequential loop and
-  // recurse. Note that we wouldn't have dropped anything from `iteratorTypes`
-  // in this case.
-  if (nOuterPar == 0) {
+  // recurse.
+  if (!isParallelIterator(iteratorTypes.front())) {
     LoopNest singleLoop = buildLoopNest(
         b, loc, lbs.take_front(), ubs.take_front(), steps.take_front(),
         [&](OpBuilder &b, Location loc, ValueRange ivs) {
           ivStorage.append(ivs.begin(), ivs.end());
-          generateParallelLoopNest(b, loc, lbs.drop_front(), ubs.drop_front(),
-                                   steps.drop_front(),
-                                   iteratorTypes.drop_front(), bodyBuilderFn,
-                                   ivStorage, distributionMethod);
+          generateParallelLoopNest(
+              b, loc, lbs.drop_front(), ubs.drop_front(), steps.drop_front(),
+              iteratorTypes.drop_front(),
+              procInfo.empty() ? procInfo : procInfo.drop_front(),
+              bodyBuilderFn, ivStorage);
         });
     return;
   }
-  if (distributionMethod.empty()) {
+
+  unsigned nLoops = iteratorTypes.size();
+  unsigned numProcessed = 0;
+  DistributionMethod distributionMethod = DistributionMethod::None;
+  if (procInfo.empty()) {
+    numProcessed = nLoops - iteratorTypes.drop_while(isParallelIterator).size();
+  } else {
+    distributionMethod = procInfo.front().distributionMethod;
+    numProcessed =
+        nLoops - procInfo
+                     .drop_while([&](linalg::ProcInfo p) {
+                       return p.distributionMethod == distributionMethod;
+                     })
+                     .size();
+  }
+
+  auto remainderProcInfo =
+      procInfo.empty() ? procInfo : procInfo.drop_front(numProcessed);
+  switch (distributionMethod) {
+  case DistributionMethod::None: {
     // Generate a single parallel loop-nest operation for all outermost
     // parallel loops and recurse.
     b.create<scf::ParallelOp>(
-        loc, lbs.take_front(nOuterPar), ubs.take_front(nOuterPar),
-        steps.take_front(nOuterPar),
+        loc, lbs.take_front(numProcessed), ubs.take_front(numProcessed),
+        steps.take_front(numProcessed),
         [&](OpBuilder &nestedBuilder, Location nestedLoc, ValueRange localIvs) {
           ivStorage.append(localIvs.begin(), localIvs.end());
           generateParallelLoopNest(
-              nestedBuilder, nestedLoc, lbs.drop_front(nOuterPar),
-              ubs.drop_front(nOuterPar), steps.drop_front(nOuterPar),
-              iteratorTypes.drop_front(nOuterPar), bodyBuilderFn, ivStorage,
-              (distributionMethod.size() < nOuterPar)
-                  ? ArrayRef<DistributionMethod>()
-                  : distributionMethod.drop_front(nOuterPar));
+              nestedBuilder, nestedLoc, lbs.drop_front(numProcessed),
+              ubs.drop_front(numProcessed), steps.drop_front(numProcessed),
+              iteratorTypes.drop_front(numProcessed), remainderProcInfo,
+              bodyBuilderFn, ivStorage);
         });
     return;
   }
-
-  // Process all consecutive similarly distributed loops simultaneously.
-  DistributionMethod methodToUse = distributionMethod[0];
-  unsigned numProcessed = 1;
-  for (unsigned i = 1; i < nOuterPar && i < distributionMethod.size(); ++i) {
-    if (distributionMethod[i] != methodToUse)
-      break;
-    numProcessed++;
-  }
-
-  switch (methodToUse) {
   case DistributionMethod::Cyclic: {
     // Generate a single parallel loop-nest operation for all outermost
     // parallel loops and recurse.
@@ -699,10 +684,8 @@
           generateParallelLoopNest(
               nestedBuilder, nestedLoc, lbs.drop_front(numProcessed),
               ubs.drop_front(numProcessed), steps.drop_front(numProcessed),
-              iteratorTypes.drop_front(numProcessed), bodyBuilderFn, ivStorage,
-              (distributionMethod.size() < numProcessed)
-                  ? ArrayRef<DistributionMethod>()
-                  : distributionMethod.drop_front(numProcessed));
+              iteratorTypes.drop_front(numProcessed), remainderProcInfo,
+              bodyBuilderFn, ivStorage);
         });
     return;
   }
@@ -714,11 +697,11 @@
       cond = ab._and(cond, ab.slt(lbs[i], ubs[i]));
     ivStorage.append(lbs.begin(), std::next(lbs.begin(), numProcessed));
     b.create<scf::IfOp>(loc, cond, [&](OpBuilder &b, Location loc) {
-      generateParallelLoopNest(
-          b, loc, lbs.drop_front(numProcessed), ubs.drop_front(numProcessed),
-          steps.drop_front(numProcessed),
-          iteratorTypes.drop_front(numProcessed), bodyBuilderFn, ivStorage,
-          distributionMethod.drop_front(numProcessed));
+      generateParallelLoopNest(b, loc, lbs.drop_front(numProcessed),
+                               ubs.drop_front(numProcessed),
+                               steps.drop_front(numProcessed),
+                               iteratorTypes.drop_front(numProcessed),
+                               remainderProcInfo, bodyBuilderFn, ivStorage);
       b.create<scf::YieldOp>(loc, ValueRange{});
     });
     return;
@@ -730,7 +713,7 @@
     generateParallelLoopNest(
         b, loc, lbs.drop_front(numProcessed), ubs.drop_front(numProcessed),
         steps.drop_front(numProcessed), iteratorTypes.drop_front(numProcessed),
-        bodyBuilderFn, ivStorage, distributionMethod.drop_front(numProcessed));
+        remainderProcInfo, bodyBuilderFn, ivStorage);
     return;
   }
 }
@@ -743,13 +726,14 @@
     function_ref<scf::ValueVector(OpBuilder &, Location, ValueRange,
                                   ValueRange)>
         bodyBuilderFn,
-    Optional<LinalgLoopDistributionOptions> distributionOptions,
-    ArrayRef<StringRef> distributionTypes) {
+    ArrayRef<linalg::ProcInfo> procInfo) {
   SmallVector<Value> iterArgInitValues = linalgOp.getOutputTensorOperands();
   assert(iterArgInitValues.empty() && "unexpected ParallelOp init values");
   // This function may be passed more iterator types than ranges.
   assert(iteratorTypes.size() >= loopRanges.size() &&
          "expected iterator type for all ranges");
+  assert((procInfo.empty() || (procInfo.size() == loopRanges.size())) &&
+         "expected proc information for all loops when present");
   iteratorTypes = iteratorTypes.take_front(loopRanges.size());
   SmallVector<Value, 8> lbsStorage, ubsStorage, stepsStorage, ivs;
   unsigned numLoops = iteratorTypes.size();
@@ -762,42 +746,22 @@
   unpackRanges(b, loc, loopRanges, lbsStorage, ubsStorage, stepsStorage);
 
   // Modify the lb, ub, and step based on the distribution options.
-  SmallVector<DistributionMethod, 0> distributionMethod;
-  if (distributionOptions) {
-    auto &options = *distributionOptions;
-    distributionMethod.assign(distributionOptions->distributionMethod.begin(),
-                              distributionOptions->distributionMethod.end());
-    SmallVector<Range, 2> parallelLoopRanges;
-    for (const auto &iteratorType : enumerate(iteratorTypes)) {
-      if (isParallelIterator(iteratorType.value()))
-        parallelLoopRanges.push_back(loopRanges[iteratorType.index()]);
-    }
-    if (distributionMethod.size() < parallelLoopRanges.size())
-      parallelLoopRanges.resize(distributionMethod.size());
-    SmallVector<ProcInfo, 2> procInfo =
-        options.procInfo(b, loc, parallelLoopRanges);
-    unsigned index = 0;
-    for (const auto &iteratorType : enumerate(iteratorTypes)) {
-      if (index >= procInfo.size())
-        break;
-      if (isParallelIterator(iteratorType.value())) {
-        unsigned i = iteratorType.index();
-        updateBoundsForCyclicDistribution(b, loc, procInfo[index].procId,
-                                          procInfo[index].nprocs, lbsStorage[i],
-                                          ubsStorage[i], stepsStorage[i]);
-        index++;
-      }
+  for (auto it : llvm::enumerate(procInfo)) {
+    if (it.value().distributionMethod != linalg::DistributionMethod::None) {
+      updateBoundsForCyclicDistribution(
+          b, loc, it.value().procId, it.value().nprocs, lbsStorage[it.index()],
+          ubsStorage[it.index()], stepsStorage[it.index()]);
     }
   }
   ValueRange lbs(lbsStorage), ubs(ubsStorage), steps(stepsStorage);
   generateParallelLoopNest(
-      b, loc, lbs, ubs, steps, iteratorTypes,
+      b, loc, lbs, ubs, steps, iteratorTypes, procInfo,
       [&](OpBuilder &b, Location loc, ValueRange ivs) {
         SmallVector<Value> operandValuesToUse =
             linalgOp.getInputAndOutputOperands();
         bodyBuilderFn(b, loc, ivs, operandValuesToUse);
       },
-      ivs, distributionMethod);
+      ivs);
 
   assert(ivs.size() == iteratorTypes.size() && "did not generate enough loops");
 }
diff --git a/mlir/lib/Dialect/Math/IR/MathOps.cpp b/mlir/lib/Dialect/Math/IR/MathOps.cpp
--- a/mlir/lib/Dialect/Math/IR/MathOps.cpp
+++ b/mlir/lib/Dialect/Math/IR/MathOps.cpp
@@ -134,6 +134,56 @@
   });
 }
 
+//===----------------------------------------------------------------------===//
+// IPowIOp folder
+//===----------------------------------------------------------------------===//
+
+OpFoldResult math::IPowIOp::fold(ArrayRef<Attribute> operands) {
+  return constFoldBinaryOpConditional<IntegerAttr>(
+      operands, [](const APInt &base, const APInt &power) -> Optional<APInt> {
+        unsigned width = base.getBitWidth();
+        auto zeroValue = APInt::getZero(width);
+        APInt oneValue{width, 1ULL, /*isSigned=*/true};
+        APInt minusOneValue{width, -1ULL, /*isSigned=*/true};
+
+        if (power.isZero())
+          return oneValue;
+
+        if (power.isNegative()) {
+          // Leave 0 raised to negative power not folded.
+          if (base.isZero())
+            return {};
+          if (base.eq(oneValue))
+            return oneValue;
+          // If abs(base) > 1, then the result is zero.
+          if (base.ne(minusOneValue))
+            return zeroValue;
+          // base == -1:
+          //   -1: power is odd
+          //    1: power is even
+          if (power[0] == 1)
+            return minusOneValue;
+
+          return oneValue;
+        }
+
+        // power is positive.
+        APInt result = oneValue;
+        APInt curBase = base;
+        APInt curPower = power;
+        while (true) {
+          if (curPower[0] == 1)
+            result *= curBase;
+          curPower.lshrInPlace(1);
+          if (curPower.isZero())
+            return result;
+          curBase *= curBase;
+        }
+      });
+
+  return Attribute();
+}
+
 //===----------------------------------------------------------------------===//
 // LogOp folder
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Math/Transforms/AlgebraicSimplification.cpp b/mlir/lib/Dialect/Math/Transforms/AlgebraicSimplification.cpp
--- a/mlir/lib/Dialect/Math/Transforms/AlgebraicSimplification.cpp
+++ b/mlir/lib/Dialect/Math/Transforms/AlgebraicSimplification.cpp
@@ -112,9 +112,100 @@
   return failure();
 }
 
+//----------------------------------------------------------------------------//
+// IPowIOp strength reduction.
+//----------------------------------------------------------------------------//
+
+namespace {
+struct IPowIStrengthReduction : public OpRewritePattern<math::IPowIOp> {
+  unsigned exponentThreshold;
+
+public:
+  IPowIStrengthReduction(MLIRContext *context, unsigned exponentThreshold = 3,
+                         PatternBenefit benefit = 1,
+                         ArrayRef<StringRef> generatedNames = {})
+      : OpRewritePattern<math::IPowIOp>(context, benefit, generatedNames),
+        exponentThreshold(exponentThreshold) {}
+  LogicalResult matchAndRewrite(math::IPowIOp op,
+                                PatternRewriter &rewriter) const final;
+};
+} // namespace
+
+LogicalResult
+IPowIStrengthReduction::matchAndRewrite(math::IPowIOp op,
+                                        PatternRewriter &rewriter) const {
+  Location loc = op.getLoc();
+  Value base = op.getLhs();
+
+  IntegerAttr scalarExponent;
+  DenseIntElementsAttr vectorExponent;
+
+  bool isScalar = matchPattern(op.getRhs(), m_Constant(&scalarExponent));
+  bool isVector = matchPattern(op.getRhs(), m_Constant(&vectorExponent));
+
+  // Simplify cases with known exponent value.
+  int64_t exponentValue = 0;
+  if (isScalar)
+    exponentValue = scalarExponent.getInt();
+  else if (isVector && vectorExponent.isSplat())
+    exponentValue = vectorExponent.getSplatValue<IntegerAttr>().getInt();
+  else
+    return failure();
+
+  // Maybe broadcasts scalar value into vector type compatible with `op`.
+  auto bcast = [&](Value value) -> Value {
+    if (auto vec = op.getType().dyn_cast<VectorType>())
+      return rewriter.create<vector::BroadcastOp>(loc, vec, value);
+    return value;
+  };
+
+  if (exponentValue == 0) {
+    // Replace `ipowi(x, 0)` with `1`.
+    Value one = rewriter.create<arith::ConstantOp>(
+        loc, rewriter.getIntegerAttr(getElementTypeOrSelf(op.getType()), 1));
+    rewriter.replaceOp(op, bcast(one));
+    return success();
+  }
+
+  bool exponentIsNegative = false;
+  if (exponentValue < 0) {
+    exponentIsNegative = true;
+    exponentValue *= -1;
+  }
+
+  // Bail out if `abs(exponent)` exceeds the threshold.
+  if (exponentValue > exponentThreshold)
+    return failure();
+
+  // Inverse the base for negative exponent, i.e. for
+  // `ipowi(x, negative_exponent)` set `x` to `1 / x`.
+  if (exponentIsNegative) {
+    Value one = rewriter.create<arith::ConstantOp>(
+        loc, rewriter.getIntegerAttr(getElementTypeOrSelf(op.getType()), 1));
+    base = rewriter.create<arith::DivSIOp>(loc, bcast(one), base);
+  }
+
+  Value result = base;
+  // Transform to naive sequence of multiplications:
+  //   * For positive exponent case replace:
+  //       `ipowi(x, positive_exponent)`
+  //     with:
+  //       x * x * x * ...
+  //   * For negative exponent case replace:
+  //       `ipowi(x, negative_exponent)`
+  //     with:
+  //       (1 / x) * (1 / x) * (1 / x) * ...
+  for (unsigned i = 1; i < exponentValue; ++i)
+    result = rewriter.create<arith::MulIOp>(loc, result, base);
+
+  rewriter.replaceOp(op, result);
+  return success();
+}
+
 //----------------------------------------------------------------------------//
 
 void mlir::populateMathAlgebraicSimplificationPatterns(
     RewritePatternSet &patterns) {
-  patterns.add<PowFStrengthReduction>(patterns.getContext());
+  patterns.add<PowFStrengthReduction, IPowIStrengthReduction>(
+      patterns.getContext());
 }
diff --git a/mlir/lib/Dialect/NVGPU/Transforms/MmaSyncTF32Transform.cpp b/mlir/lib/Dialect/NVGPU/Transforms/MmaSyncTF32Transform.cpp
--- a/mlir/lib/Dialect/NVGPU/Transforms/MmaSyncTF32Transform.cpp
+++ b/mlir/lib/Dialect/NVGPU/Transforms/MmaSyncTF32Transform.cpp
@@ -42,7 +42,8 @@
                                 PatternRewriter &rewrite) const override {
     Location location = op->getLoc();
 
-    if (op->hasAttr(op.getTf32EnabledAttrName()))
+    if (op->hasAttr(op.getTf32EnabledAttrName()) ||
+        !op.getMatrixA().getType().cast<VectorType>().getElementType().isF32())
       return failure();
 
     if (precision == MmaSyncF32Lowering::Unkown)
diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp
--- a/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp
@@ -348,6 +348,39 @@
   return CooperativeMatrixNVType::get(elementTy, scope, dims[0], dims[1]);
 }
 
+// joint-matrix-type ::= `!spv.jointmatrix` `<`rows `x` columns `x` element-type
+//                                                       `,` layout `,` scope`>`
+static Type parseJointMatrixType(SPIRVDialect const &dialect,
+                                 DialectAsmParser &parser) {
+  if (parser.parseLess())
+    return Type();
+
+  SmallVector<int64_t, 2> dims;
+  SMLoc countLoc = parser.getCurrentLocation();
+  if (parser.parseDimensionList(dims, /*allowDynamic=*/false))
+    return Type();
+
+  if (dims.size() != 2) {
+    parser.emitError(countLoc, "expected rows and columns size");
+    return Type();
+  }
+
+  auto elementTy = parseAndVerifyType(dialect, parser);
+  if (!elementTy)
+    return Type();
+  MatrixLayout matrixLayout;
+  if (parser.parseComma() ||
+      parseEnumKeywordAttr(matrixLayout, parser, "matrixLayout <id>"))
+    return Type();
+  Scope scope;
+  if (parser.parseComma() || parseEnumKeywordAttr(scope, parser, "scope <id>"))
+    return Type();
+  if (parser.parseGreater())
+    return Type();
+  return JointMatrixINTELType::get(elementTy, scope, dims[0], dims[1],
+                                   matrixLayout);
+}
+
 // TODO: Reorder methods to be utilities first and parse*Type
 // methods in alphabetical order
 //
@@ -753,6 +786,8 @@
     return parseArrayType(*this, parser);
   if (keyword == "coopmatrix")
     return parseCooperativeMatrixType(*this, parser);
+  if (keyword == "jointmatrix")
+    return parseJointMatrixType(*this, parser);
   if (keyword == "image")
     return parseImageType(*this, parser);
   if (keyword == "ptr")
@@ -859,6 +894,13 @@
   os << ">";
 }
 
+static void print(JointMatrixINTELType type, DialectAsmPrinter &os) {
+  os << "jointmatrix<" << type.getRows() << "x" << type.getColumns() << "x";
+  os << type.getElementType() << ", "
+     << stringifyMatrixLayout(type.getMatrixLayout());
+  os << ", " << stringifyScope(type.getScope()) << ">";
+}
+
 static void print(MatrixType type, DialectAsmPrinter &os) {
   os << "matrix<" << type.getNumColumns() << " x " << type.getColumnType();
   os << ">";
@@ -866,9 +908,9 @@
 
 void SPIRVDialect::printType(Type type, DialectAsmPrinter &os) const {
   TypeSwitch<Type>(type)
-      .Case<ArrayType, CooperativeMatrixNVType, PointerType, RuntimeArrayType,
-            ImageType, SampledImageType, StructType, MatrixType>(
-          [&](auto type) { print(type, os); })
+      .Case<ArrayType, CooperativeMatrixNVType, JointMatrixINTELType,
+            PointerType, RuntimeArrayType, ImageType, SampledImageType,
+            StructType, MatrixType>([&](auto type) { print(type, os); })
       .Default([](Type) { llvm_unreachable("unhandled SPIR-V type"); });
 }
 
diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp
--- a/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp
@@ -436,6 +436,13 @@
         resultType.cast<spirv::CooperativeMatrixNVType>().getElementType();
   }
 
+  if (auto jointMatrixType =
+          operandType.dyn_cast<spirv::JointMatrixINTELType>()) {
+    operandType = jointMatrixType.getElementType();
+    resultType =
+        resultType.cast<spirv::JointMatrixINTELType>().getElementType();
+  }
+
   auto operandTypeBitWidth = operandType.getIntOrFloatBitWidth();
   auto resultTypeBitWidth = resultType.getIntOrFloatBitWidth();
   auto isSameBitWidth = operandTypeBitWidth == resultTypeBitWidth;
@@ -1637,6 +1644,17 @@
     return success();
   }
 
+  if (auto jointType = cType.dyn_cast<spirv::JointMatrixINTELType>()) {
+    if (constituents.size() != 1)
+      return emitOpError("has incorrect number of operands: expected ")
+             << "1, but provided " << constituents.size();
+    if (jointType.getElementType() != constituents.front().getType())
+      return emitOpError("operand type mismatch: expected operand type ")
+             << jointType.getElementType() << ", but provided "
+             << constituents.front().getType();
+    return success();
+  }
+
   if (constituents.size() == cType.getNumElements()) {
     for (auto index : llvm::seq<uint32_t>(0, constituents.size())) {
       if (constituents[index].getType() != cType.getElementType(index)) {
@@ -3893,6 +3911,70 @@
   return verifyCoopMatrixMulAdd(*this);
 }
 
+static LogicalResult
+verifyPointerAndJointMatrixType(Operation *op, Type pointer, Type jointMatrix) {
+  Type pointeeType = pointer.cast<spirv::PointerType>().getPointeeType();
+  if (!pointeeType.isa<spirv::ScalarType>() && !pointeeType.isa<VectorType>())
+    return op->emitError(
+               "Pointer must point to a scalar or vector type but provided ")
+           << pointeeType;
+  spirv::StorageClass storage =
+      pointer.cast<spirv::PointerType>().getStorageClass();
+  if (storage != spirv::StorageClass::Workgroup &&
+      storage != spirv::StorageClass::CrossWorkgroup)
+    return op->emitError("Pointer storage class must be Workgroup or "
+                         "CrossWorkgroup but provided ")
+           << stringifyStorageClass(storage);
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// spv.JointMatrixLoadINTEL
+//===----------------------------------------------------------------------===//
+
+LogicalResult spirv::JointMatrixLoadINTELOp::verify() {
+  return verifyPointerAndJointMatrixType(*this, pointer().getType(),
+                                         result().getType());
+}
+
+//===----------------------------------------------------------------------===//
+// spv.JointMatrixStoreINTEL
+//===----------------------------------------------------------------------===//
+
+LogicalResult spirv::JointMatrixStoreINTELOp::verify() {
+  return verifyPointerAndJointMatrixType(*this, pointer().getType(),
+                                         object().getType());
+}
+
+//===----------------------------------------------------------------------===//
+// spv.JointMatrixMadINTEL
+//===----------------------------------------------------------------------===//
+
+static LogicalResult verifyJointMatrixMad(spirv::JointMatrixMadINTELOp op) {
+  if (op.c().getType() != op.result().getType())
+    return op.emitOpError("result and third operand must have the same type");
+  auto typeA = op.a().getType().cast<spirv::JointMatrixINTELType>();
+  auto typeB = op.b().getType().cast<spirv::JointMatrixINTELType>();
+  auto typeC = op.c().getType().cast<spirv::JointMatrixINTELType>();
+  auto typeR = op.result().getType().cast<spirv::JointMatrixINTELType>();
+  if (typeA.getRows() != typeR.getRows() ||
+      typeA.getColumns() != typeB.getRows() ||
+      typeB.getColumns() != typeR.getColumns())
+    return op.emitOpError("matrix size must match");
+  if (typeR.getScope() != typeA.getScope() ||
+      typeR.getScope() != typeB.getScope() ||
+      typeR.getScope() != typeC.getScope())
+    return op.emitOpError("matrix scope must match");
+  if (typeA.getElementType() != typeB.getElementType() ||
+      typeR.getElementType() != typeC.getElementType())
+    return op.emitOpError("matrix element type must match");
+  return success();
+}
+
+LogicalResult spirv::JointMatrixMadINTELOp::verify() {
+  return verifyJointMatrixMad(*this);
+}
+
 //===----------------------------------------------------------------------===//
 // spv.MatrixTimesScalar
 //===----------------------------------------------------------------------===//
@@ -4150,6 +4232,8 @@
 
   if (cType.isa<spirv::CooperativeMatrixNVType>())
     return emitError("unsupported composite type  ") << cType;
+  if (cType.isa<spirv::JointMatrixINTELType>())
+    return emitError("unsupported composite type  ") << cType;
   if (constituents.size() != cType.getNumElements())
     return emitError("has incorrect number of operands: expected ")
            << cType.getNumElements() << ", but provided "
diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp
--- a/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp
@@ -89,9 +89,9 @@
 bool CompositeType::classof(Type type) {
   if (auto vectorType = type.dyn_cast<VectorType>())
     return isValid(vectorType);
-  return type
-      .isa<spirv::ArrayType, spirv::CooperativeMatrixNVType, spirv::MatrixType,
-           spirv::RuntimeArrayType, spirv::StructType>();
+  return type.isa<spirv::ArrayType, spirv::CooperativeMatrixNVType,
+                  spirv::JointMatrixINTELType, spirv::MatrixType,
+                  spirv::RuntimeArrayType, spirv::StructType>();
 }
 
 bool CompositeType::isValid(VectorType type) {
@@ -110,7 +110,8 @@
 
 Type CompositeType::getElementType(unsigned index) const {
   return TypeSwitch<Type, Type>(*this)
-      .Case<ArrayType, CooperativeMatrixNVType, RuntimeArrayType, VectorType>(
+      .Case<ArrayType, CooperativeMatrixNVType, JointMatrixINTELType,
+            RuntimeArrayType, VectorType>(
           [](auto type) { return type.getElementType(); })
       .Case<MatrixType>([](MatrixType type) { return type.getColumnType(); })
       .Case<StructType>(
@@ -132,6 +133,10 @@
     llvm_unreachable(
         "invalid to query number of elements of spirv::CooperativeMatrix type");
   }
+  if (isa<JointMatrixINTELType>()) {
+    llvm_unreachable(
+        "invalid to query number of elements of spirv::JointMatrix type");
+  }
   if (isa<RuntimeArrayType>()) {
     llvm_unreachable(
         "invalid to query number of elements of spirv::RuntimeArray type");
@@ -140,15 +145,16 @@
 }
 
 bool CompositeType::hasCompileTimeKnownNumElements() const {
-  return !isa<CooperativeMatrixNVType, RuntimeArrayType>();
+  return !isa<CooperativeMatrixNVType, JointMatrixINTELType,
+              RuntimeArrayType>();
 }
 
 void CompositeType::getExtensions(
     SPIRVType::ExtensionArrayRefVector &extensions,
     Optional<StorageClass> storage) {
   TypeSwitch<Type>(*this)
-      .Case<ArrayType, CooperativeMatrixNVType, MatrixType, RuntimeArrayType,
-            StructType>(
+      .Case<ArrayType, CooperativeMatrixNVType, JointMatrixINTELType,
+            MatrixType, RuntimeArrayType, StructType>(
           [&](auto type) { type.getExtensions(extensions, storage); })
       .Case<VectorType>([&](VectorType type) {
         return type.getElementType().cast<ScalarType>().getExtensions(
@@ -161,8 +167,8 @@
     SPIRVType::CapabilityArrayRefVector &capabilities,
     Optional<StorageClass> storage) {
   TypeSwitch<Type>(*this)
-      .Case<ArrayType, CooperativeMatrixNVType, MatrixType, RuntimeArrayType,
-            StructType>(
+      .Case<ArrayType, CooperativeMatrixNVType, JointMatrixINTELType,
+            MatrixType, RuntimeArrayType, StructType>(
           [&](auto type) { type.getCapabilities(capabilities, storage); })
       .Case<VectorType>([&](VectorType type) {
         auto vecSize = getNumElements();
@@ -255,6 +261,74 @@
   capabilities.push_back(ref);
 }
 
+//===----------------------------------------------------------------------===//
+// JointMatrixType
+//===----------------------------------------------------------------------===//
+
+struct spirv::detail::JointMatrixTypeStorage : public TypeStorage {
+  using KeyTy = std::tuple<Type, unsigned, unsigned, MatrixLayout, Scope>;
+
+  static JointMatrixTypeStorage *construct(TypeStorageAllocator &allocator,
+                                           const KeyTy &key) {
+    return new (allocator.allocate<JointMatrixTypeStorage>())
+        JointMatrixTypeStorage(key);
+  }
+
+  bool operator==(const KeyTy &key) const {
+    return key == KeyTy(elementType, rows, columns, matrixLayout, scope);
+  }
+
+  JointMatrixTypeStorage(const KeyTy &key)
+      : elementType(std::get<0>(key)), rows(std::get<1>(key)),
+        columns(std::get<2>(key)), scope(std::get<4>(key)),
+        matrixLayout(std::get<3>(key)) {}
+
+  Type elementType;
+  unsigned rows;
+  unsigned columns;
+  Scope scope;
+  MatrixLayout matrixLayout;
+};
+
+JointMatrixINTELType JointMatrixINTELType::get(Type elementType, Scope scope,
+                                               unsigned rows, unsigned columns,
+                                               MatrixLayout matrixLayout) {
+  return Base::get(elementType.getContext(), elementType, rows, columns,
+                   matrixLayout, scope);
+}
+
+Type JointMatrixINTELType::getElementType() const {
+  return getImpl()->elementType;
+}
+
+Scope JointMatrixINTELType::getScope() const { return getImpl()->scope; }
+
+unsigned JointMatrixINTELType::getRows() const { return getImpl()->rows; }
+
+unsigned JointMatrixINTELType::getColumns() const { return getImpl()->columns; }
+
+MatrixLayout JointMatrixINTELType::getMatrixLayout() const {
+  return getImpl()->matrixLayout;
+}
+
+void JointMatrixINTELType::getExtensions(
+    SPIRVType::ExtensionArrayRefVector &extensions,
+    Optional<StorageClass> storage) {
+  getElementType().cast<SPIRVType>().getExtensions(extensions, storage);
+  static const Extension exts[] = {Extension::SPV_INTEL_joint_matrix};
+  ArrayRef<Extension> ref(exts, llvm::array_lengthof(exts));
+  extensions.push_back(ref);
+}
+
+void JointMatrixINTELType::getCapabilities(
+    SPIRVType::CapabilityArrayRefVector &capabilities,
+    Optional<StorageClass> storage) {
+  getElementType().cast<SPIRVType>().getCapabilities(capabilities, storage);
+  static const Capability caps[] = {Capability::JointMatrixINTEL};
+  ArrayRef<Capability> ref(caps, llvm::array_lengthof(caps));
+  capabilities.push_back(ref);
+}
+
 //===----------------------------------------------------------------------===//
 // ImageType
 //===----------------------------------------------------------------------===//
@@ -1172,6 +1246,7 @@
 //===----------------------------------------------------------------------===//
 
 void SPIRVDialect::registerTypes() {
-  addTypes<ArrayType, CooperativeMatrixNVType, ImageType, MatrixType,
-           PointerType, RuntimeArrayType, SampledImageType, StructType>();
+  addTypes<ArrayType, CooperativeMatrixNVType, ImageType, JointMatrixINTELType,
+           MatrixType, PointerType, RuntimeArrayType, SampledImageType,
+           StructType>();
 }
diff --git a/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp b/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp
--- a/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp
+++ b/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp
@@ -32,8 +32,8 @@
 Value mlir::tosa::clampFloatHelper(Location loc, Value arg,
                                    arith::ConstantOp min, arith::ConstantOp max,
                                    OpBuilder &rewriter) {
-  Value minValue = rewriter.create<arith::MinFOp>(loc, arg, min);
-  return rewriter.create<arith::MaxFOp>(loc, minValue, max);
+  Value minValue = rewriter.create<arith::MinFOp>(loc, arg, max);
+  return rewriter.create<arith::MaxFOp>(loc, minValue, min);
 }
 
 Value mlir::tosa::clampIntHelper(Location loc, Value arg, arith::ConstantOp min,
diff --git a/mlir/lib/Target/SPIRV/Deserialization/DeserializeOps.cpp b/mlir/lib/Target/SPIRV/Deserialization/DeserializeOps.cpp
--- a/mlir/lib/Target/SPIRV/Deserialization/DeserializeOps.cpp
+++ b/mlir/lib/Target/SPIRV/Deserialization/DeserializeOps.cpp
@@ -168,6 +168,8 @@
     return processType(opcode, operands);
   case spirv::Opcode::OpTypeForwardPointer:
     return processTypeForwardPointer(operands);
+  case spirv::Opcode::OpTypeJointMatrixINTEL:
+    return processType(opcode, operands);
   case spirv::Opcode::OpConstant:
     return processConstant(operands, /*isSpec=*/false);
   case spirv::Opcode::OpSpecConstant:
diff --git a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.h b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.h
--- a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.h
+++ b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.h
@@ -257,6 +257,8 @@
 
   LogicalResult processFunctionType(ArrayRef<uint32_t> operands);
 
+  LogicalResult processJointMatrixType(ArrayRef<uint32_t> operands);
+
   LogicalResult processImageType(ArrayRef<uint32_t> operands);
 
   LogicalResult processSampledImageType(ArrayRef<uint32_t> operands);
diff --git a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
--- a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
+++ b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
@@ -730,6 +730,8 @@
     return processCooperativeMatrixType(operands);
   case spirv::Opcode::OpTypeFunction:
     return processFunctionType(operands);
+  case spirv::Opcode::OpTypeJointMatrixINTEL:
+    return processJointMatrixType(operands);
   case spirv::Opcode::OpTypeImage:
     return processImageType(operands);
   case spirv::Opcode::OpTypeSampledImage:
@@ -888,6 +890,40 @@
   return success();
 }
 
+LogicalResult
+spirv::Deserializer::processJointMatrixType(ArrayRef<uint32_t> operands) {
+  if (operands.size() != 6) {
+    return emitError(unknownLoc, "OpTypeJointMatrix must have element "
+                                 "type and row x column parameters");
+  }
+
+  Type elementTy = getType(operands[1]);
+  if (!elementTy) {
+    return emitError(unknownLoc, "OpTypeJointMatrix references undefined <id> ")
+           << operands[1];
+  }
+
+  auto scope = spirv::symbolizeScope(getConstantInt(operands[5]).getInt());
+  if (!scope) {
+    return emitError(unknownLoc,
+                     "OpTypeJointMatrix references undefined scope <id> ")
+           << operands[5];
+  }
+  auto matrixLayout =
+      spirv::symbolizeMatrixLayout(getConstantInt(operands[4]).getInt());
+  if (!matrixLayout) {
+    return emitError(unknownLoc,
+                     "OpTypeJointMatrix references undefined scope <id> ")
+           << operands[4];
+  }
+  unsigned rows = getConstantInt(operands[2]).getInt();
+  unsigned columns = getConstantInt(operands[3]).getInt();
+
+  typeMap[operands[0]] = spirv::JointMatrixINTELType::get(
+      elementTy, scope.value(), rows, columns, matrixLayout.value());
+  return success();
+}
+
 LogicalResult
 spirv::Deserializer::processRuntimeArrayType(ArrayRef<uint32_t> operands) {
   if (operands.size() != 2) {
diff --git a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
--- a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
+++ b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
@@ -598,6 +598,27 @@
     return success();
   }
 
+  if (auto jointMatrixType = type.dyn_cast<spirv::JointMatrixINTELType>()) {
+    uint32_t elementTypeID = 0;
+    if (failed(processTypeImpl(loc, jointMatrixType.getElementType(),
+                               elementTypeID, serializationCtx))) {
+      return failure();
+    }
+    typeEnum = spirv::Opcode::OpTypeJointMatrixINTEL;
+    auto getConstantOp = [&](uint32_t id) {
+      auto attr = IntegerAttr::get(IntegerType::get(type.getContext(), 32), id);
+      return prepareConstantInt(loc, attr);
+    };
+    operands.push_back(elementTypeID);
+    operands.push_back(getConstantOp(jointMatrixType.getRows()));
+    operands.push_back(getConstantOp(jointMatrixType.getColumns()));
+    operands.push_back(getConstantOp(
+        static_cast<uint32_t>(jointMatrixType.getMatrixLayout())));
+    operands.push_back(
+        getConstantOp(static_cast<uint32_t>(jointMatrixType.getScope())));
+    return success();
+  }
+
   if (auto matrixType = type.dyn_cast<spirv::MatrixType>()) {
     uint32_t elementTypeID = 0;
     if (failed(processTypeImpl(loc, matrixType.getColumnType(), elementTypeID,
diff --git a/mlir/lib/Tools/lsp-server-support/Protocol.cpp b/mlir/lib/Tools/lsp-server-support/Protocol.cpp
--- a/mlir/lib/Tools/lsp-server-support/Protocol.cpp
+++ b/mlir/lib/Tools/lsp-server-support/Protocol.cpp
@@ -121,7 +121,7 @@
     return false;
   if (!llvm::isAlpha(scheme[0]))
     return false;
-  return std::all_of(scheme.begin() + 1, scheme.end(), [](char c) {
+  return llvm::all_of(llvm::drop_begin(scheme), [](char c) {
     return llvm::isAlnum(c) || c == '+' || c == '.' || c == '-';
   });
 }
diff --git a/mlir/lib/Transforms/Utils/TopologicalSortUtils.cpp b/mlir/lib/Transforms/Utils/TopologicalSortUtils.cpp
--- a/mlir/lib/Transforms/Utils/TopologicalSortUtils.cpp
+++ b/mlir/lib/Transforms/Utils/TopologicalSortUtils.cpp
@@ -8,29 +8,19 @@
 
 #include "mlir/Transforms/TopologicalSortUtils.h"
 #include "mlir/IR/OpDefinition.h"
+#include "llvm/ADT/SetVector.h"
 
 using namespace mlir;
 
-bool mlir::sortTopologically(
-    Block *block, llvm::iterator_range<Block::iterator> ops,
-    function_ref<bool(Value, Operation *)> isOperandReady) {
-  if (ops.empty())
-    return true;
-
-  // The set of operations that have not yet been scheduled.
-  DenseSet<Operation *> unscheduledOps;
-  // Mark all operations as unscheduled.
-  for (Operation &op : ops)
-    unscheduledOps.insert(&op);
-
-  Block::iterator nextScheduledOp = ops.begin();
-  Block::iterator end = ops.end();
-
+/// Return `true` if the given operation is ready to be scheduled.
+static bool isOpReady(Block *block, Operation *op,
+                      DenseSet<Operation *> &unscheduledOps,
+                      function_ref<bool(Value, Operation *)> isOperandReady) {
   // An operation is ready to be scheduled if all its operands are ready. An
   // operation is ready if:
   const auto isReady = [&](Value value, Operation *top) {
     // - the user-provided callback marks it as ready,
-    if (isOperandReady && isOperandReady(value, top))
+    if (isOperandReady && isOperandReady(value, op))
       return true;
     Operation *parent = value.getDefiningOp();
     // - it is a block argument,
@@ -41,12 +31,38 @@
     if (!ancestor)
       return true;
     // - it is defined in a nested region, or
-    if (ancestor == top)
+    if (ancestor == op)
       return true;
     // - its ancestor in the block is scheduled.
     return !unscheduledOps.contains(ancestor);
   };
 
+  // An operation is recursively ready to be scheduled of it and its nested
+  // operations are ready.
+  WalkResult readyToSchedule = op->walk([&](Operation *nestedOp) {
+    return llvm::all_of(nestedOp->getOperands(),
+                        [&](Value operand) { return isReady(operand, op); })
+               ? WalkResult::advance()
+               : WalkResult::interrupt();
+  });
+  return !readyToSchedule.wasInterrupted();
+}
+
+bool mlir::sortTopologically(
+    Block *block, llvm::iterator_range<Block::iterator> ops,
+    function_ref<bool(Value, Operation *)> isOperandReady) {
+  if (ops.empty())
+    return true;
+
+  // The set of operations that have not yet been scheduled.
+  DenseSet<Operation *> unscheduledOps;
+  // Mark all operations as unscheduled.
+  for (Operation &op : ops)
+    unscheduledOps.insert(&op);
+
+  Block::iterator nextScheduledOp = ops.begin();
+  Block::iterator end = ops.end();
+
   bool allOpsScheduled = true;
   while (!unscheduledOps.empty()) {
     bool scheduledAtLeastOnce = false;
@@ -56,16 +72,7 @@
     // set, and "schedule" it (move it before the `nextScheduledOp`).
     for (Operation &op :
          llvm::make_early_inc_range(llvm::make_range(nextScheduledOp, end))) {
-      // An operation is recursively ready to be scheduled of it and its nested
-      // operations are ready.
-      WalkResult readyToSchedule = op.walk([&](Operation *nestedOp) {
-        return llvm::all_of(
-                   nestedOp->getOperands(),
-                   [&](Value operand) { return isReady(operand, &op); })
-                   ? WalkResult::advance()
-                   : WalkResult::interrupt();
-      });
-      if (readyToSchedule.wasInterrupted())
+      if (!isOpReady(block, &op, unscheduledOps, isOperandReady))
         continue;
 
       // Schedule the operation by moving it to the start.
@@ -96,3 +103,48 @@
                              isOperandReady);
   return sortTopologically(block, *block, isOperandReady);
 }
+
+bool mlir::computeTopologicalSorting(
+    Block *block, MutableArrayRef<Operation *> ops,
+    function_ref<bool(Value, Operation *)> isOperandReady) {
+  if (ops.empty())
+    return true;
+
+  // The set of operations that have not yet been scheduled.
+  DenseSet<Operation *> unscheduledOps;
+
+  // Mark all operations as unscheduled.
+  for (Operation *op : ops) {
+    assert(op->getBlock() == block && "op must belong to block");
+    unscheduledOps.insert(op);
+  }
+
+  unsigned nextScheduledOp = 0;
+
+  bool allOpsScheduled = true;
+  while (!unscheduledOps.empty()) {
+    bool scheduledAtLeastOnce = false;
+
+    // Loop over the ops that are not sorted yet, try to find the ones "ready",
+    // i.e. the ones for which there aren't any operand produced by an op in the
+    // set, and "schedule" it (swap it with the op at `nextScheduledOp`).
+    for (unsigned i = nextScheduledOp; i < ops.size(); ++i) {
+      if (!isOpReady(block, ops[i], unscheduledOps, isOperandReady))
+        continue;
+
+      // Schedule the operation by moving it to the start.
+      unscheduledOps.erase(ops[i]);
+      std::swap(ops[i], ops[nextScheduledOp]);
+      scheduledAtLeastOnce = true;
+      ++nextScheduledOp;
+    }
+
+    // If no operations were scheduled, just schedule the first op and continue.
+    if (!scheduledAtLeastOnce) {
+      allOpsScheduled = false;
+      unscheduledOps.erase(ops[nextScheduledOp++]);
+    }
+  }
+
+  return allOpsScheduled;
+}
diff --git a/mlir/python/mlir/dialects/_structured_transform_ops_ext.py b/mlir/python/mlir/dialects/_structured_transform_ops_ext.py
--- a/mlir/python/mlir/dialects/_structured_transform_ops_ext.py
+++ b/mlir/python/mlir/dialects/_structured_transform_ops_ext.py
@@ -110,6 +110,24 @@
         ip=ip)
 
 
+class MatchOp:
+  """Specialization for MatchOp class."""
+
+  @classmethod
+  def match_op_names(MatchOp,
+                     target: Union[Operation, Value],
+                     names: Sequence[str],
+                     loc=None,
+                     ip=None):
+    pdl_operation_type = pdl.OperationType.get()
+    return MatchOp(
+        pdl_operation_type,
+        _get_op_result_or_value(target),
+        ops=ArrayAttr.get(list(map(lambda s: StringAttr.get(s), names))),
+        loc=loc,
+        ip=ip)
+
+
 class MultiTileSizesOp:
   """Specialization for MultitileSizesOp class."""
 
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
--- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
@@ -467,8 +467,8 @@
   // CHECK: ^bb0(%[[ARG1:.+]]: f16,
   // CHECK-DAG: %[[C0:.+]] = arith.constant 0.0
   // CHECK-DAG: %[[C6:.+]] = arith.constant 6.0
-  // CHECK-DAG: %[[MIN:.+]] = arith.minf %[[ARG1]], %[[C0]]
-  // CHECK-DAG: %[[MAX:.+]] = arith.maxf %[[MIN]], %[[C6]]
+  // CHECK-DAG: %[[MIN:.+]] = arith.minf %[[ARG1]], %[[C6]]
+  // CHECK-DAG: %[[MAX:.+]] = arith.maxf %[[MIN]], %[[C0]]
   %0 = "tosa.clamp"(%arg0) {min_int = 0 : i64, max_int = 0 : i64, min_fp = 0.0 : f32, max_fp = 6.0 : f32} : (tensor<1xf16>) -> tensor<1xf16>
 
   return
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-alloc-tensor-elimination.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-alloc-tensor-elimination.mlir
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-alloc-tensor-elimination.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-alloc-tensor-elimination.mlir
@@ -94,7 +94,7 @@
 //      CHECK: func @insertion_point_outside_loop(
 // CHECK-SAME:     %[[t:.*]]: memref<?xf32, #{{.*}}>, %[[sz:.*]]: index, %[[idx:.*]]: index)
 func.func @insertion_point_outside_loop(%t : tensor<?xf32>, %sz : index,
-                                   %idx : index) -> (tensor<?xf32>) {
+                                        %idx : index) -> (tensor<?xf32>) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c5 = arith.constant 5 : index
@@ -118,3 +118,21 @@
 
   return %r : tensor<?xf32>
 }
+
+// -----
+
+// AllocTensorElimination does currently not apply to chains where the type is
+// changing. This test just ensures that we do not crash or generate IR that
+// does not verify.
+
+// CHECK-LABEL: func @shape_mismatch
+func.func @shape_mismatch(%t: tensor<5x6x128xf32>) -> tensor<5x6x128xf32> {
+  %cst = arith.constant 8.0 : f32
+  %0 = bufferization.alloc_tensor() : tensor<128xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<128xf32>) -> tensor<128xf32>
+  %2 = tensor.expand_shape %1 [[0, 1, 2]]
+      : tensor<128xf32> into tensor<1x1x128xf32>
+  %3 = tensor.insert_slice %2 into %t[2, 3, 0][1, 1, 128][1, 1, 1]
+      : tensor<1x1x128xf32> into tensor<5x6x128xf32>
+  return %3 : tensor<5x6x128xf32>
+}
diff --git a/mlir/test/Dialect/Math/algebraic-simplification.mlir b/mlir/test/Dialect/Math/algebraic-simplification.mlir
--- a/mlir/test/Dialect/Math/algebraic-simplification.mlir
+++ b/mlir/test/Dialect/Math/algebraic-simplification.mlir
@@ -73,3 +73,93 @@
   %1 = math.powf %arg1, %v : vector<4xf32>
   return %0, %1 : f32, vector<4xf32>
 }
+
+// CHECK-LABEL: @ipowi_zero_exp(
+// CHECK-SAME: %[[ARG0:.+]]: i32
+// CHECK-SAME: %[[ARG1:.+]]: vector<4xi32>
+// CHECK-SAME: -> (i32, vector<4xi32>) {
+func.func @ipowi_zero_exp(%arg0: i32, %arg1: vector<4xi32>) -> (i32, vector<4xi32>) {
+  // CHECK: %[[CST_S:.*]] = arith.constant 1 : i32
+  // CHECK: %[[CST_V:.*]] = arith.constant dense<1> : vector<4xi32>
+  // CHECK: return %[[CST_S]], %[[CST_V]]
+  %c = arith.constant 0 : i32
+  %v = arith.constant dense <0> : vector<4xi32>
+  %0 = math.ipowi %arg0, %c : i32
+  %1 = math.ipowi %arg1, %v : vector<4xi32>
+  return %0, %1 : i32, vector<4xi32>
+}
+
+// CHECK-LABEL: @ipowi_exp_one(
+// CHECK-SAME: %[[ARG0:.+]]: i32
+// CHECK-SAME: %[[ARG1:.+]]: vector<4xi32>
+// CHECK-SAME: -> (i32, vector<4xi32>, i32, vector<4xi32>) {
+func.func @ipowi_exp_one(%arg0: i32, %arg1: vector<4xi32>) -> (i32, vector<4xi32>, i32, vector<4xi32>) {
+  // CHECK: %[[CST_S:.*]] = arith.constant 1 : i32
+  // CHECK: %[[CST_V:.*]] = arith.constant dense<1> : vector<4xi32>
+  // CHECK: %[[SCALAR:.*]] = arith.divsi %[[CST_S]], %[[ARG0]]
+  // CHECK: %[[VECTOR:.*]] = arith.divsi %[[CST_V]], %[[ARG1]]
+  // CHECK: return %[[ARG0]], %[[ARG1]], %[[SCALAR]], %[[VECTOR]]
+  %c1 = arith.constant 1 : i32
+  %v1 = arith.constant dense <1> : vector<4xi32>
+  %0 = math.ipowi %arg0, %c1 : i32
+  %1 = math.ipowi %arg1, %v1 : vector<4xi32>
+  %cm1 = arith.constant -1 : i32
+  %vm1 = arith.constant dense <-1> : vector<4xi32>
+  %2 = math.ipowi %arg0, %cm1 : i32
+  %3 = math.ipowi %arg1, %vm1 : vector<4xi32>
+  return %0, %1, %2, %3 : i32, vector<4xi32>, i32, vector<4xi32>
+}
+
+// CHECK-LABEL: @ipowi_exp_two(
+// CHECK-SAME: %[[ARG0:.+]]: i32
+// CHECK-SAME: %[[ARG1:.+]]: vector<4xi32>
+// CHECK-SAME: -> (i32, vector<4xi32>, i32, vector<4xi32>) {
+func.func @ipowi_exp_two(%arg0: i32, %arg1: vector<4xi32>) -> (i32, vector<4xi32>, i32, vector<4xi32>) {
+  // CHECK: %[[CST_S:.*]] = arith.constant 1 : i32
+  // CHECK: %[[CST_V:.*]] = arith.constant dense<1> : vector<4xi32>
+  // CHECK: %[[SCALAR0:.*]] = arith.muli %[[ARG0]], %[[ARG0]]
+  // CHECK: %[[VECTOR0:.*]] = arith.muli %[[ARG1]], %[[ARG1]]
+  // CHECK: %[[SCALAR1:.*]] = arith.divsi %[[CST_S]], %[[ARG0]]
+  // CHECK: %[[SMUL:.*]] = arith.muli %[[SCALAR1]], %[[SCALAR1]]
+  // CHECK: %[[VECTOR1:.*]] = arith.divsi %[[CST_V]], %[[ARG1]]
+  // CHECK: %[[VMUL:.*]] = arith.muli %[[VECTOR1]], %[[VECTOR1]]
+  // CHECK: return %[[SCALAR0]], %[[VECTOR0]], %[[SMUL]], %[[VMUL]]
+  %c1 = arith.constant 2 : i32
+  %v1 = arith.constant dense <2> : vector<4xi32>
+  %0 = math.ipowi %arg0, %c1 : i32
+  %1 = math.ipowi %arg1, %v1 : vector<4xi32>
+  %cm1 = arith.constant -2 : i32
+  %vm1 = arith.constant dense <-2> : vector<4xi32>
+  %2 = math.ipowi %arg0, %cm1 : i32
+  %3 = math.ipowi %arg1, %vm1 : vector<4xi32>
+  return %0, %1, %2, %3 : i32, vector<4xi32>, i32, vector<4xi32>
+}
+
+// CHECK-LABEL: @ipowi_exp_three(
+// CHECK-SAME: %[[ARG0:.+]]: i32
+// CHECK-SAME: %[[ARG1:.+]]: vector<4xi32>
+// CHECK-SAME: -> (i32, vector<4xi32>, i32, vector<4xi32>) {
+func.func @ipowi_exp_three(%arg0: i32, %arg1: vector<4xi32>) -> (i32, vector<4xi32>, i32, vector<4xi32>) {
+  // CHECK: %[[CST_S:.*]] = arith.constant 1 : i32
+  // CHECK: %[[CST_V:.*]] = arith.constant dense<1> : vector<4xi32>
+  // CHECK: %[[SMUL0:.*]] = arith.muli %[[ARG0]], %[[ARG0]]
+  // CHECK: %[[SCALAR0:.*]] = arith.muli %[[SMUL0]], %[[ARG0]]
+  // CHECK: %[[VMUL0:.*]] = arith.muli %[[ARG1]], %[[ARG1]]
+  // CHECK: %[[VECTOR0:.*]] = arith.muli %[[VMUL0]], %[[ARG1]]
+  // CHECK: %[[SCALAR1:.*]] = arith.divsi %[[CST_S]], %[[ARG0]]
+  // CHECK: %[[SMUL1:.*]] = arith.muli %[[SCALAR1]], %[[SCALAR1]]
+  // CHECK: %[[SMUL2:.*]] = arith.muli %[[SMUL1]], %[[SCALAR1]]
+  // CHECK: %[[VECTOR1:.*]] = arith.divsi %[[CST_V]], %[[ARG1]]
+  // CHECK: %[[VMUL1:.*]] = arith.muli %[[VECTOR1]], %[[VECTOR1]]
+  // CHECK: %[[VMUL2:.*]] = arith.muli %[[VMUL1]], %[[VECTOR1]]
+  // CHECK: return %[[SCALAR0]], %[[VECTOR0]], %[[SMUL2]], %[[VMUL2]]
+  %c1 = arith.constant 3 : i32
+  %v1 = arith.constant dense <3> : vector<4xi32>
+  %0 = math.ipowi %arg0, %c1 : i32
+  %1 = math.ipowi %arg1, %v1 : vector<4xi32>
+  %cm1 = arith.constant -3 : i32
+  %vm1 = arith.constant dense <-3> : vector<4xi32>
+  %2 = math.ipowi %arg0, %cm1 : i32
+  %3 = math.ipowi %arg1, %vm1 : vector<4xi32>
+  return %0, %1, %2, %3 : i32, vector<4xi32>, i32, vector<4xi32>
+}
diff --git a/mlir/test/Dialect/Math/canonicalize_ipowi.mlir b/mlir/test/Dialect/Math/canonicalize_ipowi.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Dialect/Math/canonicalize_ipowi.mlir
@@ -0,0 +1,442 @@
+// RUN: mlir-opt %s -canonicalize | FileCheck %s
+
+// CHECK-LABEL: @ipowi32_fold(
+// CHECK-SAME: %[[result:.+]]: memref<?xi32>
+func.func @ipowi32_fold(%result : memref<?xi32>) {
+// CHECK-DAG: %[[cst0:.+]] = arith.constant 0 : i32
+// CHECK-DAG: %[[cst1:.+]] = arith.constant 1 : i32
+// CHECK-DAG: %[[cst1073741824:.+]] = arith.constant 1073741824 : i32
+// CHECK-DAG: %[[cst_m1:.+]] = arith.constant -1 : i32
+// CHECK-DAG: %[[cst_m27:.+]] = arith.constant -27 : i32
+// CHECK-DAG: %[[i0:.+]] = arith.constant 0 : index
+// CHECK-DAG: %[[i1:.+]] = arith.constant 1 : index
+// CHECK-DAG: %[[i2:.+]] = arith.constant 2 : index
+// CHECK-DAG: %[[i3:.+]] = arith.constant 3 : index
+// CHECK-DAG: %[[i4:.+]] = arith.constant 4 : index
+// CHECK-DAG: %[[i5:.+]] = arith.constant 5 : index
+// CHECK-DAG: %[[i6:.+]] = arith.constant 6 : index
+// CHECK-DAG: %[[i7:.+]] = arith.constant 7 : index
+// CHECK-DAG: %[[i8:.+]] = arith.constant 8 : index
+// CHECK-DAG: %[[i9:.+]] = arith.constant 9 : index
+// CHECK-DAG: %[[i10:.+]] = arith.constant 10 : index
+// CHECK-DAG: %[[i11:.+]] = arith.constant 11 : index
+
+// --- Test power == 0 ---
+  %arg0_base = arith.constant 0 : i32
+  %arg0_power = arith.constant 0 : i32
+  %res0 = math.ipowi %arg0_base, %arg0_power : i32
+  %i0 = arith.constant 0 : index
+  memref.store %res0, %result[%i0] : memref<?xi32>
+// CHECK: memref.store %[[cst1]], %[[result]][%[[i0]]] : memref<?xi32>
+
+  %arg1_base = arith.constant 10 : i32
+  %arg1_power = arith.constant 0 : i32
+  %res1 = math.ipowi %arg1_base, %arg1_power : i32
+  %i1 = arith.constant 1 : index
+  memref.store %res1, %result[%i1] : memref<?xi32>
+// CHECK: memref.store %[[cst1]], %[[result]][%[[i1]]] : memref<?xi32>
+
+  %arg2_base = arith.constant -10 : i32
+  %arg2_power = arith.constant 0 : i32
+  %res2 = math.ipowi %arg2_base, %arg2_power : i32
+  %i2 = arith.constant 2 : index
+  memref.store %res2, %result[%i2] : memref<?xi32>
+// CHECK: memref.store %[[cst1]], %[[result]][%[[i2]]] : memref<?xi32>
+
+// --- Test negative powers ---
+  %arg3_base = arith.constant 0 : i32
+  %arg3_power = arith.constant -1 : i32
+  %res3 = math.ipowi %arg3_base, %arg3_power : i32
+  %i3 = arith.constant 3 : index
+  memref.store %res3, %result[%i3] : memref<?xi32>
+// No folding for ipowi(0, x) for x < 0:
+// CHECK: %[[res3:.+]] = math.ipowi %[[cst0]], %[[cst_m1]] : i32
+// CHECK: memref.store %[[res3]], %[[result]][%[[i3]]] : memref<?xi32>
+
+  %arg4_base = arith.constant 1 : i32
+  %arg4_power = arith.constant -10 : i32
+  %res4 = math.ipowi %arg4_base, %arg4_power : i32
+  %i4 = arith.constant 4 : index
+  memref.store %res4, %result[%i4] : memref<?xi32>
+// CHECK: memref.store %[[cst1]], %[[result]][%[[i4]]] : memref<?xi32>
+
+  %arg5_base = arith.constant 2 : i32
+  %arg5_power = arith.constant -1 : i32
+  %res5 = math.ipowi %arg5_base, %arg5_power : i32
+  %i5 = arith.constant 5 : index
+  memref.store %res5, %result[%i5] : memref<?xi32>
+// CHECK: memref.store %[[cst0]], %[[result]][%[[i5]]] : memref<?xi32>
+
+  %arg6_base = arith.constant -2 : i32
+  %arg6_power = arith.constant -1 : i32
+  %res6 = math.ipowi %arg6_base, %arg6_power : i32
+  %i6 = arith.constant 6 : index
+  memref.store %res6, %result[%i6] : memref<?xi32>
+// CHECK: memref.store %[[cst0]], %[[result]][%[[i6]]] : memref<?xi32>
+
+  %arg7_base = arith.constant -1 : i32
+  %arg7_power = arith.constant -10 : i32
+  %res7 = math.ipowi %arg7_base, %arg7_power : i32
+  %i7 = arith.constant 7 : index
+  memref.store %res7, %result[%i7] : memref<?xi32>
+// CHECK: memref.store %[[cst1]], %[[result]][%[[i7]]] : memref<?xi32>
+
+  %arg8_base = arith.constant -1 : i32
+  %arg8_power = arith.constant -11 : i32
+  %res8 = math.ipowi %arg8_base, %arg8_power : i32
+  %i8 = arith.constant 8 : index
+  memref.store %res8, %result[%i8] : memref<?xi32>
+// CHECK: memref.store %[[cst_m1]], %[[result]][%[[i8]]] : memref<?xi32>
+
+// --- Test positive powers ---
+  %arg9_base = arith.constant -3 : i32
+  %arg9_power = arith.constant 3 : i32
+  %res9 = math.ipowi %arg9_base, %arg9_power : i32
+  %i9 = arith.constant 9 : index
+  memref.store %res9, %result[%i9] : memref<?xi32>
+// CHECK: memref.store %[[cst_m27]], %[[result]][%[[i9]]] : memref<?xi32>
+
+  %arg10_base = arith.constant 2 : i32
+  %arg10_power = arith.constant 30 : i32
+  %res10 = math.ipowi %arg10_base, %arg10_power : i32
+  %i10 = arith.constant 10 : index
+  memref.store %res10, %result[%i10] : memref<?xi32>
+// CHECK: memref.store %[[cst1073741824]], %[[result]][%[[i10]]] : memref<?xi32>
+
+// --- Test vector folding ---
+  %arg11_base = arith.constant 2 : i32
+  %arg11_base_vec = vector.splat %arg11_base : vector<2x2xi32>
+  %arg11_power = arith.constant 30 : i32
+  %arg11_power_vec = vector.splat %arg11_power : vector<2x2xi32>
+  %res11_vec = math.ipowi %arg11_base_vec, %arg11_power_vec : vector<2x2xi32>
+  %i11 = arith.constant 11 : index
+  %res11 = vector.extract %res11_vec[1, 1] : vector<2x2xi32>
+  memref.store %res11, %result[%i11] : memref<?xi32>
+// CHECK: memref.store %[[cst1073741824]], %[[result]][%[[i11]]] : memref<?xi32>
+
+  return
+}
+
+// CHECK-LABEL: @ipowi64_fold(
+// CHECK-SAME: %[[result:.+]]: memref<?xi64>
+func.func @ipowi64_fold(%result : memref<?xi64>) {
+// CHECK-DAG: %[[cst0:.+]] = arith.constant 0 : i64
+// CHECK-DAG: %[[cst1:.+]] = arith.constant 1 : i64
+// CHECK-DAG: %[[cst1073741824:.+]] = arith.constant 1073741824 : i64
+// CHECK-DAG: %[[cst281474976710656:.+]] = arith.constant 281474976710656 : i64
+// CHECK-DAG: %[[cst_m1:.+]] = arith.constant -1 : i64
+// CHECK-DAG: %[[cst_m27:.+]] = arith.constant -27 : i64
+// CHECK-DAG: %[[i0:.+]] = arith.constant 0 : index
+// CHECK-DAG: %[[i1:.+]] = arith.constant 1 : index
+// CHECK-DAG: %[[i2:.+]] = arith.constant 2 : index
+// CHECK-DAG: %[[i3:.+]] = arith.constant 3 : index
+// CHECK-DAG: %[[i4:.+]] = arith.constant 4 : index
+// CHECK-DAG: %[[i5:.+]] = arith.constant 5 : index
+// CHECK-DAG: %[[i6:.+]] = arith.constant 6 : index
+// CHECK-DAG: %[[i7:.+]] = arith.constant 7 : index
+// CHECK-DAG: %[[i8:.+]] = arith.constant 8 : index
+// CHECK-DAG: %[[i9:.+]] = arith.constant 9 : index
+// CHECK-DAG: %[[i10:.+]] = arith.constant 10 : index
+// CHECK-DAG: %[[i11:.+]] = arith.constant 11 : index
+
+// --- Test power == 0 ---
+  %arg0_base = arith.constant 0 : i64
+  %arg0_power = arith.constant 0 : i64
+  %res0 = math.ipowi %arg0_base, %arg0_power : i64
+  %i0 = arith.constant 0 : index
+  memref.store %res0, %result[%i0] : memref<?xi64>
+// CHECK: memref.store %[[cst1]], %[[result]][%[[i0]]] : memref<?xi64>
+
+  %arg1_base = arith.constant 10 : i64
+  %arg1_power = arith.constant 0 : i64
+  %res1 = math.ipowi %arg1_base, %arg1_power : i64
+  %i1 = arith.constant 1 : index
+  memref.store %res1, %result[%i1] : memref<?xi64>
+// CHECK: memref.store %[[cst1]], %[[result]][%[[i1]]] : memref<?xi64>
+
+  %arg2_base = arith.constant -10 : i64
+  %arg2_power = arith.constant 0 : i64
+  %res2 = math.ipowi %arg2_base, %arg2_power : i64
+  %i2 = arith.constant 2 : index
+  memref.store %res2, %result[%i2] : memref<?xi64>
+// CHECK: memref.store %[[cst1]], %[[result]][%[[i2]]] : memref<?xi64>
+
+// --- Test negative powers ---
+  %arg3_base = arith.constant 0 : i64
+  %arg3_power = arith.constant -1 : i64
+  %res3 = math.ipowi %arg3_base, %arg3_power : i64
+  %i3 = arith.constant 3 : index
+  memref.store %res3, %result[%i3] : memref<?xi64>
+// No folding for ipowi(0, x) for x < 0:
+// CHECK: %[[res3:.+]] = math.ipowi %[[cst0]], %[[cst_m1]] : i64
+// CHECK: memref.store %[[res3]], %[[result]][%[[i3]]] : memref<?xi64>
+
+  %arg4_base = arith.constant 1 : i64
+  %arg4_power = arith.constant -10 : i64
+  %res4 = math.ipowi %arg4_base, %arg4_power : i64
+  %i4 = arith.constant 4 : index
+  memref.store %res4, %result[%i4] : memref<?xi64>
+// CHECK: memref.store %[[cst1]], %[[result]][%[[i4]]] : memref<?xi64>
+
+  %arg5_base = arith.constant 2 : i64
+  %arg5_power = arith.constant -1 : i64
+  %res5 = math.ipowi %arg5_base, %arg5_power : i64
+  %i5 = arith.constant 5 : index
+  memref.store %res5, %result[%i5] : memref<?xi64>
+// CHECK: memref.store %[[cst0]], %[[result]][%[[i5]]] : memref<?xi64>
+
+  %arg6_base = arith.constant -2 : i64
+  %arg6_power = arith.constant -1 : i64
+  %res6 = math.ipowi %arg6_base, %arg6_power : i64
+  %i6 = arith.constant 6 : index
+  memref.store %res6, %result[%i6] : memref<?xi64>
+// CHECK: memref.store %[[cst0]], %[[result]][%[[i6]]] : memref<?xi64>
+
+  %arg7_base = arith.constant -1 : i64
+  %arg7_power = arith.constant -10 : i64
+  %res7 = math.ipowi %arg7_base, %arg7_power : i64
+  %i7 = arith.constant 7 : index
+  memref.store %res7, %result[%i7] : memref<?xi64>
+// CHECK: memref.store %[[cst1]], %[[result]][%[[i7]]] : memref<?xi64>
+
+  %arg8_base = arith.constant -1 : i64
+  %arg8_power = arith.constant -11 : i64
+  %res8 = math.ipowi %arg8_base, %arg8_power : i64
+  %i8 = arith.constant 8 : index
+  memref.store %res8, %result[%i8] : memref<?xi64>
+// CHECK: memref.store %[[cst_m1]], %[[result]][%[[i8]]] : memref<?xi64>
+
+// --- Test positive powers ---
+  %arg9_base = arith.constant -3 : i64
+  %arg9_power = arith.constant 3 : i64
+  %res9 = math.ipowi %arg9_base, %arg9_power : i64
+  %i9 = arith.constant 9 : index
+  memref.store %res9, %result[%i9] : memref<?xi64>
+// CHECK: memref.store %[[cst_m27]], %[[result]][%[[i9]]] : memref<?xi64>
+
+  %arg10_base = arith.constant 2 : i64
+  %arg10_power = arith.constant 30 : i64
+  %res10 = math.ipowi %arg10_base, %arg10_power : i64
+  %i10 = arith.constant 10 : index
+  memref.store %res10, %result[%i10] : memref<?xi64>
+// CHECK: memref.store %[[cst1073741824]], %[[result]][%[[i10]]] : memref<?xi64>
+
+  %arg11_base = arith.constant 2 : i64
+  %arg11_power = arith.constant 48 : i64
+  %res11 = math.ipowi %arg11_base, %arg11_power : i64
+  %i11 = arith.constant 11 : index
+  memref.store %res11, %result[%i11] : memref<?xi64>
+// CHECK: memref.store %[[cst281474976710656]], %[[result]][%[[i11]]] : memref<?xi64>
+
+  return
+}
+
+// CHECK-LABEL: @ipowi16_fold(
+// CHECK-SAME: %[[result:.+]]: memref<?xi16>
+func.func @ipowi16_fold(%result : memref<?xi16>) {
+// CHECK-DAG: %[[cst0:.+]] = arith.constant 0 : i16
+// CHECK-DAG: %[[cst1:.+]] = arith.constant 1 : i16
+// CHECK-DAG: %[[cst16384:.+]] = arith.constant 16384 : i16
+// CHECK-DAG: %[[cst_m1:.+]] = arith.constant -1 : i16
+// CHECK-DAG: %[[cst_m27:.+]] = arith.constant -27 : i16
+// CHECK-DAG: %[[i0:.+]] = arith.constant 0 : index
+// CHECK-DAG: %[[i1:.+]] = arith.constant 1 : index
+// CHECK-DAG: %[[i2:.+]] = arith.constant 2 : index
+// CHECK-DAG: %[[i3:.+]] = arith.constant 3 : index
+// CHECK-DAG: %[[i4:.+]] = arith.constant 4 : index
+// CHECK-DAG: %[[i5:.+]] = arith.constant 5 : index
+// CHECK-DAG: %[[i6:.+]] = arith.constant 6 : index
+// CHECK-DAG: %[[i7:.+]] = arith.constant 7 : index
+// CHECK-DAG: %[[i8:.+]] = arith.constant 8 : index
+// CHECK-DAG: %[[i9:.+]] = arith.constant 9 : index
+// CHECK-DAG: %[[i10:.+]] = arith.constant 10 : index
+
+// --- Test power == 0 ---
+  %arg0_base = arith.constant 0 : i16
+  %arg0_power = arith.constant 0 : i16
+  %res0 = math.ipowi %arg0_base, %arg0_power : i16
+  %i0 = arith.constant 0 : index
+  memref.store %res0, %result[%i0] : memref<?xi16>
+// CHECK: memref.store %[[cst1]], %[[result]][%[[i0]]] : memref<?xi16>
+
+  %arg1_base = arith.constant 10 : i16
+  %arg1_power = arith.constant 0 : i16
+  %res1 = math.ipowi %arg1_base, %arg1_power : i16
+  %i1 = arith.constant 1 : index
+  memref.store %res1, %result[%i1] : memref<?xi16>
+// CHECK: memref.store %[[cst1]], %[[result]][%[[i1]]] : memref<?xi16>
+
+  %arg2_base = arith.constant -10 : i16
+  %arg2_power = arith.constant 0 : i16
+  %res2 = math.ipowi %arg2_base, %arg2_power : i16
+  %i2 = arith.constant 2 : index
+  memref.store %res2, %result[%i2] : memref<?xi16>
+// CHECK: memref.store %[[cst1]], %[[result]][%[[i2]]] : memref<?xi16>
+
+// --- Test negative powers ---
+  %arg3_base = arith.constant 0 : i16
+  %arg3_power = arith.constant -1 : i16
+  %res3 = math.ipowi %arg3_base, %arg3_power : i16
+  %i3 = arith.constant 3 : index
+  memref.store %res3, %result[%i3] : memref<?xi16>
+// No folding for ipowi(0, x) for x < 0:
+// CHECK: %[[res3:.+]] = math.ipowi %[[cst0]], %[[cst_m1]] : i16
+// CHECK: memref.store %[[res3]], %[[result]][%[[i3]]] : memref<?xi16>
+
+  %arg4_base = arith.constant 1 : i16
+  %arg4_power = arith.constant -10 : i16
+  %res4 = math.ipowi %arg4_base, %arg4_power : i16
+  %i4 = arith.constant 4 : index
+  memref.store %res4, %result[%i4] : memref<?xi16>
+// CHECK: memref.store %[[cst1]], %[[result]][%[[i4]]] : memref<?xi16>
+
+  %arg5_base = arith.constant 2 : i16
+  %arg5_power = arith.constant -1 : i16
+  %res5 = math.ipowi %arg5_base, %arg5_power : i16
+  %i5 = arith.constant 5 : index
+  memref.store %res5, %result[%i5] : memref<?xi16>
+// CHECK: memref.store %[[cst0]], %[[result]][%[[i5]]] : memref<?xi16>
+
+  %arg6_base = arith.constant -2 : i16
+  %arg6_power = arith.constant -1 : i16
+  %res6 = math.ipowi %arg6_base, %arg6_power : i16
+  %i6 = arith.constant 6 : index
+  memref.store %res6, %result[%i6] : memref<?xi16>
+// CHECK: memref.store %[[cst0]], %[[result]][%[[i6]]] : memref<?xi16>
+
+  %arg7_base = arith.constant -1 : i16
+  %arg7_power = arith.constant -10 : i16
+  %res7 = math.ipowi %arg7_base, %arg7_power : i16
+  %i7 = arith.constant 7 : index
+  memref.store %res7, %result[%i7] : memref<?xi16>
+// CHECK: memref.store %[[cst1]], %[[result]][%[[i7]]] : memref<?xi16>
+
+  %arg8_base = arith.constant -1 : i16
+  %arg8_power = arith.constant -11 : i16
+  %res8 = math.ipowi %arg8_base, %arg8_power : i16
+  %i8 = arith.constant 8 : index
+  memref.store %res8, %result[%i8] : memref<?xi16>
+// CHECK: memref.store %[[cst_m1]], %[[result]][%[[i8]]] : memref<?xi16>
+
+// --- Test positive powers ---
+  %arg9_base = arith.constant -3 : i16
+  %arg9_power = arith.constant 3 : i16
+  %res9 = math.ipowi %arg9_base, %arg9_power : i16
+  %i9 = arith.constant 9 : index
+  memref.store %res9, %result[%i9] : memref<?xi16>
+// CHECK: memref.store %[[cst_m27]], %[[result]][%[[i9]]] : memref<?xi16>
+
+  %arg10_base = arith.constant 2 : i16
+  %arg10_power = arith.constant 14 : i16
+  %res10 = math.ipowi %arg10_base, %arg10_power : i16
+  %i10 = arith.constant 10 : index
+  memref.store %res10, %result[%i10] : memref<?xi16>
+// CHECK: memref.store %[[cst16384]], %[[result]][%[[i10]]] : memref<?xi16>
+
+  return
+}
+
+// CHECK-LABEL: @ipowi8_fold(
+// CHECK-SAME: %[[result:.+]]: memref<?xi8>
+func.func @ipowi8_fold(%result : memref<?xi8>) {
+// CHECK-DAG: %[[cst0:.+]] = arith.constant 0 : i8
+// CHECK-DAG: %[[cst1:.+]] = arith.constant 1 : i8
+// CHECK-DAG: %[[cst64:.+]] = arith.constant 64 : i8
+// CHECK-DAG: %[[cst_m1:.+]] = arith.constant -1 : i8
+// CHECK-DAG: %[[cst_m27:.+]] = arith.constant -27 : i8
+// CHECK-DAG: %[[i0:.+]] = arith.constant 0 : index
+// CHECK-DAG: %[[i1:.+]] = arith.constant 1 : index
+// CHECK-DAG: %[[i2:.+]] = arith.constant 2 : index
+// CHECK-DAG: %[[i3:.+]] = arith.constant 3 : index
+// CHECK-DAG: %[[i4:.+]] = arith.constant 4 : index
+// CHECK-DAG: %[[i5:.+]] = arith.constant 5 : index
+// CHECK-DAG: %[[i6:.+]] = arith.constant 6 : index
+// CHECK-DAG: %[[i7:.+]] = arith.constant 7 : index
+// CHECK-DAG: %[[i8:.+]] = arith.constant 8 : index
+// CHECK-DAG: %[[i9:.+]] = arith.constant 9 : index
+// CHECK-DAG: %[[i10:.+]] = arith.constant 10 : index
+
+// --- Test power == 0 ---
+  %arg0_base = arith.constant 0 : i8
+  %arg0_power = arith.constant 0 : i8
+  %res0 = math.ipowi %arg0_base, %arg0_power : i8
+  %i0 = arith.constant 0 : index
+  memref.store %res0, %result[%i0] : memref<?xi8>
+// CHECK: memref.store %[[cst1]], %[[result]][%[[i0]]] : memref<?xi8>
+
+  %arg1_base = arith.constant 10 : i8
+  %arg1_power = arith.constant 0 : i8
+  %res1 = math.ipowi %arg1_base, %arg1_power : i8
+  %i1 = arith.constant 1 : index
+  memref.store %res1, %result[%i1] : memref<?xi8>
+// CHECK: memref.store %[[cst1]], %[[result]][%[[i1]]] : memref<?xi8>
+
+  %arg2_base = arith.constant -10 : i8
+  %arg2_power = arith.constant 0 : i8
+  %res2 = math.ipowi %arg2_base, %arg2_power : i8
+  %i2 = arith.constant 2 : index
+  memref.store %res2, %result[%i2] : memref<?xi8>
+// CHECK: memref.store %[[cst1]], %[[result]][%[[i2]]] : memref<?xi8>
+
+// --- Test negative powers ---
+  %arg3_base = arith.constant 0 : i8
+  %arg3_power = arith.constant -1 : i8
+  %res3 = math.ipowi %arg3_base, %arg3_power : i8
+  %i3 = arith.constant 3 : index
+  memref.store %res3, %result[%i3] : memref<?xi8>
+// No folding for ipowi(0, x) for x < 0:
+// CHECK: %[[res3:.+]] = math.ipowi %[[cst0]], %[[cst_m1]] : i8
+// CHECK: memref.store %[[res3]], %[[result]][%[[i3]]] : memref<?xi8>
+
+  %arg4_base = arith.constant 1 : i8
+  %arg4_power = arith.constant -10 : i8
+  %res4 = math.ipowi %arg4_base, %arg4_power : i8
+  %i4 = arith.constant 4 : index
+  memref.store %res4, %result[%i4] : memref<?xi8>
+// CHECK: memref.store %[[cst1]], %[[result]][%[[i4]]] : memref<?xi8>
+
+  %arg5_base = arith.constant 2 : i8
+  %arg5_power = arith.constant -1 : i8
+  %res5 = math.ipowi %arg5_base, %arg5_power : i8
+  %i5 = arith.constant 5 : index
+  memref.store %res5, %result[%i5] : memref<?xi8>
+// CHECK: memref.store %[[cst0]], %[[result]][%[[i5]]] : memref<?xi8>
+
+  %arg6_base = arith.constant -2 : i8
+  %arg6_power = arith.constant -1 : i8
+  %res6 = math.ipowi %arg6_base, %arg6_power : i8
+  %i6 = arith.constant 6 : index
+  memref.store %res6, %result[%i6] : memref<?xi8>
+// CHECK: memref.store %[[cst0]], %[[result]][%[[i6]]] : memref<?xi8>
+
+  %arg7_base = arith.constant -1 : i8
+  %arg7_power = arith.constant -10 : i8
+  %res7 = math.ipowi %arg7_base, %arg7_power : i8
+  %i7 = arith.constant 7 : index
+  memref.store %res7, %result[%i7] : memref<?xi8>
+// CHECK: memref.store %[[cst1]], %[[result]][%[[i7]]] : memref<?xi8>
+
+  %arg8_base = arith.constant -1 : i8
+  %arg8_power = arith.constant -11 : i8
+  %res8 = math.ipowi %arg8_base, %arg8_power : i8
+  %i8 = arith.constant 8 : index
+  memref.store %res8, %result[%i8] : memref<?xi8>
+// CHECK: memref.store %[[cst_m1]], %[[result]][%[[i8]]] : memref<?xi8>
+
+// --- Test positive powers ---
+  %arg9_base = arith.constant -3 : i8
+  %arg9_power = arith.constant 3 : i8
+  %res9 = math.ipowi %arg9_base, %arg9_power : i8
+  %i9 = arith.constant 9 : index
+  memref.store %res9, %result[%i9] : memref<?xi8>
+// CHECK: memref.store %[[cst_m27]], %[[result]][%[[i9]]] : memref<?xi8>
+
+  %arg10_base = arith.constant 2 : i8
+  %arg10_power = arith.constant 6 : i8
+  %res10 = math.ipowi %arg10_base, %arg10_power : i8
+  %i10 = arith.constant 10 : index
+  memref.store %res10, %result[%i10] : memref<?xi8>
+// CHECK: memref.store %[[cst64]], %[[result]][%[[i10]]] : memref<?xi8>
+
+  return
+}
diff --git a/mlir/test/Dialect/NVGPU/mma-sync-f32-to-tf32.mlir b/mlir/test/Dialect/NVGPU/mma-sync-f32-to-tf32.mlir
--- a/mlir/test/Dialect/NVGPU/mma-sync-f32-to-tf32.mlir
+++ b/mlir/test/Dialect/NVGPU/mma-sync-f32-to-tf32.mlir
@@ -18,3 +18,12 @@
   return %d : vector<2x2xf32>
 }
 // -----
+
+// Negative test for non f32 case.
+// CHECK-LABEL: mma_sync_f16
+//   CHECK-NOT: tf32Enabled
+//       CHECK: return
+func.func @mma_sync_f16(%arg0: vector<4x2xf16>, %arg1: vector<2x2xf16>, %arg2: vector<2x2xf16>) -> vector<2x2xf16> {
+  %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 16]} : (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>
+  return %d : vector<2x2xf16>
+}
diff --git a/mlir/test/Dialect/SPIRV/IR/joint-matrix-ops.mlir b/mlir/test/Dialect/SPIRV/IR/joint-matrix-ops.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/IR/joint-matrix-ops.mlir
@@ -0,0 +1,158 @@
+// RUN: mlir-opt -allow-unregistered-dialect -split-input-file -verify-diagnostics %s | FileCheck %s
+
+// CHECK-LABEL: @joint_matrix_load
+spv.func @joint_matrix_load(%ptr : !spv.ptr<i32, Workgroup>, %stride : i32) "None" {
+  // CHECK: {{%.*}} = spv.JointMatrixLoadINTEL <Subgroup> <RowMajor> {{%.*}}, {{%.*}} : (!spv.ptr<i32, Workgroup>, i32) -> !spv.jointmatrix<16x8xi32, RowMajor, Workgroup>
+  %0 = spv.JointMatrixLoadINTEL <Subgroup> <RowMajor> %ptr, %stride : (!spv.ptr<i32, Workgroup>, i32) -> !spv.jointmatrix<16x8xi32, RowMajor, Workgroup>
+  spv.Return
+}
+
+// -----
+// CHECK-LABEL: @joint_matrix_load_memaccess
+spv.func @joint_matrix_load_memaccess(%ptr : !spv.ptr<i32, CrossWorkgroup>, %stride : i32) "None" {
+  // CHECK: {{%.*}} = spv.JointMatrixLoadINTEL <Subgroup> <RowMajor> {{%.*}}, {{%.*}} {memory_access = #spv.memory_access<Volatile>} : (!spv.ptr<i32, CrossWorkgroup>, i32) -> !spv.jointmatrix<8x16xi32, ColumnMajor, Subgroup>
+  %0 = spv.JointMatrixLoadINTEL <Subgroup> <RowMajor> %ptr, %stride {memory_access = #spv.memory_access<Volatile>} : (!spv.ptr<i32, CrossWorkgroup>, i32) -> !spv.jointmatrix<8x16xi32, ColumnMajor, Subgroup>
+  spv.Return
+}
+
+// CHECK-LABEL: @joint_matrix_load_diff_ptr_type
+spv.func @joint_matrix_load_diff_ptr_type(%ptr : !spv.ptr<vector<4xi32>, Workgroup>, %stride : i32) "None" {
+  // CHECK: {{%.*}} = spv.JointMatrixLoadINTEL <Subgroup> <RowMajor> {{%.*}}, {{%.*}} {memory_access = #spv.memory_access<Volatile>} : (!spv.ptr<vector<4xi32>, Workgroup>, i32) -> !spv.jointmatrix<8x16xi32, RowMajor, Workgroup>
+  %0 = spv.JointMatrixLoadINTEL <Subgroup> <RowMajor> %ptr, %stride {memory_access = #spv.memory_access<Volatile>} : (!spv.ptr<vector<4xi32>, Workgroup>, i32) -> !spv.jointmatrix<8x16xi32, RowMajor, Workgroup>
+  spv.Return
+}
+
+// CHECK-LABEL: @joint_matrix_store
+spv.func @joint_matrix_store(%ptr : !spv.ptr<i32, Workgroup>, %stride : i32, %m : !spv.jointmatrix<8x16xi32, RowMajor, Workgroup>) "None" {
+  // CHECK: spv.JointMatrixStoreINTEL <Subgroup> <RowMajor> {{%.*}}, {{%.*}}, {{%.*}} : (!spv.ptr<i32, Workgroup>, !spv.jointmatrix<8x16xi32, RowMajor, Workgroup>, i32)
+  spv.JointMatrixStoreINTEL <Subgroup> <RowMajor> %ptr, %m, %stride : (!spv.ptr<i32, Workgroup>, !spv.jointmatrix<8x16xi32, RowMajor, Workgroup>, i32)
+  spv.Return
+}
+
+// CHECK-LABEL: @joint_matrix_store_memaccess
+spv.func @joint_matrix_store_memaccess(%ptr : !spv.ptr<i32, Workgroup>, %m : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, %stride : i32) "None" {
+  // CHECK: spv.JointMatrixStoreINTEL <Subgroup> <ColumnMajor> {{%.*}}, {{%.*}}, {{%.*}} {Volatile} : (!spv.ptr<i32, Workgroup>, !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, i32)
+  spv.JointMatrixStoreINTEL <Subgroup> <ColumnMajor> %ptr, %m, %stride {Volatile} : (!spv.ptr<i32, Workgroup>, !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, i32)
+  spv.Return
+}
+
+// CHECK-LABEL: @joint_matrix_length
+spv.func @joint_matrix_length() -> i32 "None" {
+  // CHECK: {{%.*}} = spv.JointMatrixWorkItemLengthINTEL : !spv.jointmatrix<8x16xi32, PackedB, Subgroup>
+  %0 = spv.JointMatrixWorkItemLengthINTEL : !spv.jointmatrix<8x16xi32, PackedB, Subgroup>
+  spv.ReturnValue %0 : i32
+}
+
+// CHECK-LABEL: @joint_matrix_muladd
+spv.func @joint_matrix_muladd(%a : !spv.jointmatrix<8x32xi8, RowMajor, Subgroup>, %b : !spv.jointmatrix<32x8xi8, ColumnMajor, Subgroup>, %c : !spv.jointmatrix<8x8xi32, RowMajor, Subgroup>) "None" {
+  // CHECK: {{%.*}} = spv.JointMatrixMadINTEL <Subgroup> {{%.*}}, {{%.*}}, {{%.*}}  : !spv.jointmatrix<8x32xi8, RowMajor, Subgroup>, !spv.jointmatrix<32x8xi8, ColumnMajor, Subgroup> -> !spv.jointmatrix<8x8xi32, RowMajor, Subgroup>
+  %r = spv.JointMatrixMadINTEL <Subgroup> %a, %b, %c : !spv.jointmatrix<8x32xi8, RowMajor, Subgroup>, !spv.jointmatrix<32x8xi8, ColumnMajor, Subgroup> -> !spv.jointmatrix<8x8xi32,  RowMajor, Subgroup>
+  spv.Return
+}
+
+// CHECK-LABEL: @joint_matrix_add
+spv.func @joint_matrix_add(%a : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, %b : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>) "None" {
+  // CHECK: {{%.*}} = spv.IAdd {{%.*}}, {{%.*}} : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>
+  %r = spv.IAdd %a, %b : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>
+  spv.Return
+}
+
+// CHECK-LABEL: @joint_matrix_sub
+spv.func @joint_matrix_sub(%a : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, %b : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>) "None" {
+  // CHECK: {{%.*}} = spv.ISub {{%.*}}, {{%.*}} : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>
+  %r = spv.ISub %a, %b : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>
+  spv.Return
+}
+
+// CHECK-LABEL: @joint_matrix_sdiv
+spv.func @joint_matrix_sdiv(%a : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, %b : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>) "None" {
+  // CHECK: {{%.*}} = spv.SDiv {{%.*}}, {{%.*}} : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>
+  %r = spv.SDiv %a, %b : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>
+  spv.Return
+}
+
+// CHECK-LABEL: @joint_matrix_udiv
+spv.func @joint_matrix_udiv(%a : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, %b : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>) "None" {
+  // CHECK: {{%.*}} = spv.UDiv {{%.*}}, {{%.*}} : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>
+  %r = spv.UDiv %a, %b : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>
+  spv.Return
+}
+
+// CHECK-LABEL: @joint_matrix_fadd
+spv.func @joint_matrix_fadd(%a : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup>, %b : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup>) "None" {
+  // CHECK: {{%.*}} = spv.FAdd {{%.*}}, {{%.*}} : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup>
+  %r = spv.FAdd %a, %b : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup>
+  spv.Return
+}
+
+// CHECK-LABEL: @joint_matrix_fsub
+spv.func @joint_matrix_fsub(%a : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup>, %b : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup>) "None" {
+  // CHECK: {{%.*}} = spv.FSub {{%.*}}, {{%.*}} : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup>
+  %r = spv.FSub %a, %b : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup>
+  spv.Return
+}
+
+// CHECK-LABEL: @joint_matrix_fdiv
+spv.func @joint_matrix_fdiv(%a : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup>, %b : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup>) "None" {
+  // CHECK: {{%.*}} = spv.FDiv {{%.*}}, {{%.*}} : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup>
+  %r = spv.FDiv %a, %b : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup>
+  spv.Return
+}
+
+// -----
+
+// CHECK-LABEL: @joint_matrix_access_chain
+spv.func @joint_matrix_access_chain(%a : !spv.ptr<!spv.jointmatrix<8x16xf32, RowMajor, Subgroup>, Function>) -> !spv.ptr<f32, Function> "None" {
+  %0 = spv.Constant 0: i32
+  // CHECK: {{%.*}} = spv.AccessChain {{%.*}}[{{%.*}}] : !spv.ptr<!spv.jointmatrix<8x16xf32, RowMajor, Subgroup>, Function>, i32
+  %1 = spv.AccessChain %a[%0] : !spv.ptr<!spv.jointmatrix<8x16xf32, RowMajor, Subgroup>, Function>, i32
+  spv.ReturnValue %1 : !spv.ptr<f32, Function>
+}
+
+// -----
+
+spv.func @joint_matrix_muladd(%a : !spv.jointmatrix<16x16xi32, RowMajor, Subgroup>, %b : !spv.jointmatrix<16x8xi32, RowMajor, Subgroup>, %c : !spv.jointmatrix<8x8xi32, RowMajor, Subgroup>) "None" {
+  // expected-error @+1 {{'spv.JointMatrixMadINTEL' op matrix size must match}}
+  %r = spv.JointMatrixMadINTEL <Subgroup> %a, %b, %c : !spv.jointmatrix<16x16xi32, RowMajor, Subgroup>, !spv.jointmatrix<16x8xi32, RowMajor, Subgroup> -> !spv.jointmatrix<8x8xi32, RowMajor, Subgroup>
+  spv.Return
+}
+
+// -----
+
+spv.func @joint_matrix_muladd(%a : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, %b : !spv.jointmatrix<8x8xi32, RowMajor, Subgroup>, %c : !spv.jointmatrix<8x8xi32, RowMajor, Subgroup>) "None" {
+  // expected-error @+1 {{'spv.JointMatrixMadINTEL' op matrix size must match}}
+  %r = spv.JointMatrixMadINTEL <Subgroup> %a, %b, %c : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, !spv.jointmatrix<8x8xi32, RowMajor, Subgroup> -> !spv.jointmatrix<8x8xi32, RowMajor, Subgroup>
+  spv.Return
+}
+
+// -----
+
+spv.func @joint_matrix_muladd(%a : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, %b : !spv.jointmatrix<16x8xi32, RowMajor, Workgroup>, %c : !spv.jointmatrix<8x8xi32, RowMajor, Subgroup>) "None" {
+  // expected-error @+1 {{'spv.JointMatrixMadINTEL' op matrix scope must match}}
+  %r = spv.JointMatrixMadINTEL <Subgroup> %a, %b, %c : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, !spv.jointmatrix<16x8xi32, RowMajor, Workgroup> -> !spv.jointmatrix<8x8xi32, RowMajor, Subgroup>
+  spv.Return
+}
+
+// -----
+
+spv.func @joint_matrix_muladd(%a : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup>, %b : !spv.jointmatrix<16x8xi32, RowMajor, Subgroup>, %c : !spv.jointmatrix<8x8xi32, RowMajor, Subgroup>) "None" {
+  // expected-error @+1 {{matrix element type must match}}
+  %r = spv.JointMatrixMadINTEL <Subgroup> %a, %b, %c : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup>, !spv.jointmatrix<16x8xi32, RowMajor, Subgroup> -> !spv.jointmatrix<8x8xi32, RowMajor, Subgroup>
+  spv.Return
+}
+
+// -----
+
+spv.func @joint_matrix_load_memaccess(%ptr : !spv.ptr<!spv.struct<(f32 [0])>, Workgroup>, %stride : i32) "None" {
+  // expected-error @+1 {{Pointer must point to a scalar or vector type}}
+  %0 = spv.JointMatrixLoadINTEL <Subgroup> <RowMajor> %ptr, %stride : (!spv.ptr<!spv.struct<(f32 [0])>, Workgroup>, i32)-> !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>
+  spv.Return
+}
+
+// -----
+
+spv.func @joint_matrix_load_memaccess(%ptr : !spv.ptr<i32, Function>, %stride : i32) "None" {
+  // expected-error @+1 {{Pointer storage class must be Workgroup or CrossWorkgroup}}
+  %0 = spv.JointMatrixLoadINTEL <Subgroup> <RowMajor> %ptr, %stride : (!spv.ptr<i32, Function>, i32) -> !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>
+  spv.Return
+}
diff --git a/mlir/test/Target/SPIRV/joint-matrix-ops.mlir b/mlir/test/Target/SPIRV/joint-matrix-ops.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Target/SPIRV/joint-matrix-ops.mlir
@@ -0,0 +1,102 @@
+// RUN: mlir-translate -test-spirv-roundtrip -split-input-file %s | FileCheck %s
+
+spv.module Logical GLSL450 requires #spv.vce<v1.0, [JointMatrixINTEL], [SPV_INTEL_joint_matrix]> {
+  // CHECK-LABEL: @joint_matrix_load
+  spv.func @joint_matrix_load(%ptr : !spv.ptr<i32, Workgroup>, %stride : i32) "None" {
+    // CHECK: {{%.*}} = spv.JointMatrixLoadINTEL <Subgroup> <RowMajor> {{%.*}}, {{%.*}} : (!spv.ptr<i32, Workgroup>, i32) -> !spv.jointmatrix<16x8xi32, RowMajor, Workgroup>
+    %0 = spv.JointMatrixLoadINTEL <Subgroup> <RowMajor> %ptr, %stride : (!spv.ptr<i32, Workgroup>, i32) -> !spv.jointmatrix<16x8xi32, RowMajor, Workgroup>
+    spv.Return
+  }
+
+  // CHECK-LABEL: @joint_matrix_load_memaccess
+  spv.func @joint_matrix_load_memaccess(%ptr : !spv.ptr<i32, Workgroup>, %stride : i32) "None" {
+    // CHECK: {{%.*}} = spv.JointMatrixLoadINTEL <Subgroup> <RowMajor> {{%.*}}, {{%.*}} {memory_access = #spv.memory_access<Volatile>} : (!spv.ptr<i32, Workgroup>, i32) -> !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>
+    %0 = spv.JointMatrixLoadINTEL <Subgroup> <RowMajor> %ptr, %stride {memory_access = #spv.memory_access<Volatile>} : (!spv.ptr<i32, Workgroup>, i32) -> !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>
+    spv.Return
+  }
+
+  // CHECK-LABEL: @joint_matrix_store
+  spv.func @joint_matrix_store(%ptr : !spv.ptr<i32, Workgroup>, %stride : i32, %m : !spv.jointmatrix<16x8xi32, RowMajor, Workgroup>) "None" {
+    // CHECK: spv.JointMatrixStoreINTEL <Subgroup> <RowMajor> {{%.*}}, {{%.*}}, {{%.*}} : (!spv.ptr<i32, Workgroup>, !spv.jointmatrix<16x8xi32, RowMajor, Workgroup>, i32)
+    spv.JointMatrixStoreINTEL <Subgroup> <RowMajor> %ptr, %m, %stride : (!spv.ptr<i32, Workgroup>, !spv.jointmatrix<16x8xi32, RowMajor, Workgroup>, i32)
+    spv.Return
+  }
+
+  // CHECK-LABEL: @joint_matrix_store_memaccess
+  spv.func @joint_matrix_store_memaccess(%ptr : !spv.ptr<i32, Workgroup>, %m : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, %stride : i32) "None" {
+    // CHECK: spv.JointMatrixStoreINTEL <Subgroup> <RowMajor> {{%.*}}, {{%.*}}, {{%.*}} {memory_access = #spv.memory_access<Volatile>} : (!spv.ptr<i32, Workgroup>, !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, i32)
+    spv.JointMatrixStoreINTEL <Subgroup> <RowMajor> %ptr, %m, %stride {memory_access = #spv.memory_access<Volatile>} : (!spv.ptr<i32, Workgroup>, !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, i32)
+    spv.Return
+  }
+
+  // CHECK-LABEL: @joint_matrix_length
+  spv.func @joint_matrix_length() -> i32 "None" {
+    // CHECK: {{%.*}} = spv.JointMatrixWorkItemLengthINTEL : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>
+    %0 = spv.JointMatrixWorkItemLengthINTEL : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>
+    spv.ReturnValue %0 : i32
+  }
+
+  // CHECK-LABEL: @joint_matrix_muladd
+  spv.func @joint_matrix_muladd(%a : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, %b : !spv.jointmatrix<16x8xi32, RowMajor, Subgroup>, %c : !spv.jointmatrix<8x8xi32, RowMajor, Subgroup>) "None" {
+    // CHECK: {{%.*}} = spv.JointMatrixMadINTEL <Subgroup> {{%.*}}, {{%.*}}, {{%.*}}  : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, !spv.jointmatrix<16x8xi32, RowMajor, Subgroup> -> !spv.jointmatrix<8x8xi32, RowMajor, Subgroup>
+    %r = spv.JointMatrixMadINTEL <Subgroup> %a, %b, %c : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, !spv.jointmatrix<16x8xi32, RowMajor, Subgroup> -> !spv.jointmatrix<8x8xi32, RowMajor, Subgroup>
+    spv.Return
+  }
+
+  // CHECK-LABEL: @joint_matrix_add
+  spv.func @joint_matrix_add(%a : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, %b : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>) "None" {
+    // CHECK: {{%.*}} = spv.IAdd {{%.*}}, {{%.*}} : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>
+    %r = spv.IAdd %a, %b : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>
+    spv.Return
+  }
+
+  // CHECK-LABEL: @joint_matrix_sub
+  spv.func @joint_matrix_sub(%a : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, %b : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>) "None" {
+    // CHECK: {{%.*}} = spv.ISub {{%.*}}, {{%.*}} : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>
+    %r = spv.ISub %a, %b : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>
+    spv.Return
+  }
+
+  // CHECK-LABEL: @joint_matrix_sdiv
+  spv.func @joint_matrix_sdiv(%a : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, %b : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>) "None" {
+    // CHECK: {{%.*}} = spv.SDiv {{%.*}}, {{%.*}} : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>
+    %r = spv.SDiv %a, %b : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>
+    spv.Return
+  }
+
+  // CHECK-LABEL: @joint_matrix_udiv
+  spv.func @joint_matrix_udiv(%a : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, %b : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>) "None" {
+    // CHECK: {{%.*}} = spv.UDiv {{%.*}}, {{%.*}} : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>
+    %r = spv.UDiv %a, %b : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>
+    spv.Return
+  }
+
+  // CHECK-LABEL: @joint_matrix_fadd
+  spv.func @joint_matrix_fadd(%a : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup>, %b : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup>) "None" {
+    // CHECK: {{%.*}} = spv.FAdd {{%.*}}, {{%.*}} : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup>
+    %r = spv.FAdd %a, %b : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup>
+    spv.Return
+  }
+
+  // CHECK-LABEL: @joint_matrix_fsub
+  spv.func @joint_matrix_fsub(%a : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup>, %b : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup>) "None" {
+    // CHECK: {{%.*}} = spv.FSub {{%.*}}, {{%.*}} : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup>
+    %r = spv.FSub %a, %b : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup>
+    spv.Return
+  }
+
+  // CHECK-LABEL: @joint_matrix_fdiv
+  spv.func @joint_matrix_fdiv(%a : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup>, %b : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup>) "None" {
+    // CHECK: {{%.*}} = spv.FDiv {{%.*}}, {{%.*}} : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup>
+    %r = spv.FDiv %a, %b : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup>
+    spv.Return
+  }
+
+  // CHECK-LABEL: @joint_matrix_access_chain
+  spv.func @joint_matrix_access_chain(%a : !spv.ptr<!spv.jointmatrix<8x16xf32, RowMajor, Subgroup>, Function>) -> !spv.ptr<f32, Function> "None" {
+    %0 = spv.Constant 0: i32
+    // CHECK: {{%.*}} = spv.AccessChain {{%.*}}[{{%.*}}] : !spv.ptr<!spv.jointmatrix<8x16xf32, RowMajor, Subgroup>, Function>, i32
+    %1 = spv.AccessChain %a[%0] : !spv.ptr<!spv.jointmatrix<8x16xf32, RowMajor, Subgroup>, Function>, i32
+    spv.ReturnValue %1 : !spv.ptr<f32, Function>
+  }
+}
diff --git a/mlir/test/Target/SPIRV/memory-ops.mlir b/mlir/test/Target/SPIRV/memory-ops.mlir
--- a/mlir/test/Target/SPIRV/memory-ops.mlir
+++ b/mlir/test/Target/SPIRV/memory-ops.mlir
@@ -1,15 +1,25 @@
 // RUN: mlir-translate -test-spirv-roundtrip -split-input-file %s | FileCheck %s
 
-// CHECK:           spv.func {{@.*}}([[ARG1:%.*]]: !spv.ptr<f32, Input>, [[ARG2:%.*]]: !spv.ptr<f32, Output>) "None" {
-// CHECK-NEXT:        [[VALUE:%.*]] = spv.Load "Input" [[ARG1]] : f32
-// CHECK-NEXT:        spv.Store "Output" [[ARG2]], [[VALUE]] : f32
 
 spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], []> {
+  // CHECK-LABEL: spv.func @load_store
+  //  CHECK-SAME: ([[ARG1:%.*]]: !spv.ptr<f32, Input>, [[ARG2:%.*]]: !spv.ptr<f32, Output>)
   spv.func @load_store(%arg0 : !spv.ptr<f32, Input>, %arg1 : !spv.ptr<f32, Output>) "None" {
+    // CHECK-NEXT: [[VALUE:%.*]] = spv.Load "Input" [[ARG1]] : f32
     %1 = spv.Load "Input" %arg0 : f32
+    // CHECK-NEXT: spv.Store "Output" [[ARG2]], [[VALUE]] : f32
     spv.Store "Output" %arg1, %1 : f32
     spv.Return
   }
+
+  // CHECK-LABEL: spv.func @load_store_memory_operands
+  spv.func @load_store_memory_operands(%arg0 : !spv.ptr<f32, Input>, %arg1 : !spv.ptr<f32, Output>) "None" {
+    // CHECK: spv.Load "Input" %{{.+}} ["Volatile|Aligned", 4] : f32
+    %1 = spv.Load "Input" %arg0 ["Volatile|Aligned", 4]: f32
+    // CHECK: spv.Store "Output" %{{.+}}, %{{.+}} ["Volatile|Aligned", 4] : f32
+    spv.Store "Output" %arg1, %1 ["Volatile|Aligned", 4]: f32
+    spv.Return
+  }
 }
 
 // -----
diff --git a/mlir/test/Transforms/test-toposort.mlir b/mlir/test/Transforms/test-toposort.mlir
--- a/mlir/test/Transforms/test-toposort.mlir
+++ b/mlir/test/Transforms/test-toposort.mlir
@@ -1,27 +1,39 @@
 // RUN: mlir-opt -topological-sort %s | FileCheck %s
+// RUN: mlir-opt -test-topological-sort-analysis %s | FileCheck %s -check-prefix=CHECK-ANALYSIS
 
 // Test producer is after user.
 // CHECK-LABEL: test.graph_region
-test.graph_region {
+// CHECK-ANALYSIS-LABEL: test.graph_region
+test.graph_region attributes{"root"} {
   // CHECK-NEXT: test.foo
   // CHECK-NEXT: test.baz
   // CHECK-NEXT: test.bar
-  %0 = "test.foo"() : () -> i32
-  "test.bar"(%1, %0) : (i32, i32) -> ()
-  %1 = "test.baz"() : () -> i32
+
+  // CHECK-ANALYSIS-NEXT: test.foo{{.*}} {pos = 0
+  // CHECK-ANALYSIS-NEXT: test.bar{{.*}} {pos = 2
+  // CHECK-ANALYSIS-NEXT: test.baz{{.*}} {pos = 1
+  %0 = "test.foo"() {selected} : () -> i32
+  "test.bar"(%1, %0) {selected} : (i32, i32) -> ()
+  %1 = "test.baz"() {selected} : () -> i32
 }
 
 // Test cycles.
 // CHECK-LABEL: test.graph_region
-test.graph_region {
+// CHECK-ANALYSIS-LABEL: test.graph_region
+test.graph_region attributes{"root"} {
   // CHECK-NEXT: test.d
   // CHECK-NEXT: test.a
   // CHECK-NEXT: test.c
   // CHECK-NEXT: test.b
-  %2 = "test.c"(%1) : (i32) -> i32
+
+  // CHECK-ANALYSIS-NEXT: test.c{{.*}} {pos = 0
+  // CHECK-ANALYSIS-NEXT: test.b{{.*}} : (
+  // CHECK-ANALYSIS-NEXT: test.a{{.*}} {pos = 2
+  // CHECK-ANALYSIS-NEXT: test.d{{.*}} {pos = 1
+  %2 = "test.c"(%1) {selected} : (i32) -> i32
   %1 = "test.b"(%0, %2) : (i32, i32) -> i32
-  %0 = "test.a"(%3) : (i32) -> i32
-  %3 = "test.d"() : () -> i32
+  %0 = "test.a"(%3) {selected} : (i32) -> i32
+  %3 = "test.d"() {selected} : () -> i32
 }
 
 // Test block arguments.
diff --git a/mlir/test/lib/Analysis/DataFlow/TestDenseDataFlowAnalysis.cpp b/mlir/test/lib/Analysis/DataFlow/TestDenseDataFlowAnalysis.cpp
--- a/mlir/test/lib/Analysis/DataFlow/TestDenseDataFlowAnalysis.cpp
+++ b/mlir/test/lib/Analysis/DataFlow/TestDenseDataFlowAnalysis.cpp
@@ -61,9 +61,6 @@
   /// The lattice is always initialized.
   bool isUninitialized() const override { return false; }
 
-  /// Initialize the lattice. Does nothing.
-  ChangeResult defaultInitialize() override { return ChangeResult::NoChange; }
-
   /// Mark the lattice as having reached its pessimistic fixpoint. That is, the
   /// last modifications of all memory resources are unknown.
   ChangeResult reset() override {
@@ -73,9 +70,6 @@
     return ChangeResult::Change;
   }
 
-  /// The lattice is never at a fixpoint.
-  bool isAtFixpoint() const override { return false; }
-
   /// Join the last modifications.
   ChangeResult join(const AbstractDenseLattice &lattice) override {
     const auto &rhs = static_cast<const LastModification &>(lattice);
diff --git a/mlir/test/lib/Analysis/TestDataFlowFramework.cpp b/mlir/test/lib/Analysis/TestDataFlowFramework.cpp
--- a/mlir/test/lib/Analysis/TestDataFlowFramework.cpp
+++ b/mlir/test/lib/Analysis/TestDataFlowFramework.cpp
@@ -20,9 +20,6 @@
 
   using AnalysisState::AnalysisState;
 
-  /// Default-initialize the state to zero.
-  ChangeResult defaultInitialize() override { return join(0); }
-
   /// Returns true if the state is uninitialized.
   bool isUninitialized() const override { return !state; }
 
diff --git a/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp b/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp
--- a/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp
+++ b/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp
@@ -249,14 +249,16 @@
 
 template <typename IdOp, typename NProcsOp>
 static SmallVector<ProcInfo, 2>
-getGpuProcIds(OpBuilder &b, Location loc, ArrayRef<Range> parallelLoopRanges) {
+getGpuProcIds(OpBuilder &b, Location loc, ArrayRef<Range> parallelLoopRanges,
+              ArrayRef<linalg::DistributionMethod> distributionMethod) {
   size_t count = std::min<size_t>(3, parallelLoopRanges.size());
   SmallVector<ProcInfo, 2> procInfo(count);
   Type indexType = b.getIndexType();
   for (unsigned i = 0; i < count; ++i) {
     gpu::Dimension dim = *gpu::symbolizeDimension(i);
     procInfo[count - 1 - i] = {b.create<IdOp>(loc, indexType, dim),
-                               b.create<NProcsOp>(loc, indexType, dim)};
+                               b.create<NProcsOp>(loc, indexType, dim),
+                               distributionMethod[count - 1 - i]};
   }
   return procInfo;
 }
@@ -265,10 +267,15 @@
                                           RewritePatternSet &patterns) {
   {
     LinalgLoopDistributionOptions cyclicNprocsEqNiters;
-    cyclicNprocsEqNiters.distributionMethod.resize(
-        2, DistributionMethod::CyclicNumProcsEqNumIters);
+    SmallVector<linalg::DistributionMethod> distributionMethod = {
+        DistributionMethod::CyclicNumProcsEqNumIters,
+        DistributionMethod::CyclicNumProcsEqNumIters};
     cyclicNprocsEqNiters.procInfo =
-        getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>;
+        [distributionMethod](OpBuilder &b, Location loc,
+                             ArrayRef<Range> parallelLoopRanges) {
+          return getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>(
+              b, loc, parallelLoopRanges, distributionMethod);
+        };
     patterns.add<LinalgTilingPattern>(
         MatmulOp::getOperationName(), context,
         LinalgTilingOptions()
@@ -282,10 +289,15 @@
 
   {
     LinalgLoopDistributionOptions cyclicNprocsGeNiters;
-    cyclicNprocsGeNiters.distributionMethod.resize(
-        2, DistributionMethod::CyclicNumProcsGeNumIters);
+    SmallVector<linalg::DistributionMethod> distributionMethod = {
+        DistributionMethod::CyclicNumProcsGeNumIters,
+        DistributionMethod::CyclicNumProcsGeNumIters};
     cyclicNprocsGeNiters.procInfo =
-        getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>;
+        [distributionMethod](OpBuilder &b, Location loc,
+                             ArrayRef<Range> parallelLoopRanges) {
+          return getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>(
+              b, loc, parallelLoopRanges, distributionMethod);
+        };
     patterns.add<LinalgTilingPattern>(
         MatmulOp::getOperationName(), context,
         LinalgTilingOptions()
@@ -299,10 +311,14 @@
 
   {
     LinalgLoopDistributionOptions cyclicNprocsDefault;
-    cyclicNprocsDefault.distributionMethod.resize(2,
-                                                  DistributionMethod::Cyclic);
+    SmallVector<linalg::DistributionMethod> distributionMethod = {
+        DistributionMethod::Cyclic, DistributionMethod::Cyclic};
     cyclicNprocsDefault.procInfo =
-        getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>;
+        [distributionMethod](OpBuilder &b, Location loc,
+                             ArrayRef<Range> parallelLoopRanges) {
+          return getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>(
+              b, loc, parallelLoopRanges, distributionMethod);
+        };
     patterns.add<LinalgTilingPattern>(
         MatmulOp::getOperationName(), context,
         LinalgTilingOptions()
@@ -316,10 +332,15 @@
 
   {
     LinalgLoopDistributionOptions cyclicNprocsMixed1;
-    cyclicNprocsMixed1.distributionMethod = {
+    SmallVector<linalg::DistributionMethod> distributionMethod = {
         DistributionMethod::CyclicNumProcsEqNumIters,
         DistributionMethod::CyclicNumProcsGeNumIters};
-    cyclicNprocsMixed1.procInfo = getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>;
+    cyclicNprocsMixed1.procInfo =
+        [distributionMethod](OpBuilder &b, Location loc,
+                             ArrayRef<Range> parallelLoopRanges) {
+          return getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>(
+              b, loc, parallelLoopRanges, distributionMethod);
+        };
     patterns.add<LinalgTilingPattern>(
         MatmulOp::getOperationName(), context,
         LinalgTilingOptions()
@@ -333,10 +354,15 @@
 
   {
     LinalgLoopDistributionOptions cyclicNprocsMixed2;
-    cyclicNprocsMixed2.distributionMethod = {
+    SmallVector<linalg::DistributionMethod> distributionMethod = {
         DistributionMethod::CyclicNumProcsGeNumIters,
         DistributionMethod::Cyclic};
-    cyclicNprocsMixed2.procInfo = getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>;
+    cyclicNprocsMixed2.procInfo =
+        [distributionMethod](OpBuilder &b, Location loc,
+                             ArrayRef<Range> parallelLoopRanges) {
+          return getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>(
+              b, loc, parallelLoopRanges, distributionMethod);
+        };
     patterns.add<LinalgTilingPattern>(
         MatmulOp::getOperationName(), context,
         LinalgTilingOptions()
@@ -350,10 +376,15 @@
 
   {
     LinalgLoopDistributionOptions cyclicNprocsMixed3;
-    cyclicNprocsMixed3.distributionMethod = {
+    SmallVector<linalg::DistributionMethod> distributionMethod = {
         DistributionMethod::Cyclic,
         DistributionMethod::CyclicNumProcsEqNumIters};
-    cyclicNprocsMixed3.procInfo = getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>;
+    cyclicNprocsMixed3.procInfo =
+        [distributionMethod](OpBuilder &b, Location loc,
+                             ArrayRef<Range> parallelLoopRanges) {
+          return getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>(
+              b, loc, parallelLoopRanges, distributionMethod);
+        };
 
     patterns.add<LinalgTilingPattern>(
         MatmulOp::getOperationName(), context,
@@ -368,10 +399,14 @@
 
   {
     LinalgLoopDistributionOptions cyclicNprocsEqNiters;
-    cyclicNprocsEqNiters.distributionMethod.resize(2,
-                                                   DistributionMethod::Cyclic);
+    SmallVector<linalg::DistributionMethod> distributionMethod = {
+        DistributionMethod::Cyclic, DistributionMethod::Cyclic};
     cyclicNprocsEqNiters.procInfo =
-        getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>;
+        [distributionMethod](OpBuilder &b, Location loc,
+                             ArrayRef<Range> parallelLoopRanges) {
+          return getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>(
+              b, loc, parallelLoopRanges, distributionMethod);
+        };
     patterns.add<LinalgTilingPattern>(
         MatmulOp::getOperationName(), context,
         LinalgTilingOptions()
@@ -387,8 +422,14 @@
 static void fillTileFuseAndDistributePatterns(MLIRContext *context,
                                               RewritePatternSet &patterns) {
   LinalgLoopDistributionOptions cyclicNprocsEqNiters;
-  cyclicNprocsEqNiters.distributionMethod.resize(2, DistributionMethod::Cyclic);
-  cyclicNprocsEqNiters.procInfo = getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>;
+  SmallVector<linalg::DistributionMethod> distributionMethod = {
+      DistributionMethod::Cyclic, DistributionMethod::Cyclic};
+  cyclicNprocsEqNiters.procInfo =
+      [distributionMethod](OpBuilder &b, Location loc,
+                           ArrayRef<Range> parallelLoopRanges) {
+        return getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>(
+            b, loc, parallelLoopRanges, distributionMethod);
+      };
   patterns.add<LinalgTileAndFuseTensorOpsPattern>(
       MatmulOp::getOperationName(), context,
       LinalgTilingAndFusionOptions()
diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt
--- a/mlir/test/lib/Transforms/CMakeLists.txt
+++ b/mlir/test/lib/Transforms/CMakeLists.txt
@@ -5,6 +5,7 @@
   TestControlFlowSink.cpp
   TestInlining.cpp
   TestIntRangeInference.cpp
+  TestTopologicalSort.cpp
 
   EXCLUDE_FROM_LIBMLIR
 
diff --git a/mlir/test/lib/Transforms/TestTopologicalSort.cpp b/mlir/test/lib/Transforms/TestTopologicalSort.cpp
new file mode 100644
--- /dev/null
+++ b/mlir/test/lib/Transforms/TestTopologicalSort.cpp
@@ -0,0 +1,62 @@
+//===- TestTopologicalSort.cpp - Pass to test topological sort analysis ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/TopologicalSortUtils.h"
+
+using namespace mlir;
+
+namespace {
+struct TestTopologicalSortAnalysisPass
+    : public PassWrapper<TestTopologicalSortAnalysisPass,
+                         OperationPass<ModuleOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestTopologicalSortAnalysisPass)
+
+  StringRef getArgument() const final {
+    return "test-topological-sort-analysis";
+  }
+  StringRef getDescription() const final {
+    return "Test topological sorting of ops";
+  }
+
+  void runOnOperation() override {
+    Operation *op = getOperation();
+    OpBuilder builder(op->getContext());
+
+    op->walk([&](Operation *root) {
+      if (!root->hasAttr("root"))
+        return WalkResult::advance();
+
+      assert(root->getNumRegions() == 1 && root->getRegion(0).hasOneBlock() &&
+             "expected one block");
+      Block *block = &root->getRegion(0).front();
+      SmallVector<Operation *> selectedOps;
+      block->walk([&](Operation *op) {
+        if (op->hasAttr("selected"))
+          selectedOps.push_back(op);
+      });
+
+      computeTopologicalSorting(block, selectedOps);
+      for (const auto &it : llvm::enumerate(selectedOps))
+        it.value()->setAttr("pos", builder.getIndexAttr(it.index()));
+
+      return WalkResult::advance();
+    });
+  }
+};
+} // namespace
+
+namespace mlir {
+namespace test {
+void registerTestTopologicalSortAnalysisPass() {
+  PassRegistration<TestTopologicalSortAnalysisPass>();
+}
+} // namespace test
+} // namespace mlir
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -111,6 +111,7 @@
 void registerTestSliceAnalysisPass();
 void registerTestTensorTransforms();
 void registerTestTilingInterface();
+void registerTestTopologicalSortAnalysisPass();
 void registerTestTransformDialectInterpreterPass();
 void registerTestVectorLowerings();
 void registerTestNvgpuLowerings();
@@ -207,6 +208,7 @@
   mlir::test::registerTestSliceAnalysisPass();
   mlir::test::registerTestTensorTransforms();
   mlir::test::registerTestTilingInterface();
+  mlir::test::registerTestTopologicalSortAnalysisPass();
   mlir::test::registerTestTransformDialectInterpreterPass();
   mlir::test::registerTestVectorLowerings();
   mlir::test::registerTestNvgpuLowerings();
diff --git a/mlir/tools/mlir-tblgen/SPIRVUtilsGen.cpp b/mlir/tools/mlir-tblgen/SPIRVUtilsGen.cpp
--- a/mlir/tools/mlir-tblgen/SPIRVUtilsGen.cpp
+++ b/mlir/tools/mlir-tblgen/SPIRVUtilsGen.cpp
@@ -518,7 +518,8 @@
   os << tabs
      << formatv("if (auto attr = {0}->getAttr(\"{1}\")) {{\n", opVar, attrName);
   if (attr.getAttrDefName() == "SPV_ScopeAttr" ||
-      attr.getAttrDefName() == "SPV_MemorySemanticsAttr") {
+      attr.getAttrDefName() == "SPV_MemorySemanticsAttr" ||
+      attr.getAttrDefName() == "SPV_MatrixLayoutAttr") {
     // These two enums are encoded as <id> to constant values in SPIR-V blob,
     // but we directly use the constant value as attribute in SPIR-V dialect. So
     // need to handle them separately from normal enum attributes.
@@ -810,7 +811,8 @@
                                          StringRef words, StringRef wordIndex,
                                          raw_ostream &os) {
   if (attr.getAttrDefName() == "SPV_ScopeAttr" ||
-      attr.getAttrDefName() == "SPV_MemorySemanticsAttr") {
+      attr.getAttrDefName() == "SPV_MemorySemanticsAttr" ||
+      attr.getAttrDefName() == "SPV_MatrixLayoutAttr") {
     // These two enums are encoded as <id> to constant values in SPIR-V blob,
     // but we directly use the constant value as attribute in SPIR-V dialect. So
     // need to handle them separately from normal enum attributes.
diff --git a/mlir/unittests/ExecutionEngine/CMakeLists.txt b/mlir/unittests/ExecutionEngine/CMakeLists.txt
--- a/mlir/unittests/ExecutionEngine/CMakeLists.txt
+++ b/mlir/unittests/ExecutionEngine/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_mlir_unittest(MLIRExecutionEngineTests
+  DynamicMemRef.cpp
   Invoke.cpp
 )
 get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
diff --git a/mlir/unittests/ExecutionEngine/DynamicMemRef.cpp b/mlir/unittests/ExecutionEngine/DynamicMemRef.cpp
new file mode 100644
--- /dev/null
+++ b/mlir/unittests/ExecutionEngine/DynamicMemRef.cpp
@@ -0,0 +1,99 @@
+//===- DynamicMemRef.cpp ----------------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/ExecutionEngine/CRunnerUtils.h"
+#include "llvm/ADT/SmallVector.h"
+
+#include "gmock/gmock.h"
+
+using namespace ::mlir;
+using namespace ::testing;
+
+TEST(DynamicMemRef, rankZero) {
+  int data = 57;
+
+  StridedMemRefType<int, 0> memRef;
+  memRef.basePtr = &data;
+  memRef.data = &data;
+  memRef.offset = 0;
+
+  DynamicMemRefType<int> dynamicMemRef(memRef);
+
+  llvm::SmallVector<int, 1> values(dynamicMemRef.begin(), dynamicMemRef.end());
+  EXPECT_THAT(values, ElementsAre(57));
+}
+
+TEST(DynamicMemRef, rankOne) {
+  std::array<int, 3> data;
+
+  for (size_t i = 0; i < data.size(); ++i) {
+    data[i] = i;
+  }
+
+  StridedMemRefType<int, 1> memRef;
+  memRef.basePtr = data.data();
+  memRef.data = data.data();
+  memRef.offset = 0;
+  memRef.sizes[0] = 3;
+  memRef.strides[0] = 1;
+
+  DynamicMemRefType<int> dynamicMemRef(memRef);
+
+  llvm::SmallVector<int, 3> values(dynamicMemRef.begin(), dynamicMemRef.end());
+  EXPECT_THAT(values, ElementsAreArray(data));
+
+  for (int64_t i = 0; i < 3; ++i) {
+    EXPECT_EQ(*dynamicMemRef[i], data[i]);
+  }
+}
+
+TEST(DynamicMemRef, rankTwo) {
+  std::array<int, 6> data;
+
+  for (size_t i = 0; i < data.size(); ++i) {
+    data[i] = i;
+  }
+
+  StridedMemRefType<int, 2> memRef;
+  memRef.basePtr = data.data();
+  memRef.data = data.data();
+  memRef.offset = 0;
+  memRef.sizes[0] = 2;
+  memRef.sizes[1] = 3;
+  memRef.strides[0] = 3;
+  memRef.strides[1] = 1;
+
+  DynamicMemRefType<int> dynamicMemRef(memRef);
+
+  llvm::SmallVector<int, 6> values(dynamicMemRef.begin(), dynamicMemRef.end());
+  EXPECT_THAT(values, ElementsAreArray(data));
+}
+
+TEST(DynamicMemRef, rankThree) {
+  std::array<int, 24> data;
+
+  for (size_t i = 0; i < data.size(); ++i) {
+    data[i] = i;
+  }
+
+  StridedMemRefType<int, 3> memRef;
+  memRef.basePtr = data.data();
+  memRef.data = data.data();
+  memRef.offset = 0;
+  memRef.sizes[0] = 2;
+  memRef.sizes[1] = 3;
+  memRef.sizes[2] = 4;
+  memRef.strides[0] = 12;
+  memRef.strides[1] = 4;
+  memRef.strides[2] = 1;
+
+  DynamicMemRefType<int> dynamicMemRef(memRef);
+
+  llvm::SmallVector<int, 24> values(dynamicMemRef.begin(), dynamicMemRef.end());
+  EXPECT_THAT(values, ElementsAreArray(data));
+}
\ No newline at end of file
diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp
--- a/openmp/libomptarget/src/device.cpp
+++ b/openmp/libomptarget/src/device.cpp
@@ -580,7 +580,7 @@
 int32_t DeviceTy::runRegion(void *TgtEntryPtr, void **TgtVarsPtr,
                             ptrdiff_t *TgtOffsets, int32_t TgtVarsSize,
                             AsyncInfoTy &AsyncInfo) {
-  if (!RTL->run_region || !RTL->synchronize)
+  if (!RTL->run_region_async || !RTL->synchronize)
     return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets,
                            TgtVarsSize);
   return RTL->run_region_async(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets,
diff --git a/third-party/benchmark/src/sysinfo.cc b/third-party/benchmark/src/sysinfo.cc
--- a/third-party/benchmark/src/sysinfo.cc
+++ b/third-party/benchmark/src/sysinfo.cc
@@ -12,6 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#if defined(_MSC_VER)
+// FIXME: This must be defined before any other includes to disable deprecation
+// warnings for use of codecvt from C++17. We should remove our reliance on
+// the deprecated functionality instead.
+#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
+#endif
+
 #include "internal_macros.h"
 
 #ifdef BENCHMARK_OS_WINDOWS