diff --git a/clang-tools-extra/clangd/refactor/Rename.cpp b/clang-tools-extra/clangd/refactor/Rename.cpp
--- a/clang-tools-extra/clangd/refactor/Rename.cpp
+++ b/clang-tools-extra/clangd/refactor/Rename.cpp
@@ -515,7 +515,8 @@
   else {
     // Name conflict detection.
     // Function conflicts are subtle (overloading), so ignore them.
-    if (RenameDecl.getKind() != Decl::Function) {
+    if (RenameDecl.getKind() != Decl::Function &&
+        RenameDecl.getKind() != Decl::CXXMethod) {
       if (auto *Conflict = lookupSiblingWithName(ASTCtx, RenameDecl, NewName))
         Result = InvalidName{
             InvalidName::Conflict,
diff --git a/clang-tools-extra/clangd/unittests/RenameTests.cpp b/clang-tools-extra/clangd/unittests/RenameTests.cpp
--- a/clang-tools-extra/clangd/unittests/RenameTests.cpp
+++ b/clang-tools-extra/clangd/unittests/RenameTests.cpp
@@ -1062,6 +1062,19 @@
       )cpp",
        "conflict", !HeaderFile, "Conflict"},
 
+      {R"cpp(
+        void func(int);
+        void [[o^therFunc]](double);
+      )cpp",
+       nullptr, !HeaderFile, "func"},
+      {R"cpp(
+        struct S {
+          void func(int);
+          void [[o^therFunc]](double);
+        };
+      )cpp",
+       nullptr, !HeaderFile, "func"},
+
       {R"cpp(
         int V^ar;
       )cpp",
@@ -1121,9 +1134,7 @@
     } else {
       EXPECT_TRUE(bool(Results)) << "rename returned an error: "
                                  << llvm::toString(Results.takeError());
-      ASSERT_EQ(1u, Results->GlobalChanges.size());
-      EXPECT_EQ(applyEdits(std::move(Results->GlobalChanges)).front().second,
-                expectedResult(T, NewName));
+      EXPECT_EQ(Results->LocalChanges, T.ranges());
     }
   }
 }
diff --git a/clang/docs/ControlFlowIntegrity.rst b/clang/docs/ControlFlowIntegrity.rst
--- a/clang/docs/ControlFlowIntegrity.rst
+++ b/clang/docs/ControlFlowIntegrity.rst
@@ -314,10 +314,8 @@
 is a security hardening mechanism designed to be deployed in release builds.
 
 ``-fsanitize=function`` has a higher space and time overhead due to a more
-complex type check at indirect call sites, as well as a need for run-time
-type information (RTTI), which may make it unsuitable for deployment. Because
-of the need for RTTI, ``-fsanitize=function`` can only be used with C++
-programs, whereas ``-fsanitize=cfi-icall`` can protect both C and C++ programs.
+complex type check at indirect call sites, which may make it unsuitable for
+deployment.
 
 On the other hand, ``-fsanitize=function`` conforms more closely with the C++
 standard and user expectations around interaction with shared libraries;
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -1373,37 +1373,38 @@
 Language Extensions Back-ported to Previous Standards
 =====================================================
 
-=================================== ================================ ============= ============= ==================================
-Feature                             Feature Test Macro               Introduced In Backported To Required Flags
-=================================== ================================ ============= ============= ==================================
-variadic templates                  __cpp_variadic_templates         C++11         C++03
-Alias templates                     __cpp_alias_templates            C++11         C++03
-Non-static data member initializers __cpp_nsdmi                      C++11         C++03
-Range-based ``for`` loop            __cpp_range_based_for            C++11         C++03
-RValue references                   __cpp_rvalue_references          C++11         C++03
-Attributes                          __cpp_attributes                 C++11         C++03         -fdouble-square-bracket-attributes
-variable templates                  __cpp_variable_templates         C++14         C++03
-Binary literals                     __cpp_binary_literals            C++14         C++03
-Relaxed constexpr                   __cpp_constexpr                  C++14         C++11
-``if constexpr``                    __cpp_if_constexpr               C++17         C++11
-fold expressions                    __cpp_fold_expressions           C++17         C++03
-Lambda capture of \*this by value   __cpp_capture_star_this          C++17         C++11
-Attributes on enums                 __cpp_enumerator_attributes      C++17         C++11
-Guaranteed copy elision             __cpp_guaranteed_copy_elision    C++17         C++03
-Hexadecimal floating literals       __cpp_hex_float                  C++17         C++03
-``inline`` variables                __cpp_inline_variables           C++17         C++03
-Attributes on namespaces            __cpp_namespace_attributes       C++17         C++11
-Structured bindings                 __cpp_structured_bindings        C++17         C++03
-template template arguments         __cpp_template_template_args     C++17         C++03
-``static operator[]``               __cpp_multidimensional_subscript C++20         C++03
-Designated initializers             __cpp_designated_initializers    C++20         C++03
-Conditional ``explicit``            __cpp_conditional_explicit       C++20         C++03
-``using enum``                      __cpp_using_enum                 C++20         C++03
-``if consteval``                    __cpp_if_consteval               C++23         C++20
-``static operator()``               __cpp_static_call_operator       C++23         C++03
------------------------------------ -------------------------------- ------------- ------------- ----------------------------------
-Designated initializers                                              C99           C89
-=================================== ================================ ============= ============= ==================================
+====================================== ================================ ============= ============= ==================================
+Feature                                Feature Test Macro               Introduced In Backported To Required Flags
+====================================== ================================ ============= ============= ==================================
+variadic templates                     __cpp_variadic_templates         C++11         C++03
+Alias templates                        __cpp_alias_templates            C++11         C++03
+Non-static data member initializers    __cpp_nsdmi                      C++11         C++03
+Range-based ``for`` loop               __cpp_range_based_for            C++11         C++03
+RValue references                      __cpp_rvalue_references          C++11         C++03
+Attributes                             __cpp_attributes                 C++11         C++03         -fdouble-square-bracket-attributes
+variable templates                     __cpp_variable_templates         C++14         C++03
+Binary literals                        __cpp_binary_literals            C++14         C++03
+Relaxed constexpr                      __cpp_constexpr                  C++14         C++11
+``if constexpr``                       __cpp_if_constexpr               C++17         C++11
+fold expressions                       __cpp_fold_expressions           C++17         C++03
+Lambda capture of \*this by value      __cpp_capture_star_this          C++17         C++11
+Attributes on enums                    __cpp_enumerator_attributes      C++17         C++11
+Guaranteed copy elision                __cpp_guaranteed_copy_elision    C++17         C++03
+Hexadecimal floating literals          __cpp_hex_float                  C++17         C++03
+``inline`` variables                   __cpp_inline_variables           C++17         C++03
+Attributes on namespaces               __cpp_namespace_attributes       C++17         C++11
+Structured bindings                    __cpp_structured_bindings        C++17         C++03
+template template arguments            __cpp_template_template_args     C++17         C++03
+``static operator[]``                  __cpp_multidimensional_subscript C++20         C++03
+Designated initializers                __cpp_designated_initializers    C++20         C++03
+Conditional ``explicit``               __cpp_conditional_explicit       C++20         C++03
+``using enum``                         __cpp_using_enum                 C++20         C++03
+``if consteval``                       __cpp_if_consteval               C++23         C++20
+``static operator()``                  __cpp_static_call_operator       C++23         C++03
+-------------------------------------- -------------------------------- ------------- ------------- ----------------------------------
+Designated initializers (N494)                                          C99           C89
+Array & element qualification (N2607)                                   C2x           C89
+====================================== ================================ ============= ============= ==================================
 
 Type Trait Primitives
 =====================
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -418,6 +418,10 @@
 - Propagate the value-dependent bit for VAArgExpr. Fixes a crash where a
   __builtin_va_arg call has invalid arguments.
   (`#62711 <https://github.com/llvm/llvm-project/issues/62711>`_).
+- Fix crash on attempt to initialize union with flexible array member.
+  (`#61746 <https://github.com/llvm/llvm-project/issues/61746>`_).
+- Clang `TextNodeDumper` enabled through `-ast-dump` flag no longer evaluates the
+  initializer of constexpr `VarDecl` if the declaration has a dependent type.
 
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/docs/UndefinedBehaviorSanitizer.rst b/clang/docs/UndefinedBehaviorSanitizer.rst
--- a/clang/docs/UndefinedBehaviorSanitizer.rst
+++ b/clang/docs/UndefinedBehaviorSanitizer.rst
@@ -100,7 +100,7 @@
      by Clang (and by ISO/IEC/IEEE 60559 / IEEE 754) as producing either an
      infinity or NaN value, so is not included in ``-fsanitize=undefined``.
   -  ``-fsanitize=function``: Indirect call of a function through a
-     function pointer of the wrong type (C++ only).
+     function pointer of the wrong type.
   -  ``-fsanitize=implicit-unsigned-integer-truncation``,
      ``-fsanitize=implicit-signed-integer-truncation``: Implicit conversion from
      integer of larger bit width to smaller bit width, if that results in data
diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -1472,9 +1472,12 @@
 
   /// Return the unique reference to a scalable vector type of the specified
   /// element type and scalable number of elements.
+  /// For RISC-V, number of fields is also provided when it fetching for
+  /// tuple type.
   ///
   /// \pre \p EltTy must be a built-in type.
-  QualType getScalableVectorType(QualType EltTy, unsigned NumElts) const;
+  QualType getScalableVectorType(QualType EltTy, unsigned NumElts,
+                                 unsigned NumFields = 1) const;
 
   /// Return a WebAssembly externref type.
   QualType getWebAssemblyExternrefType() const;
diff --git a/clang/include/clang/Basic/RISCVVTypes.def b/clang/include/clang/Basic/RISCVVTypes.def
--- a/clang/include/clang/Basic/RISCVVTypes.def
+++ b/clang/include/clang/Basic/RISCVVTypes.def
@@ -144,6 +144,10 @@
 RVV_PREDICATE_TYPE("__rvv_bool32_t", RvvBool32, RvvBool32Ty, 2)
 RVV_PREDICATE_TYPE("__rvv_bool64_t", RvvBool64, RvvBool64Ty, 1)
 
+//===- Tuple vector types -------------------------------------------------===//
+
+RVV_VECTOR_TYPE_INT("__rvv_int32m1x2_t", RvvInt32m1x2, RvvInt32m1x2Ty, 2,  32, 2, true)
+
 #undef RVV_VECTOR_TYPE_FLOAT
 #undef RVV_VECTOR_TYPE_INT
 #undef RVV_VECTOR_TYPE
diff --git a/clang/include/clang/Basic/riscv_vector.td b/clang/include/clang/Basic/riscv_vector.td
--- a/clang/include/clang/Basic/riscv_vector.td
+++ b/clang/include/clang/Basic/riscv_vector.td
@@ -1503,6 +1503,368 @@
 defm : RVVIndexedSegLoad<"vluxseg">;
 defm : RVVIndexedSegLoad<"vloxseg">;
 }
+
+multiclass RVVUnitStridedSegLoadTuple<string op> {
+  foreach type = ["i"] in {
+    defvar eew = !cond(!eq(type, "i") : "32");
+      foreach nf = [2] in {
+        let Name = op # nf # "e" # eew # "_v_tuple",
+            OverloadedName = op # nf # "e" # eew # "_tuple",
+            IRName = op # nf,
+            MaskedIRName = op # nf # "_mask",
+            NF = nf,
+            ManualCodegen = [{
+    {
+      assert(((IsMasked && (PolicyAttrs & RVV_VTA) && (PolicyAttrs & RVV_VMA)) ||
+            (!IsMasked && (PolicyAttrs & RVV_VTA))) &&
+            "FIXME: Only handling default policy (TAMA) for now");
+
+      llvm::Type *ElementVectorType = cast<StructType>(ResultType)->elements()[0];
+      IntrinsicTypes = {ElementVectorType, Ops.back()->getType()};
+      SmallVector<llvm::Value*, 12> Operands;
+
+      Operands.append(NF, llvm::PoisonValue::get(ElementVectorType));
+
+      unsigned Offset = IsMasked ? 1 : 0;
+      Operands.push_back(Ops[Offset]); // Ptr
+      if (IsMasked)
+        Operands.push_back(Ops[0]);
+      Operands.push_back(Ops[Offset + 1]); // VL
+      if (IsMasked)
+        Operands.push_back(ConstantInt::get(Ops.back()->getType(), PolicyAttrs));
+
+      llvm::Function *F = CGM.getIntrinsic(ID, IntrinsicTypes);
+
+      llvm::Value *LoadValue = Builder.CreateCall(F, Operands, "");
+      if (ReturnValue.isNull())
+        return LoadValue;
+      else
+        return Builder.CreateStore(LoadValue, ReturnValue.getValue());
+    }
+    }] in {
+          defvar T = "(Tuple:" # nf # ")";
+          def : RVVBuiltin<"v", T # "vPCe", type>;
+      }
+    }
+  }
+}
+
+multiclass RVVUnitStridedSegStoreTuple<string op> {
+  foreach type = ["i"] in {
+    defvar eew = !cond(!eq(type, "i") : "32");
+    foreach nf = [2] in {
+      let Name = op # nf # "e" # eew # "_v_tuple",
+          OverloadedName = op # nf # "e" # eew # "_tuple",
+          IRName = op # nf,
+          MaskedIRName = op # nf # "_mask",
+          NF = nf,
+          HasMaskedOffOperand = false,
+          ManualCodegen = [{
+    {
+      // Masked
+      // Builtin: (mask, ptr, v_tuple, vl)
+      // Intrinsic: (val0, val1, ..., ptr, mask, vl)
+      // Unmasked
+      // Builtin: (ptr, v_tuple, vl)
+      // Intrinsic: (val0, val1, ..., ptr, vl)
+      unsigned Offset = IsMasked ? 1 : 0;
+      llvm::Value *VTupleOperand = Ops[Offset + 1];
+
+      SmallVector<llvm::Value*, 12> Operands;
+      for (unsigned I = 0; I < NF; ++I) {
+        llvm::Value *V = Builder.CreateExtractValue(VTupleOperand, {I});
+        Operands.push_back(V);
+      }
+      Operands.push_back(Ops[Offset]); // Ptr
+      if (IsMasked)
+        Operands.push_back(Ops[0]);
+      Operands.push_back(Ops[Offset + 2]); // VL
+
+      IntrinsicTypes = {Operands[0]->getType(), Operands.back()->getType()};
+      llvm::Function *F = CGM.getIntrinsic(ID, IntrinsicTypes);
+      return Builder.CreateCall(F, Operands, "");
+   }
+      }] in {
+          defvar T = "(Tuple:" # nf # ")";
+          def : RVVBuiltin<"v", "0Pe" # T # "v", type>;
+      }
+    }
+  }
+}
+
+multiclass RVVUnitStridedSegLoadFFTuple<string op> {
+  foreach type = ["i"] in {
+    defvar eew = !cond(!eq(type, "i") : "32");
+      foreach nf = [2] in {
+        let Name = op # nf # "e" # eew # "ff_v_tuple",
+            OverloadedName = op # nf # "e" # eew # "ff_tuple",
+            IRName = op # nf # "ff",
+            MaskedIRName = op # nf # "ff_mask",
+            NF = nf,
+            ManualCodegen = [{
+    {
+      assert(((IsMasked && (PolicyAttrs & RVV_VTA) && (PolicyAttrs & RVV_VMA)) ||
+             (!IsMasked && (PolicyAttrs & RVV_VTA))) &&
+             "FIXME: Only handling default policy (TAMA) for now");
+
+      llvm::Type *ElementVectorType = cast<StructType>(ResultType)->elements()[0];
+
+      IntrinsicTypes = {ElementVectorType, Ops.back()->getType()};
+      SmallVector<llvm::Value*, 12> Operands;
+
+      Operands.append(NF, llvm::PoisonValue::get(ElementVectorType));
+
+      unsigned Offset = IsMasked ? 1 : 0;
+      Operands.push_back(Ops[Offset]); // Ptr
+      if (IsMasked)
+        Operands.push_back(Ops[0]);
+      Operands.push_back(Ops[Offset + 2]); // vl
+      if (IsMasked)
+        Operands.push_back(ConstantInt::get(Ops.back()->getType(), PolicyAttrs));
+
+      llvm::Function *F = CGM.getIntrinsic(ID, IntrinsicTypes);
+
+      llvm::Value *LoadValue = Builder.CreateCall(F, Operands, "");
+      // Get alignment from the new vl operand
+      clang::CharUnits Align =
+          CGM.getNaturalPointeeTypeAlignment(E->getArg(Offset + 1)->getType());
+
+      llvm::Value *ReturnTuple = llvm::PoisonValue::get(ResultType);
+      for (unsigned I = 0; I < NF; ++I) {
+        llvm::Value *V = Builder.CreateExtractValue(LoadValue, {I});
+        ReturnTuple = Builder.CreateInsertValue(ReturnTuple, V, {I});
+      }
+
+      // Store new_vl
+      llvm::Value *V = Builder.CreateExtractValue(LoadValue, {NF});
+      Builder.CreateStore(V, Address(Ops[Offset + 1], V->getType(), Align));
+
+      if (ReturnValue.isNull())
+        return ReturnTuple;
+      else
+        return Builder.CreateStore(ReturnTuple, ReturnValue.getValue());
+    }
+    }] in {
+          defvar T = "(Tuple:" # nf # ")";
+          def : RVVBuiltin<"v", T # "vPCePz", type>;
+      }
+    }
+  }
+}
+
+multiclass RVVStridedSegLoadTuple<string op> {
+  foreach type = ["i"] in {
+    defvar eew = !cond(!eq(type, "i") : "32");
+      foreach nf = [2] in {
+        let Name = op # nf # "e" # eew # "_v_tuple",
+            OverloadedName = op # nf # "e" # eew # "_tuple",
+            IRName = op # nf,
+            MaskedIRName = op # nf # "_mask",
+            NF = nf,
+            ManualCodegen = [{
+    {
+      assert(((IsMasked && (PolicyAttrs & RVV_VTA) && (PolicyAttrs & RVV_VMA)) ||
+             (!IsMasked && (PolicyAttrs & RVV_VTA))) &&
+             "FIXME: Only handling default policy (TAMA) for now");
+
+      llvm::Type *ElementVectorType = cast<StructType>(ResultType)->elements()[0];
+
+      IntrinsicTypes = {ElementVectorType, Ops.back()->getType()};
+      SmallVector<llvm::Value*, 12> Operands;
+
+      Operands.append(NF, llvm::PoisonValue::get(ElementVectorType));
+
+      unsigned Offset = IsMasked ? 1 : 0;
+      Operands.push_back(Ops[Offset]); // Ptr
+      Operands.push_back(Ops[Offset + 1]); // Stride
+      if (IsMasked)
+        Operands.push_back(Ops[0]);
+      Operands.push_back(Ops[Offset + 2]); // VL
+      if (IsMasked)
+        Operands.push_back(ConstantInt::get(Ops.back()->getType(), PolicyAttrs));
+
+      llvm::Function *F = CGM.getIntrinsic(ID, IntrinsicTypes);
+      llvm::Value *LoadValue = Builder.CreateCall(F, Operands, "");
+
+      if (ReturnValue.isNull())
+        return LoadValue;
+      else
+        return Builder.CreateStore(LoadValue, ReturnValue.getValue());
+    }
+    }] in {
+          defvar T = "(Tuple:" # nf # ")";
+          def : RVVBuiltin<"v", T # "vPCet", type>;
+      }
+    }
+  }
+}
+
+multiclass RVVStridedSegStoreTuple<string op> {
+  foreach type = ["i"] in {
+    defvar eew = !cond(!eq(type, "i") : "32");
+      foreach nf = [2] in {
+        let Name = op # nf # "e" # eew # "_v_tuple",
+            OverloadedName = op # nf # "e" # eew # "_tuple",
+            IRName = op # nf,
+            MaskedIRName = op # nf # "_mask",
+            NF = nf,
+            HasMaskedOffOperand = false,
+            MaskedPolicyScheme = NonePolicy,
+            ManualCodegen = [{
+    {
+      // Masked
+      // Builtin: (mask, ptr, stride, v_tuple, vl)
+      // Intrinsic: (val0, val1, ..., ptr, stride, mask, vl)
+      // Unmasked
+      // Builtin: (ptr, stride, v_tuple, vl)
+      // Intrinsic: (val0, val1, ..., ptr, stride, vl)
+      unsigned Offset = IsMasked ? 1 : 0;
+      llvm::Value *VTupleOperand = Ops[Offset + 2];
+
+      SmallVector<llvm::Value*, 12> Operands;
+      for (unsigned I = 0; I < NF; ++I) {
+        llvm::Value *V = Builder.CreateExtractValue(VTupleOperand, {I});
+        Operands.push_back(V);
+      }
+      Operands.push_back(Ops[Offset]); // Ptr
+      Operands.push_back(Ops[Offset + 1]); // Stride
+      if (IsMasked)
+        Operands.push_back(Ops[0]);
+      Operands.push_back(Ops[Offset + 3]); // VL
+
+      IntrinsicTypes = {Operands[0]->getType(), Operands.back()->getType()};
+      llvm::Function *F = CGM.getIntrinsic(ID, IntrinsicTypes);
+      return Builder.CreateCall(F, Operands, "");
+    }
+            }] in {
+          defvar T = "(Tuple:" # nf # ")";
+          def : RVVBuiltin<"v", "0Pet" # T # "v", type>;
+      }
+    }
+  }
+}
+
+multiclass RVVIndexedSegLoadTuple<string op> {
+  foreach type = ["i"] in {
+    foreach eew_info = [["32", "(Log2EEW:5)"]] in {
+      defvar eew = eew_info[0];
+      defvar eew_type = eew_info[1];
+      foreach nf = [2] in {
+        let Name = op # nf # "ei" # eew # "_v_tuple",
+            OverloadedName = op # nf # "ei" # eew # "_tuple",
+            IRName = op # nf,
+            MaskedIRName = op # nf # "_mask",
+            NF = nf,
+            ManualCodegen = [{
+    {
+      assert(((IsMasked && (PolicyAttrs & RVV_VTA) && (PolicyAttrs & RVV_VMA)) ||
+             (!IsMasked && (PolicyAttrs & RVV_VTA))) &&
+             "FIXME: Only handling default policy (TAMA) for now");
+
+      llvm::Type *ElementVectorType = cast<StructType>(ResultType)->elements()[0];
+
+      SmallVector<llvm::Value*, 12> Operands;
+
+      Operands.append(NF, llvm::PoisonValue::get(ElementVectorType));
+
+      unsigned Offset = IsMasked ? 1 : 0;
+      Operands.push_back(Ops[Offset]); // Ptr
+      Operands.push_back(Ops[Offset + 1]); // Idx
+      if (IsMasked)
+        Operands.push_back(Ops[0]);
+      Operands.push_back(Ops[Offset + 2]); // VL
+      if (IsMasked)
+        Operands.push_back(ConstantInt::get(Ops.back()->getType(), PolicyAttrs));
+
+      IntrinsicTypes = {ElementVectorType, Ops[Offset + 1]->getType(),
+                        Ops.back()->getType()};
+      llvm::Function *F = CGM.getIntrinsic(ID, IntrinsicTypes);
+      llvm::Value *LoadValue = Builder.CreateCall(F, Operands, "");
+
+      if (ReturnValue.isNull())
+        return LoadValue;
+      else
+        return Builder.CreateStore(LoadValue, ReturnValue.getValue());
+    }
+    }] in {
+          defvar T = "(Tuple:" # nf # ")";
+          def : RVVBuiltin<"v", T # "vPCe" # eew_type # "Uv", type>;
+        }
+      }
+    }
+  }
+}
+
+multiclass RVVIndexedSegStoreTuple<string op> {
+  foreach type = ["i"] in {
+    foreach eew_info = [["32", "(Log2EEW:5)"]] in {
+      defvar eew = eew_info[0];
+      defvar eew_type = eew_info[1];
+      foreach nf = [2] in {
+        let Name = op # nf # "ei" # eew # "_v_tuple",
+            OverloadedName = op # nf # "ei" # eew # "_tuple",
+            IRName = op # nf,
+            MaskedIRName = op # nf # "_mask",
+            NF = nf,
+            HasMaskedOffOperand = false,
+            MaskedPolicyScheme = NonePolicy,
+            ManualCodegen = [{
+    {
+      // Masked
+      // Builtin: (mask, ptr, index, v_tuple, vl)
+      // Intrinsic: (val0, val1, ..., ptr, index, mask, vl)
+      // Unmasked
+      // Builtin: (ptr, index, v_tuple, vl)
+      // Intrinsic: (val0, val1, ..., ptr, index, vl)
+      unsigned Offset = IsMasked ? 1 : 0;
+      llvm::Value *VTupleOperand = Ops[Offset + 2];
+
+      SmallVector<llvm::Value*, 12> Operands;
+      for (unsigned I = 0; I < NF; ++I) {
+        llvm::Value *V = Builder.CreateExtractValue(VTupleOperand, {I});
+        Operands.push_back(V);
+      }
+      Operands.push_back(Ops[Offset]); // Ptr
+      Operands.push_back(Ops[Offset + 1]); // Idx
+      if (IsMasked)
+        Operands.push_back(Ops[0]);
+      Operands.push_back(Ops[Offset + 3]); // VL
+
+      IntrinsicTypes = {Operands[0]->getType(), Ops[Offset + 1]->getType(),
+                        Operands.back()->getType()};
+      llvm::Function *F = CGM.getIntrinsic(ID, IntrinsicTypes);
+      return Builder.CreateCall(F, Operands, "");
+    }
+            }] in {
+          defvar T = "(Tuple:" # nf # ")";
+          def : RVVBuiltin<"v", "0Pe" # eew_type # "Uv" # T # "v", type>;
+        }
+      }
+    }
+  }
+}
+
+// TODO: Extend for policy
+let UnMaskedPolicyScheme = NonePolicy,
+    MaskedPolicyScheme = NonePolicy,
+    IsTuple = true in {
+defm : RVVUnitStridedSegLoadTuple<"vlseg">;
+defm : RVVUnitStridedSegLoadFFTuple<"vlseg">;
+defm : RVVStridedSegLoadTuple<"vlsseg">;
+defm : RVVIndexedSegLoadTuple<"vluxseg">;
+defm : RVVIndexedSegLoadTuple<"vloxseg">;
+}
+
+let UnMaskedPolicyScheme = NonePolicy,
+    MaskedPolicyScheme = NonePolicy,
+    IsTuple = true in {
+defm : RVVUnitStridedSegStoreTuple<"vsseg">;
+defm : RVVStridedSegStoreTuple<"vssseg">;
+defm : RVVIndexedSegStoreTuple<"vsuxseg">;
+defm : RVVIndexedSegStoreTuple<"vsoxseg">;
+}
+
+
 let UnMaskedPolicyScheme = NonePolicy,
     MaskedPolicyScheme = NonePolicy in {
 defm : RVVUnitStridedSegStore<"vsseg">;
@@ -2174,6 +2536,11 @@
   let Name = "vget_v", MaskedPolicyScheme = NonePolicy,
       ManualCodegen = [{
       {
+        if (isa<StructType>(Ops[0]->getType())) // For tuple type
+          // Extract value from index (operand 1) of vtuple (operand 0)
+          return Builder.CreateExtractValue(
+            Ops[0],
+            {(unsigned)cast<ConstantInt>(Ops[1])->getZExtValue()});
         auto *VecTy = cast<ScalableVectorType>(ResultType);
         auto *OpVecTy = cast<ScalableVectorType>(Ops[0]->getType());
         // Mask to only valid indices.
@@ -2191,11 +2558,22 @@
       def : RVVBuiltin<"v" # dst_lmul # "v", dst_lmul # "vvKz", "csilxfd", dst_lmul # "v">;
       def : RVVBuiltin<"Uv" # dst_lmul # "Uv", dst_lmul # "UvUvKz", "csil", dst_lmul # "Uv">;
     }
+    foreach nf = [2] in {
+      let Log2LMUL = [0] in {
+        defvar T = "(Tuple:" # nf # ")";
+        def : RVVBuiltin<T # "vv", "v" # T # "vKz", "i", "v">;
+      }
+    }
   }
 
   let Name = "vset_v", Log2LMUL = [0, 1, 2], MaskedPolicyScheme = NonePolicy,
       ManualCodegen = [{
       {
+        if (isa<StructType>(ResultType)) // For tuple type
+          // Insert value (operand 2) into index (operand 1) of vtuple (operand 0)
+          return Builder.CreateInsertValue(
+            Ops[0], Ops[2],
+            {(unsigned)cast<ConstantInt>(Ops[1])->getZExtValue()});
         auto *ResVecTy = cast<ScalableVectorType>(ResultType);
         auto *VecTy = cast<ScalableVectorType>(Ops[2]->getType());
         // Mask to only valid indices.
@@ -2213,5 +2591,11 @@
       def : RVVBuiltin<"v" # dst_lmul # "v", dst_lmul # "v" # dst_lmul # "vKzv", "csilxfd">;
       def : RVVBuiltin<"Uv" # dst_lmul # "Uv", dst_lmul # "Uv" # dst_lmul #"UvKzUv", "csil">;
     }
+    foreach nf = [2] in {
+      let Log2LMUL = [0] in {
+        defvar T = "(Tuple:" # nf # ")";
+        def : RVVBuiltin<T # "vv", T # "v" # T # "vKzv", "i", "v">;
+      }
+    }
   }
 }
diff --git a/clang/include/clang/Basic/riscv_vector_common.td b/clang/include/clang/Basic/riscv_vector_common.td
--- a/clang/include/clang/Basic/riscv_vector_common.td
+++ b/clang/include/clang/Basic/riscv_vector_common.td
@@ -231,6 +231,9 @@
 
   // Number of fields for Load/Store Segment instructions.
   int NF = 1;
+
+  // Set to true if the builtin is associated with tuple types.
+  bit IsTuple = false;
 }
 
 // This is the code emitted in the header.
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -3494,7 +3494,8 @@
 def mwatchos_version_min_EQ : Joined<["-"], "mwatchos-version-min=">, Group<m_Group>;
 def mwatchos_simulator_version_min_EQ : Joined<["-"], "mwatchos-simulator-version-min=">;
 def mwatchsimulator_version_min_EQ : Joined<["-"], "mwatchsimulator-version-min=">, Alias<mwatchos_simulator_version_min_EQ>;
-def march_EQ : Joined<["-"], "march=">, Group<m_Group>, Flags<[CoreOption]>;
+def march_EQ : Joined<["-"], "march=">, Group<m_Group>, Flags<[CoreOption]>,
+  HelpText<"For a list of available architectures for the target use '-mcpu=help'">;
 def masm_EQ : Joined<["-"], "masm=">, Group<m_Group>, Flags<[NoXarchOption]>;
 def inline_asm_EQ : Joined<["-"], "inline-asm=">, Group<m_Group>, Flags<[CC1Option]>,
   Values<"att,intel">,
@@ -3518,7 +3519,8 @@
 def mguard_EQ : Joined<["-"], "mguard=">, Group<m_Group>, Flags<[NoXarchOption]>,
   HelpText<"Enable or disable Control Flow Guard checks and guard tables emission">,
   Values<"none,cf,cf-nochecks">;
-def mcpu_EQ : Joined<["-"], "mcpu=">, Group<m_Group>;
+def mcpu_EQ : Joined<["-"], "mcpu=">, Group<m_Group>, 
+  HelpText<"For a list of available CPUs for the target use '-mcpu=help'">;
 def mmcu_EQ : Joined<["-"], "mmcu=">, Group<m_Group>;
 def msim : Flag<["-"], "msim">, Group<m_Group>;
 def mdynamic_no_pic : Joined<["-"], "mdynamic-no-pic">, Group<m_Group>;
@@ -5830,7 +5832,7 @@
   Group<f_Group>, CodeGenOpts<"EnableAssignmentTracking">,
   NormalizedValuesScope<"CodeGenOptions::AssignmentTrackingOpts">,
   Values<"disabled,enabled,forced">, NormalizedValues<["Disabled","Enabled","Forced"]>,
-  MarshallingInfoEnum<CodeGenOpts<"AssignmentTrackingMode">, "Enabled">;
+  MarshallingInfoEnum<CodeGenOpts<"AssignmentTrackingMode">, "Disabled">;
 
 } // let Flags = [CC1Option, NoDriverOption]
 
diff --git a/clang/include/clang/Support/RISCVVIntrinsicUtils.h b/clang/include/clang/Support/RISCVVIntrinsicUtils.h
--- a/clang/include/clang/Support/RISCVVIntrinsicUtils.h
+++ b/clang/include/clang/Support/RISCVVIntrinsicUtils.h
@@ -58,6 +58,7 @@
   SFixedLog2LMUL1,
   SFixedLog2LMUL2,
   SFixedLog2LMUL3,
+  Tuple2,
 };
 
 // Similar to basic type but used to describe what's kind of type related to
@@ -243,6 +244,8 @@
   unsigned ElementBitwidth = 0;
   VScaleVal Scale = 0;
   bool Valid;
+  bool IsTuple = false;
+  unsigned NF = 0;
 
   std::string BuiltinStr;
   std::string ClangBuiltinStr;
@@ -293,10 +296,15 @@
   }
   bool isConstant() const { return IsConstant; }
   bool isPointer() const { return IsPointer; }
+  bool isTuple() const { return IsTuple; }
   unsigned getElementBitwidth() const { return ElementBitwidth; }
 
   ScalarTypeKind getScalarType() const { return ScalarType; }
   VScaleVal getScale() const { return Scale; }
+  unsigned getNF() const {
+    assert(NF > 1 && NF < 8 && "Only legal NF should be fetched");
+    return NF;
+  }
 
 private:
   // Verify RVV vector type and set Valid.
@@ -444,7 +452,7 @@
   computeBuiltinTypes(llvm::ArrayRef<PrototypeDescriptor> Prototype,
                       bool IsMasked, bool HasMaskedOffOperand, bool HasVL,
                       unsigned NF, PolicyScheme DefaultScheme,
-                      Policy PolicyAttrs);
+                      Policy PolicyAttrs, bool IsTuple);
 
   static llvm::SmallVector<Policy> getSupportedUnMaskedPolicies();
   static llvm::SmallVector<Policy>
@@ -512,6 +520,7 @@
   bool HasMaskedOffOperand : 1;
   bool HasTailPolicy : 1;
   bool HasMaskPolicy : 1;
+  bool IsTuple : 1;
   uint8_t UnMaskedPolicyScheme : 2;
   uint8_t MaskedPolicyScheme : 2;
 };
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -4042,8 +4042,8 @@
 /// getScalableVectorType - Return the unique reference to a scalable vector
 /// type of the specified element type and size. VectorType must be a built-in
 /// type.
-QualType ASTContext::getScalableVectorType(QualType EltTy,
-                                           unsigned NumElts) const {
+QualType ASTContext::getScalableVectorType(QualType EltTy, unsigned NumElts,
+                                           unsigned NumFields) const {
   if (Target->hasAArch64SVETypes()) {
     uint64_t EltTySize = getTypeSize(EltTy);
 #define SVE_VECTOR_TYPE(Name, MangledName, Id, SingletonId, NumEls, ElBits,    \
@@ -4067,15 +4067,15 @@
     uint64_t EltTySize = getTypeSize(EltTy);
 #define RVV_VECTOR_TYPE(Name, Id, SingletonId, NumEls, ElBits, NF, IsSigned,   \
                         IsFP)                                                  \
-    if (!EltTy->isBooleanType() &&                                             \
-        ((EltTy->hasIntegerRepresentation() &&                                 \
-          EltTy->hasSignedIntegerRepresentation() == IsSigned) ||              \
-         (EltTy->hasFloatingRepresentation() && IsFP)) &&                      \
-        EltTySize == ElBits && NumElts == NumEls)                              \
-      return SingletonId;
+  if (!EltTy->isBooleanType() &&                                               \
+      ((EltTy->hasIntegerRepresentation() &&                                   \
+        EltTy->hasSignedIntegerRepresentation() == IsSigned) ||                \
+       (EltTy->hasFloatingRepresentation() && IsFP)) &&                        \
+      EltTySize == ElBits && NumElts == NumEls && NumFields == NF)             \
+    return SingletonId;
 #define RVV_PREDICATE_TYPE(Name, Id, SingletonId, NumEls)                      \
-    if (EltTy->isBooleanType() && NumElts == NumEls)                           \
-      return SingletonId;
+  if (EltTy->isBooleanType() && NumElts == NumEls)                             \
+    return SingletonId;
 #include "clang/Basic/RISCVVTypes.def"
   }
   return QualType();
diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp
--- a/clang/lib/AST/TextNodeDumper.cpp
+++ b/clang/lib/AST/TextNodeDumper.cpp
@@ -1821,7 +1821,8 @@
   if (D->hasInit()) {
     const Expr *E = D->getInit();
     // Only dump the value of constexpr VarDecls for now.
-    if (E && !E->isValueDependent() && D->isConstexpr()) {
+    if (E && !E->isValueDependent() && D->isConstexpr() &&
+        !D->getType()->isDependentType()) {
       const APValue *Value = D->evaluateValue();
       if (Value)
         AddChild("value", [=] { Visit(*Value, E->getType()); });
diff --git a/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp b/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp
--- a/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp
@@ -399,6 +399,18 @@
   }
 }
 
+void transferArrowOpCall(const Expr *UnwrapExpr, const Expr *ObjectExpr,
+                         LatticeTransferState &State) {
+  if (auto *OptionalVal =
+          getValueBehindPossiblePointer(*ObjectExpr, State.Env)) {
+    if (auto *Loc = maybeInitializeOptionalValueMember(
+            UnwrapExpr->getType()->getPointeeType(), *OptionalVal, State.Env)) {
+      State.Env.setValueStrict(*UnwrapExpr,
+                               State.Env.create<PointerValue>(*Loc));
+    }
+  }
+}
+
 void transferMakeOptionalCall(const CallExpr *E,
                               const MatchFinder::MatchResult &,
                               LatticeTransferState &State) {
@@ -774,25 +786,22 @@
             transferUnwrapCall(E, E->getImplicitObjectArgument(), State);
           })
 
-      // optional::operator*, optional::operator->
-      // FIXME: This does something slightly strange for `operator->`.
-      // `transferUnwrapCall()` may create a new value of type `T` for the
-      // `optional<T>`, and it associates that value with `E`. In the case of
-      // `operator->`, `E` is a pointer. As a result, we associate an
-      // expression of pointer type with a storage location of non-pointer type
-      // `T`. This can confound other code that expects expressions of
-      // pointer type to be associated with `PointerValue`s, such as the
-      // centrally provided accessors `getImplicitObjectLocation()` and
-      // `getBaseObjectLocation()`, and this is the reason we need to use our
-      // own 'maybeSkipPointer()` and `getValueBehindPossiblePointer()` instead
-      // of these accessors.
-      .CaseOfCFGStmt<CallExpr>(valueOperatorCall(std::nullopt),
+      // optional::operator*
+      .CaseOfCFGStmt<CallExpr>(isOptionalOperatorCallWithName("*"),
                                [](const CallExpr *E,
                                   const MatchFinder::MatchResult &,
                                   LatticeTransferState &State) {
                                  transferUnwrapCall(E, E->getArg(0), State);
                                })
 
+      // optional::operator->
+      .CaseOfCFGStmt<CallExpr>(isOptionalOperatorCallWithName("->"),
+                               [](const CallExpr *E,
+                                  const MatchFinder::MatchResult &,
+                                  LatticeTransferState &State) {
+                                 transferArrowOpCall(E, E->getArg(0), State);
+                               })
+
       // optional::has_value
       .CaseOfCFGStmt<CXXMemberCallExpr>(
           isOptionalMemberCallWithName("has_value"),
diff --git a/clang/lib/Analysis/FlowSensitive/Transfer.cpp b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
--- a/clang/lib/Analysis/FlowSensitive/Transfer.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
@@ -48,10 +48,8 @@
 
 static BoolValue &evaluateBooleanEquality(const Expr &LHS, const Expr &RHS,
                                           Environment &Env) {
-  if (auto *LHSValue =
-          dyn_cast_or_null<BoolValue>(Env.getValue(LHS, SkipPast::Reference)))
-    if (auto *RHSValue =
-            dyn_cast_or_null<BoolValue>(Env.getValue(RHS, SkipPast::Reference)))
+  if (auto *LHSValue = dyn_cast_or_null<BoolValue>(Env.getValueStrict(LHS)))
+    if (auto *RHSValue = dyn_cast_or_null<BoolValue>(Env.getValueStrict(RHS)))
       return Env.makeIff(*LHSValue, *RHSValue);
 
   return Env.makeAtomicBoolValue();
@@ -121,9 +119,7 @@
 // value, if any unpacking occured. Also, does the lvalue-to-rvalue conversion,
 // by skipping past the reference.
 static Value *maybeUnpackLValueExpr(const Expr &E, Environment &Env) {
-  // FIXME: this is too flexible: it _allows_ a reference, while it should
-  // _require_ one, since lvalues should always be wrapped in `ReferenceValue`.
-  auto *Loc = Env.getStorageLocation(E, SkipPast::Reference);
+  auto *Loc = Env.getStorageLocationStrict(E);
   if (Loc == nullptr)
     return nullptr;
   auto *Val = Env.getValue(*Loc);
@@ -139,6 +135,29 @@
   return &UnpackedVal;
 }
 
+static void propagateValue(const Expr &From, const Expr &To, Environment &Env) {
+  if (auto *Val = Env.getValueStrict(From))
+    Env.setValueStrict(To, *Val);
+}
+
+static void propagateStorageLocation(const Expr &From, const Expr &To,
+                                     Environment &Env) {
+  if (auto *Loc = Env.getStorageLocationStrict(From))
+    Env.setStorageLocationStrict(To, *Loc);
+}
+
+// Forwards the value or storage location of `From` to `To` in cases where
+// `From` may be either a glvalue or a prvalue. `To` must be a glvalue iff
+// `From` is a glvalue.
+static void propagateValueOrStorageLocation(const Expr &From, const Expr &To,
+                                            Environment &Env) {
+  assert(From.isGLValue() == To.isGLValue());
+  if (From.isGLValue())
+    propagateStorageLocation(From, To, Env);
+  else
+    propagateValue(From, To, Env);
+}
+
 namespace {
 
 class TransferVisitor : public ConstStmtVisitor<TransferVisitor> {
@@ -155,13 +174,11 @@
 
     switch (S->getOpcode()) {
     case BO_Assign: {
-      auto *LHSLoc = Env.getStorageLocation(*LHS, SkipPast::Reference);
+      auto *LHSLoc = Env.getStorageLocationStrict(*LHS);
       if (LHSLoc == nullptr)
         break;
 
-      // No skipping should be necessary, because any lvalues should have
-      // already been stripped off in evaluating the LValueToRValue cast.
-      auto *RHSVal = Env.getValue(*RHS, SkipPast::None);
+      auto *RHSVal = Env.getValueStrict(*RHS);
       if (RHSVal == nullptr)
         break;
 
@@ -276,7 +293,7 @@
         return;
       }
 
-      if (auto *InitExprVal = Env.getValue(*InitExpr, SkipPast::None))
+      if (auto *InitExprVal = Env.getValueStrict(*InitExpr))
         Env.setValue(Loc, *InitExprVal);
 
       if (Env.getValue(Loc) == nullptr) {
@@ -443,7 +460,7 @@
     }
     case UO_LNot: {
       auto *SubExprVal =
-          dyn_cast_or_null<BoolValue>(Env.getValue(*SubExpr, SkipPast::None));
+          dyn_cast_or_null<BoolValue>(Env.getValueStrict(*SubExpr));
       if (SubExprVal == nullptr)
         break;
 
@@ -653,19 +670,13 @@
       const Expr *SubExpr = S->getSubExpr();
       assert(SubExpr != nullptr);
 
-      auto *SubExprLoc = Env.getStorageLocation(*SubExpr, SkipPast::None);
-      if (SubExprLoc == nullptr)
-        return;
-
-      Env.setStorageLocation(*S, *SubExprLoc);
+      propagateValue(*SubExpr, *S, Env);
     }
   }
 
   void VisitCXXTemporaryObjectExpr(const CXXTemporaryObjectExpr *S) {
-    auto &Loc = Env.createStorageLocation(*S);
-    Env.setStorageLocation(*S, Loc);
     if (Value *Val = Env.createValue(S->getType()))
-      Env.setValue(Loc, *Val);
+      Env.setValueStrict(*S, *Val);
   }
 
   void VisitCallExpr(const CallExpr *S) {
@@ -703,22 +714,20 @@
     const Expr *SubExpr = S->getSubExpr();
     assert(SubExpr != nullptr);
 
-    auto *SubExprLoc = Env.getStorageLocation(*SubExpr, SkipPast::None);
-    if (SubExprLoc == nullptr)
+    Value *SubExprVal = Env.getValueStrict(*SubExpr);
+    if (SubExprVal == nullptr)
       return;
 
-    Env.setStorageLocation(*S, *SubExprLoc);
+    auto &Loc = Env.createStorageLocation(*S);
+    Env.setStorageLocationStrict(*S, Loc);
+    Env.setValue(Loc, *SubExprVal);
   }
 
   void VisitCXXBindTemporaryExpr(const CXXBindTemporaryExpr *S) {
     const Expr *SubExpr = S->getSubExpr();
     assert(SubExpr != nullptr);
 
-    auto *SubExprLoc = Env.getStorageLocation(*SubExpr, SkipPast::None);
-    if (SubExprLoc == nullptr)
-      return;
-
-    Env.setStorageLocation(*S, *SubExprLoc);
+    propagateValue(*SubExpr, *S, Env);
   }
 
   void VisitCXXStaticCastExpr(const CXXStaticCastExpr *S) {
@@ -726,11 +735,7 @@
       const Expr *SubExpr = S->getSubExpr();
       assert(SubExpr != nullptr);
 
-      auto *SubExprLoc = Env.getStorageLocation(*SubExpr, SkipPast::None);
-      if (SubExprLoc == nullptr)
-        return;
-
-      Env.setStorageLocation(*S, *SubExprLoc);
+      propagateValueOrStorageLocation(*SubExpr, *S, Env);
     }
   }
 
@@ -738,10 +743,14 @@
     // FIXME: Revisit this once flow conditions are added to the framework. For
     // `a = b ? c : d` we can add `b => a == c && !b => a == d` to the flow
     // condition.
-    auto &Loc = Env.createStorageLocation(*S);
-    Env.setStorageLocation(*S, Loc);
-    if (Value *Val = Env.createValue(S->getType()))
-      Env.setValue(Loc, *Val);
+    if (S->isGLValue()) {
+      auto &Loc = Env.createStorageLocation(*S);
+      Env.setStorageLocationStrict(*S, Loc);
+      if (Value *Val = Env.createValue(S->getType()))
+        Env.setValue(Loc, *Val);
+    } else if (Value *Val = Env.createValue(S->getType())) {
+      Env.setValueStrict(*S, *Val);
+    }
   }
 
   void VisitInitListExpr(const InitListExpr *S) {
@@ -780,9 +789,7 @@
   }
 
   void VisitCXXBoolLiteralExpr(const CXXBoolLiteralExpr *S) {
-    auto &Loc = Env.createStorageLocation(*S);
-    Env.setStorageLocation(*S, Loc);
-    Env.setValue(Loc, Env.getBoolLiteralValue(S->getValue()));
+    Env.setValueStrict(*S, Env.getBoolLiteralValue(S->getValue()));
   }
 
   void VisitParenExpr(const ParenExpr *S) {
@@ -814,11 +821,11 @@
     if (!SubExprEnv)
       return nullptr;
 
-    if (auto *Val = dyn_cast_or_null<BoolValue>(
-            SubExprEnv->getValue(SubExpr, SkipPast::Reference)))
+    if (auto *Val =
+            dyn_cast_or_null<BoolValue>(SubExprEnv->getValueStrict(SubExpr)))
       return Val;
 
-    if (Env.getStorageLocation(SubExpr, SkipPast::None) == nullptr) {
+    if (Env.getValueStrict(SubExpr) == nullptr) {
       // Sub-expressions that are logic operators are not added in basic blocks
       // (e.g. see CFG for `bool d = a && (b || c);`). If `SubExpr` is a logic
       // operator, it may not have been evaluated and assigned a value yet. In
@@ -827,8 +834,7 @@
       Visit(&SubExpr);
     }
 
-    if (auto *Val = dyn_cast_or_null<BoolValue>(
-            Env.getValue(SubExpr, SkipPast::Reference)))
+    if (auto *Val = dyn_cast_or_null<BoolValue>(Env.getValueStrict(SubExpr)))
       return Val;
 
     // If the value of `SubExpr` is still unknown, we create a fresh symbolic
diff --git a/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp b/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
--- a/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
+++ b/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
@@ -123,13 +123,10 @@
 private:
   TerminatorVisitorRetTy extendFlowCondition(const Expr &Cond) {
     // The terminator sub-expression might not be evaluated.
-    if (Env.getStorageLocation(Cond, SkipPast::None) == nullptr)
+    if (Env.getValueStrict(Cond) == nullptr)
       transfer(StmtToEnv, Cond, Env);
 
-    // FIXME: The flow condition must be an r-value, so `SkipPast::None` should
-    // suffice.
-    auto *Val =
-        cast_or_null<BoolValue>(Env.getValue(Cond, SkipPast::Reference));
+    auto *Val = cast_or_null<BoolValue>(Env.getValueStrict(Cond));
     // Value merging depends on flow conditions from different environments
     // being mutually exclusive -- that is, they cannot both be true in their
     // entirety (even if they may share some clauses). So, we need *some* value
@@ -219,7 +216,8 @@
     // operator includes a branch that contains a noreturn destructor call.
     //
     // See `NoreturnDestructorTest` for concrete examples.
-    if (Block.succ_begin()->getReachableBlock()->hasNoReturnElement()) {
+    if (Block.succ_begin()->getReachableBlock() != nullptr &&
+        Block.succ_begin()->getReachableBlock()->hasNoReturnElement()) {
       auto &StmtToBlock = AC.CFCtx.getStmtToBlock();
       auto StmtBlock = StmtToBlock.find(Block.getTerminatorStmt());
       assert(StmtBlock != StmtToBlock.end());
@@ -303,18 +301,14 @@
   auto *InitStmt = Init->getInit();
   assert(InitStmt != nullptr);
 
-  auto *InitStmtLoc = Env.getStorageLocation(*InitStmt, SkipPast::Reference);
-  if (InitStmtLoc == nullptr)
-    return;
-
-  auto *InitStmtVal = Env.getValue(*InitStmtLoc);
-  if (InitStmtVal == nullptr)
-    return;
-
   if (Member->getType()->isReferenceType()) {
+    auto *InitStmtLoc = Env.getStorageLocationStrict(*InitStmt);
+    if (InitStmtLoc == nullptr)
+      return;
+
     auto &MemberLoc = ThisLoc.getChild(*Member);
     Env.setValue(MemberLoc, Env.create<ReferenceValue>(*InitStmtLoc));
-  } else {
+  } else if (auto *InitStmtVal = Env.getValueStrict(*InitStmt)) {
     auto &MemberLoc = ThisLoc.getChild(*Member);
     Env.setValue(MemberLoc, *InitStmtVal);
   }
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -19793,6 +19793,14 @@
     ICEArguments |= (1 << 2);
 
   for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
+    // Handle aggregate argument, namely RVV tuple types in segment load/store
+    if (hasAggregateEvaluationKind(E->getArg(i)->getType())) {
+      LValue L = EmitAggExprToLValue(E->getArg(i));
+      llvm::Value *AggValue = Builder.CreateLoad(L.getAddress(*this));
+      Ops.push_back(AggValue);
+      continue;
+    }
+
     // If this is a normal argument, just emit it as a scalar.
     if ((ICEArguments & (1 << i)) == 0) {
       Ops.push_back(EmitScalarExpr(E->getArg(i)));
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -3132,30 +3132,51 @@
       llvm::StructType *STy = dyn_cast<llvm::StructType>(ArgI.getCoerceToType());
       if (ArgI.isDirect() && ArgI.getCanBeFlattened() && STy &&
           STy->getNumElements() > 1) {
-        uint64_t SrcSize = CGM.getDataLayout().getTypeAllocSize(STy);
-        llvm::Type *DstTy = Ptr.getElementType();
-        uint64_t DstSize = CGM.getDataLayout().getTypeAllocSize(DstTy);
+        llvm::TypeSize StructSize = CGM.getDataLayout().getTypeAllocSize(STy);
+        llvm::TypeSize PtrElementSize =
+            CGM.getDataLayout().getTypeAllocSize(Ptr.getElementType());
+        if (StructSize.isScalable()) {
+          assert(STy->containsHomogeneousScalableVectorTypes() &&
+                 "ABI only supports structure with homogeneous scalable vector "
+                 "type");
+          assert(StructSize == PtrElementSize &&
+                 "Only allow non-fractional movement of structure with"
+                 "homogeneous scalable vector type");
+          assert(STy->getNumElements() == NumIRArgs);
+
+          llvm::Value *LoadedStructValue = llvm::PoisonValue::get(STy);
+          for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+            auto *AI = Fn->getArg(FirstIRArg + i);
+            AI->setName(Arg->getName() + ".coerce" + Twine(i));
+            LoadedStructValue =
+                Builder.CreateInsertValue(LoadedStructValue, AI, i);
+          }
 
-        Address AddrToStoreInto = Address::invalid();
-        if (SrcSize <= DstSize) {
-          AddrToStoreInto = Builder.CreateElementBitCast(Ptr, STy);
+          Builder.CreateStore(LoadedStructValue, Ptr);
         } else {
-          AddrToStoreInto =
-            CreateTempAlloca(STy, Alloca.getAlignment(), "coerce");
-        }
+          uint64_t SrcSize = StructSize.getFixedValue();
+          uint64_t DstSize = PtrElementSize.getFixedValue();
+
+          Address AddrToStoreInto = Address::invalid();
+          if (SrcSize <= DstSize) {
+            AddrToStoreInto = Builder.CreateElementBitCast(Ptr, STy);
+          } else {
+            AddrToStoreInto =
+                CreateTempAlloca(STy, Alloca.getAlignment(), "coerce");
+          }
 
-        assert(STy->getNumElements() == NumIRArgs);
-        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
-          auto AI = Fn->getArg(FirstIRArg + i);
-          AI->setName(Arg->getName() + ".coerce" + Twine(i));
-          Address EltPtr = Builder.CreateStructGEP(AddrToStoreInto, i);
-          Builder.CreateStore(AI, EltPtr);
-        }
+          assert(STy->getNumElements() == NumIRArgs);
+          for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+            auto AI = Fn->getArg(FirstIRArg + i);
+            AI->setName(Arg->getName() + ".coerce" + Twine(i));
+            Address EltPtr = Builder.CreateStructGEP(AddrToStoreInto, i);
+            Builder.CreateStore(AI, EltPtr);
+          }
 
-        if (SrcSize > DstSize) {
-          Builder.CreateMemCpy(Ptr, AddrToStoreInto, DstSize);
+          if (SrcSize > DstSize) {
+            Builder.CreateMemCpy(Ptr, AddrToStoreInto, DstSize);
+          }
         }
-
       } else {
         // Simple case, just do a coerced store of the argument into the alloca.
         assert(NumIRArgs == 1);
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -5349,8 +5349,9 @@
 
   CGCallee Callee = OrigCallee;
 
-  if (getLangOpts().CPlusPlus && SanOpts.has(SanitizerKind::Function) &&
-      (!TargetDecl || !isa<FunctionDecl>(TargetDecl))) {
+  if (SanOpts.has(SanitizerKind::Function) &&
+      (!TargetDecl || !isa<FunctionDecl>(TargetDecl)) &&
+      !isa<FunctionNoProtoType>(PointeeType)) {
     if (llvm::Constant *PrefixSig =
             CGM.getTargetCodeGenInfo().getUBSanFunctionSignature(CGM)) {
       SanitizerScope SanScope(this);
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -572,10 +572,11 @@
 CodeGenFunction::getUBSanFunctionTypeHash(QualType Ty) const {
   // Remove any (C++17) exception specifications, to allow calling e.g. a
   // noexcept function through a non-noexcept pointer.
-  auto ProtoTy = getContext().getFunctionTypeWithExceptionSpec(Ty, EST_None);
+  if (!isa<FunctionNoProtoType>(Ty))
+    Ty = getContext().getFunctionTypeWithExceptionSpec(Ty, EST_None);
   std::string Mangled;
   llvm::raw_string_ostream Out(Mangled);
-  CGM.getCXXABI().getMangleContext().mangleTypeName(ProtoTy, Out, false);
+  CGM.getCXXABI().getMangleContext().mangleTypeName(Ty, Out, false);
   return llvm::ConstantInt::get(CGM.Int32Ty,
                                 static_cast<uint32_t>(llvm::xxHash64(Mangled)));
 }
@@ -945,7 +946,7 @@
 
   // If we are checking function types, emit a function type signature as
   // prologue data.
-  if (FD && getLangOpts().CPlusPlus && SanOpts.has(SanitizerKind::Function)) {
+  if (FD && SanOpts.has(SanitizerKind::Function)) {
     if (llvm::Constant *PrologueSig = getPrologueSignature(CGM, FD)) {
       llvm::LLVMContext &Ctx = Fn->getContext();
       llvm::MDBuilder MDB(Ctx);
diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp
--- a/clang/lib/CodeGen/CodeGenTypes.cpp
+++ b/clang/lib/CodeGen/CodeGenTypes.cpp
@@ -630,13 +630,22 @@
 #include "clang/Basic/PPCTypes.def"
 #define RVV_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/RISCVVTypes.def"
-    {
-      ASTContext::BuiltinVectorTypeInfo Info =
-          Context.getBuiltinVectorTypeInfo(cast<BuiltinType>(Ty));
-      return llvm::ScalableVectorType::get(ConvertType(Info.ElementType),
-                                           Info.EC.getKnownMinValue() *
-                                           Info.NumVectors);
-    }
+      {
+        ASTContext::BuiltinVectorTypeInfo Info =
+            Context.getBuiltinVectorTypeInfo(cast<BuiltinType>(Ty));
+        // Tuple types are expressed as aggregregate types of the same scalable
+        // vector type (e.g. vint32m1x2_t is two vint32m1_t, which is {<vscale x
+        // 2 x i32>, <vscale x 2 x i32>}).
+        if (Info.NumVectors != 1) {
+          llvm::Type *EltTy = llvm::ScalableVectorType::get(
+              ConvertType(Info.ElementType), Info.EC.getKnownMinValue());
+          llvm::SmallVector<llvm::Type *, 4> EltTys(Info.NumVectors, EltTy);
+          return llvm::StructType::get(getLLVMContext(), EltTys);
+        }
+        return llvm::ScalableVectorType::get(ConvertType(Info.ElementType),
+                                             Info.EC.getKnownMinValue() *
+                                                 Info.NumVectors);
+      }
 #define WASM_REF_TYPE(Name, MangledName, Id, SingletonId, AS)                  \
   case BuiltinType::Id: {                                                      \
     if (BuiltinType::Id == BuiltinType::WasmExternRef)                         \
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -4522,9 +4522,12 @@
     ASTContext::BuiltinVectorTypeInfo VecInfo =
         Context.getBuiltinVectorTypeInfo(cast<BuiltinType>(
             TheCall->getArg(0)->getType().getCanonicalType().getTypePtr()));
-    unsigned MaxIndex =
-        (VecInfo.EC.getKnownMinValue() * VecInfo.NumVectors) /
-        (ResVecInfo.EC.getKnownMinValue() * ResVecInfo.NumVectors);
+    unsigned MaxIndex;
+    if (VecInfo.NumVectors != 1) // vget for tuple type
+      MaxIndex = VecInfo.NumVectors;
+    else // vget for non-tuple type
+      MaxIndex = (VecInfo.EC.getKnownMinValue() * VecInfo.NumVectors) /
+                 (ResVecInfo.EC.getKnownMinValue() * ResVecInfo.NumVectors);
     return SemaBuiltinConstantArgRange(TheCall, 1, 0, MaxIndex - 1);
   }
   case RISCVVector::BI__builtin_rvv_vset_v: {
@@ -4534,9 +4537,12 @@
     ASTContext::BuiltinVectorTypeInfo VecInfo =
         Context.getBuiltinVectorTypeInfo(cast<BuiltinType>(
             TheCall->getArg(2)->getType().getCanonicalType().getTypePtr()));
-    unsigned MaxIndex =
-        (ResVecInfo.EC.getKnownMinValue() * ResVecInfo.NumVectors) /
-        (VecInfo.EC.getKnownMinValue() * VecInfo.NumVectors);
+    unsigned MaxIndex;
+    if (ResVecInfo.NumVectors != 1) // vset for tuple type
+      MaxIndex = ResVecInfo.NumVectors;
+    else // vset fo non-tuple type
+      MaxIndex = (ResVecInfo.EC.getKnownMinValue() * ResVecInfo.NumVectors) /
+                 (VecInfo.EC.getKnownMinValue() * VecInfo.NumVectors);
     return SemaBuiltinConstantArgRange(TheCall, 1, 0, MaxIndex - 1);
   }
   case RISCVVector::BI__builtin_rvv_sf_vc_i_se_u8mf8:
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -811,7 +811,7 @@
       // order to leave them uninitialized, the ILE is expanded and the extra
       // fields are then filled with NoInitExpr.
       unsigned NumElems = numStructUnionElements(ILE->getType());
-      if (RDecl->hasFlexibleArrayMember())
+      if (!RDecl->isUnion() && RDecl->hasFlexibleArrayMember())
         ++NumElems;
       if (!VerifyOnly && ILE->getNumInits() < NumElems)
         ILE->resizeInits(SemaRef.Context, NumElems);
diff --git a/clang/lib/Sema/SemaRISCVVectorLookup.cpp b/clang/lib/Sema/SemaRISCVVectorLookup.cpp
--- a/clang/lib/Sema/SemaRISCVVectorLookup.cpp
+++ b/clang/lib/Sema/SemaRISCVVectorLookup.cpp
@@ -135,8 +135,12 @@
   case Invalid:
     llvm_unreachable("Unhandled type.");
   }
-  if (Type->isVector())
-    QT = Context.getScalableVectorType(QT, *Type->getScale());
+  if (Type->isVector()) {
+    if (Type->isTuple())
+      QT = Context.getScalableVectorType(QT, *Type->getScale(), Type->getNF());
+    else
+      QT = Context.getScalableVectorType(QT, *Type->getScale());
+  }
 
   if (Type->isConstant())
     QT = Context.getConstType(QT);
@@ -214,15 +218,16 @@
       const Policy DefaultPolicy;
 
       llvm::SmallVector<PrototypeDescriptor> ProtoSeq =
-          RVVIntrinsic::computeBuiltinTypes(BasicProtoSeq, /*IsMasked=*/false,
-                                            /*HasMaskedOffOperand=*/false,
-                                            Record.HasVL, Record.NF,
-                                            UnMaskedPolicyScheme, DefaultPolicy);
+          RVVIntrinsic::computeBuiltinTypes(
+              BasicProtoSeq, /*IsMasked=*/false,
+              /*HasMaskedOffOperand=*/false, Record.HasVL, Record.NF,
+              UnMaskedPolicyScheme, DefaultPolicy, Record.IsTuple);
 
       llvm::SmallVector<PrototypeDescriptor> ProtoMaskSeq =
           RVVIntrinsic::computeBuiltinTypes(
               BasicProtoSeq, /*IsMasked=*/true, Record.HasMaskedOffOperand,
-              Record.HasVL, Record.NF, MaskedPolicyScheme, DefaultPolicy);
+              Record.HasVL, Record.NF, MaskedPolicyScheme, DefaultPolicy,
+              Record.IsTuple);
 
       bool UnMaskedHasPolicy = UnMaskedPolicyScheme != PolicyScheme::SchemeNone;
       bool MaskedHasPolicy = MaskedPolicyScheme != PolicyScheme::SchemeNone;
@@ -280,7 +285,7 @@
                   RVVIntrinsic::computeBuiltinTypes(
                       BasicProtoSeq, /*IsMasked=*/false,
                       /*HasMaskedOffOperand=*/false, Record.HasVL, Record.NF,
-                      UnMaskedPolicyScheme, P);
+                      UnMaskedPolicyScheme, P, Record.IsTuple);
               std::optional<RVVTypes> PolicyTypes = TypeCache.computeTypes(
                   BaseType, Log2LMUL, Record.NF, PolicyPrototype);
               InitRVVIntrinsic(Record, SuffixStr, OverloadedSuffixStr,
@@ -301,8 +306,9 @@
           for (auto P : SupportedMaskedPolicies) {
             llvm::SmallVector<PrototypeDescriptor> PolicyPrototype =
                 RVVIntrinsic::computeBuiltinTypes(
-                    BasicProtoSeq, /*IsMasked=*/true, Record.HasMaskedOffOperand,
-                    Record.HasVL, Record.NF, MaskedPolicyScheme, P);
+                    BasicProtoSeq, /*IsMasked=*/true,
+                    Record.HasMaskedOffOperand, Record.HasVL, Record.NF,
+                    MaskedPolicyScheme, P, Record.IsTuple);
             std::optional<RVVTypes> PolicyTypes = TypeCache.computeTypes(
                 BaseType, Log2LMUL, Record.NF, PolicyPrototype);
             InitRVVIntrinsic(Record, SuffixStr, OverloadedSuffixStr,
diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp
--- a/clang/lib/Serialization/ASTReaderStmt.cpp
+++ b/clang/lib/Serialization/ASTReaderStmt.cpp
@@ -400,8 +400,10 @@
     Clobbers.push_back(cast_or_null<StringLiteral>(Record.readSubStmt()));
 
   // Labels
-  for (unsigned I = 0, N = NumLabels; I != N; ++I)
+  for (unsigned I = 0, N = NumLabels; I != N; ++I) {
+    Names.push_back(Record.readIdentifier());
     Exprs.push_back(Record.readSubStmt());
+  }
 
   S->setOutputsAndInputsAndClobbers(Record.getContext(),
                                     Names.data(), Constraints.data(),
diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp
--- a/clang/lib/Serialization/ASTWriterStmt.cpp
+++ b/clang/lib/Serialization/ASTWriterStmt.cpp
@@ -317,7 +317,10 @@
     Record.AddStmt(S->getClobberStringLiteral(I));
 
   // Labels
-  for (auto *E : S->labels()) Record.AddStmt(E);
+  for (unsigned I = 0, N = S->getNumLabels(); I != N; ++I) {
+    Record.AddIdentifierRef(S->getLabelIdentifier(I));
+    Record.AddStmt(S->getLabelExpr(I));
+  }
 
   Code = serialization::STMT_GCCASM;
 }
diff --git a/clang/lib/Support/RISCVVIntrinsicUtils.cpp b/clang/lib/Support/RISCVVIntrinsicUtils.cpp
--- a/clang/lib/Support/RISCVVIntrinsicUtils.cpp
+++ b/clang/lib/Support/RISCVVIntrinsicUtils.cpp
@@ -113,6 +113,8 @@
     return false;
   if (isFloat() && ElementBitwidth == 8)
     return false;
+  if (IsTuple && (NF == 1 || NF > 8))
+    return false;
   unsigned V = *Scale;
   switch (ElementBitwidth) {
   case 1:
@@ -214,6 +216,9 @@
   // vector values.
   if (IsPointer)
     BuiltinStr += "*";
+
+  if (IsTuple)
+    BuiltinStr = "T" + utostr(NF) + BuiltinStr;
 }
 
 void RVVType::initClangBuiltinStr() {
@@ -237,7 +242,8 @@
   default:
     llvm_unreachable("ScalarTypeKind is invalid");
   }
-  ClangBuiltinStr += utostr(ElementBitwidth) + LMUL.str() + "_t";
+  ClangBuiltinStr += utostr(ElementBitwidth) + LMUL.str() +
+                     (IsTuple ? "x" + utostr(NF) : "") + "_t";
 }
 
 void RVVType::initTypeStr() {
@@ -249,7 +255,8 @@
   auto getTypeString = [&](StringRef TypeStr) {
     if (isScalar())
       return Twine(TypeStr + Twine(ElementBitwidth) + "_t").str();
-    return Twine("v" + TypeStr + Twine(ElementBitwidth) + LMUL.str() + "_t")
+    return Twine("v" + TypeStr + Twine(ElementBitwidth) + LMUL.str() +
+                 (IsTuple ? "x" + utostr(NF) : "") + "_t")
         .str();
   };
 
@@ -325,6 +332,8 @@
   }
   if (isVector())
     ShortStr += LMUL.str();
+  if (isTuple())
+    ShortStr += "x" + utostr(NF);
 }
 
 void RVVType::applyBasicType() {
@@ -542,6 +551,19 @@
         return std::nullopt;
       }
 
+    } else if (ComplexTT.first == "Tuple") {
+      unsigned NF = 0;
+      if (ComplexTT.second.getAsInteger(10, NF)) {
+        llvm_unreachable("Invalid NF value!");
+        return std::nullopt;
+      }
+      switch (NF) {
+      case 2:
+        VTM = VectorTypeModifier::Tuple2;
+        break;
+      default:
+        llvm_unreachable("Unhandled NF");
+      }
     } else {
       llvm_unreachable("Illegal complex type transformers!");
     }
@@ -702,6 +724,11 @@
   case VectorTypeModifier::SFixedLog2LMUL3:
     applyFixedLog2LMUL(3, FixedLMULType::SmallerThan);
     break;
+  case VectorTypeModifier::Tuple2: {
+    IsTuple = true;
+    NF = 2;
+    break;
+  }
   case VectorTypeModifier::NoModifier:
     break;
   }
@@ -912,7 +939,7 @@
 llvm::SmallVector<PrototypeDescriptor> RVVIntrinsic::computeBuiltinTypes(
     llvm::ArrayRef<PrototypeDescriptor> Prototype, bool IsMasked,
     bool HasMaskedOffOperand, bool HasVL, unsigned NF,
-    PolicyScheme DefaultScheme, Policy PolicyAttrs) {
+    PolicyScheme DefaultScheme, Policy PolicyAttrs, bool IsTuple) {
   SmallVector<PrototypeDescriptor> NewPrototype(Prototype.begin(),
                                                 Prototype.end());
   bool HasPassthruOp = DefaultScheme == PolicyScheme::HasPassthruOperand;
@@ -938,8 +965,12 @@
       // to
       // (void, op0 address, op1 address, ..., mask, maskedoff0, maskedoff1,
       // ...)
-      NewPrototype.insert(NewPrototype.begin() + NF + 1,
-                          PrototypeDescriptor::Mask);
+      if (IsTuple)
+        NewPrototype.insert(NewPrototype.begin() + 1,
+                            PrototypeDescriptor::Mask);
+      else
+        NewPrototype.insert(NewPrototype.begin() + NF + 1,
+                            PrototypeDescriptor::Mask);
     } else {
       // If IsMasked, insert PrototypeDescriptor:Mask as first input operand.
       NewPrototype.insert(NewPrototype.begin() + 1, PrototypeDescriptor::Mask);
@@ -963,6 +994,8 @@
   // If HasVL, append PrototypeDescriptor:VL to last operand
   if (HasVL)
     NewPrototype.push_back(PrototypeDescriptor::VL);
+  if (IsTuple)
+    NewPrototype[0].VTM = static_cast<uint8_t>(VectorTypeModifier::Tuple2);
   return NewPrototype;
 }
 
@@ -1077,6 +1110,7 @@
   OS << (int)Record.HasMaskedOffOperand << ",";
   OS << (int)Record.HasTailPolicy << ",";
   OS << (int)Record.HasMaskPolicy << ",";
+  OS << (int)Record.IsTuple << ",";
   OS << (int)Record.UnMaskedPolicyScheme << ",";
   OS << (int)Record.MaskedPolicyScheme << ",";
   OS << "},\n";
diff --git a/clang/test/AST/ast-dump-decl.cpp b/clang/test/AST/ast-dump-decl.cpp
--- a/clang/test/AST/ast-dump-decl.cpp
+++ b/clang/test/AST/ast-dump-decl.cpp
@@ -818,3 +818,38 @@
 // CHECK:       `-TextComment
 // CHECK: VarDecl {{.*}} Test 'int' extern
 // CHECK-NOT: FullComment
+
+namespace TestConstexprVariableTemplateWithInitializer {
+  template<typename T> constexpr T foo{};
+  // CHECK:      VarTemplateDecl 0x{{.+}} <{{.+}}:[[@LINE-1]]:3, col:40> col:36 foo
+  // CHECK-NEXT: |-TemplateTypeParmDecl 0x{{.+}} <col:12, col:21> col:21 referenced typename depth 0 index 0 T
+  // CHECK-NEXT: `-VarDecl 0x{{.+}} <col:24, col:40> col:36 foo 'const T' constexpr listinit
+  // CHECK-NEXT:  `-InitListExpr 0x{{.+}} <col:39, col:40> 'void'
+
+  template<typename T> constexpr int val{42};
+  // CHECK:      VarTemplateDecl 0x{{.+}} <{{.+}}:[[@LINE-1]]:3, col:44> col:38 val
+  // CHECK-NEXT: |-TemplateTypeParmDecl 0x{{.+}} <col:12, col:21> col:21 typename depth 0 index 0 T
+  // CHECK-NEXT: `-VarDecl 0x{{.+}} <col:24, col:44> col:38 val 'const int' constexpr listinit
+  // CHECK-NEXT:  |-value: Int 42
+  // CHECK-NEXT:  `-InitListExpr 0x{{.+}} <col:41, col:44> 'int'
+
+  template <typename _Tp>
+  struct in_place_type_t {
+    explicit in_place_type_t() = default;
+  };
+
+  template <typename _Tp>
+  inline constexpr in_place_type_t<_Tp> in_place_type{};
+  // CHECK:     -VarTemplateDecl 0x{{.+}} <line:[[@LINE-2]]:3, line:[[@LINE-1]]:55> col:41 in_place_type
+  // CHECK-NEXT: |-TemplateTypeParmDecl 0x{{.+}} <line:[[@LINE-3]]:13, col:22> col:22 referenced typename depth 0 index 0 _Tp
+  // CHECK-NEXT: `-VarDecl 0x{{.+}} <line:[[@LINE-3]]:3, col:55> col:41 in_place_type 'const in_place_type_t<_Tp>':'const in_place_type_t<_Tp>' inline constexpr listinit
+  // CHECK-NEXT:  `-InitListExpr 0x{{.+}} <col:54, col:55> 'void'
+
+  template <typename T> constexpr T call_init(0);
+  // CHECK:     -VarTemplateDecl 0x{{.+}} <line:[[@LINE-1]]:3, col:48> col:37 call_init
+  // CHECK-NEXT: |-TemplateTypeParmDecl 0x{{.+}} <col:13, col:22> col:22 referenced typename depth 0 index 0 T
+  // CHECK-NEXT: `-VarDecl 0x{{.+}} <col:25, col:48> col:37 call_init 'const T' constexpr callinit
+  // CHECK-NEXT:  `-ParenListExpr 0x{{.+}} <col:46, col:48> 'NULL TYPE'
+  // CHECK-NEXT:   `-IntegerLiteral 0x{{.+}} <col:47> 'int' 0
+
+}
diff --git a/clang/test/CodeGen/RISCV/rvb-intrinsics/riscv64-zbb.c b/clang/test/CodeGen/RISCV/rvb-intrinsics/riscv64-zbb.c
--- a/clang/test/CodeGen/RISCV/rvb-intrinsics/riscv64-zbb.c
+++ b/clang/test/CodeGen/RISCV/rvb-intrinsics/riscv64-zbb.c
@@ -50,6 +50,18 @@
   return __builtin_riscv_clz_64(a);
 }
 
+// RV64ZBB-LABEL: @ctz_32(
+// RV64ZBB-NEXT:  entry:
+// RV64ZBB-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// RV64ZBB-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR]], align 4
+// RV64ZBB-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// RV64ZBB-NEXT:    [[TMP1:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP0]], i1 false)
+// RV64ZBB-NEXT:    ret i32 [[TMP1]]
+//
+int ctz_32(int a) {
+  return __builtin_riscv_ctz_32(a);
+}
+
 // RV64ZBB-LABEL: @ctz_64(
 // RV64ZBB-NEXT:  entry:
 // RV64ZBB-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vget_tuple.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vget_tuple.c
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vget_tuple.c
@@ -0,0 +1,20 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
+// RUN:   -target-feature +experimental-zvfh -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_vget_v_i32m1x2_i32m1
+// CHECK-RV64-SAME: (<vscale x 2 x i32> [[SRC_COERCE0:%.*]], <vscale x 2 x i32> [[SRC_COERCE1:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[SRC_COERCE0]], 0
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]], <vscale x 2 x i32> [[SRC_COERCE1]], 1
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP1]], 0
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP2]]
+//
+vint32m1_t test_vget_v_i32m1x2_i32m1(vint32m1x2_t src) {
+  return __riscv_vget_v_i32m1x2_i32m1(src, 0);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg2ei32_tuple.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg2ei32_tuple.c
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vloxseg2ei32_tuple.c
@@ -0,0 +1,28 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
+// RUN:   -target-feature +experimental-zvfh -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vloxseg2ei32_v_tuple_i32m1
+// CHECK-RV64-SAME: (ptr noundef [[BASE:%.*]], <vscale x 2 x i32> [[BINDEX:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.riscv.vloxseg2.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> poison, <vscale x 2 x i32> poison, ptr [[BASE]], <vscale x 2 x i32> [[BINDEX]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]]
+//
+vint32m1x2_t test_vloxseg2ei32_v_tuple_i32m1(const int32_t *base, vuint32m1_t bindex, size_t vl) {
+  return __riscv_vloxseg2ei32_v_tuple_i32m1(base, bindex, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vloxseg2ei32_v_tuple_i32m1_m
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], ptr noundef [[BASE:%.*]], <vscale x 2 x i32> [[BINDEX:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.riscv.vloxseg2.mask.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> poison, <vscale x 2 x i32> poison, ptr [[BASE]], <vscale x 2 x i32> [[BINDEX]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]]
+//
+vint32m1x2_t test_vloxseg2ei32_v_tuple_i32m1_m(vbool32_t mask, const int32_t *base, vuint32m1_t bindex, size_t vl) {
+  return __riscv_vloxseg2ei32_v_tuple_i32m1_m(mask, base, bindex, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg2e32_tuple.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg2e32_tuple.c
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg2e32_tuple.c
@@ -0,0 +1,27 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
+// RUN:   -target-feature +experimental-zvfh -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vlseg2e32_v_tuple_i32m1
+// CHECK-RV64-SAME: (ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.riscv.vlseg2.nxv2i32.i64(<vscale x 2 x i32> poison, <vscale x 2 x i32> poison, ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]]
+//
+vint32m1x2_t test_vlseg2e32_v_tuple_i32m1(const int32_t *base, size_t vl) {
+  return __riscv_vlseg2e32_v_tuple_i32m1(base, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vlseg2e32_v_tuple_i32m1_m
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.riscv.vlseg2.mask.nxv2i32.i64(<vscale x 2 x i32> poison, <vscale x 2 x i32> poison, ptr [[BASE]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]]
+//
+vint32m1x2_t test_vlseg2e32_v_tuple_i32m1_m(vbool32_t mask, const int32_t *base, size_t vl) {
+  return __riscv_vlseg2e32_v_tuple_i32m1_m(mask, base, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg2e32ff_tuple.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg2e32ff_tuple.c
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlseg2e32ff_tuple.c
@@ -0,0 +1,39 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
+// RUN:   -target-feature +experimental-zvfh -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vlseg2e32ff_v_tuple_i32m1
+// CHECK-RV64-SAME: (ptr noundef [[BASE:%.*]], ptr noundef [[NEW_VL:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call { <vscale x 2 x i32>, <vscale x 2 x i32>, i64 } @llvm.riscv.vlseg2ff.nxv2i32.i64(<vscale x 2 x i32> poison, <vscale x 2 x i32> poison, ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, i64 } [[TMP0]], 0
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[TMP1]], 0
+// CHECK-RV64-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, i64 } [[TMP0]], 1
+// CHECK-RV64-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP2]], <vscale x 2 x i32> [[TMP3]], 1
+// CHECK-RV64-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, i64 } [[TMP0]], 2
+// CHECK-RV64-NEXT:    store i64 [[TMP5]], ptr [[NEW_VL]], align 8
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP4]]
+//
+vint32m1x2_t test_vlseg2e32ff_v_tuple_i32m1(const int32_t *base, size_t *new_vl, size_t vl) {
+  return __riscv_vlseg2e32ff_v_tuple_i32m1(base, new_vl, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vlseg2e32ff_v_tuple_i32m1_m
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], ptr noundef [[BASE:%.*]], ptr noundef [[NEW_VL:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call { <vscale x 2 x i32>, <vscale x 2 x i32>, i64 } @llvm.riscv.vlseg2ff.mask.nxv2i32.i64(<vscale x 2 x i32> poison, <vscale x 2 x i32> poison, ptr [[BASE]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, i64 } [[TMP0]], 0
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[TMP1]], 0
+// CHECK-RV64-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, i64 } [[TMP0]], 1
+// CHECK-RV64-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP2]], <vscale x 2 x i32> [[TMP3]], 1
+// CHECK-RV64-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, i64 } [[TMP0]], 2
+// CHECK-RV64-NEXT:    store i64 [[TMP5]], ptr [[NEW_VL]], align 8
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP4]]
+//
+vint32m1x2_t test_vlseg2e32ff_v_tuple_i32m1_m(vbool32_t mask, const int32_t *base, size_t *new_vl, size_t vl) {
+  return __riscv_vlseg2e32ff_v_tuple_i32m1_m(mask, base, new_vl, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlsseg2e32_tuple.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlsseg2e32_tuple.c
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vlsseg2e32_tuple.c
@@ -0,0 +1,27 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
+// RUN:   -target-feature +experimental-zvfh -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vlsseg2e32_v_tuple_i32m1
+// CHECK-RV64-SAME: (ptr noundef [[BASE:%.*]], i64 noundef [[BSTRIDE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.riscv.vlsseg2.nxv2i32.i64(<vscale x 2 x i32> poison, <vscale x 2 x i32> poison, ptr [[BASE]], i64 [[BSTRIDE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]]
+//
+vint32m1x2_t test_vlsseg2e32_v_tuple_i32m1(const int32_t *base, ptrdiff_t bstride, size_t vl) {
+  return __riscv_vlsseg2e32_v_tuple_i32m1(base, bstride, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vlsseg2e32_v_tuple_i32m1_m
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[BSTRIDE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.riscv.vlsseg2.mask.nxv2i32.i64(<vscale x 2 x i32> poison, <vscale x 2 x i32> poison, ptr [[BASE]], i64 [[BSTRIDE]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]]
+//
+vint32m1x2_t test_vlsseg2e32_v_tuple_i32m1_m(vbool32_t mask, const int32_t *base, ptrdiff_t bstride, size_t vl) {
+  return __riscv_vlsseg2e32_v_tuple_i32m1_m(mask, base, bstride, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg2ei32_tuple.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg2ei32_tuple.c
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vluxseg2ei32_tuple.c
@@ -0,0 +1,28 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
+// RUN:   -target-feature +experimental-zvfh -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vluxseg2ei32_v_tuple_i32m1
+// CHECK-RV64-SAME: (ptr noundef [[BASE:%.*]], <vscale x 2 x i32> [[BINDEX:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.riscv.vluxseg2.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> poison, <vscale x 2 x i32> poison, ptr [[BASE]], <vscale x 2 x i32> [[BINDEX]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]]
+//
+vint32m1x2_t test_vluxseg2ei32_v_tuple_i32m1(const int32_t *base, vuint32m1_t bindex, size_t vl) {
+  return __riscv_vluxseg2ei32_v_tuple_i32m1(base, bindex, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vluxseg2ei32_v_tuple_i32m1_m
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], ptr noundef [[BASE:%.*]], <vscale x 2 x i32> [[BINDEX:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.riscv.vluxseg2.mask.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> poison, <vscale x 2 x i32> poison, ptr [[BASE]], <vscale x 2 x i32> [[BINDEX]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]]
+//
+vint32m1x2_t test_vluxseg2ei32_v_tuple_i32m1_m(vbool32_t mask, const int32_t *base, vuint32m1_t bindex, size_t vl) {
+  return __riscv_vluxseg2ei32_v_tuple_i32m1_m(mask, base, bindex, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vset_tuple.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vset_tuple.c
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vset_tuple.c
@@ -0,0 +1,20 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
+// RUN:   -target-feature +experimental-zvfh -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vset_v_i32m1x2_i32m1
+// CHECK-RV64-SAME: (<vscale x 2 x i32> [[DEST_COERCE0:%.*]], <vscale x 2 x i32> [[DEST_COERCE1:%.*]], <vscale x 2 x i32> [[VAL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[DEST_COERCE0]], 0
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]], <vscale x 2 x i32> [[DEST_COERCE1]], 1
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP1]], <vscale x 2 x i32> [[VAL]], 0
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP2]]
+//
+vint32m1x2_t test_vset_v_i32m1x2_i32m1(vint32m1x2_t dest, vint32m1_t val) {
+  return __riscv_vset_v_i32m1x2_i32m1(dest, 0, val);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg2ei32_tuple.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg2ei32_tuple.c
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsoxseg2ei32_tuple.c
@@ -0,0 +1,36 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
+// RUN:   -target-feature +experimental-zvfh -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei32_v_tuple_i32m1
+// CHECK-RV64-SAME: (ptr noundef [[BASE:%.*]], <vscale x 2 x i32> [[BINDEX:%.*]], <vscale x 2 x i32> [[V_TUPLE_COERCE0:%.*]], <vscale x 2 x i32> [[V_TUPLE_COERCE1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[V_TUPLE_COERCE0]], 0
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]], <vscale x 2 x i32> [[V_TUPLE_COERCE1]], 1
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP1]], 0
+// CHECK-RV64-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP1]], 1
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[TMP2]], <vscale x 2 x i32> [[TMP3]], ptr [[BASE]], <vscale x 2 x i32> [[BINDEX]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei32_v_tuple_i32m1(int32_t *base, vuint32m1_t bindex, vint32m1x2_t v_tuple, size_t vl) {
+  return __riscv_vsoxseg2ei32_v_tuple_i32m1(base, bindex, v_tuple, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei32_v_tuple_i32m1_m
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], ptr noundef [[BASE:%.*]], <vscale x 2 x i32> [[BINDEX:%.*]], <vscale x 2 x i32> [[V_TUPLE_COERCE0:%.*]], <vscale x 2 x i32> [[V_TUPLE_COERCE1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[V_TUPLE_COERCE0]], 0
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]], <vscale x 2 x i32> [[V_TUPLE_COERCE1]], 1
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP1]], 0
+// CHECK-RV64-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP1]], 1
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.mask.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[TMP2]], <vscale x 2 x i32> [[TMP3]], ptr [[BASE]], <vscale x 2 x i32> [[BINDEX]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei32_v_tuple_i32m1_m(vbool32_t mask, int32_t *base, vuint32m1_t bindex, vint32m1x2_t v_tuple, size_t vl) {
+  return __riscv_vsoxseg2ei32_v_tuple_i32m1_m(mask, base, bindex, v_tuple, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsseg2e32_tuple.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsseg2e32_tuple.c
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsseg2e32_tuple.c
@@ -0,0 +1,31 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
+// RUN:   -target-feature +experimental-zvfh -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsseg2e32_v_tuple_i32m1
+// CHECK-RV64-SAME: (ptr noundef [[BASE:%.*]], <vscale x 2 x i32> [[V_TUPLE_COERCE0:%.*]], <vscale x 2 x i32> [[V_TUPLE_COERCE1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[V_TUPLE_COERCE0]], 0
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]], <vscale x 2 x i32> [[V_TUPLE_COERCE1]], 1
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP1]], 0
+// CHECK-RV64-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP1]], 1
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsseg2.nxv2i32.i64(<vscale x 2 x i32> [[TMP2]], <vscale x 2 x i32> [[TMP3]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsseg2e32_v_tuple_i32m1(int32_t *base, vint32m1x2_t v_tuple, size_t vl) {
+  return __riscv_vsseg2e32_v_tuple_i32m1(base, v_tuple, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsseg2e32_v_i32m1_m
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], ptr noundef [[BASE:%.*]], <vscale x 2 x i32> [[V0:%.*]], <vscale x 2 x i32> [[V1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsseg2.mask.nxv2i32.i64(<vscale x 2 x i32> [[V0]], <vscale x 2 x i32> [[V1]], ptr [[BASE]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsseg2e32_v_i32m1_m(vbool32_t mask, int32_t *base, vint32m1_t v0, vint32m1_t v1, size_t vl) {
+  return __riscv_vsseg2e32_v_i32m1_m(mask, base, v0, v1, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vssseg2e32_tuple.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vssseg2e32_tuple.c
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vssseg2e32_tuple.c
@@ -0,0 +1,36 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
+// RUN:   -target-feature +experimental-zvfh -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vssseg2e32_v_tuple_i32m1
+// CHECK-RV64-SAME: (ptr noundef [[BASE:%.*]], i64 noundef [[BSTRIDE:%.*]], <vscale x 2 x i32> [[V_TUPLE_COERCE0:%.*]], <vscale x 2 x i32> [[V_TUPLE_COERCE1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[V_TUPLE_COERCE0]], 0
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]], <vscale x 2 x i32> [[V_TUPLE_COERCE1]], 1
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP1]], 0
+// CHECK-RV64-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP1]], 1
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vssseg2.nxv2i32.i64(<vscale x 2 x i32> [[TMP2]], <vscale x 2 x i32> [[TMP3]], ptr [[BASE]], i64 [[BSTRIDE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vssseg2e32_v_tuple_i32m1(int32_t *base, ptrdiff_t bstride, vint32m1x2_t v_tuple, size_t vl) {
+  return __riscv_vssseg2e32_v_tuple_i32m1(base, bstride, v_tuple, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vssseg2e32_v_tuple_i32m1_m
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[BSTRIDE:%.*]], <vscale x 2 x i32> [[V_TUPLE_COERCE0:%.*]], <vscale x 2 x i32> [[V_TUPLE_COERCE1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[V_TUPLE_COERCE0]], 0
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]], <vscale x 2 x i32> [[V_TUPLE_COERCE1]], 1
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP1]], 0
+// CHECK-RV64-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP1]], 1
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vssseg2.mask.nxv2i32.i64(<vscale x 2 x i32> [[TMP2]], <vscale x 2 x i32> [[TMP3]], ptr [[BASE]], i64 [[BSTRIDE]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vssseg2e32_v_tuple_i32m1_m(vbool32_t mask, int32_t *base, ptrdiff_t bstride, vint32m1x2_t v_tuple, size_t vl) {
+  return __riscv_vssseg2e32_v_tuple_i32m1_m(mask, base, bstride, v_tuple, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg2ei32_tuple.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg2ei32_tuple.c
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vsuxseg2ei32_tuple.c
@@ -0,0 +1,36 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
+// RUN:   -target-feature +experimental-zvfh -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei32_v_tuple_i32m1
+// CHECK-RV64-SAME: (ptr noundef [[BASE:%.*]], <vscale x 2 x i32> [[BINDEX:%.*]], <vscale x 2 x i32> [[V_TUPLE_COERCE0:%.*]], <vscale x 2 x i32> [[V_TUPLE_COERCE1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[V_TUPLE_COERCE0]], 0
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]], <vscale x 2 x i32> [[V_TUPLE_COERCE1]], 1
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP1]], 0
+// CHECK-RV64-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP1]], 1
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[TMP2]], <vscale x 2 x i32> [[TMP3]], ptr [[BASE]], <vscale x 2 x i32> [[BINDEX]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei32_v_tuple_i32m1(int32_t *base, vuint32m1_t bindex, vint32m1x2_t v_tuple, size_t vl) {
+  return __riscv_vsuxseg2ei32_v_tuple_i32m1(base, bindex, v_tuple, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei32_v_tuple_i32m1_m
+// CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], ptr noundef [[BASE:%.*]], <vscale x 2 x i32> [[BINDEX:%.*]], <vscale x 2 x i32> [[V_TUPLE_COERCE0:%.*]], <vscale x 2 x i32> [[V_TUPLE_COERCE1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[V_TUPLE_COERCE0]], 0
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]], <vscale x 2 x i32> [[V_TUPLE_COERCE1]], 1
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP1]], 0
+// CHECK-RV64-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP1]], 1
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.mask.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[TMP2]], <vscale x 2 x i32> [[TMP3]], ptr [[BASE]], <vscale x 2 x i32> [[BINDEX]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei32_v_tuple_i32m1_m(vbool32_t mask, int32_t *base, vuint32m1_t bindex, vint32m1x2_t v_tuple, size_t vl) {
+  return __riscv_vsuxseg2ei32_v_tuple_i32m1_m(mask, base, bindex, v_tuple, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-tuple-type.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-tuple-type.c
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-tuple-type.c
@@ -0,0 +1,91 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve32x -O0 \
+// RUN:   -emit-llvm %s -o - | FileCheck %s --check-prefix=O0
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve32x -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | FileCheck %s --check-prefix=AFTER_MEM2REG
+
+
+#include <riscv_vector.h>
+
+// Declare local variable
+// O0-LABEL: define dso_local void @foo
+// O0-SAME: () #[[ATTR0:[0-9]+]] {
+// O0-NEXT:  entry:
+// O0-NEXT:    [[V_TUPLE:%.*]] = alloca { <vscale x 2 x i32>, <vscale x 2 x i32> }, align 4
+// O0-NEXT:    ret void
+//
+// AFTER_MEM2REG-LABEL: define dso_local void @foo
+// AFTER_MEM2REG-SAME: () #[[ATTR0:[0-9]+]] {
+// AFTER_MEM2REG-NEXT:  entry:
+// AFTER_MEM2REG-NEXT:    ret void
+//
+void foo() {
+  __rvv_int32m1x2_t v_tuple;
+}
+
+// Declare local variable and return
+// O0-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32> } @bar
+// O0-SAME: () #[[ATTR0]] {
+// O0-NEXT:  entry:
+// O0-NEXT:    [[V_TUPLE:%.*]] = alloca { <vscale x 2 x i32>, <vscale x 2 x i32> }, align 4
+// O0-NEXT:    [[TMP0:%.*]] = load { <vscale x 2 x i32>, <vscale x 2 x i32> }, ptr [[V_TUPLE]], align 4
+// O0-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]]
+//
+// AFTER_MEM2REG-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32> } @bar
+// AFTER_MEM2REG-SAME: () #[[ATTR0]] {
+// AFTER_MEM2REG-NEXT:  entry:
+// AFTER_MEM2REG-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32> } undef
+//
+__rvv_int32m1x2_t bar() {
+  __rvv_int32m1x2_t v_tuple;
+  return v_tuple;
+}
+
+// Pass as function parameter
+// O0-LABEL: define dso_local void @baz
+// O0-SAME: (<vscale x 2 x i32> [[V_TUPLE_COERCE0:%.*]], <vscale x 2 x i32> [[V_TUPLE_COERCE1:%.*]]) #[[ATTR0]] {
+// O0-NEXT:  entry:
+// O0-NEXT:    [[V_TUPLE:%.*]] = alloca { <vscale x 2 x i32>, <vscale x 2 x i32> }, align 4
+// O0-NEXT:    [[V_TUPLE_ADDR:%.*]] = alloca { <vscale x 2 x i32>, <vscale x 2 x i32> }, align 4
+// O0-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[V_TUPLE_COERCE0]], 0
+// O0-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]], <vscale x 2 x i32> [[V_TUPLE_COERCE1]], 1
+// O0-NEXT:    store { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP1]], ptr [[V_TUPLE]], align 4
+// O0-NEXT:    [[V_TUPLE1:%.*]] = load { <vscale x 2 x i32>, <vscale x 2 x i32> }, ptr [[V_TUPLE]], align 4
+// O0-NEXT:    store { <vscale x 2 x i32>, <vscale x 2 x i32> } [[V_TUPLE1]], ptr [[V_TUPLE_ADDR]], align 4
+// O0-NEXT:    ret void
+//
+// AFTER_MEM2REG-LABEL: define dso_local void @baz
+// AFTER_MEM2REG-SAME: (<vscale x 2 x i32> [[V_TUPLE_COERCE0:%.*]], <vscale x 2 x i32> [[V_TUPLE_COERCE1:%.*]]) #[[ATTR0]] {
+// AFTER_MEM2REG-NEXT:  entry:
+// AFTER_MEM2REG-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[V_TUPLE_COERCE0]], 0
+// AFTER_MEM2REG-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]], <vscale x 2 x i32> [[V_TUPLE_COERCE1]], 1
+// AFTER_MEM2REG-NEXT:    ret void
+//
+void baz(__rvv_int32m1x2_t v_tuple) {
+}
+
+// Pass as function parameter and return
+// O0-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32> } @qux
+// O0-SAME: (<vscale x 2 x i32> [[V_TUPLE_COERCE0:%.*]], <vscale x 2 x i32> [[V_TUPLE_COERCE1:%.*]]) #[[ATTR0]] {
+// O0-NEXT:  entry:
+// O0-NEXT:    [[V_TUPLE:%.*]] = alloca { <vscale x 2 x i32>, <vscale x 2 x i32> }, align 4
+// O0-NEXT:    [[V_TUPLE_ADDR:%.*]] = alloca { <vscale x 2 x i32>, <vscale x 2 x i32> }, align 4
+// O0-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[V_TUPLE_COERCE0]], 0
+// O0-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]], <vscale x 2 x i32> [[V_TUPLE_COERCE1]], 1
+// O0-NEXT:    store { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP1]], ptr [[V_TUPLE]], align 4
+// O0-NEXT:    [[V_TUPLE1:%.*]] = load { <vscale x 2 x i32>, <vscale x 2 x i32> }, ptr [[V_TUPLE]], align 4
+// O0-NEXT:    store { <vscale x 2 x i32>, <vscale x 2 x i32> } [[V_TUPLE1]], ptr [[V_TUPLE_ADDR]], align 4
+// O0-NEXT:    [[TMP2:%.*]] = load { <vscale x 2 x i32>, <vscale x 2 x i32> }, ptr [[V_TUPLE_ADDR]], align 4
+// O0-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP2]]
+//
+// AFTER_MEM2REG-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32> } @qux
+// AFTER_MEM2REG-SAME: (<vscale x 2 x i32> [[V_TUPLE_COERCE0:%.*]], <vscale x 2 x i32> [[V_TUPLE_COERCE1:%.*]]) #[[ATTR0]] {
+// AFTER_MEM2REG-NEXT:  entry:
+// AFTER_MEM2REG-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[V_TUPLE_COERCE0]], 0
+// AFTER_MEM2REG-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]], <vscale x 2 x i32> [[V_TUPLE_COERCE1]], 1
+// AFTER_MEM2REG-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP1]]
+//
+__rvv_int32m1x2_t qux(__rvv_int32m1x2_t v_tuple) {
+  return v_tuple;
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/vget-index-out-of-range.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/vget-index-out-of-range.c
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/vget-index-out-of-range.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/vget-index-out-of-range.c
@@ -339,3 +339,8 @@
   // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}}
   return __riscv_vget_v_f16m8_f16m4(src, 2);
 }
+
+vint32m1_t test_vget_v_i32m1x2_i32m1(vint32m1x2_t src) {
+  // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}}
+  return __riscv_vget_v_i32m1x2_i32m1(src, 2);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/vset-index-out-of-range.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/vset-index-out-of-range.c
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/vset-index-out-of-range.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/vset-index-out-of-range.c
@@ -339,3 +339,8 @@
   // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}}
   return __riscv_vset_v_f16m4_f16m8(dest, 2, val);
 }
+
+vint32m1x2_t test_vset_v_i32m1x2_i32m1(vint32m1x2_t dest, vint32m1_t val) {
+  // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}}
+  return __riscv_vset_v_i32m1x2_i32m1(dest, 2, val);
+}
diff --git a/clang/test/CodeGen/assignment-tracking/flag.cpp b/clang/test/CodeGen/assignment-tracking/flag.cpp
--- a/clang/test/CodeGen/assignment-tracking/flag.cpp
+++ b/clang/test/CodeGen/assignment-tracking/flag.cpp
@@ -8,10 +8,10 @@
 // RUN:     -emit-llvm  %s -o - -fexperimental-assignment-tracking=disabled -O1\
 // RUN: | FileCheck %s --check-prefixes=DISABLE
 
-//// Enabled by default:
+//// Disabled by default:
 // RUN: %clang_cc1 -triple x86_64-none-linux-gnu -debug-info-kind=standalone   \
 // RUN:     -emit-llvm  %s -o - -O1                                            \
-// RUN: | FileCheck %s --check-prefixes=ENABLE
+// RUN: | FileCheck %s --check-prefixes=DISABLE
 
 //// Disabled at O0 unless forced.
 // RUN: %clang_cc1 -triple x86_64-none-linux-gnu -debug-info-kind=standalone   \
diff --git a/clang/test/CodeGen/ubsan-function.c b/clang/test/CodeGen/ubsan-function.c
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGen/ubsan-function.c
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 -emit-llvm -triple x86_64 -std=c17 -fsanitize=function %s -o - | FileCheck %s
+
+// CHECK-LABEL: define{{.*}} @call_no_prototype(
+// CHECK-NOT:     __ubsan_handle_function_type_mismatch
+void call_no_prototype(void (*f)()) { f(); }
+
+// CHECK-LABEL: define{{.*}} @call_prototype(
+// CHECK:         __ubsan_handle_function_type_mismatch
+void call_prototype(void (*f)(void)) { f(); }
diff --git a/clang/test/PCH/asm-label.cpp b/clang/test/PCH/asm-label.cpp
new file mode 100644
--- /dev/null
+++ b/clang/test/PCH/asm-label.cpp
@@ -0,0 +1,23 @@
+// RUN: %clang_cc1 -emit-pch %s -o %t
+// RUN: %clang_cc1 -include-pch %t %s -verify
+#ifndef HEADER_H
+#define HEADER_H
+template<int = 0> 
+void MyMethod() {
+  void *bar;
+  some_path:
+  asm goto
+      (
+          "mov %w[foo], %w[foo]"
+          : [foo] "=r"(bar)
+          : [foo2] "r"(bar), [foo3] "r"(bar), [foo4] "r"(bar)
+          : 
+          : some_path
+      );
+  }
+#else
+void test() {
+ MyMethod();
+// expected-no-diagnostics
+}
+#endif
diff --git a/clang/test/Sema/flexible-array-in-union.c b/clang/test/Sema/flexible-array-in-union.c
new file mode 100644
--- /dev/null
+++ b/clang/test/Sema/flexible-array-in-union.c
@@ -0,0 +1,13 @@
+// RUN: %clang_cc1 %s -verify=c -fsyntax-only
+// RUN: %clang_cc1 %s -verify -fsyntax-only -x c++
+// RUN: %clang_cc1 %s -verify -fsyntax-only -fms-compatibility
+// RUN: %clang_cc1 %s -verify -fsyntax-only -fms-compatibility -x c++
+
+// The test checks that an attempt to initialize union with flexible array
+// member with an initializer list doesn't crash clang.
+
+
+union { char x[]; } r = {0}; // c-error {{flexible array member 'x' in a union is not allowed}}
+
+// expected-no-diagnostics
+
diff --git a/clang/test/Sema/riscv-types.c b/clang/test/Sema/riscv-types.c
--- a/clang/test/Sema/riscv-types.c
+++ b/clang/test/Sema/riscv-types.c
@@ -133,6 +133,9 @@
 
   // CHECK: __rvv_int8mf2_t x43;
   __rvv_int8mf2_t x43;
+
+  // CHECK: __rvv_int32m1x2_t x44;
+  __rvv_int32m1x2_t x44;
 }
 
 typedef __rvv_bool4_t vbool4_t;
diff --git a/clang/unittests/Analysis/FlowSensitive/SignAnalysisTest.cpp b/clang/unittests/Analysis/FlowSensitive/SignAnalysisTest.cpp
--- a/clang/unittests/Analysis/FlowSensitive/SignAnalysisTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/SignAnalysisTest.cpp
@@ -114,12 +114,10 @@
     return {nullptr, {}, {}};
 
   // Value of the unary op.
-  auto *UnaryOpValue = State.Env.getValue(*UO, SkipPast::None);
+  auto *UnaryOpValue = State.Env.getValueStrict(*UO);
   if (!UnaryOpValue) {
-    auto &Loc = State.Env.createStorageLocation(*UO);
-    State.Env.setStorageLocation(*UO, Loc);
     UnaryOpValue = &State.Env.makeAtomicBoolValue();
-    State.Env.setValue(Loc, *UnaryOpValue);
+    State.Env.setValueStrict(*UO, *UnaryOpValue);
   }
 
   // Properties for the operand (sub expression).
@@ -133,22 +131,17 @@
 
 void transferBinary(const BinaryOperator *BO, const MatchFinder::MatchResult &M,
                     LatticeTransferState &State) {
-  StorageLocation *Loc = State.Env.getStorageLocation(*BO, SkipPast::None);
-  if (!Loc) {
-    Loc = &State.Env.createStorageLocation(*BO);
-    State.Env.setStorageLocation(*BO, *Loc);
-  }
-  BoolValue *Comp = cast_or_null<BoolValue>(State.Env.getValue(*Loc));
+  BoolValue *Comp = cast_or_null<BoolValue>(State.Env.getValueStrict(*BO));
   if (!Comp) {
     Comp = &State.Env.makeAtomicBoolValue();
-    State.Env.setValue(*Loc, *Comp);
+    State.Env.setValueStrict(*BO, *Comp);
   }
 
   // FIXME Use this as well:
   // auto *NegatedComp = &State.Env.makeNot(*Comp);
 
-  auto *LHS = State.Env.getValue(*BO->getLHS(), SkipPast::None);
-  auto *RHS = State.Env.getValue(*BO->getRHS(), SkipPast::None);
+  auto *LHS = State.Env.getValueStrict(*BO->getLHS());
+  auto *RHS = State.Env.getValueStrict(*BO->getRHS());
 
   if (!LHS || !RHS)
     return;
@@ -244,19 +237,43 @@
   }
 }
 
+// Returns the `Value` associated with `E` (which may be either a prvalue or
+// glvalue). Creates a `Value` or `StorageLocation` as needed if `E` does not
+// have either of these associated with it yet.
+//
+// If this functionality turns out to be needed in more cases, this function
+// should be moved to a more central location.
+Value *getOrCreateValue(const Expr *E, Environment &Env) {
+  Value *Val = nullptr;
+  if (E->isGLValue()) {
+    StorageLocation *Loc = Env.getStorageLocationStrict(*E);
+    if (!Loc) {
+      Loc = &Env.createStorageLocation(*E);
+      Env.setStorageLocationStrict(*E, *Loc);
+    }
+    Val = Env.getValue(*Loc);
+    if (!Val) {
+      Val = Env.createValue(E->getType());
+      Env.setValue(*Loc, *Val);
+    }
+  } else {
+    Val = Env.getValueStrict(*E);
+    if (!Val) {
+      Val = Env.createValue(E->getType());
+      Env.setValueStrict(*E, *Val);
+    }
+  }
+  assert(Val != nullptr);
+
+  return Val;
+}
+
 void transferExpr(const Expr *E, const MatchFinder::MatchResult &M,
                   LatticeTransferState &State) {
   const ASTContext &Context = *M.Context;
-  StorageLocation *Loc = State.Env.getStorageLocation(*E, SkipPast::None);
-  if (!Loc) {
-    Loc = &State.Env.createStorageLocation(*E);
-    State.Env.setStorageLocation(*E, *Loc);
-  }
-  Value *Val = State.Env.getValue(*Loc);
-  if (!Val) {
-    Val = State.Env.createValue(Context.IntTy);
-    State.Env.setValue(*Loc, *Val);
-  }
+
+  Value *Val = getOrCreateValue(E, State.Env);
+
   // The sign symbolic values have been initialized already.
   if (Val->getProperty("neg"))
     return;
diff --git a/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp b/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
--- a/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
@@ -296,6 +296,22 @@
                                  UnorderedElementsAre("foo"))))));
 }
 
+TEST_F(NoreturnDestructorTest,
+       ConditionalOperatorConstantCondition_LeftBranchReturns) {
+  std::string Code = R"(
+    #include "noreturn_destructor_test_defs.h"
+
+    void target() {
+      int value = true ? foo() : Fatal().bar();
+      (void)0;
+      // [[p]]
+    }
+  )";
+  runDataflow(Code, UnorderedElementsAre(IsStringMapEntry(
+                        "p", HoldsFunctionCallLattice(HasCalledFunctions(
+                                 UnorderedElementsAre("foo"))))));
+}
+
 TEST_F(NoreturnDestructorTest, ConditionalOperatorRightBranchReturns) {
   std::string Code = R"(
     #include "noreturn_destructor_test_defs.h"
@@ -311,6 +327,22 @@
                                  UnorderedElementsAre("foo"))))));
 }
 
+TEST_F(NoreturnDestructorTest,
+       ConditionalOperatorConstantCondition_RightBranchReturns) {
+  std::string Code = R"(
+    #include "noreturn_destructor_test_defs.h"
+
+    void target() {
+      int value = false ? Fatal().bar() : foo();
+      (void)0;
+      // [[p]]
+    }
+  )";
+  runDataflow(Code, UnorderedElementsAre(IsStringMapEntry(
+                        "p", HoldsFunctionCallLattice(HasCalledFunctions(
+                                 UnorderedElementsAre("foo"))))));
+}
+
 TEST_F(NoreturnDestructorTest, ConditionalOperatorNestedBranchesDoNotReturn) {
   std::string Code = R"(
     #include "noreturn_destructor_test_defs.h"
diff --git a/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp b/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp
--- a/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp
@@ -2764,9 +2764,6 @@
 }
 
 TEST_P(UncheckedOptionalAccessTest, OptionalValueInitialization) {
-  // FIXME: Fix when to initialize `value`. All unwrapping should be safe in
-  // this example, but `value` initialization is done multiple times during the
-  // fixpoint iterations and joining the environment won't correctly merge them.
   ExpectDiagnosticsFor(
       R"(
     #include "unchecked_optional_access_test.h"
@@ -2786,7 +2783,7 @@
       }
       // Now we merge the two values. UncheckedOptionalAccessModel::merge() will
       // throw away the "value" property.
-      foo->value(); // [[unsafe]]
+      foo->value();
     }
   )");
 }
diff --git a/clang/utils/TableGen/RISCVVEmitter.cpp b/clang/utils/TableGen/RISCVVEmitter.cpp
--- a/clang/utils/TableGen/RISCVVEmitter.cpp
+++ b/clang/utils/TableGen/RISCVVEmitter.cpp
@@ -65,6 +65,7 @@
   bool HasMaskedOffOperand :1;
   bool HasTailPolicy : 1;
   bool HasMaskPolicy : 1;
+  bool IsTuple : 1;
   uint8_t UnMaskedPolicyScheme : 2;
   uint8_t MaskedPolicyScheme : 2;
 };
@@ -363,6 +364,16 @@
                                 TypeModifier::UnsignedInteger));
         printType(*UT);
       }
+      // FIXME: Expand more type declaration
+      if (I == 'i' && Log2LMUL == 0) { // vint32m1x2_t
+        auto TupleT = TypeCache.computeType(
+            BT, Log2LMUL,
+            PrototypeDescriptor(BaseTypeModifier::Vector,
+                                VectorTypeModifier::Tuple2,
+                                TypeModifier::SignedInteger));
+        if (TupleT)
+          printType(*TupleT);
+      }
     }
   }
 
@@ -512,6 +523,7 @@
     StringRef IRName = R->getValueAsString("IRName");
     StringRef MaskedIRName = R->getValueAsString("MaskedIRName");
     unsigned NF = R->getValueAsInt("NF");
+    bool IsTuple = R->getValueAsBit("IsTuple");
 
     const Policy DefaultPolicy;
     SmallVector<Policy> SupportedUnMaskedPolicies =
@@ -532,10 +544,10 @@
     auto Prototype = RVVIntrinsic::computeBuiltinTypes(
         BasicPrototype, /*IsMasked=*/false,
         /*HasMaskedOffOperand=*/false, HasVL, NF, UnMaskedPolicyScheme,
-        DefaultPolicy);
+        DefaultPolicy, IsTuple);
     auto MaskedPrototype = RVVIntrinsic::computeBuiltinTypes(
         BasicPrototype, /*IsMasked=*/true, HasMaskedOffOperand, HasVL, NF,
-        MaskedPolicyScheme, DefaultPolicy);
+        MaskedPolicyScheme, DefaultPolicy, IsTuple);
 
     // Create Intrinsics for each type and LMUL.
     for (char I : TypeRange) {
@@ -564,7 +576,7 @@
                 RVVIntrinsic::computeBuiltinTypes(
                     BasicPrototype, /*IsMasked=*/false,
                     /*HasMaskedOffOperand=*/false, HasVL, NF,
-                    UnMaskedPolicyScheme, P);
+                    UnMaskedPolicyScheme, P, IsTuple);
             std::optional<RVVTypes> PolicyTypes =
                 TypeCache.computeTypes(BT, Log2LMUL, NF, PolicyPrototype);
             Out.push_back(std::make_unique<RVVIntrinsic>(
@@ -590,7 +602,7 @@
           SmallVector<PrototypeDescriptor> PolicyPrototype =
               RVVIntrinsic::computeBuiltinTypes(
                   BasicPrototype, /*IsMasked=*/true, HasMaskedOffOperand, HasVL,
-                  NF, MaskedPolicyScheme, P);
+                  NF, MaskedPolicyScheme, P, IsTuple);
           std::optional<RVVTypes> PolicyTypes =
               TypeCache.computeTypes(BT, Log2LMUL, NF, PolicyPrototype);
           Out.push_back(std::make_unique<RVVIntrinsic>(
@@ -650,6 +662,7 @@
     SR.Prototype = std::move(BasicPrototype);
     SR.Suffix = parsePrototypes(SuffixProto);
     SR.OverloadedSuffix = parsePrototypes(OverloadedSuffixProto);
+    SR.IsTuple = IsTuple;
 
     SemaRecords->push_back(SR);
   }
@@ -691,6 +704,7 @@
     R.HasMaskPolicy = SR.HasMaskPolicy;
     R.UnMaskedPolicyScheme = SR.UnMaskedPolicyScheme;
     R.MaskedPolicyScheme = SR.MaskedPolicyScheme;
+    R.IsTuple = SR.IsTuple;
 
     assert(R.PrototypeIndex !=
            static_cast<uint16_t>(SemaSignatureTable::INVALID_INDEX));
diff --git a/compiler-rt/test/ubsan/TestCases/TypeCheck/Function/c.c b/compiler-rt/test/ubsan/TestCases/TypeCheck/Function/c.c
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/ubsan/TestCases/TypeCheck/Function/c.c
@@ -0,0 +1,14 @@
+// RUN: %clang -g -fsanitize=function %s -o %t
+// RUN: %run %t 2>&1 | FileCheck %s --check-prefix=CHECK --implicit-check-not='runtime error:'
+
+void f(void (*fp)(int (*)[])) { fp(0); }
+
+void callee0(int (*a)[]) {}
+void callee1(int (*a)[1]) {}
+
+int main() {
+  int a[1];
+  f(callee0);
+  // CHECK: runtime error: call to function callee1 through pointer to incorrect function type 'void (*)(int (*)[])'
+  f(callee1); // compatible type in C, but flagged
+}
diff --git a/compiler-rt/test/ubsan/TestCases/TypeCheck/Function/function.cpp b/compiler-rt/test/ubsan/TestCases/TypeCheck/Function/function.cpp
--- a/compiler-rt/test/ubsan/TestCases/TypeCheck/Function/function.cpp
+++ b/compiler-rt/test/ubsan/TestCases/TypeCheck/Function/function.cpp
@@ -1,6 +1,3 @@
-// Work around "Cannot represent a difference across sections"
-// UNSUPPORTED: target=powerpc64-{{.*}}
-
 // RUN: %clangxx -DDETERMINE_UNIQUE %s -o %t-unique
 // RUN: %clangxx -std=c++17 -fsanitize=function %s -O3 -g -DSHARED_LIB -fPIC -shared -o %t-so.so
 // RUN: %clangxx -std=c++17 -fsanitize=function %s -O3 -g -o %t %t-so.so
diff --git a/compiler-rt/test/ubsan/TestCases/TypeCheck/Function/lit.local.cfg.py b/compiler-rt/test/ubsan/TestCases/TypeCheck/Function/lit.local.cfg.py
--- a/compiler-rt/test/ubsan/TestCases/TypeCheck/Function/lit.local.cfg.py
+++ b/compiler-rt/test/ubsan/TestCases/TypeCheck/Function/lit.local.cfg.py
@@ -1,5 +1,8 @@
 if config.host_os not in ['Darwin', 'FreeBSD', 'Linux', 'NetBSD']:
   config.unsupported = True
+# Work around "Cannot represent a difference across sections"
+if config.target_arch == 'powerpc64':
+  config.unsupported = True
 # Work around "library ... not found: needed by main executable" in qemu.
 if config.android and config.target_arch not in ['x86', 'x86_64']:
   config.unsupported = True
diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md
--- a/flang/docs/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -363,6 +363,7 @@
 * Constraint C1406, which prohibits the same module name from being used
   in a scope for both an intrinsic and a non-intrinsic module, is implemented
   as a portability warning only, not a hard error.
+* IBM @PROCESS directive is accepted but ignored.
 
 ## Preprocessing behavior
 
diff --git a/flang/include/flang/Optimizer/Builder/HLFIRTools.h b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
--- a/flang/include/flang/Optimizer/Builder/HLFIRTools.h
+++ b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
@@ -23,6 +23,10 @@
 class FirOpBuilder;
 }
 
+namespace mlir {
+class IRMapping;
+}
+
 namespace hlfir {
 
 class AssociateOp;
@@ -359,13 +363,18 @@
                                   mlir::ValueRange typeParams,
                                   const ElementalKernelGenerator &genKernel);
 
+/// Structure to describe a loop nest.
+struct LoopNest {
+  fir::DoLoopOp outerLoop;
+  fir::DoLoopOp innerLoop;
+  llvm::SmallVector<mlir::Value> oneBasedIndices;
+};
+
 /// Generate a fir.do_loop nest looping from 1 to extents[i].
-/// Return the inner fir.do_loop and the indices of the loops.
-std::pair<fir::DoLoopOp, llvm::SmallVector<mlir::Value>>
-genLoopNest(mlir::Location loc, fir::FirOpBuilder &builder,
-            mlir::ValueRange extents);
-inline std::pair<fir::DoLoopOp, llvm::SmallVector<mlir::Value>>
-genLoopNest(mlir::Location loc, fir::FirOpBuilder &builder, mlir::Value shape) {
+LoopNest genLoopNest(mlir::Location loc, fir::FirOpBuilder &builder,
+                     mlir::ValueRange extents);
+inline LoopNest genLoopNest(mlir::Location loc, fir::FirOpBuilder &builder,
+                            mlir::Value shape) {
   return genLoopNest(loc, builder, getIndexExtents(loc, builder, shape));
 }
 
@@ -379,6 +388,20 @@
                                         hlfir::ElementalOp elemental,
                                         mlir::ValueRange oneBasedIndices);
 
+/// Inline the body of an hlfir.elemental without cloning the resulting
+/// hlfir.yield_element, and return the cloned operand of the
+/// hlfir.yield_element. The mapper must be provided to cover complex cases
+/// where the inlined elemental is not defined in the current context and uses
+/// values that have been cloned already.
+/// A callback is provided to indicate if an hlfir.apply inside the
+/// hlfir.elemental must be immediately replaced by the inlining of the
+/// applied hlfir.elemental.
+mlir::Value inlineElementalOp(
+    mlir::Location loc, fir::FirOpBuilder &builder,
+    hlfir::ElementalOp elemental, mlir::ValueRange oneBasedIndices,
+    mlir::IRMapping &mapper,
+    const std::function<bool(hlfir::ElementalOp)> &mustRecursivelyInline);
+
 std::pair<fir::ExtendedValue, std::optional<hlfir::CleanupFunction>>
 convertToValue(mlir::Location loc, fir::FirOpBuilder &builder,
                const hlfir::Entity &entity);
diff --git a/flang/include/flang/Optimizer/Dialect/FIRDialect.h b/flang/include/flang/Optimizer/Dialect/FIRDialect.h
--- a/flang/include/flang/Optimizer/Dialect/FIRDialect.h
+++ b/flang/include/flang/Optimizer/Dialect/FIRDialect.h
@@ -47,6 +47,9 @@
   void registerAttributes();
   // Register the Types of this dialect.
   void registerTypes();
+  // Register external interfaces on operations of
+  // this dialect.
+  void registerOpExternalInterfaces();
 };
 
 /// The FIR codegen dialect is a dialect containing a small set of transient
diff --git a/flang/include/flang/Optimizer/Dialect/FIRType.h b/flang/include/flang/Optimizer/Dialect/FIRType.h
--- a/flang/include/flang/Optimizer/Dialect/FIRType.h
+++ b/flang/include/flang/Optimizer/Dialect/FIRType.h
@@ -359,7 +359,7 @@
 mlir::Type fromRealTypeID(mlir::MLIRContext *context, llvm::Type::TypeID typeID,
                           fir::KindTy kind);
 
-int getTypeCode(mlir::Type ty, KindMapping &kindMap);
+int getTypeCode(mlir::Type ty, const KindMapping &kindMap);
 
 inline bool BaseBoxType::classof(mlir::Type type) {
   return type.isa<fir::BoxType, fir::ClassType>();
@@ -413,6 +413,14 @@
   return fir::unwrapRefType(t).isa<fir::BaseBoxType>();
 }
 
+/// Return a string representation of `ty`. The fir.ref is omitted in the
+/// representation.
+///
+/// fir.array<10x10xf32> -> prefix_10x10xf32
+/// fir.ref<i32> -> i32
+std::string getTypeAsString(mlir::Type ty, const KindMapping &kindMap,
+                            llvm::StringRef prefix = "");
+
 } // namespace fir
 
 #endif // FORTRAN_OPTIMIZER_DIALECT_FIRTYPE_H
diff --git a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
--- a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
+++ b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
@@ -644,10 +644,14 @@
   let extraClassDeclaration = [{
       mlir::Block *getBody() { return &getRegion().front(); }
 
-      // Get the indices iterating over the shape.
+      /// Get the indices iterating over the shape.
       mlir::Block::BlockArgListType getIndices() {
        return getBody()->getArguments();
       }
+
+      /// Must this elemental be evaluated in order?
+      /// TODO: add attribute and set it in lowering.
+      bool isOrdered() {return true;}
   }];
 
   let skipDefaultBuilders = 1;
diff --git a/flang/lib/Lower/ConvertCall.cpp b/flang/lib/Lower/ConvertCall.cpp
--- a/flang/lib/Lower/ConvertCall.cpp
+++ b/flang/lib/Lower/ConvertCall.cpp
@@ -1517,11 +1517,10 @@
     // iterations are cleaned up inside the iterations.
     if (!callContext.resultType) {
       // Subroutine case. Generate call inside loop nest.
-      auto [innerLoop, oneBasedIndicesVector] =
-          hlfir::genLoopNest(loc, builder, shape);
-      mlir::ValueRange oneBasedIndices = oneBasedIndicesVector;
+      hlfir::LoopNest loopNest = hlfir::genLoopNest(loc, builder, shape);
+      mlir::ValueRange oneBasedIndices = loopNest.oneBasedIndices;
       auto insPt = builder.saveInsertionPoint();
-      builder.setInsertionPointToStart(innerLoop.getBody());
+      builder.setInsertionPointToStart(loopNest.innerLoop.getBody());
       callContext.stmtCtx.pushScope();
       for (auto &preparedActual : loweredActuals)
         if (preparedActual)
diff --git a/flang/lib/Lower/ConvertConstant.cpp b/flang/lib/Lower/ConvertConstant.cpp
--- a/flang/lib/Lower/ConvertConstant.cpp
+++ b/flang/lib/Lower/ConvertConstant.cpp
@@ -20,6 +20,8 @@
 #include "flang/Optimizer/Builder/Complex.h"
 #include "flang/Optimizer/Builder/Todo.h"
 
+#include <algorithm>
+
 /// Convert string, \p s, to an APFloat value. Recognize and handle Inf and
 /// NaN strings as well. \p s is assumed to not contain any spaces.
 static llvm::APFloat consAPFloat(const llvm::fltSemantics &fsem,
@@ -66,17 +68,12 @@
 /// Helper class to lower an array constant to a global with an MLIR dense
 /// attribute.
 ///
-/// If we have a rank-1 array of integer, real, or logical, then we can
+/// If we have an array of integer, real, or logical, then we can
 /// create a global array with the dense attribute.
 ///
 /// The mlir tensor type can only handle integer, real, or logical. It
 /// does not currently support nested structures which is required for
 /// complex.
-///
-/// Also, we currently handle just rank-1 since tensor type assumes
-/// row major array ordering. We will need to reorder the dimensions
-/// in the tensor type to support Fortran's column major array ordering.
-/// How to create this tensor type is to be determined.
 class DenseGlobalBuilder {
 public:
   static fir::GlobalOp tryCreating(fir::FirOpBuilder &builder,
@@ -124,8 +121,6 @@
           &constant) {
     static_assert(TC != Fortran::common::TypeCategory::Character,
                   "must be numerical or logical");
-    if (constant.Rank() != 1)
-      return;
     auto attrTc = TC == Fortran::common::TypeCategory::Logical
                       ? Fortran::common::TypeCategory::Integer
                       : TC;
@@ -158,12 +153,16 @@
                                   llvm::StringRef globalName,
                                   mlir::StringAttr linkage,
                                   bool isConst) const {
-    // Not a rank 1 "trivial" intrinsic constant array, or empty array.
+    // Not a "trivial" intrinsic constant array, or empty array.
     if (!attributeElementType || attributes.empty())
       return {};
 
+    assert(symTy.isa<fir::SequenceType>() && "expecting an array global");
+    auto arrTy = symTy.cast<fir::SequenceType>();
+    llvm::SmallVector<int64_t> tensorShape(arrTy.getShape());
+    std::reverse(tensorShape.begin(), tensorShape.end());
     auto tensorTy =
-        mlir::RankedTensorType::get(attributes.size(), attributeElementType);
+        mlir::RankedTensorType::get(tensorShape, attributeElementType);
     auto init = mlir::DenseElementsAttr::get(tensorTy, attributes);
     return builder.createGlobal(loc, symTy, globalName, linkage, init, isConst);
   }
@@ -544,6 +543,13 @@
           true, constant);
     }
     if (!global)
+      // If the number of elements of the array is huge, the compilation may
+      // use a lot of memory and take a very long time to complete.
+      // Empirical evidence shows that an array with 150000 elements of
+      // complex type takes roughly 30 seconds to compile and uses 4GB of RAM,
+      // on a modern machine.
+      // It would be nice to add a driver switch to control the array size
+      // after which flang should not continue to compile.
       global = builder.createGlobalConstant(
           loc, arrayTy, globalName,
           [&](fir::FirOpBuilder &builder) {
diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -431,14 +431,9 @@
 
   // If this is an array, check to see if we can use a dense attribute
   // with a tensor mlir type. This optimization currently only supports
-  // rank-1 Fortran arrays of integer, real, or logical. The tensor
-  // type does not support nested structures which are needed for
-  // complex numbers.
-  // To get multidimensional arrays to work, we will have to use column major
-  // array ordering with the tensor type (so it matches column major ordering
-  // with the Fortran fir.array). By default, tensor types assume row major
-  // ordering. How to create this tensor type is to be determined.
-  if (symTy.isa<fir::SequenceType>() && sym.Rank() == 1 &&
+  // Fortran arrays of integer, real, or logical. The tensor type does
+  // not support nested structures which are needed for complex numbers.
+  if (symTy.isa<fir::SequenceType>() &&
       !Fortran::semantics::IsAllocatableOrPointer(sym)) {
     mlir::Type eleTy = symTy.cast<fir::SequenceType>().getEleTy();
     if (eleTy.isa<mlir::IntegerType, mlir::FloatType, fir::LogicalType>()) {
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -478,6 +478,57 @@
   }
 }
 
+static mlir::acc::PrivateRecipeOp
+createBasePrivateRecipeOp(fir::FirOpBuilder &builder, mlir::Value input,
+                          llvm::StringRef recipeName, mlir::Location loc) {
+  mlir::ModuleOp mod = builder.getModule();
+  mlir::OpBuilder modBuilder(mod.getBodyRegion());
+  mlir::Type ty = input.getType();
+  auto recipe =
+      modBuilder.create<mlir::acc::PrivateRecipeOp>(loc, recipeName, ty);
+  builder.createBlock(&recipe.getInitRegion(), recipe.getInitRegion().end(),
+                      {ty}, {loc});
+  builder.setInsertionPointToEnd(&recipe.getInitRegion().back());
+  builder.create<mlir::acc::YieldOp>(
+      loc, recipe.getInitRegion().front().getArgument(0));
+  return recipe;
+}
+
+static void
+genPrivatizations(const Fortran::parser::AccObjectList &objectList,
+                  Fortran::lower::AbstractConverter &converter,
+                  Fortran::semantics::SemanticsContext &semanticsContext,
+                  Fortran::lower::StatementContext &stmtCtx,
+                  llvm::SmallVectorImpl<mlir::Value> &dataOperands,
+                  llvm::SmallVector<mlir::Attribute> &privatizations) {
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+  mlir::ModuleOp mod = builder.getModule();
+  for (const auto &accObject : objectList.v) {
+    llvm::SmallVector<mlir::Value> bounds;
+    std::stringstream asFortran;
+    mlir::Location operandLocation = genOperandLocation(converter, accObject);
+    mlir::Value baseAddr = gatherDataOperandAddrAndBounds(
+        converter, builder, semanticsContext, stmtCtx, accObject,
+        operandLocation, asFortran, bounds);
+
+    std::string recipeName = fir::getTypeAsString(
+        baseAddr.getType(), converter.getKindMap(), "privatization");
+    if (auto recipe =
+            mod.lookupSymbol<mlir::acc::PrivateRecipeOp>(recipeName)) {
+      privatizations.push_back(mlir::SymbolRefAttr::get(
+          builder.getContext(), recipe.getSymName().str()));
+    } else {
+      auto crtPos = builder.saveInsertionPoint();
+      mlir::acc::PrivateRecipeOp newRecipe = createBasePrivateRecipeOp(
+          builder, baseAddr, recipeName, operandLocation);
+      builder.restoreInsertionPoint(crtPos);
+      privatizations.push_back(mlir::SymbolRefAttr::get(
+          builder.getContext(), newRecipe.getSymName().str()));
+    }
+    dataOperands.push_back(baseAddr);
+  }
+}
+
 template <typename Clause>
 static void genObjectListWithModifier(
     const Clause *x, Fortran::lower::AbstractConverter &converter,
@@ -633,7 +684,7 @@
              Fortran::semantics::SemanticsContext &semanticsContext,
              Fortran::lower::StatementContext &stmtCtx,
              const Fortran::parser::AccClauseList &accClauseList) {
-  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
 
   mlir::Value workerNum;
   mlir::Value vectorNum;
@@ -641,6 +692,7 @@
   mlir::Value gangStatic;
   llvm::SmallVector<mlir::Value, 2> tileOperands, privateOperands,
       reductionOperands;
+  llvm::SmallVector<mlir::Attribute> privatizations;
   bool hasGang = false, hasVector = false, hasWorker = false;
 
   for (const Fortran::parser::AccClause &clause : accClauseList.v) {
@@ -665,8 +717,8 @@
           } else {
             // * was passed as value and will be represented as a special
             // constant.
-            gangStatic = firOpBuilder.createIntegerConstant(
-                clauseLocation, firOpBuilder.getIndexType(), starCst);
+            gangStatic = builder.createIntegerConstant(
+                clauseLocation, builder.getIndexType(), starCst);
           }
         }
       }
@@ -698,8 +750,8 @@
         } else {
           // * was passed as value and will be represented as a -1 constant
           // integer.
-          mlir::Value tileStar = firOpBuilder.createIntegerConstant(
-              clauseLocation, firOpBuilder.getIntegerType(32),
+          mlir::Value tileStar = builder.createIntegerConstant(
+              clauseLocation, builder.getIntegerType(32),
               /* STAR */ -1);
           tileOperands.push_back(tileStar);
         }
@@ -707,8 +759,8 @@
     } else if (const auto *privateClause =
                    std::get_if<Fortran::parser::AccClause::Private>(
                        &clause.u)) {
-      genObjectList(privateClause->v, converter, semanticsContext, stmtCtx,
-                    privateOperands);
+      genPrivatizations(privateClause->v, converter, semanticsContext, stmtCtx,
+                        privateOperands, privatizations);
     } else if (std::get_if<Fortran::parser::AccClause::Reduction>(&clause.u)) {
       // Reduction clause is left out for the moment as the clause will probably
       // end up having its own operation.
@@ -728,14 +780,18 @@
   addOperands(operands, operandSegments, reductionOperands);
 
   auto loopOp = createRegionOp<mlir::acc::LoopOp, mlir::acc::YieldOp>(
-      firOpBuilder, currentLocation, operands, operandSegments);
+      builder, currentLocation, operands, operandSegments);
 
   if (hasGang)
-    loopOp.setHasGangAttr(firOpBuilder.getUnitAttr());
+    loopOp.setHasGangAttr(builder.getUnitAttr());
   if (hasWorker)
-    loopOp.setHasWorkerAttr(firOpBuilder.getUnitAttr());
+    loopOp.setHasWorkerAttr(builder.getUnitAttr());
   if (hasVector)
-    loopOp.setHasVectorAttr(firOpBuilder.getUnitAttr());
+    loopOp.setHasVectorAttr(builder.getUnitAttr());
+
+  if (!privatizations.empty())
+    loopOp.setPrivatizationsAttr(
+        mlir::ArrayAttr::get(builder.getContext(), privatizations));
 
   // Lower clauses mapped to attributes
   for (const Fortran::parser::AccClause &clause : accClauseList.v) {
@@ -745,16 +801,16 @@
       const std::optional<int64_t> collapseValue =
           Fortran::evaluate::ToInt64(*expr);
       if (collapseValue) {
-        loopOp.setCollapseAttr(firOpBuilder.getI64IntegerAttr(*collapseValue));
+        loopOp.setCollapseAttr(builder.getI64IntegerAttr(*collapseValue));
       }
     } else if (std::get_if<Fortran::parser::AccClause::Seq>(&clause.u)) {
-      loopOp.setSeqAttr(firOpBuilder.getUnitAttr());
+      loopOp.setSeqAttr(builder.getUnitAttr());
     } else if (std::get_if<Fortran::parser::AccClause::Independent>(
                    &clause.u)) {
-      loopOp.setIndependentAttr(firOpBuilder.getUnitAttr());
+      loopOp.setIndependentAttr(builder.getUnitAttr());
     } else if (std::get_if<Fortran::parser::AccClause::Auto>(&clause.u)) {
       loopOp->setAttr(mlir::acc::LoopOp::getAutoAttrStrName(),
-                      firOpBuilder.getUnitAttr());
+                      builder.getUnitAttr());
     }
   }
   return loopOp;
@@ -824,9 +880,9 @@
       copyEntryOperands, copyoutEntryOperands, createEntryOperands,
       dataClauseOperands;
 
-  // TODO: need to more work/design.
   llvm::SmallVector<mlir::Value> reductionOperands, privateOperands,
       firstprivateOperands;
+  llvm::SmallVector<mlir::Attribute> privatizations;
 
   // Async, wait and self clause have optional values but can be present with
   // no value as well. When there is no value, the op has an attribute to
@@ -973,8 +1029,8 @@
     } else if (const auto *privateClause =
                    std::get_if<Fortran::parser::AccClause::Private>(
                        &clause.u)) {
-      genObjectList(privateClause->v, converter, semanticsContext, stmtCtx,
-                    privateOperands);
+      genPrivatizations(privateClause->v, converter, semanticsContext, stmtCtx,
+                        privateOperands, privatizations);
     } else if (const auto *firstprivateClause =
                    std::get_if<Fortran::parser::AccClause::Firstprivate>(
                        &clause.u)) {
@@ -1019,6 +1075,12 @@
   if (addSelfAttr)
     computeOp.setSelfAttrAttr(builder.getUnitAttr());
 
+  if constexpr (!std::is_same_v<Op, mlir::acc::KernelsOp>) {
+    if (!privatizations.empty())
+      computeOp.setPrivatizationsAttr(
+          mlir::ArrayAttr::get(builder.getContext(), privatizations));
+  }
+
   auto insPt = builder.saveInsertionPoint();
   builder.setInsertionPointAfter(computeOp);
 
diff --git a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
--- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp
+++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
@@ -764,26 +764,62 @@
   return yield;
 }
 
-std::pair<fir::DoLoopOp, llvm::SmallVector<mlir::Value>>
-hlfir::genLoopNest(mlir::Location loc, fir::FirOpBuilder &builder,
-                   mlir::ValueRange extents) {
+mlir::Value hlfir::inlineElementalOp(
+    mlir::Location loc, fir::FirOpBuilder &builder,
+    hlfir::ElementalOp elemental, mlir::ValueRange oneBasedIndices,
+    mlir::IRMapping &mapper,
+    const std::function<bool(hlfir::ElementalOp)> &mustRecursivelyInline) {
+  mlir::Region &region = elemental.getRegion();
+  // hlfir.elemental region is a SizedRegion<1>.
+  assert(region.hasOneBlock() && "elemental region must have one block");
+  mapper.map(elemental.getIndices(), oneBasedIndices);
+  mlir::Block::OpListType &ops = region.back().getOperations();
+  assert(!ops.empty() && "elemental block cannot be empty");
+  auto end = ops.end();
+  for (auto opIt = ops.begin(); std::next(opIt) != end; ++opIt) {
+    if (auto apply = mlir::dyn_cast<hlfir::ApplyOp>(*opIt))
+      if (auto appliedElemental =
+              apply.getExpr().getDefiningOp<hlfir::ElementalOp>())
+        if (mustRecursivelyInline(appliedElemental)) {
+          llvm::SmallVector<mlir::Value> clonedApplyIndices;
+          for (auto indice : apply.getIndices())
+            clonedApplyIndices.push_back(mapper.lookupOrDefault(indice));
+          mlir::Value inlined = inlineElementalOp(
+              loc, builder, appliedElemental, clonedApplyIndices, mapper,
+              mustRecursivelyInline);
+          mapper.map(apply.getResult(), inlined);
+          continue;
+        }
+    (void)builder.clone(*opIt, mapper);
+  }
+  auto oldYield = mlir::dyn_cast_or_null<hlfir::YieldElementOp>(
+      region.back().getOperations().back());
+  assert(oldYield && "must terminate with yieldElementalOp");
+  return mapper.lookupOrDefault(oldYield.getElementValue());
+}
+
+hlfir::LoopNest hlfir::genLoopNest(mlir::Location loc,
+                                   fir::FirOpBuilder &builder,
+                                   mlir::ValueRange extents) {
+  hlfir::LoopNest loopNest;
   assert(!extents.empty() && "must have at least one extent");
   auto insPt = builder.saveInsertionPoint();
-  llvm::SmallVector<mlir::Value> indices(extents.size());
+  loopNest.oneBasedIndices.assign(extents.size(), mlir::Value{});
   // Build loop nest from column to row.
   auto one = builder.create<mlir::arith::ConstantIndexOp>(loc, 1);
   mlir::Type indexType = builder.getIndexType();
   unsigned dim = extents.size() - 1;
-  fir::DoLoopOp innerLoop;
   for (auto extent : llvm::reverse(extents)) {
     auto ub = builder.createConvert(loc, indexType, extent);
-    innerLoop = builder.create<fir::DoLoopOp>(loc, one, ub, one);
-    builder.setInsertionPointToStart(innerLoop.getBody());
+    loopNest.innerLoop = builder.create<fir::DoLoopOp>(loc, one, ub, one);
+    builder.setInsertionPointToStart(loopNest.innerLoop.getBody());
     // Reverse the indices so they are in column-major order.
-    indices[dim--] = innerLoop.getInductionVar();
+    loopNest.oneBasedIndices[dim--] = loopNest.innerLoop.getInductionVar();
+    if (!loopNest.outerLoop)
+      loopNest.outerLoop = loopNest.innerLoop;
   }
   builder.restoreInsertionPoint(insPt);
-  return {innerLoop, indices};
+  return loopNest;
 }
 
 static fir::ExtendedValue
diff --git a/flang/lib/Optimizer/Dialect/FIRDialect.cpp b/flang/lib/Optimizer/Dialect/FIRDialect.cpp
--- a/flang/lib/Optimizer/Dialect/FIRDialect.cpp
+++ b/flang/lib/Optimizer/Dialect/FIRDialect.cpp
@@ -64,6 +64,7 @@
 #define GET_OP_LIST
 #include "flang/Optimizer/Dialect/FIROps.cpp.inc"
       >();
+  registerOpExternalInterfaces();
   addInterfaces<FIRInlinerInterface>();
 }
 
diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -12,6 +12,7 @@
 
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/Dialect/FIRAttr.h"
+#include "flang/Optimizer/Dialect/FIRDialect.h"
 #include "flang/Optimizer/Dialect/FIROpsSupport.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Optimizer/Dialect/Support/FIRContext.h"
@@ -19,6 +20,7 @@
 #include "flang/Optimizer/Support/Utils.h"
 #include "mlir/Dialect/CommonFolders.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinOps.h"
@@ -3758,6 +3760,17 @@
   return fortranVar.verifyDeclareLikeOpImpl(getMemref());
 }
 
+//===----------------------------------------------------------------------===//
+// FIROpsDialect
+//===----------------------------------------------------------------------===//
+
+void fir::FIROpsDialect::registerOpExternalInterfaces() {
+  // Attach default declare target interfaces to operations which can be marked
+  // as declare target.
+  fir::GlobalOp::attachInterface<
+      mlir::omp::DeclareTargetDefaultModel<fir::GlobalOp>>(*getContext());
+}
+
 // Tablegen operators
 
 #define GET_OP_CLASSES
diff --git a/flang/lib/Optimizer/Dialect/FIRType.cpp b/flang/lib/Optimizer/Dialect/FIRType.cpp
--- a/flang/lib/Optimizer/Dialect/FIRType.cpp
+++ b/flang/lib/Optimizer/Dialect/FIRType.cpp
@@ -383,7 +383,7 @@
 }
 
 /// Return the ISO_C_BINDING intrinsic module value of type \p ty.
-int getTypeCode(mlir::Type ty, fir::KindMapping &kindMap) {
+int getTypeCode(mlir::Type ty, const fir::KindMapping &kindMap) {
   unsigned width = 0;
   if (mlir::IntegerType intTy = ty.dyn_cast<mlir::IntegerType>()) {
     switch (intTy.getWidth()) {
@@ -473,6 +473,50 @@
   llvm_unreachable("unsupported type");
 }
 
+std::string getTypeAsString(mlir::Type ty, const fir::KindMapping &kindMap,
+                            llvm::StringRef prefix) {
+  std::stringstream name;
+  name << prefix.str();
+  if (!prefix.empty())
+    name << "_";
+  ty = fir::unwrapRefType(ty);
+  while (ty) {
+    if (fir::isa_trivial(ty)) {
+      if (ty.isIntOrIndex()) {
+        name << 'i' << ty.getIntOrFloatBitWidth();
+      } else if (ty.isa<mlir::FloatType>()) {
+        name << 'f' << ty.getIntOrFloatBitWidth();
+      } else if (fir::isa_complex(ty)) {
+        name << 'z';
+        if (auto cplxTy = mlir::dyn_cast_or_null<mlir::ComplexType>(ty)) {
+          auto floatTy = cplxTy.getElementType().cast<mlir::FloatType>();
+          name << floatTy.getWidth();
+        } else if (auto cplxTy = mlir::dyn_cast_or_null<fir::ComplexType>(ty)) {
+          name << kindMap.getRealBitsize(cplxTy.getFKind());
+        }
+      } else if (auto logTy = mlir::dyn_cast_or_null<fir::LogicalType>(ty)) {
+        name << 'l' << kindMap.getLogicalBitsize(logTy.getFKind());
+      } else {
+        llvm::report_fatal_error("unsupported type");
+      }
+      break;
+    } else if (auto charTy = mlir::dyn_cast_or_null<fir::CharacterType>(ty)) {
+      name << 'c' << kindMap.getCharacterBitsize(charTy.getFKind());
+      if (charTy.getLen() != fir::CharacterType::singleton())
+        name << "x" << charTy.getLen();
+      break;
+    } else if (auto seqTy = mlir::dyn_cast_or_null<fir::SequenceType>(ty)) {
+      for (auto extent : seqTy.getShape())
+        name << extent << 'x';
+      ty = seqTy.getEleTy();
+    } else {
+      // TODO: add support for RecordType/BaseBoxType
+      llvm::report_fatal_error("unsupported type");
+    }
+  }
+  return name.str();
+}
+
 } // namespace fir
 
 namespace {
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp b/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp
--- a/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp
@@ -552,12 +552,11 @@
                         adaptor.getTypeparams());
     // Generate a loop nest looping around the fir.elemental shape and clone
     // fir.elemental region inside the inner loop.
-    auto [innerLoop, oneBasedLoopIndices] =
-        hlfir::genLoopNest(loc, builder, extents);
+    hlfir::LoopNest loopNest = hlfir::genLoopNest(loc, builder, extents);
     auto insPt = builder.saveInsertionPoint();
-    builder.setInsertionPointToStart(innerLoop.getBody());
-    auto yield =
-        hlfir::inlineElementalOp(loc, builder, elemental, oneBasedLoopIndices);
+    builder.setInsertionPointToStart(loopNest.innerLoop.getBody());
+    auto yield = hlfir::inlineElementalOp(loc, builder, elemental,
+                                          loopNest.oneBasedIndices);
     hlfir::Entity elementValue(yield.getElementValue());
     // Skip final AsExpr if any. It would create an element temporary,
     // which is no needed since the element will be assigned right away in
@@ -572,7 +571,7 @@
     rewriter.eraseOp(yield);
     // Assign the element value to the temp element for this iteration.
     auto tempElement =
-        hlfir::getElementAt(loc, builder, temp, oneBasedLoopIndices);
+        hlfir::getElementAt(loc, builder, temp, loopNest.oneBasedIndices);
     builder.create<hlfir::AssignOp>(loc, elementValue, tempElement);
     // hlfir.yield_element implicitly marks the end-of-life its operand if
     // it is an expression created in the hlfir.elemental (since it is its
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp
--- a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp
@@ -19,11 +19,13 @@
 
 #include "ScheduleOrderedAssignments.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Optimizer/Builder/HLFIRTools.h"
 #include "flang/Optimizer/Builder/Todo.h"
 #include "flang/Optimizer/Dialect/Support/FIRContext.h"
 #include "flang/Optimizer/HLFIR/Passes.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Debug.h"
 
@@ -42,6 +44,52 @@
     llvm::cl::desc("Only run ordered assignment scheduling with no codegen"),
     llvm::cl::init(false));
 
+namespace {
+
+/// Structure that represents a masked expression being lowered. Masked
+/// expressions are any expressions inside an hlfir.where. As described in
+/// Fortran 2018 section 10.2.3.2, the evaluation of the elemental parts of such
+/// expressions must be masked, while the evaluation of none elemental parts
+/// must not be masked. This structure analyzes the region evaluating the
+/// expression and allows splitting the generation of the none elemental part
+/// from the elemental part.
+struct MaskedArrayExpr {
+  MaskedArrayExpr(mlir::Location loc, mlir::Region &region);
+
+  /// Generate the none elemental part. Must be called outside of the
+  /// loops created for the WHERE construct.
+  void generateNoneElementalPart(fir::FirOpBuilder &builder,
+                                 mlir::IRMapping &mapper);
+
+  /// Methods below can only be called once generateNoneElementalPart has been
+  /// called.
+
+  /// Return the shape of the expression.
+  mlir::Value generateShape(fir::FirOpBuilder &builder,
+                            mlir::IRMapping &mapper);
+  /// Return the value of an element value for this expression given the current
+  /// where loop indices.
+  mlir::Value generateElementalParts(fir::FirOpBuilder &builder,
+                                     mlir::ValueRange oneBasedIndices,
+                                     mlir::IRMapping &mapper);
+  /// Generate the cleanup for the none elemental parts, if any. This must be
+  /// called after the loops created for the WHERE construct.
+  void generateNoneElementalCleanupIfAny(fir::FirOpBuilder &builder,
+                                         mlir::IRMapping &mapper);
+
+  mlir::Location loc;
+  mlir::Region &region;
+  /// Was generateNoneElementalPart called?
+  bool noneElementalPartWasGenerated = false;
+  /// Set of operations that form the elemental parts of the
+  /// expression evaluation. These are the hlfir.elemental and
+  /// hlfir.elemental_addr that form the elemental tree producing
+  /// the expression value. hlfir.elemental that produce values
+  /// used inside transformational operations are not part of this set.
+  llvm::SmallSet<mlir::Operation *, 4> elementalParts{};
+};
+} // namespace
+
 namespace {
 /// Structure that visits an ordered assignment tree and generates code for
 /// it according to a schedule.
@@ -76,6 +124,8 @@
   /// Generate code when leaving a given ordered assignment node.
   void post(hlfir::ForallOp);
   void post(hlfir::ForallMaskOp);
+  void post(hlfir::WhereOp);
+  void post(hlfir::ElseWhereOp);
 
   /// Is this an assignment to a vector subscripted entity?
   static bool hasVectorSubscriptedLhs(hlfir::RegionAssignOp regionAssignOp);
@@ -105,9 +155,23 @@
   /// at the current insertion point (by cloning).
   void generateCleanupIfAny(std::optional<hlfir::YieldOp> maybeYield);
 
+  /// Generate a masked entity. This can only be called when whereLoopNest was
+  /// set (When an hlfir.where is being visited).
+  /// This method returns the scalar element (that may have been previously
+  /// saved) for the current indices inside the where loop.
+  mlir::Value generateMaskedEntity(mlir::Location loc, mlir::Region &region) {
+    MaskedArrayExpr maskedExpr(loc, region);
+    return generateMaskedEntity(maskedExpr);
+  }
+  mlir::Value generateMaskedEntity(MaskedArrayExpr &maskedExpr);
+
+  /// Create a fir.if at the current position inside the where loop nest
+  /// given a mask expression.
+  void generateMaskIfOp(MaskedArrayExpr &mask);
+
   fir::FirOpBuilder &builder;
 
-  /// Map containg the mapping between the original order assignment tree
+  /// Map containing the mapping between the original order assignment tree
   /// operations and the operations that have been cloned in the current run.
   /// It is reset between two runs.
   mlir::IRMapping mapper;
@@ -115,6 +179,9 @@
   /// point correctly when leaving a node that requires a fir.do_loop or fir.if
   /// operation.
   llvm::SmallVector<mlir::Operation *> constructStack;
+  /// Current where loop nest, if any.
+  std::optional<hlfir::LoopNest> whereLoopNest;
+
   /// Root of the order assignment tree being lowered.
   hlfir::OrderedAssignmentTreeOpInterface root;
   /// Pointer to the current run of the schedule being lowered.
@@ -139,8 +206,8 @@
                 mlir::dyn_cast<hlfir::OrderedAssignmentTreeOpInterface>(op))
           walk(subNode);
       llvm::TypeSwitch<mlir::Operation *, void>(node.getOperation())
-          .Case<hlfir::ForallOp, hlfir::ForallMaskOp>(
-              [&](auto concreteOp) { post(concreteOp); })
+          .Case<hlfir::ForallOp, hlfir::ForallMaskOp, hlfir::WhereOp,
+                hlfir::ElseWhereOp>([&](auto concreteOp) { post(concreteOp); })
           .Default([](auto) {});
     }
   }
@@ -218,19 +285,78 @@
   generateCleanupIfAny(oldLhsYield);
 }
 
+void OrderedAssignmentRewriter::generateMaskIfOp(MaskedArrayExpr &mask) {
+  assert(whereLoopNest.has_value() && "must be inside a WHERE");
+  mlir::Location loc = mask.loc;
+  hlfir::Entity maskVal{generateMaskedEntity(mask)};
+  maskVal = hlfir::loadTrivialScalar(loc, builder, maskVal);
+  mlir::Value cdt = builder.createConvert(loc, builder.getI1Type(), maskVal);
+  // Else region is added when visiting nested hlfir.elseWhereOp, if any.
+  auto ifOp = builder.create<fir::IfOp>(loc, std::nullopt, cdt,
+                                        /*withElseRegion=*/false);
+  constructStack.push_back(ifOp.getOperation());
+  builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
+}
+
 void OrderedAssignmentRewriter::pre(hlfir::WhereOp whereOp) {
   mlir::Location loc = whereOp.getLoc();
-  TODO(loc, "WHERE in HLFIR");
+  MaskedArrayExpr mask(loc, whereOp.getMaskRegion());
+  if (!whereLoopNest) {
+    // Start a loop nest iterating on the shape of the where mask.
+    mask.generateNoneElementalPart(builder, mapper);
+    mlir::Value shape = mask.generateShape(builder, mapper);
+    whereLoopNest = hlfir::genLoopNest(loc, builder, shape);
+    constructStack.push_back(whereLoopNest->outerLoop.getOperation());
+    builder.setInsertionPointToStart(whereLoopNest->innerLoop.getBody());
+  }
+  // Generate a fir.if with the value of the current element of the mask
+  // inside the loops.
+  generateMaskIfOp(mask);
+}
+
+void OrderedAssignmentRewriter::post(hlfir::WhereOp whereOp) {
+  assert(!constructStack.empty() && "must contain a fir.if");
+  builder.setInsertionPointAfter(constructStack.pop_back_val());
+  // If all where/elsewhere fir.if have been popped, this is the outer whereOp,
+  // and the where loop must be exited.
+  assert(!constructStack.empty() && "must contain a  fir.do_loop or fir.if");
+  if (mlir::isa<fir::DoLoopOp>(constructStack.back())) {
+    builder.setInsertionPointAfter(constructStack.pop_back_val());
+    whereLoopNest.reset();
+  }
 }
 
 void OrderedAssignmentRewriter::pre(hlfir::ElseWhereOp elseWhereOp) {
+  assert(!constructStack.empty() && "cannot be empty inside a where");
   mlir::Location loc = elseWhereOp.getLoc();
-  TODO(loc, "ELSEWHERE in HLFIR");
+  // Create an "else" region for the current where/elsewhere fir.if.
+  auto ifOp = mlir::dyn_cast<fir::IfOp>(constructStack.back());
+  assert(ifOp && ifOp.getElseRegion().empty() && "must be an if without else");
+  builder.createBlock(&ifOp.getElseRegion());
+  auto end = builder.create<fir::ResultOp>(loc);
+  builder.setInsertionPoint(end);
+  if (elseWhereOp.getMaskRegion().empty())
+    return;
+  // Create new nested fir.if with elsewhere mask if any.
+  MaskedArrayExpr mask(loc, elseWhereOp.getMaskRegion());
+  generateMaskIfOp(mask);
+}
+
+void OrderedAssignmentRewriter::post(hlfir::ElseWhereOp elseWhereOp) {
+  // Exit ifOp that was created for the elseWhereOp mask, if any.
+  if (elseWhereOp.getMaskRegion().empty())
+    return;
+  assert(!constructStack.empty() && "must contain a fir.if");
+  builder.setInsertionPointAfter(constructStack.pop_back_val());
 }
 
 std::pair<mlir::Value, std::optional<hlfir::YieldOp>>
 OrderedAssignmentRewriter::generateYieldedEntity(mlir::Region &region) {
   // TODO: if the region was saved, use that instead of generating code again.
+  if (whereLoopNest.has_value()) {
+    mlir::Location loc = region.getParentOp()->getLoc();
+    return {generateMaskedEntity(loc, region), std::nullopt};
+  }
   assert(region.hasOneBlock() && "region must contain one block");
   // Clone all operations except the final hlfir.yield.
   mlir::Block::OpListType &ops = region.back().getOperations();
@@ -258,6 +384,27 @@
   return value;
 }
 
+mlir::Value
+OrderedAssignmentRewriter::generateMaskedEntity(MaskedArrayExpr &maskedExpr) {
+  assert(whereLoopNest.has_value() && "must be inside WHERE loop nest");
+  auto insertionPoint = builder.saveInsertionPoint();
+  if (!maskedExpr.noneElementalPartWasGenerated) {
+    // Generate none elemental part before the where loops (but inside the
+    // current forall loops if any).
+    builder.setInsertionPoint(whereLoopNest->outerLoop);
+    maskedExpr.generateNoneElementalPart(builder, mapper);
+  }
+  // Generate the none elemental part cleanup after the where loops.
+  builder.setInsertionPointAfter(whereLoopNest->outerLoop);
+  maskedExpr.generateNoneElementalCleanupIfAny(builder, mapper);
+  // Generate the value of the current element for the masked expression
+  // at the current insertion point (inside the where loops, and any fir.if
+  // generated for previous masks).
+  builder.restoreInsertionPoint(insertionPoint);
+  return maskedExpr.generateElementalParts(
+      builder, whereLoopNest->oneBasedIndices, mapper);
+}
+
 void OrderedAssignmentRewriter::generateCleanupIfAny(
     std::optional<hlfir::YieldOp> maybeYield) {
   if (maybeYield.has_value())
@@ -310,6 +457,127 @@
   return false;
 }
 
+/// Is the apply using all the elemental indices in order?
+static bool isInOrderApply(hlfir::ApplyOp apply, hlfir::ElementalOp elemental) {
+  if (elemental.getIndices().size() != apply.getIndices().size())
+    return false;
+  for (auto [elementalIdx, applyIdx] :
+       llvm::zip(elemental.getIndices(), apply.getIndices()))
+    if (elementalIdx != applyIdx)
+      return false;
+  return true;
+}
+
+/// Gather the chain of hlfir::ElementalOp, if any, that produced \p value.
+static void
+gatherElementalTree(mlir::Value value,
+                    llvm::SmallPtrSetImpl<mlir::Operation *> &elementalOps,
+                    bool isOutOfOrder) {
+  if (auto elemental = value.getDefiningOp<hlfir::ElementalOp>()) {
+    // Only inline an applied elemental that must be executed in order if the
+    // applying indices are in order. An hlfir::Elemental may have been created
+    // for a transformational like transpose, and Fortran 2018 standard
+    // section 10.2.3.2, point 10 imply that impure elemental sub-expression
+    // evaluations should not be masked if they are the arguments of
+    // transformational expressions.
+    if (isOutOfOrder && elemental.isOrdered())
+      return;
+    elementalOps.insert(elemental.getOperation());
+    for (mlir::Operation &op : elemental.getBody()->getOperations())
+      if (auto apply = mlir::dyn_cast<hlfir::ApplyOp>(op)) {
+        bool isUnorderedApply =
+            isOutOfOrder || !isInOrderApply(apply, elemental);
+        gatherElementalTree(apply.getExpr(), elementalOps, isUnorderedApply);
+      }
+  }
+}
+
+MaskedArrayExpr::MaskedArrayExpr(mlir::Location loc, mlir::Region &region)
+    : loc{loc}, region{region} {
+  mlir::Operation &terminator = region.back().back();
+  // TODO: clarify if vector subscripts must be inlined or not here.
+  // In case of x(elemental(A), :), this could lead to more elemental(A)
+  // evaluation than needed, which is not OK if "elemental" is impure.
+  // The standard is not very clear here.
+  if (mlir::isa<hlfir::ElementalAddrOp>(terminator))
+    TODO(loc, "vector subscripted assignments inside WHERE");
+  mlir::Value entity = mlir::cast<hlfir::YieldOp>(terminator).getEntity();
+  gatherElementalTree(entity, elementalParts, /*isOutOfOrder=*/false);
+}
+
+void MaskedArrayExpr::generateNoneElementalPart(fir::FirOpBuilder &builder,
+                                                mlir::IRMapping &mapper) {
+  assert(!noneElementalPartWasGenerated &&
+         "none elemental parts already generated");
+  // Clone all operations, except the elemental and the final yield.
+  mlir::Block::OpListType &ops = region.back().getOperations();
+  assert(!ops.empty() && "yield block cannot be empty");
+  auto end = ops.end();
+  for (auto opIt = ops.begin(); std::next(opIt) != end; ++opIt)
+    if (!elementalParts.contains(&*opIt))
+      (void)builder.clone(*opIt, mapper);
+  noneElementalPartWasGenerated = true;
+}
+
+mlir::Value MaskedArrayExpr::generateShape(fir::FirOpBuilder &builder,
+                                           mlir::IRMapping &mapper) {
+  assert(noneElementalPartWasGenerated &&
+         "non elemental part must have been generated");
+  mlir::Operation &terminator = region.back().back();
+  // If the operation that produced the yielded entity is elemental, it was not
+  // cloned, but it holds a shape argument that was cloned. Return the cloned
+  // shape.
+  if (auto elementalAddrOp = mlir::dyn_cast<hlfir::ElementalAddrOp>(terminator))
+    return mapper.lookupOrDefault(elementalAddrOp.getShape());
+  mlir::Value entity = mlir::cast<hlfir::YieldOp>(terminator).getEntity();
+  if (auto elemental = entity.getDefiningOp<hlfir::ElementalOp>())
+    return mapper.lookupOrDefault(elemental.getShape());
+  // Otherwise, the whole entity was cloned, and the shape can be generated
+  // from it.
+  hlfir::Entity clonedEntity{mapper.lookupOrDefault(entity)};
+  return hlfir::genShape(loc, builder, hlfir::Entity{clonedEntity});
+}
+
+mlir::Value
+MaskedArrayExpr::generateElementalParts(fir::FirOpBuilder &builder,
+                                        mlir::ValueRange oneBasedIndices,
+                                        mlir::IRMapping &mapper) {
+  assert(noneElementalPartWasGenerated &&
+         "non elemental part must have been generated");
+  mlir::Operation &terminator = region.back().back();
+  if (mlir::isa<hlfir::ElementalAddrOp>(terminator))
+    TODO(loc, "vector subscripted assignments inside WHERE");
+  mlir::Value entity = mlir::cast<hlfir::YieldOp>(terminator).getEntity();
+  auto elemental = entity.getDefiningOp<hlfir::ElementalOp>();
+  if (!elemental) {
+    hlfir::Entity clonedEntity{mapper.lookupOrDefault(entity)};
+    return hlfir::getElementAt(loc, builder, clonedEntity, oneBasedIndices);
+  }
+  auto mustRecursivelyInline =
+      [&](hlfir::ElementalOp appliedElemental) -> bool {
+    return elementalParts.contains(appliedElemental.getOperation());
+  };
+  return inlineElementalOp(loc, builder, elemental, oneBasedIndices, mapper,
+                           mustRecursivelyInline);
+}
+
+void MaskedArrayExpr::generateNoneElementalCleanupIfAny(
+    fir::FirOpBuilder &builder, mlir::IRMapping &mapper) {
+  mlir::Operation &terminator = region.back().back();
+  if (mlir::isa<hlfir::ElementalAddrOp>(terminator))
+    TODO(loc, "vector subscripted assignments inside WHERE");
+  auto yieldOp = mlir::cast<hlfir::YieldOp>(terminator);
+  if (yieldOp.getCleanup().empty())
+    return;
+  for (mlir::Operation &op : yieldOp.getCleanup().getOps()) {
+    if (auto destroy = mlir::dyn_cast<hlfir::DestroyOp>(op))
+      if (elementalParts.contains(destroy.getExpr().getDefiningOp()))
+        continue;
+    if (!mlir::isa<fir::FirEndOp>(op))
+      (void)builder.clone(op, mapper);
+  }
+}
+
 /// Lower an ordered assignment tree to fir.do_loop and hlfir.assign given
 /// a schedule.
 static void lower(hlfir::OrderedAssignmentTreeOpInterface root,
@@ -371,8 +639,9 @@
   mlir::LogicalResult
   matchAndRewrite(hlfir::WhereOp whereOp,
                   mlir::PatternRewriter &rewriter) const override {
-    TODO(whereOp.getLoc(), "WHERE construct or statement in HLFIR");
-    return mlir::failure();
+    auto root = mlir::cast<hlfir::OrderedAssignmentTreeOpInterface>(
+        whereOp.getOperation());
+    return ::rewrite(root, tryFusingAssignments, rewriter);
   }
   const bool tryFusingAssignments;
 };
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/ScheduleOrderedAssignments.cpp b/flang/lib/Optimizer/HLFIR/Transforms/ScheduleOrderedAssignments.cpp
--- a/flang/lib/Optimizer/HLFIR/Transforms/ScheduleOrderedAssignments.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/ScheduleOrderedAssignments.cpp
@@ -568,6 +568,12 @@
       os << "rhs";
     else if (&assign.getLhsRegion() == &yieldRegion)
       os << "lhs";
+  } else if (auto where = mlir::dyn_cast<hlfir::WhereOp>(parent)) {
+    if (&where.getMaskRegion() == &yieldRegion)
+      os << "mask";
+  } else if (auto elseWhereOp = mlir::dyn_cast<hlfir::ElseWhereOp>(parent)) {
+    if (&elseWhereOp.getMaskRegion() == &yieldRegion)
+      os << "mask";
   } else {
     os << "unknown";
   }
diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp
--- a/flang/lib/Parser/prescan.cpp
+++ b/flang/lib/Parser/prescan.cpp
@@ -777,8 +777,23 @@
   return false;
 }
 
+static bool IsAtProcess(const char *p) {
+  static const char pAtProc[]{"process"};
+  for (std::size_t i{0}; i < sizeof pAtProc - 1; ++i) {
+    if (ToLowerCaseLetter(*++p) != pAtProc[i])
+      return false;
+  }
+  return true;
+}
+
 bool Prescanner::IsFixedFormCommentLine(const char *start) const {
   const char *p{start};
+
+  // The @process directive must start in column 1.
+  if (*p == '@' && IsAtProcess(p)) {
+    return true;
+  }
+
   if (IsFixedFormCommentChar(*p) || *p == '%' || // VAX %list, %eject, &c.
       ((*p == 'D' || *p == 'd') &&
           !features_.IsEnabled(LanguageFeature::OldDebugLines))) {
@@ -810,6 +825,8 @@
   p = SkipWhiteSpaceAndCComments(p);
   if (*p == '!' || *p == '\n') {
     return p;
+  } else if (*p == '@') {
+    return IsAtProcess(p) ? p : nullptr;
   } else {
     return nullptr;
   }
diff --git a/flang/test/Fir/omp-declare-target-data.fir b/flang/test/Fir/omp-declare-target-data.fir
new file mode 100644
--- /dev/null
+++ b/flang/test/Fir/omp-declare-target-data.fir
@@ -0,0 +1,78 @@
+// RUN: fir-opt --fir-to-llvm-ir %s | FileCheck %s
+
+module attributes {omp.is_device = #omp.isdevice<is_device = false>} {
+
+  // CHECK: llvm.mlir.global external @_QMtest_0Earray_1d(dense<[1, 2, 3]> : tensor<3xi32>) {{{.*}}omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>{{.*}}} : !llvm.array<3 x i32>
+  fir.global @_QMtest_0Earray_1d(dense<[1, 2, 3]> : tensor<3xi32>) {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} : !fir.array<3xi32>
+
+  // CHECK: llvm.mlir.global external @_QMtest_0Earray_2d() {{{.*}}omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>{{.*}}} : !llvm.array<2 x array<2 x i32>>
+  fir.global @_QMtest_0Earray_2d {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} : !fir.array<2x2xi32> {
+    %0 = fir.undefined !fir.array<2x2xi32>
+    %c1_i32 = arith.constant 1 : i32
+    %1 = fir.insert_value %0, %c1_i32, [0 : index, 0 : index] : (!fir.array<2x2xi32>, i32) -> !fir.array<2x2xi32>
+    %c2_i32 = arith.constant 2 : i32
+    %2 = fir.insert_value %1, %c2_i32, [1 : index, 0 : index] : (!fir.array<2x2xi32>, i32) -> !fir.array<2x2xi32>
+    %c3_i32 = arith.constant 3 : i32
+    %3 = fir.insert_value %2, %c3_i32, [0 : index, 1 : index] : (!fir.array<2x2xi32>, i32) -> !fir.array<2x2xi32>
+    %c4_i32 = arith.constant 4 : i32
+    %4 = fir.insert_value %3, %c4_i32, [1 : index, 1 : index] : (!fir.array<2x2xi32>, i32) -> !fir.array<2x2xi32>
+    %c2 = arith.constant 2 : index
+    %c2_0 = arith.constant 2 : index
+    fir.has_value %4 : !fir.array<2x2xi32>
+  }
+
+  // CHECK: llvm.mlir.global external @_QMtest_0Edata_extended_link_1() {{{.*}}omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>{{.*}}} : f32
+  fir.global @_QMtest_0Edata_extended_link_1 {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} : f32 {
+    %cst = arith.constant 2.000000e+00 : f32
+    fir.has_value %cst : f32
+  }
+
+  // CHECK: llvm.mlir.global external @_QMtest_0Edata_extended_link_2() {{{.*}}omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>{{.*}}} : f32
+  fir.global @_QMtest_0Edata_extended_link_2 {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} : f32 {
+    %cst = arith.constant 3.000000e+00 : f32
+    fir.has_value %cst : f32
+  }
+
+  // CHECK: llvm.mlir.global external @_QMtest_0Edata_extended_to_1() {{{.*}}omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>{{.*}}} : f32
+  fir.global @_QMtest_0Edata_extended_to_1 {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>} : f32 {
+    %cst = arith.constant 2.000000e+00 : f32
+    fir.has_value %cst : f32
+  }
+
+  // CHECK: llvm.mlir.global external @_QMtest_0Edata_extended_to_2() {{{.*}}omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>{{.*}}} : f32 {
+  fir.global @_QMtest_0Edata_extended_to_2 {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>} : f32 {
+    %cst = arith.constant 3.000000e+00 : f32
+    fir.has_value %cst : f32
+  }
+
+  // CHECK: llvm.mlir.global external @_QMtest_0Edata_int() {{{.*}}omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>{{.*}}} : i32
+  fir.global @_QMtest_0Edata_int {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} : i32 {
+    %c10_i32 = arith.constant 10 : i32
+    fir.has_value %c10_i32 : i32
+  }
+
+  // CHECK: llvm.mlir.global external @_QMtest_0Edata_int_clauseless() {{{.*}}omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>{{.*}}} : i32
+  fir.global @_QMtest_0Edata_int_clauseless {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>} : i32 {
+    %c1_i32 = arith.constant 1 : i32
+    fir.has_value %c1_i32 : i32
+  }
+
+  // CHECK: llvm.mlir.global external @_QMtest_0Edata_int_to() {{{.*}}omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>{{.*}}} : i32
+  fir.global @_QMtest_0Edata_int_to {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>} : i32 {
+    %c5_i32 = arith.constant 5 : i32
+    fir.has_value %c5_i32 : i32
+  }
+
+  // CHECK: llvm.mlir.global external @_QMtest_0Ept1() {{{.*}}omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>{{.*}}} : !llvm.struct<(ptr<i32>, i64, i32, i8, i8, i8, i8)> {
+  fir.global @_QMtest_0Ept1 {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} : !fir.box<!fir.ptr<i32>> {
+    %0 = fir.zero_bits !fir.ptr<i32>
+    %1 = fir.embox %0 : (!fir.ptr<i32>) -> !fir.box<!fir.ptr<i32>>
+    fir.has_value %1 : !fir.box<!fir.ptr<i32>>
+  }
+
+  // CHECK: llvm.mlir.global external @_QMtest_0Ept2_tar() {{{.*}}omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>{{.*}}} : i32
+  fir.global @_QMtest_0Ept2_tar {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} target : i32 {
+    %c5_i32 = arith.constant 5 : i32
+    fir.has_value %c5_i32 : i32
+  }
+}
diff --git a/flang/test/HLFIR/order_assignments/where-codegen-no-conflict.fir b/flang/test/HLFIR/order_assignments/where-codegen-no-conflict.fir
new file mode 100644
--- /dev/null
+++ b/flang/test/HLFIR/order_assignments/where-codegen-no-conflict.fir
@@ -0,0 +1,309 @@
+// Test code generation of hlfir.where, and hlfir.elsewhere when there
+// is no need to create temporary storage.
+// RUN: fir-opt %s --lower-hlfir-ordered-assignments | FileCheck %s
+
+func.func @test_simple(%arg0: !fir.box<!fir.array<?xf32>>, %arg1: !fir.box<!fir.array<?x!fir.logical<4>>>) {
+  %cst = arith.constant 4.200000e+01 : f32
+  %0:2 = hlfir.declare %arg1 {uniq_name = "mask"} : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>)
+  %1:2 = hlfir.declare %arg0 {uniq_name = "x"} : (!fir.box<!fir.array<?xf32>>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+  hlfir.where {
+    hlfir.yield %0#0 : !fir.box<!fir.array<?x!fir.logical<4>>>
+  } do {
+    hlfir.region_assign {
+      hlfir.yield %cst : f32
+    } to {
+      hlfir.yield %1#0 : !fir.box<!fir.array<?xf32>>
+    }
+  }
+  return
+}
+// CHECK-LABEL:   func.func @test_simple(
+// CHECK-SAME:                           %[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>>,
+// CHECK-SAME:                           %[[VAL_1:.*]]: !fir.box<!fir.array<?x!fir.logical<4>>>) {
+// CHECK:           %[[VAL_2:.*]] = arith.constant 4.200000e+01 : f32
+// CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "mask"} : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>)
+// CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "x"} : (!fir.box<!fir.array<?xf32>>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+// CHECK:           %[[VAL_5:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_6:.*]]:3 = fir.box_dims %[[VAL_3]]#0, %[[VAL_5]] : (!fir.box<!fir.array<?x!fir.logical<4>>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_7:.*]] = fir.shape %[[VAL_6]]#1 : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_8:.*]] = arith.constant 1 : index
+// CHECK:           fir.do_loop %[[VAL_9:.*]] = %[[VAL_8]] to %[[VAL_6]]#1 step %[[VAL_8]] {
+// CHECK:             %[[VAL_10:.*]] = hlfir.designate %[[VAL_3]]#0 (%[[VAL_9]])  : (!fir.box<!fir.array<?x!fir.logical<4>>>, index) -> !fir.ref<!fir.logical<4>>
+// CHECK:             %[[VAL_11:.*]] = fir.load %[[VAL_10]] : !fir.ref<!fir.logical<4>>
+// CHECK:             %[[VAL_12:.*]] = fir.convert %[[VAL_11]] : (!fir.logical<4>) -> i1
+// CHECK:             fir.if %[[VAL_12]] {
+// CHECK:               %[[VAL_13:.*]] = hlfir.designate %[[VAL_4]]#0 (%[[VAL_9]])  : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+// CHECK:               hlfir.assign %[[VAL_2]] to %[[VAL_13]] : f32, !fir.ref<f32>
+// CHECK:             }
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
+
+
+func.func @test_elsewhere(%arg0: !fir.ref<!fir.array<100xf32>>, %arg1: !fir.ref<!fir.array<100xf32>>, %arg2: !fir.ref<!fir.array<100xf32>>, %arg3: !fir.ref<!fir.array<100x!fir.logical<4>>>, %arg4: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "mask2"}) {
+  %c100 = arith.constant 100 : index
+  %0 = fir.shape %c100 : (index) -> !fir.shape<1>
+  %1:2 = hlfir.declare %arg3(%0) {uniq_name = "mask"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+  %2:2 = hlfir.declare %arg4(%0) {uniq_name = "mask2"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+  %3:2 = hlfir.declare %arg0(%0) {uniq_name = "x"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>)
+  %4:2 = hlfir.declare %arg1(%0) {uniq_name = "y"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>)
+  %5:2 = hlfir.declare %arg2(%0) {uniq_name = "z"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>)
+  hlfir.where {
+    hlfir.yield %1#0 : !fir.ref<!fir.array<100x!fir.logical<4>>>
+  } do {
+    hlfir.region_assign {
+      hlfir.yield %4#0 : !fir.ref<!fir.array<100xf32>>
+    } to {
+      hlfir.yield %3#0 : !fir.ref<!fir.array<100xf32>>
+    }
+    hlfir.elsewhere mask {
+      hlfir.yield %2#0 : !fir.ref<!fir.array<100x!fir.logical<4>>>
+    } do {
+      hlfir.region_assign {
+        hlfir.yield %3#0 : !fir.ref<!fir.array<100xf32>>
+      } to {
+        hlfir.yield %4#0 : !fir.ref<!fir.array<100xf32>>
+      }
+      hlfir.elsewhere do {
+        hlfir.region_assign {
+          hlfir.yield %4#0 : !fir.ref<!fir.array<100xf32>>
+        } to {
+          hlfir.yield %5#0 : !fir.ref<!fir.array<100xf32>>
+        }
+      }
+    }
+  }
+  return
+}
+// CHECK-LABEL:   func.func @test_elsewhere(
+// CHECK-SAME:                              %[[VAL_0:[^:]*]]: !fir.ref<!fir.array<100xf32>>,
+// CHECK-SAME:                              %[[VAL_1:[^:]*]]: !fir.ref<!fir.array<100xf32>>,
+// CHECK-SAME:                              %[[VAL_2:[^:]*]]: !fir.ref<!fir.array<100xf32>>,
+// CHECK-SAME:                              %[[VAL_3:[^:]*]]: !fir.ref<!fir.array<100x!fir.logical<4>>>,
+// CHECK-SAME:                              %[[VAL_4:[^:]*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "mask2"}) {
+// CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
+// CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_3]](%[[VAL_6]]) {uniq_name = "mask"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_4]](%[[VAL_6]]) {uniq_name = "mask2"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+// CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) {uniq_name = "x"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>)
+// CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_6]]) {uniq_name = "y"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>)
+// CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_6]]) {uniq_name = "z"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>)
+// CHECK:           %[[VAL_12:.*]] = arith.constant 1 : index
+// CHECK:           fir.do_loop %[[VAL_13:.*]] = %[[VAL_12]] to %[[VAL_5]] step %[[VAL_12]] {
+// CHECK:             %[[VAL_14:.*]] = hlfir.designate %[[VAL_7]]#0 (%[[VAL_13]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, index) -> !fir.ref<!fir.logical<4>>
+// CHECK:             %[[VAL_15:.*]] = fir.load %[[VAL_14]] : !fir.ref<!fir.logical<4>>
+// CHECK:             %[[VAL_16:.*]] = fir.convert %[[VAL_15]] : (!fir.logical<4>) -> i1
+// CHECK:             fir.if %[[VAL_16]] {
+// CHECK:               %[[VAL_17:.*]] = hlfir.designate %[[VAL_10]]#0 (%[[VAL_13]])  : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
+// CHECK:               %[[VAL_18:.*]] = hlfir.designate %[[VAL_9]]#0 (%[[VAL_13]])  : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
+// CHECK:               hlfir.assign %[[VAL_17]] to %[[VAL_18]] : !fir.ref<f32>, !fir.ref<f32>
+// CHECK:             }
+// CHECK:           }
+// CHECK:           %[[VAL_19:.*]] = arith.constant 1 : index
+// CHECK:           fir.do_loop %[[VAL_20:.*]] = %[[VAL_19]] to %[[VAL_5]] step %[[VAL_19]] {
+// CHECK:             %[[VAL_21:.*]] = hlfir.designate %[[VAL_7]]#0 (%[[VAL_20]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, index) -> !fir.ref<!fir.logical<4>>
+// CHECK:             %[[VAL_22:.*]] = fir.load %[[VAL_21]] : !fir.ref<!fir.logical<4>>
+// CHECK:             %[[VAL_23:.*]] = fir.convert %[[VAL_22]] : (!fir.logical<4>) -> i1
+// CHECK:             fir.if %[[VAL_23]] {
+// CHECK:             } else {
+// CHECK:               %[[VAL_24:.*]] = hlfir.designate %[[VAL_8]]#0 (%[[VAL_20]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, index) -> !fir.ref<!fir.logical<4>>
+// CHECK:               %[[VAL_25:.*]] = fir.load %[[VAL_24]] : !fir.ref<!fir.logical<4>>
+// CHECK:               %[[VAL_26:.*]] = fir.convert %[[VAL_25]] : (!fir.logical<4>) -> i1
+// CHECK:               fir.if %[[VAL_26]] {
+// CHECK:                 %[[VAL_27:.*]] = hlfir.designate %[[VAL_9]]#0 (%[[VAL_20]])  : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
+// CHECK:                 %[[VAL_28:.*]] = hlfir.designate %[[VAL_10]]#0 (%[[VAL_20]])  : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
+// CHECK:                 hlfir.assign %[[VAL_27]] to %[[VAL_28]] : !fir.ref<f32>, !fir.ref<f32>
+// CHECK:               }
+// CHECK:             }
+// CHECK:           }
+// CHECK:           %[[VAL_29:.*]] = arith.constant 1 : index
+// CHECK:           fir.do_loop %[[VAL_30:.*]] = %[[VAL_29]] to %[[VAL_5]] step %[[VAL_29]] {
+// CHECK:             %[[VAL_31:.*]] = hlfir.designate %[[VAL_7]]#0 (%[[VAL_30]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, index) -> !fir.ref<!fir.logical<4>>
+// CHECK:             %[[VAL_32:.*]] = fir.load %[[VAL_31]] : !fir.ref<!fir.logical<4>>
+// CHECK:             %[[VAL_33:.*]] = fir.convert %[[VAL_32]] : (!fir.logical<4>) -> i1
+// CHECK:             fir.if %[[VAL_33]] {
+// CHECK:             } else {
+// CHECK:               %[[VAL_34:.*]] = hlfir.designate %[[VAL_8]]#0 (%[[VAL_30]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, index) -> !fir.ref<!fir.logical<4>>
+// CHECK:               %[[VAL_35:.*]] = fir.load %[[VAL_34]] : !fir.ref<!fir.logical<4>>
+// CHECK:               %[[VAL_36:.*]] = fir.convert %[[VAL_35]] : (!fir.logical<4>) -> i1
+// CHECK:               fir.if %[[VAL_36]] {
+// CHECK:               } else {
+// CHECK:                 %[[VAL_37:.*]] = hlfir.designate %[[VAL_10]]#0 (%[[VAL_30]])  : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
+// CHECK:                 %[[VAL_38:.*]] = hlfir.designate %[[VAL_11]]#0 (%[[VAL_30]])  : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
+// CHECK:                 hlfir.assign %[[VAL_37]] to %[[VAL_38]] : !fir.ref<f32>, !fir.ref<f32>
+// CHECK:               }
+// CHECK:             }
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
+
+func.func @expr_tree(%arg0: !fir.box<!fir.array<?xf32>>, %arg1: !fir.box<!fir.array<?xf32>>, %arg2: !fir.box<!fir.array<?x!fir.logical<4>>>) {
+  %cst = arith.constant 0.000000e+00 : f32
+  %c-1 = arith.constant -1 : index
+  %c1 = arith.constant 1 : index
+  %c10 = arith.constant 10 : index
+  %0:2 = hlfir.declare %arg2 {uniq_name = "mask"} : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>)
+  %1:2 = hlfir.declare %arg0 {uniq_name = "x"} : (!fir.box<!fir.array<?xf32>>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+  %2:2 = hlfir.declare %arg1 {uniq_name = "y"} : (!fir.box<!fir.array<?xf32>>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+  hlfir.where {
+    %3 = fir.shape %c10 : (index) -> !fir.shape<1>
+    %4 = hlfir.designate %2#0 (%c10:%c1:%c-1)  shape %3 : (!fir.box<!fir.array<?xf32>>, index, index, index, !fir.shape<1>) -> !fir.box<!fir.array<10xf32>>
+    %5 = hlfir.elemental %3 : (!fir.shape<1>) -> !hlfir.expr<10xf32> {
+    ^bb0(%arg3: index):
+      %9 = hlfir.designate %4 (%arg3)  : (!fir.box<!fir.array<10xf32>>, index) -> !fir.ref<f32>
+      %10 = fir.load %9 : !fir.ref<f32>
+      %11 = math.absf %10 fastmath<contract> : f32
+      hlfir.yield_element %11 : f32
+    }
+    %6 = hlfir.elemental %3 : (!fir.shape<1>) -> !hlfir.expr<10x!fir.logical<4>> {
+    ^bb0(%arg3: index):
+      %9 = hlfir.apply %5, %arg3 : (!hlfir.expr<10xf32>, index) -> f32
+      %10 = arith.cmpf ogt, %9, %cst : f32
+      %11 = fir.convert %10 : (i1) -> !fir.logical<4>
+      hlfir.yield_element %11 : !fir.logical<4>
+    }
+    %7 = hlfir.elemental %3 : (!fir.shape<1>) -> !hlfir.expr<10x!fir.logical<4>> {
+    ^bb0(%arg3: index):
+      %9 = hlfir.apply %6, %arg3 : (!hlfir.expr<10x!fir.logical<4>>, index) -> !fir.logical<4>
+      %10 = hlfir.no_reassoc %9 : !fir.logical<4>
+      hlfir.yield_element %10 : !fir.logical<4>
+    }
+    %8 = hlfir.elemental %3 : (!fir.shape<1>) -> !hlfir.expr<10x!fir.logical<4>> {
+    ^bb0(%arg3: index):
+      %9 = hlfir.apply %7, %arg3 : (!hlfir.expr<10x!fir.logical<4>>, index) -> !fir.logical<4>
+      %10 = hlfir.designate %0#0 (%arg3)  : (!fir.box<!fir.array<?x!fir.logical<4>>>, index) -> !fir.ref<!fir.logical<4>>
+      %11 = fir.load %10 : !fir.ref<!fir.logical<4>>
+      %12 = fir.convert %9 : (!fir.logical<4>) -> i1
+      %13 = fir.convert %11 : (!fir.logical<4>) -> i1
+      %14 = arith.andi %12, %13 : i1
+      %15 = fir.convert %14 : (i1) -> !fir.logical<4>
+      hlfir.yield_element %15 : !fir.logical<4>
+    }
+    hlfir.yield %8 : !hlfir.expr<10x!fir.logical<4>> cleanup {
+      hlfir.destroy %8 : !hlfir.expr<10x!fir.logical<4>>
+      hlfir.destroy %7 : !hlfir.expr<10x!fir.logical<4>>
+      hlfir.destroy %6 : !hlfir.expr<10x!fir.logical<4>>
+      hlfir.destroy %5 : !hlfir.expr<10xf32>
+    }
+  } do {
+    hlfir.region_assign {
+      hlfir.yield %2#0 : !fir.box<!fir.array<?xf32>>
+    } to {
+      hlfir.yield %1#0 : !fir.box<!fir.array<?xf32>>
+    }
+  }
+  return
+}
+// CHECK-LABEL:   func.func @expr_tree(
+// CHECK-SAME:                         %[[VAL_0:[^:]*]]: !fir.box<!fir.array<?xf32>>,
+// CHECK-SAME:                         %[[VAL_1:[^:]*]]: !fir.box<!fir.array<?xf32>>,
+// CHECK-SAME:                         %[[VAL_2:.*]]: !fir.box<!fir.array<?x!fir.logical<4>>>) {
+// CHECK:           %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           %[[VAL_4:.*]] = arith.constant -1 : index
+// CHECK:           %[[VAL_5:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_6:.*]] = arith.constant 10 : index
+// CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "mask"} : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>)
+// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "x"} : (!fir.box<!fir.array<?xf32>>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+// CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "y"} : (!fir.box<!fir.array<?xf32>>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+// CHECK:           %[[VAL_10:.*]] = fir.shape %[[VAL_6]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_11:.*]] = hlfir.designate %[[VAL_9]]#0 (%[[VAL_6]]:%[[VAL_5]]:%[[VAL_4]])  shape %[[VAL_10]] : (!fir.box<!fir.array<?xf32>>, index, index, index, !fir.shape<1>) -> !fir.box<!fir.array<10xf32>>
+// CHECK:           %[[VAL_12:.*]] = arith.constant 1 : index
+// CHECK:           fir.do_loop %[[VAL_13:.*]] = %[[VAL_12]] to %[[VAL_6]] step %[[VAL_12]] {
+// CHECK:             %[[VAL_14:.*]] = hlfir.designate %[[VAL_11]] (%[[VAL_13]])  : (!fir.box<!fir.array<10xf32>>, index) -> !fir.ref<f32>
+// CHECK:             %[[VAL_15:.*]] = fir.load %[[VAL_14]] : !fir.ref<f32>
+// CHECK:             %[[VAL_16:.*]] = math.absf %[[VAL_15]] fastmath<contract> : f32
+// CHECK:             %[[VAL_17:.*]] = arith.cmpf ogt, %[[VAL_16]], %[[VAL_3]] : f32
+// CHECK:             %[[VAL_18:.*]] = fir.convert %[[VAL_17]] : (i1) -> !fir.logical<4>
+// CHECK:             %[[VAL_19:.*]] = hlfir.no_reassoc %[[VAL_18]] : !fir.logical<4>
+// CHECK:             %[[VAL_20:.*]] = hlfir.designate %[[VAL_7]]#0 (%[[VAL_13]])  : (!fir.box<!fir.array<?x!fir.logical<4>>>, index) -> !fir.ref<!fir.logical<4>>
+// CHECK:             %[[VAL_21:.*]] = fir.load %[[VAL_20]] : !fir.ref<!fir.logical<4>>
+// CHECK:             %[[VAL_22:.*]] = fir.convert %[[VAL_19]] : (!fir.logical<4>) -> i1
+// CHECK:             %[[VAL_23:.*]] = fir.convert %[[VAL_21]] : (!fir.logical<4>) -> i1
+// CHECK:             %[[VAL_24:.*]] = arith.andi %[[VAL_22]], %[[VAL_23]] : i1
+// CHECK:             %[[VAL_25:.*]] = fir.convert %[[VAL_24]] : (i1) -> !fir.logical<4>
+// CHECK:             %[[VAL_26:.*]] = fir.convert %[[VAL_25]] : (!fir.logical<4>) -> i1
+// CHECK:             fir.if %[[VAL_26]] {
+// CHECK:               %[[VAL_27:.*]] = hlfir.designate %[[VAL_9]]#0 (%[[VAL_13]])  : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+// CHECK:               %[[VAL_28:.*]] = hlfir.designate %[[VAL_8]]#0 (%[[VAL_13]])  : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+// CHECK:               hlfir.assign %[[VAL_27]] to %[[VAL_28]] : !fir.ref<f32>, !fir.ref<f32>
+// CHECK:             }
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
+
+func.func @inside_forall(%arg0: !fir.ref<!fir.array<10x20xf32>>, %arg1: !fir.ref<!fir.array<20xf32>>) {
+  %c1 = arith.constant 1 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %c10_i32 = arith.constant 10 : i32
+  %c1_i32 = arith.constant 1 : i32
+  %c10 = arith.constant 10 : index
+  %c20 = arith.constant 20 : index
+  %0 = fir.shape %c10, %c20 : (index, index) -> !fir.shape<2>
+  %1:2 = hlfir.declare %arg0(%0) {uniq_name = "x"} : (!fir.ref<!fir.array<10x20xf32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<10x20xf32>>, !fir.ref<!fir.array<10x20xf32>>)
+  %2 = fir.shape %c20 : (index) -> !fir.shape<1>
+  %3:2 = hlfir.declare %arg1(%2) {uniq_name = "y"} : (!fir.ref<!fir.array<20xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<20xf32>>, !fir.ref<!fir.array<20xf32>>)
+  hlfir.forall lb {
+    hlfir.yield %c1_i32 : i32
+  } ub {
+    hlfir.yield %c10_i32 : i32
+  }  (%arg2: i32) {
+    hlfir.where {
+      %4 = hlfir.elemental %2 : (!fir.shape<1>) -> !hlfir.expr<20x!fir.logical<4>> {
+      ^bb0(%arg3: index):
+        %5 = hlfir.designate %3#0 (%arg3)  : (!fir.ref<!fir.array<20xf32>>, index) -> !fir.ref<f32>
+        %6 = fir.load %5 : !fir.ref<f32>
+        %7 = arith.cmpf ogt, %6, %cst : f32
+        %8 = fir.convert %7 : (i1) -> !fir.logical<4>
+        hlfir.yield_element %8 : !fir.logical<4>
+      }
+      hlfir.yield %4 : !hlfir.expr<20x!fir.logical<4>> cleanup {
+        hlfir.destroy %4 : !hlfir.expr<20x!fir.logical<4>>
+      }
+    } do {
+      hlfir.region_assign {
+        hlfir.yield %3#0 : !fir.ref<!fir.array<20xf32>>
+      } to {
+        %4 = fir.convert %arg2 : (i32) -> i64
+        %5 = hlfir.designate %1#0 (%4, %c1:%c20:%c1)  shape %2 : (!fir.ref<!fir.array<10x20xf32>>, i64, index, index, index, !fir.shape<1>) -> !fir.box<!fir.array<20xf32>>
+        hlfir.yield %5 : !fir.box<!fir.array<20xf32>>
+      }
+    }
+  }
+  return
+}
+// CHECK-LABEL:   func.func @inside_forall(
+// CHECK-SAME:                             %[[VAL_0:.*]]: !fir.ref<!fir.array<10x20xf32>>,
+// CHECK-SAME:                             %[[VAL_1:.*]]: !fir.ref<!fir.array<20xf32>>) {
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           %[[VAL_4:.*]] = arith.constant 10 : i32
+// CHECK:           %[[VAL_5:.*]] = arith.constant 1 : i32
+// CHECK:           %[[VAL_6:.*]] = arith.constant 10 : index
+// CHECK:           %[[VAL_7:.*]] = arith.constant 20 : index
+// CHECK:           %[[VAL_8:.*]] = fir.shape %[[VAL_6]], %[[VAL_7]] : (index, index) -> !fir.shape<2>
+// CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_8]]) {uniq_name = "x"} : (!fir.ref<!fir.array<10x20xf32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<10x20xf32>>, !fir.ref<!fir.array<10x20xf32>>)
+// CHECK:           %[[VAL_10:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_10]]) {uniq_name = "y"} : (!fir.ref<!fir.array<20xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<20xf32>>, !fir.ref<!fir.array<20xf32>>)
+// CHECK:           %[[VAL_12:.*]] = fir.convert %[[VAL_5]] : (i32) -> index
+// CHECK:           %[[VAL_13:.*]] = fir.convert %[[VAL_4]] : (i32) -> index
+// CHECK:           %[[VAL_14:.*]] = arith.constant 1 : index
+// CHECK:           fir.do_loop %[[VAL_15:.*]] = %[[VAL_12]] to %[[VAL_13]] step %[[VAL_14]] {
+// CHECK:             %[[VAL_16:.*]] = fir.convert %[[VAL_15]] : (index) -> i32
+// CHECK:             %[[VAL_17:.*]] = arith.constant 1 : index
+// CHECK:             %[[VAL_18:.*]] = fir.convert %[[VAL_16]] : (i32) -> i64
+// CHECK:             %[[VAL_19:.*]] = hlfir.designate %[[VAL_9]]#0 (%[[VAL_18]], %[[VAL_2]]:%[[VAL_7]]:%[[VAL_2]])  shape %[[VAL_10]] : (!fir.ref<!fir.array<10x20xf32>>, i64, index, index, index, !fir.shape<1>) -> !fir.box<!fir.array<20xf32>>
+// CHECK:             fir.do_loop %[[VAL_20:.*]] = %[[VAL_17]] to %[[VAL_7]] step %[[VAL_17]] {
+// CHECK:               %[[VAL_21:.*]] = hlfir.designate %[[VAL_11]]#0 (%[[VAL_20]])  : (!fir.ref<!fir.array<20xf32>>, index) -> !fir.ref<f32>
+// CHECK:               %[[VAL_22:.*]] = fir.load %[[VAL_21]] : !fir.ref<f32>
+// CHECK:               %[[VAL_23:.*]] = arith.cmpf ogt, %[[VAL_22]], %[[VAL_3]] : f32
+// CHECK:               %[[VAL_24:.*]] = fir.convert %[[VAL_23]] : (i1) -> !fir.logical<4>
+// CHECK:               %[[VAL_25:.*]] = fir.convert %[[VAL_24]] : (!fir.logical<4>) -> i1
+// CHECK:               fir.if %[[VAL_25]] {
+// CHECK:                 %[[VAL_26:.*]] = hlfir.designate %[[VAL_11]]#0 (%[[VAL_20]])  : (!fir.ref<!fir.array<20xf32>>, index) -> !fir.ref<f32>
+// CHECK:                 %[[VAL_27:.*]] = hlfir.designate %[[VAL_19]] (%[[VAL_20]])  : (!fir.box<!fir.array<20xf32>>, index) -> !fir.ref<f32>
+// CHECK:                 hlfir.assign %[[VAL_26]] to %[[VAL_27]] : !fir.ref<f32>, !fir.ref<f32>
+// CHECK:               }
+// CHECK:             }
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
diff --git a/flang/test/HLFIR/order_assignments/where-fusing-scheduling.f90 b/flang/test/HLFIR/order_assignments/where-fusing-scheduling.f90
new file mode 100644
--- /dev/null
+++ b/flang/test/HLFIR/order_assignments/where-fusing-scheduling.f90
@@ -0,0 +1,41 @@
+! Test scheduling of WHERE in lower-hlfir-ordered-assignments pass
+! when fusing is enabled or disabled.
+
+!RUN: bbc -hlfir -o - -pass-pipeline="builtin.module(lower-hlfir-ordered-assignments{fuse-assignments=false})" --debug-only=flang-ordered-assignment -flang-dbg-order-assignment-schedule-only %s 2>&1 | FileCheck %s --check-prefix NOFUSE
+
+!RUN: bbc -hlfir -o - -pass-pipeline="builtin.module(lower-hlfir-ordered-assignments{fuse-assignments=true})" --debug-only=flang-ordered-assignment -flang-dbg-order-assignment-schedule-only %s 2>&1 | FileCheck %s --check-prefix FUSE
+
+!REQUIRES: asserts
+
+subroutine fusable(x, y, mask)
+  real :: x(:), y(:)
+  logical :: mask(:)
+  where (mask)
+    x = 41.
+    y = 42.
+  end where
+end subroutine
+
+subroutine unfusable(x, y, mask)
+  real :: x(:), y(:)
+  logical :: mask(:)
+  where (mask)
+    x(1:10) = y
+    y = x(10:1:-1)
+  end where
+end subroutine
+
+!NOFUSE-LABEL: ------------ scheduling where in _QPfusable ------------
+!NOFUSE-NEXT: run 1 evaluate: where/region_assign1
+!NOFUSE-NEXT: run 2 evaluate: where/region_assign2
+!NOFUSE-LABEL: ------------ scheduling where in _QPunfusable ------------
+!NOFUSE-NEXT: run 1 evaluate: where/region_assign1
+!NOFUSE-NEXT: run 2 evaluate: where/region_assign2
+
+!FUSE-LABEL: ------------ scheduling where in _QPfusable ------------
+!FUSE-NEXT: run 1 evaluate: where/region_assign1
+!FUSE-NEXT: run 1 evaluate: where/region_assign2
+!FUSE-LABEL: ------------ scheduling where in _QPunfusable ------------
+!FUSE-NEXT: run 1 evaluate: where/region_assign1
+!FUSE-NEXT: conflict: R/W: <block argument> of type '!fir.box<!fir.array<?xf32>>' at index: 1 W:<block argument> of type '!fir.box<!fir.array<?xf32>>' at index: 1
+!FUSE-NEXT: run 2 evaluate: where/region_assign2
diff --git a/flang/test/HLFIR/order_assignments/where-scheduling.f90 b/flang/test/HLFIR/order_assignments/where-scheduling.f90
new file mode 100644
--- /dev/null
+++ b/flang/test/HLFIR/order_assignments/where-scheduling.f90
@@ -0,0 +1,128 @@
+! Test scheduling of WHERE in lower-hlfir-ordered-assignments pass.
+
+! RUN: bbc -hlfir -o - -pass-pipeline="builtin.module(lower-hlfir-ordered-assignments)" --debug-only=flang-ordered-assignment -flang-dbg-order-assignment-schedule-only %s 2>&1 | FileCheck %s
+! REQUIRES: asserts
+
+subroutine no_conflict(x, y)
+  real :: x(:), y(:)
+  where (y.gt.0) x = y
+end subroutine
+
+subroutine fake_conflict(x, y)
+  ! The conflict here could be avoided because the read and write are
+  ! aligned, so there would not be any read after write at the element
+  ! level, but this will require a bit more work to detect this (like
+  ! comparing the hlfir.designate operations).
+  real :: x(:), y(:)
+  where (x.gt.y) x = y
+end subroutine
+
+subroutine only_once(x, y, z)
+  interface
+    impure function call_me_only_once()
+      logical :: call_me_only_once(10)
+    end function
+  end interface
+  real :: x(:), y(:), z(:)
+  where (call_me_only_once())
+    x = y
+    z = y
+  end where
+end subroutine
+
+subroutine rhs_lhs_conflict(x, y)
+  real :: x(:, :), y(:, :)
+  where (y.gt.0.) x = transpose(x)
+end subroutine
+
+subroutine where_construct_no_conflict(x, y, z, mask1, mask2)
+  real :: x(:), y(:), z(:)
+  logical :: mask1(:), mask2(:)
+  where (mask1)
+    x = y
+  elsewhere (mask2)
+    z = y
+  end where
+end subroutine
+
+subroutine where_construct_conflict(x, y)
+  real :: x(:, :), y(:, :)
+  where (y.gt.0.)
+    x = y
+  elsewhere (x.gt.0)
+    y = x
+  end where
+end subroutine
+
+subroutine where_construct_conflict_2(x, y)
+  real :: x(:, :), y(:, :)
+  where (x.gt.0.)
+    x = y
+  elsewhere (y.gt.0)
+    y = x
+  end where
+end subroutine
+
+subroutine where_vector_subscript_conflict_1(x, vec1)
+  real :: x(10)
+  integer :: vec1(10)
+  where (x(vec1).lt.0.) x = 42.
+end subroutine
+
+subroutine where_vector_subscript_conflict_2(x, vec1)
+  integer :: x(10)
+  real :: y(10)
+  where (y(x).lt.0.) x = 0
+end subroutine
+
+subroutine where_in_forall_conflict(x)
+  real :: x(:, :)
+  forall (i = 1:10)
+    where (x(i, :).gt.0) x(:, i) = x(i, :)
+  end forall
+end subroutine
+
+!CHECK-LABEL: ------------ scheduling where in _QPno_conflict ------------
+!CHECK-NEXT: run 1 evaluate: where/region_assign1
+!CHECK-LABEL: ------------ scheduling where in _QPfake_conflict ------------
+!CHECK-NEXT: conflict: R/W: <block argument> of type '!fir.box<!fir.array<?xf32>>' at index: 0 W:<block argument> of type '!fir.box<!fir.array<?xf32>>' at index: 0
+!CHECK-NEXT: run 1 save    : where/mask
+!CHECK-NEXT: run 2 evaluate: where/region_assign1
+!CHECK-LABEL: ------------ scheduling where in _QPonly_once ------------
+!CHECK-NEXT: unknown effect: %9 = fir.call @llvm.stacksave() fastmath<contract> : () -> !fir.ref<i8>
+!CHECK-NEXT: run 1 save  (w): where/mask
+!CHECK-NEXT: run 2 evaluate: where/region_assign1
+!CHECK-NEXT: run 3 evaluate: where/region_assign2
+!CHECK-LABEL: ------------ scheduling where in _QPrhs_lhs_conflict ------------
+!CHECK-NEXT: unknown effect: %2 = hlfir.transpose %0#0 : (!fir.box<!fir.array<?x?xf32>>) -> !hlfir.expr<?x?xf32>
+!CHECK-NEXT: run 1 save  (w): where/region_assign1/rhs
+!CHECK-NEXT: run 2 evaluate: where/region_assign1
+!CHECK-LABEL: ------------ scheduling where in _QPwhere_construct_no_conflict ------------
+!CHECK-NEXT: run 1 evaluate: where/region_assign1
+!CHECK-NEXT: run 2 evaluate: where/elsewhere1/region_assign1
+!CHECK-LABEL: ------------ scheduling where in _QPwhere_construct_conflict ------------
+!CHECK-NEXT: run 1 evaluate: where/region_assign1
+!CHECK-NEXT: conflict: R/W: <block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 1 W:<block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 1
+!CHECK-NEXT: run 2 save    : where/mask
+!CHECK-NEXT: run 3 evaluate: where/elsewhere1/region_assign1
+!CHECK-LABEL: ------------ scheduling where in _QPwhere_construct_conflict_2 ------------
+!CHECK-NEXT: conflict: R/W: <block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 0 W:<block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 0
+!CHECK-NEXT: run 1 save    : where/mask
+!CHECK-NEXT: run 2 evaluate: where/region_assign1
+!CHECK-NEXT: conflict: R/W: <block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 1 W:<block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 1
+!CHECK-NEXT: run 3 save    : where/elsewhere1/mask
+!CHECK-NEXT: run 4 evaluate: where/elsewhere1/region_assign1
+!CHECK-LABEL: ------------ scheduling where in _QPwhere_vector_subscript_conflict_1 ------------
+!CHECK-NEXT: conflict: R/W: <block argument> of type '!fir.ref<!fir.array<10xf32>>' at index: 0 W:<block argument> of type '!fir.ref<!fir.array<10xf32>>' at index: 0
+!CHECK-NEXT: run 1 save    : where/mask
+!CHECK-NEXT: run 2 evaluate: where/region_assign1
+!CHECK-LABEL: ------------ scheduling where in _QPwhere_vector_subscript_conflict_2 ------------
+!CHECK-NEXT: conflict: R/W: <block argument> of type '!fir.ref<!fir.array<10xi32>>' at index: 0 W:<block argument> of type '!fir.ref<!fir.array<10xi32>>' at index: 0
+!CHECK-NEXT: run 1 save    : where/mask
+!CHECK-NEXT: run 2 evaluate: where/region_assign1
+!CHECK-LABEL: ------------ scheduling forall in _QPwhere_in_forall_conflict ------------
+!CHECK-NEXT: conflict: R/W: <block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 0 W:<block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 0
+!CHECK-NEXT: run 1 save    : forall/where1/mask
+!CHECK-NEXT: conflict: R/W: <block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 0 W:<block argument> of type '!fir.box<!fir.array<?x?xf32>>' at index: 0
+!CHECK-NEXT: run 1 save    : forall/where1/region_assign1/rhs
+!CHECK-NEXT: run 2 evaluate: forall/where1/region_assign1
diff --git a/flang/test/Lower/OpenACC/acc-loop.f90 b/flang/test/Lower/OpenACC/acc-loop.f90
--- a/flang/test/Lower/OpenACC/acc-loop.f90
+++ b/flang/test/Lower/OpenACC/acc-loop.f90
@@ -2,6 +2,11 @@
 
 ! RUN: bbc -fopenacc -emit-fir %s -o - | FileCheck %s
 
+! CHECK-LABEL: acc.private.recipe @privatization_10x10xf32 : !fir.ref<!fir.array<10x10xf32>> init {
+! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.array<10x10xf32>>):
+! CHECK: acc.yield %{{.*}} : !fir.ref<!fir.array<10x10xf32>>
+! CHECK: }
+
 program acc_loop
 
   integer :: i, j
@@ -154,7 +159,7 @@
     a(i) = b(i)
   END DO
 
-!CHECK:      acc.loop private(%{{.*}} : !fir.ref<!fir.array<10x10xf32>>) {
+!CHECK:      acc.loop private(@privatization_10x10xf32 -> %{{.*}} : !fir.ref<!fir.array<10x10xf32>>) {
 !CHECK:        fir.do_loop
 !CHECK:        acc.yield
 !CHECK-NEXT: }{{$}}
@@ -164,7 +169,7 @@
     a(i) = b(i)
   END DO
 
-!CHECK:      acc.loop private(%{{.*}}, %{{.*}} : !fir.ref<!fir.array<10x10xf32>>, !fir.ref<!fir.array<10x10xf32>>) {
+!CHECK:      acc.loop private(@privatization_10x10xf32 -> %{{.*}} : !fir.ref<!fir.array<10x10xf32>>, @privatization_10x10xf32 -> %{{.*}} : !fir.ref<!fir.array<10x10xf32>>) {
 !CHECK:        fir.do_loop
 !CHECK:        acc.yield
 !CHECK-NEXT: }{{$}}
@@ -174,7 +179,7 @@
     a(i) = b(i)
   END DO
 
-!CHECK:      acc.loop private(%{{.*}}, %{{.*}} : !fir.ref<!fir.array<10x10xf32>>, !fir.ref<!fir.array<10x10xf32>>) {
+!CHECK:      acc.loop private(@privatization_10x10xf32 -> %{{.*}} : !fir.ref<!fir.array<10x10xf32>>, @privatization_10x10xf32 -> %{{.*}} : !fir.ref<!fir.array<10x10xf32>>) {
 !CHECK:        fir.do_loop
 !CHECK:        acc.yield
 !CHECK-NEXT: }{{$}}
diff --git a/flang/test/Lower/OpenACC/acc-parallel-loop.f90 b/flang/test/Lower/OpenACC/acc-parallel-loop.f90
--- a/flang/test/Lower/OpenACC/acc-parallel-loop.f90
+++ b/flang/test/Lower/OpenACC/acc-parallel-loop.f90
@@ -2,6 +2,13 @@
 
 ! RUN: bbc -fopenacc -emit-fir %s -o - | FileCheck %s
 
+! CHECK-LABEL: acc.private.recipe @privatization_10xf32 : !fir.ref<!fir.array<10xf32>> init {
+! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.array<10xf32>>):
+! CHECK:   acc.yield %{{.*}} : !fir.ref<!fir.array<10xf32>>
+! CHECK: }
+
+! CHECK-LABEL: func.func @_QPacc_parallel_loop()
+
 subroutine acc_parallel_loop
   integer :: i, j
 
@@ -447,8 +454,8 @@
     a(i) = b(i)
   END DO
 
-! CHECK:      acc.parallel firstprivate(%[[B]] : !fir.ref<!fir.array<10xf32>>) private(%[[A]] : !fir.ref<!fir.array<10xf32>>) {
-! CHECK:        acc.loop private(%[[A]] : !fir.ref<!fir.array<10xf32>>) {
+! CHECK:      acc.parallel firstprivate(%[[B]] : !fir.ref<!fir.array<10xf32>>) private(@privatization_10xf32 -> %[[A]] : !fir.ref<!fir.array<10xf32>>) {
+! CHECK:        acc.loop private(@privatization_10xf32 -> %[[A]] : !fir.ref<!fir.array<10xf32>>) {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
diff --git a/flang/test/Lower/OpenACC/acc-parallel.f90 b/flang/test/Lower/OpenACC/acc-parallel.f90
--- a/flang/test/Lower/OpenACC/acc-parallel.f90
+++ b/flang/test/Lower/OpenACC/acc-parallel.f90
@@ -2,6 +2,13 @@
 
 ! RUN: bbc -fopenacc -emit-fir %s -o - | FileCheck %s
 
+! CHECK-LABEL: acc.private.recipe @privatization_10x10xf32 : !fir.ref<!fir.array<10x10xf32>> init {
+! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.array<10x10xf32>>):
+! CHECK: acc.yield %{{.*}} : !fir.ref<!fir.array<10x10xf32>>
+! CHECK: }
+
+! CHECK-LABEL: func.func @_QPacc_parallel()
+
 subroutine acc_parallel
   integer :: i, j
 
@@ -288,11 +295,11 @@
 !CHECK: acc.detach accPtr(%[[ATTACH_D]] : !fir.ptr<f32>) {dataClause = 10 : i64, name = "d"}
 !CHECK: acc.detach accPtr(%[[ATTACH_E]] : !fir.ptr<f32>) {dataClause = 10 : i64, name = "e"}
 
-  !$acc parallel private(a) firstprivate(b) private(c)
-  !$acc end parallel
+!$acc parallel private(a) firstprivate(b) private(c)
+!$acc end parallel
 
-!CHECK:      acc.parallel firstprivate(%[[B]] : !fir.ref<!fir.array<10x10xf32>>) private(%[[A]], %[[C]] : !fir.ref<!fir.array<10x10xf32>>, !fir.ref<!fir.array<10x10xf32>>) {
-!CHECK:        acc.yield
-!CHECK-NEXT: }{{$}}
+! CHECK:      acc.parallel firstprivate(%[[B]] : !fir.ref<!fir.array<10x10xf32>>) private(@privatization_10x10xf32 -> %[[A]] : !fir.ref<!fir.array<10x10xf32>>, @privatization_10x10xf32 -> %[[C]] : !fir.ref<!fir.array<10x10xf32>>) {
+! CHECK:        acc.yield
+! CHECK-NEXT: }{{$}}
 
 end subroutine acc_parallel
diff --git a/flang/test/Lower/OpenACC/acc-serial-loop.f90 b/flang/test/Lower/OpenACC/acc-serial-loop.f90
--- a/flang/test/Lower/OpenACC/acc-serial-loop.f90
+++ b/flang/test/Lower/OpenACC/acc-serial-loop.f90
@@ -2,6 +2,13 @@
 
 ! RUN: bbc -fopenacc -emit-fir %s -o - | FileCheck %s
 
+! CHECK-LABEL: acc.private.recipe @privatization_10xf32 : !fir.ref<!fir.array<10xf32>> init {
+! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.array<10xf32>>):
+! CHECK:   acc.yield %{{.*}} : !fir.ref<!fir.array<10xf32>>
+! CHECK: }
+
+! CHECK-LABEL: func.func @_QPacc_serial_loop()
+
 subroutine acc_serial_loop
   integer :: i, j
 
@@ -363,8 +370,8 @@
     a(i) = b(i)
   END DO
 
-! CHECK:      acc.serial firstprivate(%[[B]] : !fir.ref<!fir.array<10xf32>>) private(%[[A]] : !fir.ref<!fir.array<10xf32>>) {
-! CHECK:        acc.loop private(%[[A]] : !fir.ref<!fir.array<10xf32>>) {
+! CHECK:      acc.serial firstprivate(%[[B]] : !fir.ref<!fir.array<10xf32>>) private(@privatization_10xf32 -> %[[A]] : !fir.ref<!fir.array<10xf32>>) {
+! CHECK:        acc.loop private(@privatization_10xf32 -> %[[A]] : !fir.ref<!fir.array<10xf32>>) {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
diff --git a/flang/test/Lower/OpenACC/acc-serial.f90 b/flang/test/Lower/OpenACC/acc-serial.f90
--- a/flang/test/Lower/OpenACC/acc-serial.f90
+++ b/flang/test/Lower/OpenACC/acc-serial.f90
@@ -2,6 +2,13 @@
 
 ! RUN: bbc -fopenacc -emit-fir %s -o - | FileCheck %s
 
+! CHECK-LABEL: acc.private.recipe @privatization_10x10xf32 : !fir.ref<!fir.array<10x10xf32>> init {
+! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.array<10x10xf32>>):
+! CHECK: acc.yield %{{.*}} : !fir.ref<!fir.array<10x10xf32>>
+! CHECK: }
+
+! CHECK-LABEL: func.func @_QPacc_serial()
+
 subroutine acc_serial
   integer :: i, j
 
@@ -234,7 +241,7 @@
   !$acc serial private(a) firstprivate(b) private(c)
   !$acc end serial
 
-! CHECK:      acc.serial firstprivate(%[[B]] : !fir.ref<!fir.array<10x10xf32>>) private(%[[A]], %[[C]] : !fir.ref<!fir.array<10x10xf32>>, !fir.ref<!fir.array<10x10xf32>>) {
+! CHECK:      acc.serial firstprivate(%[[B]] : !fir.ref<!fir.array<10x10xf32>>) private(@privatization_10x10xf32 -> %[[A]] : !fir.ref<!fir.array<10x10xf32>>, @privatization_10x10xf32 -> %[[C]] : !fir.ref<!fir.array<10x10xf32>>) {
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
diff --git a/flang/test/Lower/array.f90 b/flang/test/Lower/array.f90
--- a/flang/test/Lower/array.f90
+++ b/flang/test/Lower/array.f90
@@ -102,33 +102,25 @@
   integer, dimension(10) :: a0
   real, dimension(2,3) ::  a1
   integer, dimension(3,4) :: a2
+  integer, dimension(2,3,4) :: a3
  
   a0 = (/1, 2, 3, 3, 3, 3, 3, 3, 3, 3/)
   a1 = reshape((/3.5, 3.5, 3.5, 3.5, 3.5, 3.5/), shape(a1))
   a2 = reshape((/1, 3, 3, 5, 3, 3, 3, 3, 9, 9, 9, 8/), shape(a2))
+  a3 = reshape((/1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12/), shape(a3))
 end subroutine range
 
 ! a0 array constructor
 ! CHECK: fir.global internal @_QQro.10xi4.{{.*}}(dense<[1, 2, 3, 3, 3, 3, 3, 3, 3, 3]> : tensor<10xi32>) constant : !fir.array<10xi32>
 
 ! a1 array constructor
-! CHECK: fir.global internal @_QQro.2x3xr4.{{.*}} constant : !fir.array<2x3xf32> {
-  ! CHECK-DAG: %cst = arith.constant {{.*}} : f32
-  ! CHECK: %{{.*}} = fir.insert_on_range %{{[0-9]+}}, %cst from (0, 0) to (1, 2) :
+! CHECK: fir.global internal @_QQro.2x3xr4.{{.*}}(dense<3.500000e+00> : tensor<3x2xf32>) constant : !fir.array<2x3xf32>
 
 ! a2 array constructor
-! CHECK: fir.global internal @_QQro.3x4xi4.{{.*}} constant : !fir.array<3x4xi32> {
-  ! CHECK-DAG: %[[c1_i32:.*]] = arith.constant 1 : i32
-  ! CHECK-DAG: %[[c3_i32:.*]] = arith.constant 3 : i32
-  ! CHECK-DAG: %[[c5_i32:.*]] = arith.constant 5 : i32
-  ! CHECK-DAG: %[[c8_i32:.*]] = arith.constant 8 : i32
-  ! CHECK-DAG: %[[c9_i32:.*]] = arith.constant 9 : i32
-  ! CHECK: %[[r1:.*]] = fir.insert_value %{{.*}}, %{{.*}}, [0 : index, 0 : index] :
-  ! CHECK: %[[r2:.*]] = fir.insert_on_range %[[r1]], %[[c3_i32]] from (1, 0) to (2, 0) :
-  ! CHECK: %[[r3:.*]] = fir.insert_value %[[r2]], %{{.*}}, [0 : index, 1 : index] :
-  ! CHECK: %[[r4:.*]] = fir.insert_on_range %[[r3]], %[[c3_i32]] from (1, 1) to (1, 2) :
-  ! CHECK: %[[r5:.*]] = fir.insert_on_range %[[r4]], %[[c9_i32]] from (2, 2) to (1, 3) :
-  ! CHECK: %[[r6:.*]] = fir.insert_value %[[r5]], %{{.*}}, [2 : index, 3 : index] :
+! CHECK: fir.global internal @_QQro.3x4xi4.{{.*}}(dense<{{\[\[1, 3, 3], \[5, 3, 3], \[3, 3, 9], \[9, 9, 8]]}}> : tensor<4x3xi32>) constant : !fir.array<3x4xi32>
+
+! a3 array constructor
+! CHECK: fir.global internal @_QQro.2x3x4xi4.{{.*}}(dense<{{\[\[\[1, 1], \[2, 2], \[3, 3]], \[\[4, 4], \[5, 5], \[6, 6]], \[\[7, 7], \[8, 8], \[9, 9]], \[\[10, 10], \[11, 11], \[12, 12]]]}}> : tensor<4x3x2xi32>) constant : !fir.array<2x3x4xi32>
 
 ! CHECK-LABEL rangeGlobal
 subroutine rangeGlobal()
@@ -137,6 +129,15 @@
 
 end subroutine rangeGlobal
 
+! CHECK-LABEL hugeGlobal
+subroutine hugeGlobal()
+  integer, parameter :: D = 500
+  integer, dimension(D, D) :: a
+
+! CHECK: fir.global internal @_QQro.500x500xi4.{{.*}}(dense<{{.*}}> : tensor<500x500xi32>) constant : !fir.array<500x500xi32>
+  a = reshape((/(i, i = 1, D * D)/), shape(a))
+end subroutine hugeGlobal
+
 block data
   real(selected_real_kind(6)) :: x(5,5)
   common /block/ x
diff --git a/flang/test/Lower/dense-array-any-rank.f90 b/flang/test/Lower/dense-array-any-rank.f90
new file mode 100644
--- /dev/null
+++ b/flang/test/Lower/dense-array-any-rank.f90
@@ -0,0 +1,25 @@
+! RUN: bbc -emit-fir -o - %s | FileCheck --check-prefixes="CHECK-FIR" %s
+! RUN: %flang_fc1 -emit-llvm -o - %s | FileCheck --check-prefixes="CHECK-LLVMIR" %s
+
+! CHECK-LABEL: test
+subroutine test()
+  integer, dimension(10) :: a1
+  integer, dimension(3,4) :: a2
+  integer, dimension(2,3,4) :: a3
+
+  a1 = (/1, 2, 3, 4, 5, 6, 7, 8, 9, 10/)
+  a2 = reshape((/11, 12, 13, 21, 22, 23, 31, 32, 33, 41, 42, 43/), shape(a2))
+  a3 = reshape((/111, 112, 121, 122, 131, 132, 211, 212, 221, 222, 231, 232, 311, 312, 321, 322, 331, 332, 411, 412, 421, 422, 431, 432/), shape(a3))
+end subroutine
+
+! a1 array constructor
+! CHECK-FIR: fir.global internal @_QQro.10xi4.{{.*}}(dense<[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]> : tensor<10xi32>) constant : !fir.array<10xi32>
+! CHECK-LLVMIR: @_QQro.10xi4.0 = internal constant [10 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10]
+
+! a2 array constructor
+! CHECK-FIR: fir.global internal @_QQro.3x4xi4.{{.*}}(dense<{{\[\[11, 12, 13], \[21, 22, 23], \[31, 32, 33], \[41, 42, 43]]}}> : tensor<4x3xi32>) constant : !fir.array<3x4xi32>
+! CHECK-LLVMIR: @_QQro.3x4xi4.1 = internal constant [4 x [3 x i32]] {{\[\[3 x i32] \[i32 11, i32 12, i32 13], \[3 x i32] \[i32 21, i32 22, i32 23], \[3 x i32] \[i32 31, i32 32, i32 33], \[3 x i32] \[i32 41, i32 42, i32 43]]}}
+
+! a3 array constructor
+! CHECK-FIR: fir.global internal @_QQro.2x3x4xi4.{{.*}}(dense<{{\[\[\[111, 112], \[121, 122], \[131, 132]], \[\[211, 212], \[221, 222], \[231, 232]], \[\[311, 312], \[321, 322], \[331, 332]], \[\[411, 412], \[421, 422], \[431, 432]]]}}> : tensor<4x3x2xi32>) constant : !fir.array<2x3x4xi32>
+! CHECK-LLVMIR: @_QQro.2x3x4xi4.2 = internal constant [4 x [3 x [2 x i32]]] {{\[\[3 x \[2 x i32]] \[\[2 x i32] \[i32 111, i32 112], \[2 x i32] \[i32 121, i32 122], \[2 x i32] \[i32 131, i32 132]], \[3 x \[2 x i32]] \[\[2 x i32] \[i32 211, i32 212], \[2 x i32] \[i32 221, i32 222], \[2 x i32] \[i32 231, i32 232]], \[3 x \[2 x i32]] \[\[2 x i32] \[i32 311, i32 312], \[2 x i32] \[i32 321, i32 322], \[2 x i32] \[i32 331, i32 332]], \[3 x \[2 x i32]] \[\[2 x i32] \[i32 411, i32 412], \[2 x i32] \[i32 421, i32 422], \[2 x i32] \[i32 431, i32 432]]]}}
diff --git a/flang/test/Parser/at-process.f b/flang/test/Parser/at-process.f
new file mode 100644
--- /dev/null
+++ b/flang/test/Parser/at-process.f
@@ -0,0 +1,20 @@
+! RUN: %flang_fc1 -fsyntax-only %s 2>&1 | FileCheck %s
+
+! Test ignoring @PROCESS directive in fixed source form
+
+@process opt(3)
+@process	opt(0)
+@process
+@processopt(3)
+      subroutine f()
+c@process
+      end
+
+!CHECK: Character in fixed-form label field must be a digit
+@
+
+!CHECK: Character in fixed-form label field must be a digit
+@proce 
+
+!CHECK: Character in fixed-form label field must be a digit
+@precoss 
diff --git a/flang/test/Parser/at-process.f90 b/flang/test/Parser/at-process.f90
new file mode 100644
--- /dev/null
+++ b/flang/test/Parser/at-process.f90
@@ -0,0 +1,23 @@
+! RUN: not %flang_fc1 -fsyntax-only %s 2>&1 | FileCheck %s
+
+! Test ignoring @PROCESS directive in free source form
+
+@process opt(3)
+@process	opt(0)
+      @process strict
+@processopt(3)
+subroutine f()
+print *, "@process"
+   ! @process
+end subroutine f
+
+!CHECK: error: expected '('
+@p
+
+!CHECK: error: expected '('
+@proce
+
+!CHECK: error: expected '('
+@precoss
+end
+
diff --git a/flang/unittests/Optimizer/FIRTypesTest.cpp b/flang/unittests/Optimizer/FIRTypesTest.cpp
--- a/flang/unittests/Optimizer/FIRTypesTest.cpp
+++ b/flang/unittests/Optimizer/FIRTypesTest.cpp
@@ -8,13 +8,19 @@
 
 #include "gtest/gtest.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
+#include "flang/Optimizer/Dialect/Support/KindMapping.h"
 #include "flang/Optimizer/Support/InitFIR.h"
 
 struct FIRTypesTest : public testing::Test {
 public:
-  void SetUp() { fir::support::loadDialects(context); }
-
+  void SetUp() {
+    fir::support::loadDialects(context);
+    kindMap = new fir::KindMapping(&context, kindMapInit, "r42a10c14d28i40l41");
+  }
   mlir::MLIRContext context;
+  fir::KindMapping *kindMap{};
+  std::string kindMapInit =
+      "i10:80,l3:24,a1:8,r54:Double,r62:X86_FP80,r11:PPC_FP128";
 };
 
 // Test fir::isPolymorphicType from flang/Optimizer/Dialect/FIRType.h.
@@ -253,3 +259,22 @@
     EXPECT_EQ(ptrArrNone, fir::updateTypeForUnlimitedPolymorphic(ptrArrTy));
   }
 }
+
+TEST_F(FIRTypesTest, getTypeAsString) {
+  EXPECT_EQ("i32",
+      fir::getTypeAsString(mlir::IntegerType::get(&context, 32), *kindMap));
+  EXPECT_EQ(
+      "f64", fir::getTypeAsString(mlir::FloatType::getF64(&context), *kindMap));
+  EXPECT_EQ(
+      "l8", fir::getTypeAsString(fir::LogicalType::get(&context, 1), *kindMap));
+  EXPECT_EQ("z32",
+      fir::getTypeAsString(
+          mlir::ComplexType::get(mlir::FloatType::getF32(&context)), *kindMap));
+  EXPECT_EQ("c8",
+      fir::getTypeAsString(fir::CharacterType::get(&context, 1, 1), *kindMap));
+  EXPECT_EQ("c8x10",
+      fir::getTypeAsString(fir::CharacterType::get(&context, 1, 10), *kindMap));
+  mlir::Type ty = mlir::IntegerType::get(&context, 64);
+  mlir::Type arrTy = fir::SequenceType::get({10, 20}, ty);
+  EXPECT_EQ("10x20xi64", fir::getTypeAsString(arrTy, *kindMap));
+}
diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt
--- a/libclc/CMakeLists.txt
+++ b/libclc/CMakeLists.txt
@@ -115,6 +115,7 @@
   BitReader
   BitWriter
   Core
+  Support
 )
 add_llvm_executable( prepare_builtins utils/prepare-builtins.cpp )
 target_compile_definitions( prepare_builtins PRIVATE ${LLVM_VERSION_DEFINE} )
diff --git a/libcxx/docs/Status/Cxx20.rst b/libcxx/docs/Status/Cxx20.rst
--- a/libcxx/docs/Status/Cxx20.rst
+++ b/libcxx/docs/Status/Cxx20.rst
@@ -50,6 +50,7 @@
    .. [#note-P0883.2] P0883: ``ATOMIC_FLAG_INIT`` was marked deprecated in version 14.0, but was undeprecated with the implementation of LWG3659 in version 15.0.
    .. [#note-P2231] P2231: Optional is complete. The changes to variant haven't been implemented yet.
    .. [#note-P0408] P0408: Only `view()` members implemented.
+   .. [#note-P0660] P0660: Section 32.3 Stop Tokens is complete. ``jthread`` hasn't been implemented yet.
 
 .. _issues-status-cxx20:
 
diff --git a/libcxx/docs/Status/Cxx20Issues.csv b/libcxx/docs/Status/Cxx20Issues.csv
--- a/libcxx/docs/Status/Cxx20Issues.csv
+++ b/libcxx/docs/Status/Cxx20Issues.csv
@@ -210,7 +210,7 @@
 "`3250 <https://wg21.link/LWG3250>`__","``std::format``\ : ``#``\  (alternate form) for NaN and inf","Prague","|Complete|","14.0","|format|"
 "`3251 <https://wg21.link/LWG3251>`__","Are ``std::format``\  alignment specifiers applied to string arguments?","Prague","|Complete|","14.0","|format|"
 "`3252 <https://wg21.link/LWG3252>`__","Parse locale's aware modifiers for commands are not consistent with POSIX spec","Prague","","","|chrono|"
-"`3254 <https://wg21.link/LWG3254>`__","Strike ``stop_token``\ 's ``operator!=``\ ","Prague","",""
+"`3254 <https://wg21.link/LWG3254>`__","Strike ``stop_token``\ 's ``operator!=``\ ","Prague","|Complete|","17.0"
 "`3255 <https://wg21.link/LWG3255>`__","``span``\ 's ``array``\  constructor is too strict","Prague","|Complete|",""
 "`3260 <https://wg21.link/LWG3260>`__","``year_month*``\  arithmetic rejects durations convertible to years","Prague","","","|chrono|"
 "`3262 <https://wg21.link/LWG3262>`__","Formatting of negative durations is not specified","Prague","|Complete|","16.0","|chrono| |format|"
diff --git a/libcxx/docs/Status/Cxx20Papers.csv b/libcxx/docs/Status/Cxx20Papers.csv
--- a/libcxx/docs/Status/Cxx20Papers.csv
+++ b/libcxx/docs/Status/Cxx20Papers.csv
@@ -104,7 +104,7 @@
 "`P0553R4 <https://wg21.link/P0553R4>`__","LWG","Bit operations","Cologne","|Complete|","9.0"
 "`P0631R8 <https://wg21.link/P0631R8>`__","LWG","Math Constants","Cologne","|Complete|","11.0"
 "`P0645R10 <https://wg21.link/P0645R10>`__","LWG","Text Formatting","Cologne","|Complete| [#note-P0645]_","14.0"
-"`P0660R10 <https://wg21.link/P0660R10>`__","LWG","Stop Token and Joining Thread, Rev 10","Cologne","",""
+"`P0660R10 <https://wg21.link/P0660R10>`__","LWG","Stop Token and Joining Thread, Rev 10.","Cologne","|In Progress| [#note-P0660]_",""
 "`P0784R7 <https://wg21.link/P0784R7>`__","CWG","More constexpr containers","Cologne","|Complete|","12.0"
 "`P0980R1 <https://wg21.link/P0980R1>`__","LWG","Making std::string constexpr","Cologne","|Complete|","15.0"
 "`P1004R2 <https://wg21.link/P1004R2>`__","LWG","Making std::vector constexpr","Cologne","|Complete|","15.0"
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -632,6 +632,10 @@
   __stop_token/atomic_unique_lock.h
   __stop_token/intrusive_list_view.h
   __stop_token/intrusive_shared_ptr.h
+  __stop_token/stop_callback.h
+  __stop_token/stop_source.h
+  __stop_token/stop_state.h
+  __stop_token/stop_token.h
   __string/char_traits.h
   __string/constexpr_c_functions.h
   __string/extern_template_lists.h
@@ -947,6 +951,7 @@
   stdint.h
   stdio.h
   stdlib.h
+  stop_token
   streambuf
   string
   string.h
diff --git a/libcxx/include/__stop_token/stop_callback.h b/libcxx/include/__stop_token/stop_callback.h
new file mode 100644
--- /dev/null
+++ b/libcxx/include/__stop_token/stop_callback.h
@@ -0,0 +1,98 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___STOP_TOKEN_STOP_CALLBACK_H
+#define _LIBCPP___STOP_TOKEN_STOP_CALLBACK_H
+
+#include <__concepts/constructible.h>
+#include <__concepts/destructible.h>
+#include <__concepts/invocable.h>
+#include <__config>
+#include <__stop_token/intrusive_shared_ptr.h>
+#include <__stop_token/stop_state.h>
+#include <__stop_token/stop_token.h>
+#include <__type_traits/is_nothrow_constructible.h>
+#include <__utility/forward.h>
+#include <__utility/move.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+#if _LIBCPP_STD_VER >= 20
+
+template <class _Callback>
+class stop_callback : private __stop_callback_base {
+  static_assert(invocable<_Callback>,
+                "Mandates: stop_callback is instantiated with an argument for the template parameter Callback that "
+                "satisfies invocable.");
+  static_assert(destructible<_Callback>,
+                "Mandates: stop_callback is instantiated with an argument for the template parameter Callback that "
+                "satisfies destructible.");
+
+public:
+  using callback_type = _Callback;
+
+  template <class _Cb>
+    requires constructible_from<_Callback, _Cb>
+  _LIBCPP_HIDE_FROM_ABI explicit stop_callback(const stop_token& __st,
+                                               _Cb&& __cb) noexcept(is_nothrow_constructible_v<_Callback, _Cb>)
+      : stop_callback(__private_tag{}, __st.__state_, std::forward<_Cb>(__cb)) {}
+
+  template <class _Cb>
+    requires constructible_from<_Callback, _Cb>
+  _LIBCPP_HIDE_FROM_ABI explicit stop_callback(stop_token&& __st,
+                                               _Cb&& __cb) noexcept(is_nothrow_constructible_v<_Callback, _Cb>)
+      : stop_callback(__private_tag{}, std::move(__st.__state_), std::forward<_Cb>(__cb)) {}
+
+  _LIBCPP_HIDE_FROM_ABI ~stop_callback() {
+    if (__state_) {
+      __state_->__remove_callback(this);
+    }
+  }
+
+  stop_callback(const stop_callback&)            = delete;
+  stop_callback(stop_callback&&)                 = delete;
+  stop_callback& operator=(const stop_callback&) = delete;
+  stop_callback& operator=(stop_callback&&)      = delete;
+
+private:
+  _LIBCPP_NO_UNIQUE_ADDRESS _Callback __callback_;
+  __intrusive_shared_ptr<__stop_state> __state_;
+
+  friend __stop_callback_base;
+
+  struct __private_tag {};
+
+  template <class _StatePtr, class _Cb>
+  _LIBCPP_HIDE_FROM_ABI explicit stop_callback(__private_tag, _StatePtr&& __state, _Cb&& __cb) noexcept(
+      is_nothrow_constructible_v<_Callback, _Cb>)
+      : __stop_callback_base(+[](__stop_callback_base* __cb_base) noexcept {
+          // stop callback is supposed to only be called once
+          std::forward<_Callback>(static_cast<stop_callback*>(__cb_base)->__callback_)();
+        }),
+        __callback_(std::forward<_Cb>(__cb)),
+        __state_() {
+    if (__state && __state->__add_callback(this)) {
+      // st.stop_requested() was false and this is successfully added to the linked list
+      __state_ = std::forward<_StatePtr>(__state);
+    }
+  }
+};
+
+template <class Callback>
+stop_callback(stop_token, Callback) -> stop_callback<Callback>;
+
+#endif // _LIBCPP_STD_VER >= 20
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP___STOP_TOKEN_STOP_TOKEN_H
diff --git a/libcxx/include/__stop_token/stop_source.h b/libcxx/include/__stop_token/stop_source.h
new file mode 100644
--- /dev/null
+++ b/libcxx/include/__stop_token/stop_source.h
@@ -0,0 +1,91 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___STOP_TOKEN_STOP_SOURCE_H
+#define _LIBCPP___STOP_TOKEN_STOP_SOURCE_H
+
+#include <__config>
+#include <__stop_token/intrusive_shared_ptr.h>
+#include <__stop_token/stop_state.h>
+#include <__stop_token/stop_token.h>
+#include <__utility/move.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+#if _LIBCPP_STD_VER >= 20
+
+struct nostopstate_t {
+  explicit nostopstate_t() = default;
+};
+
+inline constexpr nostopstate_t nostopstate{};
+
+class stop_source {
+public:
+  _LIBCPP_HIDE_FROM_ABI stop_source() : __state_(new __stop_state()) { __state_->__increment_stop_source_counter(); }
+
+  _LIBCPP_HIDE_FROM_ABI explicit stop_source(nostopstate_t) noexcept : __state_(nullptr) {}
+
+  _LIBCPP_HIDE_FROM_ABI stop_source(const stop_source& __other) noexcept : __state_(__other.__state_) {
+    if (__state_) {
+      __state_->__increment_stop_source_counter();
+    }
+  }
+
+  _LIBCPP_HIDE_FROM_ABI stop_source(stop_source&& __other) noexcept = default;
+
+  _LIBCPP_HIDE_FROM_ABI stop_source& operator=(const stop_source& __other) noexcept {
+    // increment `__other` first so that we don't hit 0 in case of self-assignment
+    if (__other.__state_) {
+      __other.__state_->__increment_stop_source_counter();
+    }
+    if (__state_) {
+      __state_->__decrement_stop_source_counter();
+    }
+    __state_ = __other.__state_;
+    return *this;
+  }
+
+  _LIBCPP_HIDE_FROM_ABI stop_source& operator=(stop_source&&) noexcept = default;
+
+  _LIBCPP_HIDE_FROM_ABI ~stop_source() {
+    if (__state_) {
+      __state_->__decrement_stop_source_counter();
+    }
+  }
+
+  _LIBCPP_HIDE_FROM_ABI void swap(stop_source& __other) noexcept { __state_.swap(__other.__state_); }
+
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI stop_token get_token() const noexcept { return stop_token(__state_); }
+
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI bool stop_possible() const noexcept { return __state_ != nullptr; }
+
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI bool stop_requested() const noexcept {
+    return __state_ != nullptr && __state_->__stop_requested();
+  }
+
+  _LIBCPP_HIDE_FROM_ABI bool request_stop() noexcept { return __state_ && __state_->__request_stop(); }
+
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI friend bool operator==(const stop_source&, const stop_source&) noexcept = default;
+
+  _LIBCPP_HIDE_FROM_ABI friend void swap(stop_source& __lhs, stop_source& __rhs) noexcept { __lhs.swap(__rhs); }
+
+private:
+  __intrusive_shared_ptr<__stop_state> __state_;
+};
+
+#endif // _LIBCPP_STD_VER >= 20
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP___STOP_TOKEN_STOP_SOURCE_H
diff --git a/libcxx/include/__stop_token/stop_state.h b/libcxx/include/__stop_token/stop_state.h
new file mode 100644
--- /dev/null
+++ b/libcxx/include/__stop_token/stop_state.h
@@ -0,0 +1,233 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___STOP_TOKEN_STOP_STATE_H
+#define _LIBCPP___STOP_TOKEN_STOP_STATE_H
+
+#include <__config>
+#include <__stop_token/atomic_unique_lock.h>
+#include <__stop_token/intrusive_list_view.h>
+#include <atomic>
+#include <thread>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+#if _LIBCPP_STD_VER >= 20
+
+struct __stop_callback_base : __intrusive_node_base<__stop_callback_base> {
+  using __callback_fn_t = void(__stop_callback_base*) noexcept;
+  _LIBCPP_HIDE_FROM_ABI explicit __stop_callback_base(__callback_fn_t* __callback_fn) : __callback_fn_(__callback_fn) {}
+
+  _LIBCPP_HIDE_FROM_ABI void __invoke() noexcept { __callback_fn_(this); }
+
+  __callback_fn_t* __callback_fn_;
+  std::atomic<bool> __completed_ = false;
+  bool* __destroyed_             = nullptr;
+};
+
+class __stop_state {
+  static constexpr uint32_t __stop_requested_bit        = 1;
+  static constexpr uint32_t __callback_list_locked_bit  = 1 << 1;
+  static constexpr uint32_t __stop_source_counter_shift = 2;
+
+  // The "stop_source counter" is not used for lifetime reference counting.
+  // When the number of stop_source reaches 0, the remaining stop_tokens's
+  // stop_possible will return false. We need this counter to track this.
+  //
+  // The "callback list locked" bit implements the atomic_unique_lock to
+  // guard the operations on the callback list
+  //
+  //       31 - 2          |  1                   |    0           |
+  //  stop_source counter  | callback list locked | stop_requested |
+  std::atomic<uint32_t> __state_ = 0;
+
+  // Reference count for stop_token + stop_callback + stop_source
+  // When the counter reaches zero, the state is destroyed
+  // It is used by __intrusive_shared_ptr, but it is stored here for better layout
+  std::atomic<uint32_t> __ref_count_ = 0;
+
+  using __state_t            = uint32_t;
+  using __callback_list_lock = __atomic_unique_lock<__state_t, __callback_list_locked_bit>;
+  using __callback_list      = __intrusive_list_view<__stop_callback_base>;
+
+  __callback_list __callback_list_;
+  std::thread::id __requesting_thread_;
+
+public:
+  _LIBCPP_HIDE_FROM_ABI __stop_state() noexcept = default;
+
+  _LIBCPP_HIDE_FROM_ABI void __increment_stop_source_counter() noexcept {
+    _LIBCPP_ASSERT(
+        __state_.load(std::memory_order_relaxed) <= static_cast<__state_t>(~(1 << __stop_source_counter_shift)),
+        "stop_source's counter reaches the maximum. Incrementing the counter will overflow");
+    __state_.fetch_add(1 << __stop_source_counter_shift, std::memory_order_relaxed);
+  }
+
+  // We are not destroying the object after counter decrements to zero, nor do we have
+  // operations depend on the ordering of decrementing the counter. relaxed is enough.
+  _LIBCPP_HIDE_FROM_ABI void __decrement_stop_source_counter() noexcept {
+    _LIBCPP_ASSERT(__state_.load(std::memory_order_relaxed) >= static_cast<__state_t>(1 << __stop_source_counter_shift),
+                   "stop_source's counter is 0. Decrementing the counter will underflow");
+    __state_.fetch_sub(1 << __stop_source_counter_shift, std::memory_order_relaxed);
+  }
+
+  _LIBCPP_HIDE_FROM_ABI bool __stop_requested() const noexcept {
+    // acquire because [thread.stoptoken.intro] A call to request_stop that returns true
+    // synchronizes with a call to stop_requested on an associated stop_token or stop_source
+    // object that returns true.
+    // request_stop's compare_exchange_weak has release which syncs with this acquire
+    return (__state_.load(std::memory_order_acquire) & __stop_requested_bit) != 0;
+  }
+
+  _LIBCPP_HIDE_FROM_ABI bool __stop_possible_for_stop_token() const noexcept {
+    // [stoptoken.mem] false if "a stop request was not made and there are no associated stop_source objects"
+    // Todo: Can this be std::memory_order_relaxed as the standard does not say anything except not to introduce data
+    // race?
+    __state_t __curent_state = __state_.load(std::memory_order_acquire);
+    return ((__curent_state & __stop_requested_bit) != 0) || ((__curent_state >> __stop_source_counter_shift) != 0);
+  }
+
+  _LIBCPP_HIDE_FROM_ABI bool __request_stop() noexcept {
+    auto __cb_list_lock = __try_lock_for_request_stop();
+    if (!__cb_list_lock.__owns_lock()) {
+      return false;
+    }
+    __requesting_thread_ = std::this_thread::get_id();
+
+    while (!__callback_list_.__empty()) {
+      auto __cb = __callback_list_.__pop_front();
+
+      // allow other callbacks to be removed while invoking the current callback
+      __cb_list_lock.__unlock();
+
+      bool __destroyed   = false;
+      __cb->__destroyed_ = &__destroyed;
+
+      __cb->__invoke();
+
+      // __cb's invoke function could potentially delete itself. We need to check before accessing __cb's member
+      if (!__destroyed) {
+        // needs to set __destroyed_ pointer to nullptr, otherwise it points to a local variable
+        // which is to be destroyed at the end of the loop
+        __cb->__destroyed_ = nullptr;
+
+        // [stopcallback.cons] If callback is concurrently executing on another thread, then the return
+        // from the invocation of callback strongly happens before ([intro.races]) callback is destroyed.
+        // this release syncs with the acquire in the remove_callback
+        __cb->__completed_.store(true, std::memory_order_release);
+        __cb->__completed_.notify_all();
+      }
+
+      __cb_list_lock.__lock();
+    }
+
+    return true;
+  }
+
+  _LIBCPP_HIDE_FROM_ABI bool __add_callback(__stop_callback_base* __cb) noexcept {
+    // If it is already stop_requested. Do not try to request it again.
+    const auto __give_up_trying_to_lock_condition = [__cb](__state_t __state) {
+      if ((__state & __stop_requested_bit) != 0) {
+        // already stop requested, synchronously run the callback and no need to lock the list again
+        __cb->__invoke();
+        return true;
+      }
+      // no stop source. no need to lock the list to add the callback as it can never be invoked
+      return (__state >> __stop_source_counter_shift) == 0;
+    };
+
+    __callback_list_lock __cb_list_lock(__state_, __give_up_trying_to_lock_condition);
+
+    if (!__cb_list_lock.__owns_lock()) {
+      return false;
+    }
+
+    __callback_list_.__push_front(__cb);
+
+    return true;
+    // unlock here: [thread.stoptoken.intro] Registration of a callback synchronizes with the invocation of
+    // that callback.
+    // Note: this release sync with the acquire in the request_stop' __try_lock_for_request_stop
+  }
+
+  // called by the destructor of stop_callback
+  _LIBCPP_HIDE_FROM_ABI void __remove_callback(__stop_callback_base* __cb) noexcept {
+    __callback_list_lock __cb_list_lock(__state_);
+
+    // under below condition, the request_stop call just popped __cb from the list and could execute it now
+    bool __potentially_executing_now = __cb->__prev_ == nullptr && !__callback_list_.__is_head(__cb);
+
+    if (__potentially_executing_now) {
+      auto __requested_thread = __requesting_thread_;
+      __cb_list_lock.__unlock();
+
+      if (std::this_thread::get_id() != __requested_thread) {
+        // [stopcallback.cons] If callback is concurrently executing on another thread, then the return
+        // from the invocation of callback strongly happens before ([intro.races]) callback is destroyed.
+        __cb->__completed_.wait(false, std::memory_order_acquire);
+      } else {
+        // The destructor of stop_callback runs on the same thread of the thread that invokes the callback.
+        // The callback is potentially invoking its own destuctor. Set the flag to avoid accessing destroyed
+        // members on the invoking side
+        if (__cb->__destroyed_) {
+          *__cb->__destroyed_ = true;
+        }
+      }
+    } else {
+      __callback_list_.__remove(__cb);
+    }
+  }
+
+private:
+  _LIBCPP_HIDE_FROM_ABI __callback_list_lock __try_lock_for_request_stop() noexcept {
+    // If it is already stop_requested, do not try to request stop or lock the list again.
+    const auto __lock_fail_condition = [](__state_t __state) { return (__state & __stop_requested_bit) != 0; };
+
+    // set locked and requested bit at the same time
+    const auto __after_lock_state = [](__state_t __state) {
+      return __state | __callback_list_locked_bit | __stop_requested_bit;
+    };
+
+    // acq because [thread.stoptoken.intro] Registration of a callback synchronizes with the invocation of that
+    //     callback. We are going to invoke the callback after getting the lock, acquire so that we can see the
+    //     registration of a callback (and other writes that happens-before the add_callback)
+    //     Note: the rel (unlock) in the add_callback syncs with this acq
+    // rel because [thread.stoptoken.intro] A call to request_stop that returns true synchronizes with a call
+    //     to stop_requested on an associated stop_token or stop_source object that returns true.
+    //     We need to make sure that all writes (including user code) before request_stop will be made visible
+    //     to the threads that waiting for `stop_requested == true`
+    //     Note: this rel syncs with the acq in `stop_requested`
+    const auto __locked_ordering = std::memory_order_acq_rel;
+
+    return __callback_list_lock(__state_, __lock_fail_condition, __after_lock_state, __locked_ordering);
+  }
+
+  template <class _Tp>
+  friend struct __intrusive_shared_ptr_traits;
+};
+
+template <class _Tp>
+struct __intrusive_shared_ptr_traits;
+
+template <>
+struct __intrusive_shared_ptr_traits<__stop_state> {
+  _LIBCPP_HIDE_FROM_ABI static std::atomic<uint32_t>& __get_atomic_ref_count(__stop_state& __state) {
+    return __state.__ref_count_;
+  }
+};
+
+#endif // _LIBCPP_STD_VER >= 20
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP___STOP_TOKEN_STOP_STATE_H
diff --git a/libcxx/include/__stop_token/stop_token.h b/libcxx/include/__stop_token/stop_token.h
new file mode 100644
--- /dev/null
+++ b/libcxx/include/__stop_token/stop_token.h
@@ -0,0 +1,63 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___STOP_TOKEN_STOP_TOKEN_H
+#define _LIBCPP___STOP_TOKEN_STOP_TOKEN_H
+
+#include <__config>
+#include <__stop_token/intrusive_shared_ptr.h>
+#include <__stop_token/stop_state.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+#if _LIBCPP_STD_VER >= 20
+
+class stop_token {
+public:
+  _LIBCPP_HIDE_FROM_ABI stop_token() noexcept = default;
+
+  _LIBCPP_HIDE_FROM_ABI stop_token(const stop_token&) noexcept            = default;
+  _LIBCPP_HIDE_FROM_ABI stop_token(stop_token&&) noexcept                 = default;
+  _LIBCPP_HIDE_FROM_ABI stop_token& operator=(const stop_token&) noexcept = default;
+  _LIBCPP_HIDE_FROM_ABI stop_token& operator=(stop_token&&) noexcept      = default;
+  _LIBCPP_HIDE_FROM_ABI ~stop_token()                                     = default;
+
+  _LIBCPP_HIDE_FROM_ABI void swap(stop_token& __other) noexcept { __state_.swap(__other.__state_); }
+
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI bool stop_requested() const noexcept {
+    return __state_ != nullptr && __state_->__stop_requested();
+  }
+
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI bool stop_possible() const noexcept {
+    return __state_ != nullptr && __state_->__stop_possible_for_stop_token();
+  }
+
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI friend bool operator==(const stop_token&, const stop_token&) noexcept = default;
+
+  _LIBCPP_HIDE_FROM_ABI friend void swap(stop_token& __lhs, stop_token& __rhs) noexcept { __lhs.swap(__rhs); }
+
+private:
+  __intrusive_shared_ptr<__stop_state> __state_;
+
+  friend class stop_source;
+  template <class _Tp>
+  friend class stop_callback;
+
+  _LIBCPP_HIDE_FROM_ABI explicit stop_token(const __intrusive_shared_ptr<__stop_state>& __state) : __state_(__state) {}
+};
+
+#endif // _LIBCPP_STD_VER >= 20
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP___STOP_TOKEN_STOP_TOKEN_H
diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
--- a/libcxx/include/module.modulemap.in
+++ b/libcxx/include/module.modulemap.in
@@ -1448,11 +1448,17 @@
     export *
   }
   module stop_token {
+    @requires_LIBCXX_ENABLE_THREADS@
+    header "stop_token"
     export *
     module __stop_token {
       module atomic_unique_lock   { private header "__stop_token/atomic_unique_lock.h" }
       module intrusive_list_view  { private header "__stop_token/intrusive_list_view.h" }
       module intrusive_shared_ptr { private header "__stop_token/intrusive_shared_ptr.h" }
+      module stop_callback        { private header "__stop_token/stop_callback.h" }
+      module stop_source          { private header "__stop_token/stop_source.h" }
+      module stop_state           { private header "__stop_token/stop_state.h" }
+      module stop_token           { private header "__stop_token/stop_token.h" }
     }
   }
   module streambuf {
diff --git a/libcxx/include/stop_token b/libcxx/include/stop_token
new file mode 100644
--- /dev/null
+++ b/libcxx/include/stop_token
@@ -0,0 +1,49 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP_STOP_TOKEN
+#define _LIBCPP_STOP_TOKEN
+
+/*
+
+namespace std {
+  // [stoptoken], class stop_token
+  class stop_token;
+
+  // [stopsource], class stop_source
+  class stop_source;
+
+  // no-shared-stop-state indicator
+  struct nostopstate_t {
+    explicit nostopstate_t() = default;
+  };
+  inline constexpr nostopstate_t nostopstate{};
+
+  // [stopcallback], class template stop_callback
+  template<class Callback>
+    class stop_callback;
+
+*/
+
+#include <__assert> // all public C++ headers provide the assertion handler
+#include <__config>
+#include <__stop_token/stop_callback.h>
+#include <__stop_token/stop_source.h>
+#include <__stop_token/stop_token.h>
+#include <version>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#ifdef _LIBCPP_HAS_NO_THREADS
+#  error "<stop_token> is not supported since libc++ has been configured without support for threads."
+#endif
+
+#endif // _LIBCPP_STOP_TOKEN
diff --git a/libcxx/test/libcxx/assertions/headers_declare_verbose_abort.sh.cpp b/libcxx/test/libcxx/assertions/headers_declare_verbose_abort.sh.cpp
--- a/libcxx/test/libcxx/assertions/headers_declare_verbose_abort.sh.cpp
+++ b/libcxx/test/libcxx/assertions/headers_declare_verbose_abort.sh.cpp
@@ -555,199 +555,205 @@
 #endif
 
 // RUN: %{build} -DTEST_101
-#if defined(TEST_101) && !defined(_LIBCPP_HAS_NO_LOCALIZATION)
-#   include <streambuf>
+#if defined(TEST_101) && !defined(_LIBCPP_HAS_NO_THREADS) && defined(_LIBCPP_AVAILABILITY_SYNC)
+#   include <stop_token>
     using HandlerType = decltype(std::__libcpp_verbose_abort);
 #endif
 
 // RUN: %{build} -DTEST_102
-#if defined(TEST_102)
-#   include <string>
+#if defined(TEST_102) && !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#   include <streambuf>
     using HandlerType = decltype(std::__libcpp_verbose_abort);
 #endif
 
-// RUN: %{build} -DTEST_104
-#if defined(TEST_104)
-#   include <string_view>
+// RUN: %{build} -DTEST_103
+#if defined(TEST_103)
+#   include <string>
     using HandlerType = decltype(std::__libcpp_verbose_abort);
 #endif
 
 // RUN: %{build} -DTEST_105
-#if defined(TEST_105) && !defined(_LIBCPP_HAS_NO_LOCALIZATION)
-#   include <strstream>
+#if defined(TEST_105)
+#   include <string_view>
     using HandlerType = decltype(std::__libcpp_verbose_abort);
 #endif
 
 // RUN: %{build} -DTEST_106
-#if defined(TEST_106)
-#   include <system_error>
+#if defined(TEST_106) && !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#   include <strstream>
     using HandlerType = decltype(std::__libcpp_verbose_abort);
 #endif
 
-// RUN: %{build} -DTEST_108
-#if defined(TEST_108) && !defined(_LIBCPP_HAS_NO_THREADS)
-#   include <thread>
+// RUN: %{build} -DTEST_107
+#if defined(TEST_107)
+#   include <system_error>
     using HandlerType = decltype(std::__libcpp_verbose_abort);
 #endif
 
 // RUN: %{build} -DTEST_109
-#if defined(TEST_109)
-#   include <tuple>
+#if defined(TEST_109) && !defined(_LIBCPP_HAS_NO_THREADS)
+#   include <thread>
     using HandlerType = decltype(std::__libcpp_verbose_abort);
 #endif
 
 // RUN: %{build} -DTEST_110
 #if defined(TEST_110)
-#   include <type_traits>
+#   include <tuple>
     using HandlerType = decltype(std::__libcpp_verbose_abort);
 #endif
 
 // RUN: %{build} -DTEST_111
 #if defined(TEST_111)
-#   include <typeindex>
+#   include <type_traits>
     using HandlerType = decltype(std::__libcpp_verbose_abort);
 #endif
 
 // RUN: %{build} -DTEST_112
 #if defined(TEST_112)
-#   include <typeinfo>
+#   include <typeindex>
     using HandlerType = decltype(std::__libcpp_verbose_abort);
 #endif
 
-// RUN: %{build} -DTEST_114
-#if defined(TEST_114)
-#   include <unordered_map>
+// RUN: %{build} -DTEST_113
+#if defined(TEST_113)
+#   include <typeinfo>
     using HandlerType = decltype(std::__libcpp_verbose_abort);
 #endif
 
 // RUN: %{build} -DTEST_115
 #if defined(TEST_115)
-#   include <unordered_set>
+#   include <unordered_map>
     using HandlerType = decltype(std::__libcpp_verbose_abort);
 #endif
 
 // RUN: %{build} -DTEST_116
 #if defined(TEST_116)
-#   include <utility>
+#   include <unordered_set>
     using HandlerType = decltype(std::__libcpp_verbose_abort);
 #endif
 
 // RUN: %{build} -DTEST_117
 #if defined(TEST_117)
-#   include <valarray>
+#   include <utility>
     using HandlerType = decltype(std::__libcpp_verbose_abort);
 #endif
 
 // RUN: %{build} -DTEST_118
 #if defined(TEST_118)
-#   include <variant>
+#   include <valarray>
     using HandlerType = decltype(std::__libcpp_verbose_abort);
 #endif
 
 // RUN: %{build} -DTEST_119
 #if defined(TEST_119)
-#   include <vector>
+#   include <variant>
     using HandlerType = decltype(std::__libcpp_verbose_abort);
 #endif
 
 // RUN: %{build} -DTEST_120
 #if defined(TEST_120)
-#   include <version>
+#   include <vector>
     using HandlerType = decltype(std::__libcpp_verbose_abort);
 #endif
 
-// RUN: %{build} -DTEST_123
-#if defined(TEST_123) && __cplusplus >= 201103L
-#   include <experimental/deque>
+// RUN: %{build} -DTEST_121
+#if defined(TEST_121)
+#   include <version>
     using HandlerType = decltype(std::__libcpp_verbose_abort);
 #endif
 
 // RUN: %{build} -DTEST_124
 #if defined(TEST_124) && __cplusplus >= 201103L
-#   include <experimental/forward_list>
+#   include <experimental/deque>
     using HandlerType = decltype(std::__libcpp_verbose_abort);
 #endif
 
 // RUN: %{build} -DTEST_125
 #if defined(TEST_125) && __cplusplus >= 201103L
-#   include <experimental/iterator>
+#   include <experimental/forward_list>
     using HandlerType = decltype(std::__libcpp_verbose_abort);
 #endif
 
 // RUN: %{build} -DTEST_126
 #if defined(TEST_126) && __cplusplus >= 201103L
-#   include <experimental/list>
+#   include <experimental/iterator>
     using HandlerType = decltype(std::__libcpp_verbose_abort);
 #endif
 
 // RUN: %{build} -DTEST_127
 #if defined(TEST_127) && __cplusplus >= 201103L
-#   include <experimental/map>
+#   include <experimental/list>
     using HandlerType = decltype(std::__libcpp_verbose_abort);
 #endif
 
 // RUN: %{build} -DTEST_128
 #if defined(TEST_128) && __cplusplus >= 201103L
-#   include <experimental/memory_resource>
+#   include <experimental/map>
     using HandlerType = decltype(std::__libcpp_verbose_abort);
 #endif
 
 // RUN: %{build} -DTEST_129
 #if defined(TEST_129) && __cplusplus >= 201103L
-#   include <experimental/propagate_const>
+#   include <experimental/memory_resource>
     using HandlerType = decltype(std::__libcpp_verbose_abort);
 #endif
 
 // RUN: %{build} -DTEST_130
-#if defined(TEST_130) && !defined(_LIBCPP_HAS_NO_LOCALIZATION) && __cplusplus >= 201103L
-#   include <experimental/regex>
+#if defined(TEST_130) && __cplusplus >= 201103L
+#   include <experimental/propagate_const>
     using HandlerType = decltype(std::__libcpp_verbose_abort);
 #endif
 
 // RUN: %{build} -DTEST_131
-#if defined(TEST_131) && __cplusplus >= 201103L
-#   include <experimental/set>
+#if defined(TEST_131) && !defined(_LIBCPP_HAS_NO_LOCALIZATION) && __cplusplus >= 201103L
+#   include <experimental/regex>
     using HandlerType = decltype(std::__libcpp_verbose_abort);
 #endif
 
 // RUN: %{build} -DTEST_132
 #if defined(TEST_132) && __cplusplus >= 201103L
-#   include <experimental/simd>
+#   include <experimental/set>
     using HandlerType = decltype(std::__libcpp_verbose_abort);
 #endif
 
 // RUN: %{build} -DTEST_133
 #if defined(TEST_133) && __cplusplus >= 201103L
-#   include <experimental/string>
+#   include <experimental/simd>
     using HandlerType = decltype(std::__libcpp_verbose_abort);
 #endif
 
 // RUN: %{build} -DTEST_134
 #if defined(TEST_134) && __cplusplus >= 201103L
-#   include <experimental/type_traits>
+#   include <experimental/string>
     using HandlerType = decltype(std::__libcpp_verbose_abort);
 #endif
 
 // RUN: %{build} -DTEST_135
 #if defined(TEST_135) && __cplusplus >= 201103L
-#   include <experimental/unordered_map>
+#   include <experimental/type_traits>
     using HandlerType = decltype(std::__libcpp_verbose_abort);
 #endif
 
 // RUN: %{build} -DTEST_136
 #if defined(TEST_136) && __cplusplus >= 201103L
-#   include <experimental/unordered_set>
+#   include <experimental/unordered_map>
     using HandlerType = decltype(std::__libcpp_verbose_abort);
 #endif
 
 // RUN: %{build} -DTEST_137
 #if defined(TEST_137) && __cplusplus >= 201103L
-#   include <experimental/utility>
+#   include <experimental/unordered_set>
     using HandlerType = decltype(std::__libcpp_verbose_abort);
 #endif
 
 // RUN: %{build} -DTEST_138
 #if defined(TEST_138) && __cplusplus >= 201103L
+#   include <experimental/utility>
+    using HandlerType = decltype(std::__libcpp_verbose_abort);
+#endif
+
+// RUN: %{build} -DTEST_139
+#if defined(TEST_139) && __cplusplus >= 201103L
 #   include <experimental/vector>
     using HandlerType = decltype(std::__libcpp_verbose_abort);
 #endif
diff --git a/libcxx/test/libcxx/clang_tidy.sh.cpp b/libcxx/test/libcxx/clang_tidy.sh.cpp
--- a/libcxx/test/libcxx/clang_tidy.sh.cpp
+++ b/libcxx/test/libcxx/clang_tidy.sh.cpp
@@ -178,6 +178,9 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
+#if !defined(_LIBCPP_HAS_NO_THREADS) && defined(_LIBCPP_AVAILABILITY_SYNC)
+#   include <stop_token>
+#endif
 #if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
 #   include <streambuf>
 #endif
diff --git a/libcxx/test/libcxx/double_include.sh.cpp b/libcxx/test/libcxx/double_include.sh.cpp
--- a/libcxx/test/libcxx/double_include.sh.cpp
+++ b/libcxx/test/libcxx/double_include.sh.cpp
@@ -176,6 +176,9 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
+#if !defined(_LIBCPP_HAS_NO_THREADS) && defined(_LIBCPP_AVAILABILITY_SYNC)
+#   include <stop_token>
+#endif
 #if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
 #   include <streambuf>
 #endif
diff --git a/libcxx/test/libcxx/min_max_macros.compile.pass.cpp b/libcxx/test/libcxx/min_max_macros.compile.pass.cpp
--- a/libcxx/test/libcxx/min_max_macros.compile.pass.cpp
+++ b/libcxx/test/libcxx/min_max_macros.compile.pass.cpp
@@ -277,6 +277,10 @@
 TEST_MACROS();
 #include <stdlib.h>
 TEST_MACROS();
+#if !defined(_LIBCPP_HAS_NO_THREADS) && defined(_LIBCPP_AVAILABILITY_SYNC)
+#   include <stop_token>
+TEST_MACROS();
+#endif
 #if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
 #   include <streambuf>
 TEST_MACROS();
diff --git a/libcxx/test/libcxx/modules_include.sh.cpp b/libcxx/test/libcxx/modules_include.sh.cpp
--- a/libcxx/test/libcxx/modules_include.sh.cpp
+++ b/libcxx/test/libcxx/modules_include.sh.cpp
@@ -644,232 +644,237 @@
 // RUN: echo '%{cxx} %s %{flags} %{compile_flags} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only -DTEST_101 &' >> %t.sh
 // RUN: echo 'TEST_101=$!' >> %t.sh
 // RUN: echo "wait $TEST_85" >> %t.sh
-#if defined(TEST_101) && !defined(_LIBCPP_HAS_NO_LOCALIZATION)
-#include <streambuf>
+#if defined(TEST_101) && !defined(_LIBCPP_HAS_NO_THREADS) && defined(_LIBCPP_AVAILABILITY_SYNC)
+#include <stop_token>
 #endif
 // RUN: echo '%{cxx} %s %{flags} %{compile_flags} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only -DTEST_102 &' >> %t.sh
 // RUN: echo 'TEST_102=$!' >> %t.sh
 // RUN: echo "wait $TEST_86" >> %t.sh
-#if defined(TEST_102)
-#include <string>
+#if defined(TEST_102) && !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#include <streambuf>
 #endif
 // RUN: echo '%{cxx} %s %{flags} %{compile_flags} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only -DTEST_103 &' >> %t.sh
 // RUN: echo 'TEST_103=$!' >> %t.sh
 // RUN: echo "wait $TEST_87" >> %t.sh
 #if defined(TEST_103)
-#include <string.h>
+#include <string>
 #endif
 // RUN: echo '%{cxx} %s %{flags} %{compile_flags} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only -DTEST_104 &' >> %t.sh
 // RUN: echo 'TEST_104=$!' >> %t.sh
 // RUN: echo "wait $TEST_88" >> %t.sh
 #if defined(TEST_104)
-#include <string_view>
+#include <string.h>
 #endif
 // RUN: echo '%{cxx} %s %{flags} %{compile_flags} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only -DTEST_105 &' >> %t.sh
 // RUN: echo 'TEST_105=$!' >> %t.sh
 // RUN: echo "wait $TEST_89" >> %t.sh
-#if defined(TEST_105) && !defined(_LIBCPP_HAS_NO_LOCALIZATION)
-#include <strstream>
+#if defined(TEST_105)
+#include <string_view>
 #endif
 // RUN: echo '%{cxx} %s %{flags} %{compile_flags} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only -DTEST_106 &' >> %t.sh
 // RUN: echo 'TEST_106=$!' >> %t.sh
 // RUN: echo "wait $TEST_90" >> %t.sh
-#if defined(TEST_106)
-#include <system_error>
+#if defined(TEST_106) && !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#include <strstream>
 #endif
 // RUN: echo '%{cxx} %s %{flags} %{compile_flags} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only -DTEST_107 &' >> %t.sh
 // RUN: echo 'TEST_107=$!' >> %t.sh
 // RUN: echo "wait $TEST_91" >> %t.sh
 #if defined(TEST_107)
-#include <tgmath.h>
+#include <system_error>
 #endif
 // RUN: echo '%{cxx} %s %{flags} %{compile_flags} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only -DTEST_108 &' >> %t.sh
 // RUN: echo 'TEST_108=$!' >> %t.sh
 // RUN: echo "wait $TEST_92" >> %t.sh
-#if defined(TEST_108) && !defined(_LIBCPP_HAS_NO_THREADS)
-#include <thread>
+#if defined(TEST_108)
+#include <tgmath.h>
 #endif
 // RUN: echo '%{cxx} %s %{flags} %{compile_flags} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only -DTEST_109 &' >> %t.sh
 // RUN: echo 'TEST_109=$!' >> %t.sh
 // RUN: echo "wait $TEST_93" >> %t.sh
-#if defined(TEST_109)
-#include <tuple>
+#if defined(TEST_109) && !defined(_LIBCPP_HAS_NO_THREADS)
+#include <thread>
 #endif
 // RUN: echo '%{cxx} %s %{flags} %{compile_flags} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only -DTEST_110 &' >> %t.sh
 // RUN: echo 'TEST_110=$!' >> %t.sh
 // RUN: echo "wait $TEST_94" >> %t.sh
 #if defined(TEST_110)
-#include <type_traits>
+#include <tuple>
 #endif
 // RUN: echo '%{cxx} %s %{flags} %{compile_flags} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only -DTEST_111 &' >> %t.sh
 // RUN: echo 'TEST_111=$!' >> %t.sh
 // RUN: echo "wait $TEST_95" >> %t.sh
 #if defined(TEST_111)
-#include <typeindex>
+#include <type_traits>
 #endif
 // RUN: echo '%{cxx} %s %{flags} %{compile_flags} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only -DTEST_112 &' >> %t.sh
 // RUN: echo 'TEST_112=$!' >> %t.sh
 // RUN: echo "wait $TEST_96" >> %t.sh
 #if defined(TEST_112)
-#include <typeinfo>
+#include <typeindex>
 #endif
 // RUN: echo '%{cxx} %s %{flags} %{compile_flags} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only -DTEST_113 &' >> %t.sh
 // RUN: echo 'TEST_113=$!' >> %t.sh
 // RUN: echo "wait $TEST_97" >> %t.sh
 #if defined(TEST_113)
-#include <uchar.h>
+#include <typeinfo>
 #endif
 // RUN: echo '%{cxx} %s %{flags} %{compile_flags} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only -DTEST_114 &' >> %t.sh
 // RUN: echo 'TEST_114=$!' >> %t.sh
 // RUN: echo "wait $TEST_98" >> %t.sh
 #if defined(TEST_114)
-#include <unordered_map>
+#include <uchar.h>
 #endif
 // RUN: echo '%{cxx} %s %{flags} %{compile_flags} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only -DTEST_115 &' >> %t.sh
 // RUN: echo 'TEST_115=$!' >> %t.sh
 // RUN: echo "wait $TEST_99" >> %t.sh
 #if defined(TEST_115)
-#include <unordered_set>
+#include <unordered_map>
 #endif
 // RUN: echo '%{cxx} %s %{flags} %{compile_flags} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only -DTEST_116 &' >> %t.sh
 // RUN: echo 'TEST_116=$!' >> %t.sh
 // RUN: echo "wait $TEST_100" >> %t.sh
 #if defined(TEST_116)
-#include <utility>
+#include <unordered_set>
 #endif
 // RUN: echo '%{cxx} %s %{flags} %{compile_flags} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only -DTEST_117 &' >> %t.sh
 // RUN: echo 'TEST_117=$!' >> %t.sh
 // RUN: echo "wait $TEST_101" >> %t.sh
 #if defined(TEST_117)
-#include <valarray>
+#include <utility>
 #endif
 // RUN: echo '%{cxx} %s %{flags} %{compile_flags} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only -DTEST_118 &' >> %t.sh
 // RUN: echo 'TEST_118=$!' >> %t.sh
 // RUN: echo "wait $TEST_102" >> %t.sh
 #if defined(TEST_118)
-#include <variant>
+#include <valarray>
 #endif
 // RUN: echo '%{cxx} %s %{flags} %{compile_flags} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only -DTEST_119 &' >> %t.sh
 // RUN: echo 'TEST_119=$!' >> %t.sh
 // RUN: echo "wait $TEST_103" >> %t.sh
 #if defined(TEST_119)
-#include <vector>
+#include <variant>
 #endif
 // RUN: echo '%{cxx} %s %{flags} %{compile_flags} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only -DTEST_120 &' >> %t.sh
 // RUN: echo 'TEST_120=$!' >> %t.sh
 // RUN: echo "wait $TEST_104" >> %t.sh
 #if defined(TEST_120)
-#include <version>
+#include <vector>
 #endif
 // RUN: echo '%{cxx} %s %{flags} %{compile_flags} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only -DTEST_121 &' >> %t.sh
 // RUN: echo 'TEST_121=$!' >> %t.sh
 // RUN: echo "wait $TEST_105" >> %t.sh
-#if defined(TEST_121) && !defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS)
-#include <wchar.h>
+#if defined(TEST_121)
+#include <version>
 #endif
 // RUN: echo '%{cxx} %s %{flags} %{compile_flags} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only -DTEST_122 &' >> %t.sh
 // RUN: echo 'TEST_122=$!' >> %t.sh
 // RUN: echo "wait $TEST_106" >> %t.sh
 #if defined(TEST_122) && !defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS)
-#include <wctype.h>
+#include <wchar.h>
 #endif
 // RUN: echo '%{cxx} %s %{flags} %{compile_flags} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only -DTEST_123 &' >> %t.sh
 // RUN: echo 'TEST_123=$!' >> %t.sh
 // RUN: echo "wait $TEST_107" >> %t.sh
-#if defined(TEST_123) && __cplusplus >= 201103L
-#include <experimental/deque>
+#if defined(TEST_123) && !defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS)
+#include <wctype.h>
 #endif
 // RUN: echo '%{cxx} %s %{flags} %{compile_flags} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only -DTEST_124 &' >> %t.sh
 // RUN: echo 'TEST_124=$!' >> %t.sh
 // RUN: echo "wait $TEST_108" >> %t.sh
 #if defined(TEST_124) && __cplusplus >= 201103L
-#include <experimental/forward_list>
+#include <experimental/deque>
 #endif
 // RUN: echo '%{cxx} %s %{flags} %{compile_flags} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only -DTEST_125 &' >> %t.sh
 // RUN: echo 'TEST_125=$!' >> %t.sh
 // RUN: echo "wait $TEST_109" >> %t.sh
 #if defined(TEST_125) && __cplusplus >= 201103L
-#include <experimental/iterator>
+#include <experimental/forward_list>
 #endif
 // RUN: echo '%{cxx} %s %{flags} %{compile_flags} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only -DTEST_126 &' >> %t.sh
 // RUN: echo 'TEST_126=$!' >> %t.sh
 // RUN: echo "wait $TEST_110" >> %t.sh
 #if defined(TEST_126) && __cplusplus >= 201103L
-#include <experimental/list>
+#include <experimental/iterator>
 #endif
 // RUN: echo '%{cxx} %s %{flags} %{compile_flags} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only -DTEST_127 &' >> %t.sh
 // RUN: echo 'TEST_127=$!' >> %t.sh
 // RUN: echo "wait $TEST_111" >> %t.sh
 #if defined(TEST_127) && __cplusplus >= 201103L
-#include <experimental/map>
+#include <experimental/list>
 #endif
 // RUN: echo '%{cxx} %s %{flags} %{compile_flags} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only -DTEST_128 &' >> %t.sh
 // RUN: echo 'TEST_128=$!' >> %t.sh
 // RUN: echo "wait $TEST_112" >> %t.sh
 #if defined(TEST_128) && __cplusplus >= 201103L
-#include <experimental/memory_resource>
+#include <experimental/map>
 #endif
 // RUN: echo '%{cxx} %s %{flags} %{compile_flags} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only -DTEST_129 &' >> %t.sh
 // RUN: echo 'TEST_129=$!' >> %t.sh
 // RUN: echo "wait $TEST_113" >> %t.sh
 #if defined(TEST_129) && __cplusplus >= 201103L
-#include <experimental/propagate_const>
+#include <experimental/memory_resource>
 #endif
 // RUN: echo '%{cxx} %s %{flags} %{compile_flags} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only -DTEST_130 &' >> %t.sh
 // RUN: echo 'TEST_130=$!' >> %t.sh
 // RUN: echo "wait $TEST_114" >> %t.sh
-#if defined(TEST_130) && !defined(_LIBCPP_HAS_NO_LOCALIZATION) && __cplusplus >= 201103L
-#include <experimental/regex>
+#if defined(TEST_130) && __cplusplus >= 201103L
+#include <experimental/propagate_const>
 #endif
 // RUN: echo '%{cxx} %s %{flags} %{compile_flags} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only -DTEST_131 &' >> %t.sh
 // RUN: echo 'TEST_131=$!' >> %t.sh
 // RUN: echo "wait $TEST_115" >> %t.sh
-#if defined(TEST_131) && __cplusplus >= 201103L
-#include <experimental/set>
+#if defined(TEST_131) && !defined(_LIBCPP_HAS_NO_LOCALIZATION) && __cplusplus >= 201103L
+#include <experimental/regex>
 #endif
 // RUN: echo '%{cxx} %s %{flags} %{compile_flags} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only -DTEST_132 &' >> %t.sh
 // RUN: echo 'TEST_132=$!' >> %t.sh
 // RUN: echo "wait $TEST_116" >> %t.sh
 #if defined(TEST_132) && __cplusplus >= 201103L
-#include <experimental/simd>
+#include <experimental/set>
 #endif
 // RUN: echo '%{cxx} %s %{flags} %{compile_flags} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only -DTEST_133 &' >> %t.sh
 // RUN: echo 'TEST_133=$!' >> %t.sh
 // RUN: echo "wait $TEST_117" >> %t.sh
 #if defined(TEST_133) && __cplusplus >= 201103L
-#include <experimental/string>
+#include <experimental/simd>
 #endif
 // RUN: echo '%{cxx} %s %{flags} %{compile_flags} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only -DTEST_134 &' >> %t.sh
 // RUN: echo 'TEST_134=$!' >> %t.sh
 // RUN: echo "wait $TEST_118" >> %t.sh
 #if defined(TEST_134) && __cplusplus >= 201103L
-#include <experimental/type_traits>
+#include <experimental/string>
 #endif
 // RUN: echo '%{cxx} %s %{flags} %{compile_flags} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only -DTEST_135 &' >> %t.sh
 // RUN: echo 'TEST_135=$!' >> %t.sh
 // RUN: echo "wait $TEST_119" >> %t.sh
 #if defined(TEST_135) && __cplusplus >= 201103L
-#include <experimental/unordered_map>
+#include <experimental/type_traits>
 #endif
 // RUN: echo '%{cxx} %s %{flags} %{compile_flags} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only -DTEST_136 &' >> %t.sh
 // RUN: echo 'TEST_136=$!' >> %t.sh
 // RUN: echo "wait $TEST_120" >> %t.sh
 #if defined(TEST_136) && __cplusplus >= 201103L
-#include <experimental/unordered_set>
+#include <experimental/unordered_map>
 #endif
 // RUN: echo '%{cxx} %s %{flags} %{compile_flags} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only -DTEST_137 &' >> %t.sh
 // RUN: echo 'TEST_137=$!' >> %t.sh
 // RUN: echo "wait $TEST_121" >> %t.sh
 #if defined(TEST_137) && __cplusplus >= 201103L
-#include <experimental/utility>
+#include <experimental/unordered_set>
 #endif
 // RUN: echo '%{cxx} %s %{flags} %{compile_flags} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only -DTEST_138 &' >> %t.sh
 // RUN: echo 'TEST_138=$!' >> %t.sh
 // RUN: echo "wait $TEST_122" >> %t.sh
 #if defined(TEST_138) && __cplusplus >= 201103L
-#include <experimental/vector>
+#include <experimental/utility>
 #endif
+// RUN: echo '%{cxx} %s %{flags} %{compile_flags} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only -DTEST_139 &' >> %t.sh
+// RUN: echo 'TEST_139=$!' >> %t.sh
 // RUN: echo "wait $TEST_123" >> %t.sh
+#if defined(TEST_139) && __cplusplus >= 201103L
+#include <experimental/vector>
+#endif
 // RUN: echo "wait $TEST_124" >> %t.sh
 // RUN: echo "wait $TEST_125" >> %t.sh
 // RUN: echo "wait $TEST_126" >> %t.sh
@@ -885,5 +890,6 @@
 // RUN: echo "wait $TEST_136" >> %t.sh
 // RUN: echo "wait $TEST_137" >> %t.sh
 // RUN: echo "wait $TEST_138" >> %t.sh
+// RUN: echo "wait $TEST_139" >> %t.sh
 // RUN: bash %t.sh
 // GENERATED-MARKER
diff --git a/libcxx/test/libcxx/nasty_macros.compile.pass.cpp b/libcxx/test/libcxx/nasty_macros.compile.pass.cpp
--- a/libcxx/test/libcxx/nasty_macros.compile.pass.cpp
+++ b/libcxx/test/libcxx/nasty_macros.compile.pass.cpp
@@ -301,6 +301,9 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
+#if !defined(_LIBCPP_HAS_NO_THREADS) && defined(_LIBCPP_AVAILABILITY_SYNC)
+#   include <stop_token>
+#endif
 #if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
 #   include <streambuf>
 #endif
diff --git a/libcxx/test/libcxx/no_assert_include.compile.pass.cpp b/libcxx/test/libcxx/no_assert_include.compile.pass.cpp
--- a/libcxx/test/libcxx/no_assert_include.compile.pass.cpp
+++ b/libcxx/test/libcxx/no_assert_include.compile.pass.cpp
@@ -173,6 +173,9 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
+#if !defined(_LIBCPP_HAS_NO_THREADS) && defined(_LIBCPP_AVAILABILITY_SYNC)
+#   include <stop_token>
+#endif
 #if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
 #   include <streambuf>
 #endif
diff --git a/libcxx/test/libcxx/private_headers.verify.cpp b/libcxx/test/libcxx/private_headers.verify.cpp
--- a/libcxx/test/libcxx/private_headers.verify.cpp
+++ b/libcxx/test/libcxx/private_headers.verify.cpp
@@ -629,6 +629,10 @@
 #include <__stop_token/atomic_unique_lock.h> // expected-error@*:* {{use of private header from outside its module: '__stop_token/atomic_unique_lock.h'}}
 #include <__stop_token/intrusive_list_view.h> // expected-error@*:* {{use of private header from outside its module: '__stop_token/intrusive_list_view.h'}}
 #include <__stop_token/intrusive_shared_ptr.h> // expected-error@*:* {{use of private header from outside its module: '__stop_token/intrusive_shared_ptr.h'}}
+#include <__stop_token/stop_callback.h> // expected-error@*:* {{use of private header from outside its module: '__stop_token/stop_callback.h'}}
+#include <__stop_token/stop_source.h> // expected-error@*:* {{use of private header from outside its module: '__stop_token/stop_source.h'}}
+#include <__stop_token/stop_state.h> // expected-error@*:* {{use of private header from outside its module: '__stop_token/stop_state.h'}}
+#include <__stop_token/stop_token.h> // expected-error@*:* {{use of private header from outside its module: '__stop_token/stop_token.h'}}
 #include <__string/char_traits.h> // expected-error@*:* {{use of private header from outside its module: '__string/char_traits.h'}}
 #include <__string/constexpr_c_functions.h> // expected-error@*:* {{use of private header from outside its module: '__string/constexpr_c_functions.h'}}
 #include <__string/extern_template_lists.h> // expected-error@*:* {{use of private header from outside its module: '__string/extern_template_lists.h'}}
diff --git a/libcxx/test/libcxx/transitive_includes.sh.cpp b/libcxx/test/libcxx/transitive_includes.sh.cpp
--- a/libcxx/test/libcxx/transitive_includes.sh.cpp
+++ b/libcxx/test/libcxx/transitive_includes.sh.cpp
@@ -409,136 +409,140 @@
 #if defined(TEST_97)
 #include <stdexcept>
 #endif
-// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_101 > /dev/null 2> %t/header.streambuf
+// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_101 > /dev/null 2> %t/header.stop_token
 #if defined(TEST_101)
-#include <streambuf>
+#include <stop_token>
 #endif
-// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_102 > /dev/null 2> %t/header.string
+// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_102 > /dev/null 2> %t/header.streambuf
 #if defined(TEST_102)
+#include <streambuf>
+#endif
+// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_103 > /dev/null 2> %t/header.string
+#if defined(TEST_103)
 #include <string>
 #endif
-// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_104 > /dev/null 2> %t/header.string_view
-#if defined(TEST_104)
+// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_105 > /dev/null 2> %t/header.string_view
+#if defined(TEST_105)
 #include <string_view>
 #endif
-// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_105 > /dev/null 2> %t/header.strstream
-#if defined(TEST_105)
+// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_106 > /dev/null 2> %t/header.strstream
+#if defined(TEST_106)
 #include <strstream>
 #endif
-// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_106 > /dev/null 2> %t/header.system_error
-#if defined(TEST_106)
+// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_107 > /dev/null 2> %t/header.system_error
+#if defined(TEST_107)
 #include <system_error>
 #endif
-// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_108 > /dev/null 2> %t/header.thread
-#if defined(TEST_108)
+// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_109 > /dev/null 2> %t/header.thread
+#if defined(TEST_109)
 #include <thread>
 #endif
-// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_109 > /dev/null 2> %t/header.tuple
-#if defined(TEST_109)
+// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_110 > /dev/null 2> %t/header.tuple
+#if defined(TEST_110)
 #include <tuple>
 #endif
-// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_110 > /dev/null 2> %t/header.type_traits
-#if defined(TEST_110)
+// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_111 > /dev/null 2> %t/header.type_traits
+#if defined(TEST_111)
 #include <type_traits>
 #endif
-// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_111 > /dev/null 2> %t/header.typeindex
-#if defined(TEST_111)
+// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_112 > /dev/null 2> %t/header.typeindex
+#if defined(TEST_112)
 #include <typeindex>
 #endif
-// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_112 > /dev/null 2> %t/header.typeinfo
-#if defined(TEST_112)
+// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_113 > /dev/null 2> %t/header.typeinfo
+#if defined(TEST_113)
 #include <typeinfo>
 #endif
-// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_114 > /dev/null 2> %t/header.unordered_map
-#if defined(TEST_114)
+// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_115 > /dev/null 2> %t/header.unordered_map
+#if defined(TEST_115)
 #include <unordered_map>
 #endif
-// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_115 > /dev/null 2> %t/header.unordered_set
-#if defined(TEST_115)
+// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_116 > /dev/null 2> %t/header.unordered_set
+#if defined(TEST_116)
 #include <unordered_set>
 #endif
-// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_116 > /dev/null 2> %t/header.utility
-#if defined(TEST_116)
+// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_117 > /dev/null 2> %t/header.utility
+#if defined(TEST_117)
 #include <utility>
 #endif
-// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_117 > /dev/null 2> %t/header.valarray
-#if defined(TEST_117)
+// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_118 > /dev/null 2> %t/header.valarray
+#if defined(TEST_118)
 #include <valarray>
 #endif
-// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_118 > /dev/null 2> %t/header.variant
-#if defined(TEST_118)
+// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_119 > /dev/null 2> %t/header.variant
+#if defined(TEST_119)
 #include <variant>
 #endif
-// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_119 > /dev/null 2> %t/header.vector
-#if defined(TEST_119)
+// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_120 > /dev/null 2> %t/header.vector
+#if defined(TEST_120)
 #include <vector>
 #endif
-// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_120 > /dev/null 2> %t/header.version
-#if defined(TEST_120)
+// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_121 > /dev/null 2> %t/header.version
+#if defined(TEST_121)
 #include <version>
 #endif
-// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_123 > /dev/null 2> %t/header.experimental_deque
-#if defined(TEST_123)
+// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_124 > /dev/null 2> %t/header.experimental_deque
+#if defined(TEST_124)
 #include <experimental/deque>
 #endif
-// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_124 > /dev/null 2> %t/header.experimental_forward_list
-#if defined(TEST_124)
+// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_125 > /dev/null 2> %t/header.experimental_forward_list
+#if defined(TEST_125)
 #include <experimental/forward_list>
 #endif
-// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_125 > /dev/null 2> %t/header.experimental_iterator
-#if defined(TEST_125)
+// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_126 > /dev/null 2> %t/header.experimental_iterator
+#if defined(TEST_126)
 #include <experimental/iterator>
 #endif
-// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_126 > /dev/null 2> %t/header.experimental_list
-#if defined(TEST_126)
+// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_127 > /dev/null 2> %t/header.experimental_list
+#if defined(TEST_127)
 #include <experimental/list>
 #endif
-// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_127 > /dev/null 2> %t/header.experimental_map
-#if defined(TEST_127)
+// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_128 > /dev/null 2> %t/header.experimental_map
+#if defined(TEST_128)
 #include <experimental/map>
 #endif
-// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_128 > /dev/null 2> %t/header.experimental_memory_resource
-#if defined(TEST_128)
+// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_129 > /dev/null 2> %t/header.experimental_memory_resource
+#if defined(TEST_129)
 #include <experimental/memory_resource>
 #endif
-// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_129 > /dev/null 2> %t/header.experimental_propagate_const
-#if defined(TEST_129)
+// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_130 > /dev/null 2> %t/header.experimental_propagate_const
+#if defined(TEST_130)
 #include <experimental/propagate_const>
 #endif
-// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_130 > /dev/null 2> %t/header.experimental_regex
-#if defined(TEST_130)
+// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_131 > /dev/null 2> %t/header.experimental_regex
+#if defined(TEST_131)
 #include <experimental/regex>
 #endif
-// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_131 > /dev/null 2> %t/header.experimental_set
-#if defined(TEST_131)
+// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_132 > /dev/null 2> %t/header.experimental_set
+#if defined(TEST_132)
 #include <experimental/set>
 #endif
-// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_132 > /dev/null 2> %t/header.experimental_simd
-#if defined(TEST_132)
+// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_133 > /dev/null 2> %t/header.experimental_simd
+#if defined(TEST_133)
 #include <experimental/simd>
 #endif
-// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_133 > /dev/null 2> %t/header.experimental_string
-#if defined(TEST_133)
+// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_134 > /dev/null 2> %t/header.experimental_string
+#if defined(TEST_134)
 #include <experimental/string>
 #endif
-// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_134 > /dev/null 2> %t/header.experimental_type_traits
-#if defined(TEST_134)
+// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_135 > /dev/null 2> %t/header.experimental_type_traits
+#if defined(TEST_135)
 #include <experimental/type_traits>
 #endif
-// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_135 > /dev/null 2> %t/header.experimental_unordered_map
-#if defined(TEST_135)
+// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_136 > /dev/null 2> %t/header.experimental_unordered_map
+#if defined(TEST_136)
 #include <experimental/unordered_map>
 #endif
-// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_136 > /dev/null 2> %t/header.experimental_unordered_set
-#if defined(TEST_136)
+// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_137 > /dev/null 2> %t/header.experimental_unordered_set
+#if defined(TEST_137)
 #include <experimental/unordered_set>
 #endif
-// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_137 > /dev/null 2> %t/header.experimental_utility
-#if defined(TEST_137)
+// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_138 > /dev/null 2> %t/header.experimental_utility
+#if defined(TEST_138)
 #include <experimental/utility>
 #endif
-// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_138 > /dev/null 2> %t/header.experimental_vector
-#if defined(TEST_138)
+// RUN: %{cxx} %s %{flags} %{compile_flags} --trace-includes -fshow-skipped-includes --preprocess -DTEST_139 > /dev/null 2> %t/header.experimental_vector
+#if defined(TEST_139)
 #include <experimental/vector>
 #endif
 // RUN: %{python} %S/transitive_includes_to_csv.py %t > %t/transitive_includes.csv
diff --git a/libcxx/test/libcxx/transitive_includes/cxx03.csv b/libcxx/test/libcxx/transitive_includes/cxx03.csv
--- a/libcxx/test/libcxx/transitive_includes/cxx03.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx03.csv
@@ -770,6 +770,11 @@
 stdexcept cstdlib
 stdexcept exception
 stdexcept iosfwd
+stop_token atomic
+stop_token cstddef
+stop_token limits
+stop_token thread
+stop_token version
 streambuf cstdint
 streambuf ios
 streambuf iosfwd
diff --git a/libcxx/test/libcxx/transitive_includes/cxx11.csv b/libcxx/test/libcxx/transitive_includes/cxx11.csv
--- a/libcxx/test/libcxx/transitive_includes/cxx11.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx11.csv
@@ -771,6 +771,11 @@
 stdexcept cstdlib
 stdexcept exception
 stdexcept iosfwd
+stop_token atomic
+stop_token cstddef
+stop_token limits
+stop_token thread
+stop_token version
 streambuf cstdint
 streambuf ios
 streambuf iosfwd
diff --git a/libcxx/test/libcxx/transitive_includes/cxx14.csv b/libcxx/test/libcxx/transitive_includes/cxx14.csv
--- a/libcxx/test/libcxx/transitive_includes/cxx14.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx14.csv
@@ -773,6 +773,11 @@
 stdexcept cstdlib
 stdexcept exception
 stdexcept iosfwd
+stop_token atomic
+stop_token cstddef
+stop_token limits
+stop_token thread
+stop_token version
 streambuf cstdint
 streambuf ios
 streambuf iosfwd
diff --git a/libcxx/test/libcxx/transitive_includes/cxx17.csv b/libcxx/test/libcxx/transitive_includes/cxx17.csv
--- a/libcxx/test/libcxx/transitive_includes/cxx17.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx17.csv
@@ -773,6 +773,11 @@
 stdexcept cstdlib
 stdexcept exception
 stdexcept iosfwd
+stop_token atomic
+stop_token cstddef
+stop_token limits
+stop_token thread
+stop_token version
 streambuf cstdint
 streambuf ios
 streambuf iosfwd
diff --git a/libcxx/test/libcxx/transitive_includes/cxx20.csv b/libcxx/test/libcxx/transitive_includes/cxx20.csv
--- a/libcxx/test/libcxx/transitive_includes/cxx20.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx20.csv
@@ -779,6 +779,11 @@
 stdexcept cstdlib
 stdexcept exception
 stdexcept iosfwd
+stop_token atomic
+stop_token cstddef
+stop_token limits
+stop_token thread
+stop_token version
 streambuf cstdint
 streambuf ios
 streambuf iosfwd
diff --git a/libcxx/test/libcxx/transitive_includes/cxx2b.csv b/libcxx/test/libcxx/transitive_includes/cxx2b.csv
--- a/libcxx/test/libcxx/transitive_includes/cxx2b.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx2b.csv
@@ -520,6 +520,11 @@
 stack initializer_list
 stack version
 stdexcept iosfwd
+stop_token atomic
+stop_token cstddef
+stop_token limits
+stop_token thread
+stop_token version
 streambuf cstdint
 streambuf ios
 streambuf iosfwd
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/stop_token.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/stop_token.version.compile.pass.cpp
new file mode 100644
--- /dev/null
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/stop_token.version.compile.pass.cpp
@@ -0,0 +1,76 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// WARNING: This test was generated by generate_feature_test_macro_components.py
+// and should not be edited manually.
+//
+// clang-format off
+
+// UNSUPPORTED: no-threads, availability-synchronization_library-missing
+
+// <stop_token>
+
+// Test the feature test macros defined by <stop_token>
+
+/*  Constant             Value
+    __cpp_lib_jthread    201911L [C++20]
+*/
+
+#include <stop_token>
+#include "test_macros.h"
+
+#if TEST_STD_VER < 14
+
+# ifdef __cpp_lib_jthread
+#   error "__cpp_lib_jthread should not be defined before c++20"
+# endif
+
+#elif TEST_STD_VER == 14
+
+# ifdef __cpp_lib_jthread
+#   error "__cpp_lib_jthread should not be defined before c++20"
+# endif
+
+#elif TEST_STD_VER == 17
+
+# ifdef __cpp_lib_jthread
+#   error "__cpp_lib_jthread should not be defined before c++20"
+# endif
+
+#elif TEST_STD_VER == 20
+
+# if !defined(_LIBCPP_VERSION)
+#   ifndef __cpp_lib_jthread
+#     error "__cpp_lib_jthread should be defined in c++20"
+#   endif
+#   if __cpp_lib_jthread != 201911L
+#     error "__cpp_lib_jthread should have the value 201911L in c++20"
+#   endif
+# else // _LIBCPP_VERSION
+#   ifdef __cpp_lib_jthread
+#     error "__cpp_lib_jthread should not be defined because it is unimplemented in libc++!"
+#   endif
+# endif
+
+#elif TEST_STD_VER > 20
+
+# if !defined(_LIBCPP_VERSION)
+#   ifndef __cpp_lib_jthread
+#     error "__cpp_lib_jthread should be defined in c++2b"
+#   endif
+#   if __cpp_lib_jthread != 201911L
+#     error "__cpp_lib_jthread should have the value 201911L in c++2b"
+#   endif
+# else // _LIBCPP_VERSION
+#   ifdef __cpp_lib_jthread
+#     error "__cpp_lib_jthread should not be defined because it is unimplemented in libc++!"
+#   endif
+# endif
+
+#endif // TEST_STD_VER > 20
+
diff --git a/libcxx/test/std/thread/thread.stoptoken/nostopstate/cons.default.pass.cpp b/libcxx/test/std/thread/thread.stoptoken/nostopstate/cons.default.pass.cpp
new file mode 100644
--- /dev/null
+++ b/libcxx/test/std/thread/thread.stoptoken/nostopstate/cons.default.pass.cpp
@@ -0,0 +1,41 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: availability-synchronization_library-missing
+
+// struct nostopstate_t {
+//   explicit nostopstate_t() = default;
+// };
+//
+// inline constexpr nostopstate_t nostopstate{};
+
+#include <stop_token>
+#include <type_traits>
+
+#include "test_macros.h"
+
+static_assert(std::is_trivially_default_constructible_v<std::nostopstate_t>);
+
+struct Empty {};
+static_assert(sizeof(Empty) == sizeof(std::nostopstate_t));
+
+template <class T>
+void conversionTest(T);
+
+template <class T>
+concept ImplicitlyDefaultConstructible = requires { conversionTest<T>({}); };
+static_assert(!ImplicitlyDefaultConstructible<std::nostopstate_t>);
+
+int main(int, char**) {
+  [[maybe_unused]] auto x = std::nostopstate;
+  [[maybe_unused]] auto y = std::nostopstate_t{};
+
+  return 0;
+}
diff --git a/libcxx/test/std/thread/thread.stoptoken/stopcallback/cons.const.token.pass.cpp b/libcxx/test/std/thread/thread.stoptoken/stopcallback/cons.const.token.pass.cpp
new file mode 100644
--- /dev/null
+++ b/libcxx/test/std/thread/thread.stoptoken/stopcallback/cons.const.token.pass.cpp
@@ -0,0 +1,236 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: availability-synchronization_library-missing
+
+// template<class C>
+// explicit stop_callback(const stop_token& st, C&& cb)
+//   noexcept(is_nothrow_constructible_v<Callback, C>);
+
+#include <atomic>
+#include <cassert>
+#include <chrono>
+#include <stop_token>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "make_test_thread.h"
+#include "test_macros.h"
+
+struct Cb {
+  void operator()() const;
+};
+
+// Constraints: Callback and C satisfy constructible_from<Callback, C>.
+static_assert(std::is_constructible_v<std::stop_callback<void (*)()>, const std::stop_token&, void (*)()>);
+static_assert(!std::is_constructible_v<std::stop_callback<void (*)()>, const std::stop_token&, void (*)(int)>);
+static_assert(std::is_constructible_v<std::stop_callback<Cb>, const std::stop_token&, Cb&>);
+static_assert(std::is_constructible_v<std::stop_callback<Cb&>, const std::stop_token&, Cb&>);
+static_assert(!std::is_constructible_v<std::stop_callback<Cb>, const std::stop_token&, int>);
+
+// explicit
+template <class T>
+void conversion_test(T);
+
+template <class T, class... Args>
+concept ImplicitlyConstructible = requires(Args&&... args) { conversion_test<T>({std::forward<Args>(args)...}); };
+static_assert(ImplicitlyConstructible<int, int>);
+static_assert(!ImplicitlyConstructible<std::stop_callback<Cb>, const std::stop_token&, Cb>);
+
+// noexcept
+template <bool NoExceptCtor>
+struct CbNoExcept {
+  CbNoExcept(int) noexcept(NoExceptCtor);
+  void operator()() const;
+};
+static_assert(std::is_nothrow_constructible_v<std::stop_callback<CbNoExcept<true>>, const std::stop_token&, int>);
+static_assert(!std::is_nothrow_constructible_v<std::stop_callback<CbNoExcept<false>>, const std::stop_token&, int>);
+
+int main(int, char**) {
+  // was requested
+  {
+    std::stop_source ss;
+    const auto st = ss.get_token();
+    ss.request_stop();
+
+    bool called = false;
+    std::stop_callback sc(st, [&] { called = true; });
+    assert(called);
+  }
+
+  // was not requested
+  {
+    std::stop_source ss;
+    const auto st = ss.get_token();
+
+    bool called = false;
+    std::stop_callback sc(st, [&] { called = true; });
+    assert(!called);
+
+    ss.request_stop();
+    assert(called);
+  }
+
+  // token has no state
+  {
+    std::stop_token st;
+    bool called = false;
+    std::stop_callback sc(st, [&] { called = true; });
+    assert(!called);
+  }
+
+  // should not be called multiple times
+  {
+    std::stop_source ss;
+    const auto st = ss.get_token();
+
+    int calledTimes = 0;
+    std::stop_callback sc(st, [&] { ++calledTimes; });
+
+    std::vector<std::thread> threads;
+    for (auto i = 0; i < 10; ++i) {
+      threads.emplace_back(support::make_test_thread([&] { ss.request_stop(); }));
+    }
+
+    for (auto& thread : threads) {
+      thread.join();
+    }
+    assert(calledTimes == 1);
+  }
+
+  // adding more callbacks during invoking other callbacks
+  {
+    std::stop_source ss;
+    const auto st = ss.get_token();
+
+    std::atomic<bool> startedFlag = false;
+    std::atomic<bool> finishFlag  = false;
+    std::stop_callback sc(st, [&] {
+      startedFlag = true;
+      startedFlag.notify_all();
+      finishFlag.wait(false);
+    });
+
+    auto thread = support::make_test_thread([&] { ss.request_stop(); });
+
+    startedFlag.wait(false);
+
+    // first callback is still running, adding another one;
+    bool secondCallbackCalled = false;
+    std::stop_callback sc2(st, [&] { secondCallbackCalled = true; });
+
+    finishFlag = true;
+    finishFlag.notify_all();
+
+    thread.join();
+    assert(secondCallbackCalled);
+  }
+
+  // adding callbacks on different threads
+  {
+    std::stop_source ss;
+    const auto st = ss.get_token();
+
+    std::vector<std::thread> threads;
+    std::atomic<int> callbackCalledTimes = 0;
+    std::atomic<bool> done               = false;
+    for (auto i = 0; i < 10; ++i) {
+      threads.emplace_back(support::make_test_thread([&] {
+        std::stop_callback sc{st, [&] { callbackCalledTimes.fetch_add(1, std::memory_order_relaxed); }};
+        done.wait(false);
+      }));
+    }
+    using namespace std::chrono_literals;
+    std::this_thread::sleep_for(1ms);
+    ss.request_stop();
+    done = true;
+    done.notify_all();
+    for (auto& thread : threads) {
+      thread.join();
+    }
+    assert(callbackCalledTimes.load(std::memory_order_relaxed) == 10);
+  }
+
+  // correct overload
+  {
+    struct CBWithTracking {
+      bool& lvalueCalled;
+      bool& lvalueConstCalled;
+      bool& rvalueCalled;
+      bool& rvalueConstCalled;
+
+      void operator()() & { lvalueCalled = true; }
+      void operator()() const& { lvalueConstCalled = true; }
+      void operator()() && { rvalueCalled = true; }
+      void operator()() const&& { rvalueConstCalled = true; }
+    };
+
+    // RValue
+    {
+      bool lvalueCalled      = false;
+      bool lvalueConstCalled = false;
+      bool rvalueCalled      = false;
+      bool rvalueConstCalled = false;
+      std::stop_source ss;
+      const auto st = ss.get_token();
+      ss.request_stop();
+
+      std::stop_callback<CBWithTracking> sc(
+          st, CBWithTracking{lvalueCalled, lvalueConstCalled, rvalueCalled, rvalueConstCalled});
+      assert(rvalueCalled);
+    }
+
+    // RValue
+    {
+      bool lvalueCalled      = false;
+      bool lvalueConstCalled = false;
+      bool rvalueCalled      = false;
+      bool rvalueConstCalled = false;
+      std::stop_source ss;
+      const auto st = ss.get_token();
+      ss.request_stop();
+
+      std::stop_callback<const CBWithTracking> sc(
+          st, CBWithTracking{lvalueCalled, lvalueConstCalled, rvalueCalled, rvalueConstCalled});
+      assert(rvalueConstCalled);
+    }
+
+    // LValue
+    {
+      bool lvalueCalled      = false;
+      bool lvalueConstCalled = false;
+      bool rvalueCalled      = false;
+      bool rvalueConstCalled = false;
+      std::stop_source ss;
+      const auto st = ss.get_token();
+      ss.request_stop();
+      CBWithTracking cb{lvalueCalled, lvalueConstCalled, rvalueCalled, rvalueConstCalled};
+      std::stop_callback<CBWithTracking&> sc(st, cb);
+      assert(lvalueCalled);
+    }
+
+    // const LValue
+    {
+      bool lvalueCalled      = false;
+      bool lvalueConstCalled = false;
+      bool rvalueCalled      = false;
+      bool rvalueConstCalled = false;
+      std::stop_source ss;
+      const auto st = ss.get_token();
+      ss.request_stop();
+      CBWithTracking cb{lvalueCalled, lvalueConstCalled, rvalueCalled, rvalueConstCalled};
+      std::stop_callback<const CBWithTracking&> sc(st, cb);
+      assert(lvalueConstCalled);
+    }
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/thread/thread.stoptoken/stopcallback/cons.rvalue.token.pass.cpp b/libcxx/test/std/thread/thread.stoptoken/stopcallback/cons.rvalue.token.pass.cpp
new file mode 100644
--- /dev/null
+++ b/libcxx/test/std/thread/thread.stoptoken/stopcallback/cons.rvalue.token.pass.cpp
@@ -0,0 +1,227 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: availability-synchronization_library-missing
+
+// template<class C>
+// explicit stop_callback(stop_token&& st, C&& cb)
+//   noexcept(is_nothrow_constructible_v<Callback, C>);
+
+#include <atomic>
+#include <cassert>
+#include <chrono>
+#include <stop_token>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "make_test_thread.h"
+#include "test_macros.h"
+
+struct Cb {
+  void operator()() const;
+};
+
+// Constraints: Callback and C satisfy constructible_from<Callback, C>.
+static_assert(std::is_constructible_v<std::stop_callback<void (*)()>, std::stop_token&&, void (*)()>);
+static_assert(!std::is_constructible_v<std::stop_callback<void (*)()>, std::stop_token&&, void (*)(int)>);
+static_assert(std::is_constructible_v<std::stop_callback<Cb>, std::stop_token&&, Cb&>);
+static_assert(std::is_constructible_v<std::stop_callback<Cb&>, std::stop_token&&, Cb&>);
+static_assert(!std::is_constructible_v<std::stop_callback<Cb>, std::stop_token&&, int>);
+
+// explicit
+template <class T>
+void conversion_test(T);
+
+template <class T, class... Args>
+concept ImplicitlyConstructible = requires(Args&&... args) { conversion_test<T>({std::forward<Args>(args)...}); };
+static_assert(ImplicitlyConstructible<int, int>);
+static_assert(!ImplicitlyConstructible<std::stop_callback<Cb>, std::stop_token&&, Cb>);
+
+// noexcept
+template <bool NoExceptCtor>
+struct CbNoExcept {
+  CbNoExcept(int) noexcept(NoExceptCtor);
+  void operator()() const;
+};
+static_assert(std::is_nothrow_constructible_v<std::stop_callback<CbNoExcept<true>>, std::stop_token&&, int>);
+static_assert(!std::is_nothrow_constructible_v<std::stop_callback<CbNoExcept<false>>, std::stop_token&&, int>);
+
+int main(int, char**) {
+  // was requested
+  {
+    std::stop_source ss;
+    ss.request_stop();
+
+    bool called = false;
+    std::stop_callback sc(ss.get_token(), [&] { called = true; });
+    assert(called);
+  }
+
+  // was not requested
+  {
+    std::stop_source ss;
+
+    bool called = false;
+    std::stop_callback sc(ss.get_token(), [&] { called = true; });
+    assert(!called);
+
+    ss.request_stop();
+    assert(called);
+  }
+
+  // token has no state
+  {
+    std::stop_token st;
+    bool called = false;
+    std::stop_callback sc(std::move(st), [&] { called = true; });
+    assert(!called);
+  }
+
+  // should not be called multiple times
+  {
+    std::stop_source ss;
+
+    int calledTimes = 0;
+    std::stop_callback sc(ss.get_token(), [&] { ++calledTimes; });
+
+    std::vector<std::thread> threads;
+    for (auto i = 0; i < 10; ++i) {
+      threads.emplace_back(support::make_test_thread([&] { ss.request_stop(); }));
+    }
+
+    for (auto& thread : threads) {
+      thread.join();
+    }
+    assert(calledTimes == 1);
+  }
+
+  // adding more callbacks during invoking other callbacks
+  {
+    std::stop_source ss;
+
+    std::atomic<bool> startedFlag = false;
+    std::atomic<bool> finishFlag  = false;
+    std::stop_callback sc(ss.get_token(), [&] {
+      startedFlag = true;
+      startedFlag.notify_all();
+      finishFlag.wait(false);
+    });
+
+    auto thread = support::make_test_thread([&] { ss.request_stop(); });
+
+    startedFlag.wait(false);
+
+    // first callback is still running, adding another one;
+    bool secondCallbackCalled = false;
+    std::stop_callback sc2(ss.get_token(), [&] { secondCallbackCalled = true; });
+
+    finishFlag = true;
+    finishFlag.notify_all();
+
+    thread.join();
+    assert(secondCallbackCalled);
+  }
+
+  // adding callbacks on different threads
+  {
+    std::stop_source ss;
+
+    std::vector<std::thread> threads;
+    std::atomic<int> callbackCalledTimes = 0;
+    std::atomic<bool> done               = false;
+    for (auto i = 0; i < 10; ++i) {
+      threads.emplace_back(support::make_test_thread([&] {
+        std::stop_callback sc{ss.get_token(), [&] { callbackCalledTimes.fetch_add(1, std::memory_order_relaxed); }};
+        done.wait(false);
+      }));
+    }
+    using namespace std::chrono_literals;
+    std::this_thread::sleep_for(1ms);
+    ss.request_stop();
+    done = true;
+    done.notify_all();
+    for (auto& thread : threads) {
+      thread.join();
+    }
+    assert(callbackCalledTimes.load(std::memory_order_relaxed) == 10);
+  }
+
+  // correct overload
+  {
+    struct CBWithTracking {
+      bool& lvalueCalled;
+      bool& lvalueConstCalled;
+      bool& rvalueCalled;
+      bool& rvalueConstCalled;
+
+      void operator()() & { lvalueCalled = true; }
+      void operator()() const& { lvalueConstCalled = true; }
+      void operator()() && { rvalueCalled = true; }
+      void operator()() const&& { rvalueConstCalled = true; }
+    };
+
+    // RValue
+    {
+      bool lvalueCalled      = false;
+      bool lvalueConstCalled = false;
+      bool rvalueCalled      = false;
+      bool rvalueConstCalled = false;
+      std::stop_source ss;
+      ss.request_stop();
+
+      std::stop_callback<CBWithTracking> sc(
+          ss.get_token(), CBWithTracking{lvalueCalled, lvalueConstCalled, rvalueCalled, rvalueConstCalled});
+      assert(rvalueCalled);
+    }
+
+    // RValue
+    {
+      bool lvalueCalled      = false;
+      bool lvalueConstCalled = false;
+      bool rvalueCalled      = false;
+      bool rvalueConstCalled = false;
+      std::stop_source ss;
+      ss.request_stop();
+
+      std::stop_callback<const CBWithTracking> sc(
+          ss.get_token(), CBWithTracking{lvalueCalled, lvalueConstCalled, rvalueCalled, rvalueConstCalled});
+      assert(rvalueConstCalled);
+    }
+
+    // LValue
+    {
+      bool lvalueCalled      = false;
+      bool lvalueConstCalled = false;
+      bool rvalueCalled      = false;
+      bool rvalueConstCalled = false;
+      std::stop_source ss;
+      ss.request_stop();
+      CBWithTracking cb{lvalueCalled, lvalueConstCalled, rvalueCalled, rvalueConstCalled};
+      std::stop_callback<CBWithTracking&> sc(ss.get_token(), cb);
+      assert(lvalueCalled);
+    }
+
+    // const LValue
+    {
+      bool lvalueCalled      = false;
+      bool lvalueConstCalled = false;
+      bool rvalueCalled      = false;
+      bool rvalueConstCalled = false;
+      std::stop_source ss;
+      ss.request_stop();
+      CBWithTracking cb{lvalueCalled, lvalueConstCalled, rvalueCalled, rvalueConstCalled};
+      std::stop_callback<const CBWithTracking&> sc(ss.get_token(), cb);
+      assert(lvalueConstCalled);
+    }
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/thread/thread.stoptoken/stopcallback/copy.move.compile.pass.cpp b/libcxx/test/std/thread/thread.stoptoken/stopcallback/copy.move.compile.pass.cpp
new file mode 100644
--- /dev/null
+++ b/libcxx/test/std/thread/thread.stoptoken/stopcallback/copy.move.compile.pass.cpp
@@ -0,0 +1,28 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: availability-synchronization_library-missing
+
+//  stop_callback(const stop_callback&) = delete;
+//  stop_callback(stop_callback&&) = delete;
+//  stop_callback& operator=(const stop_callback&) = delete;
+//  stop_callback& operator=(stop_callback&&) = delete;
+
+#include <stop_token>
+#include <type_traits>
+
+struct Callback {
+  void operator()() const;
+};
+
+static_assert(!std::is_copy_constructible_v<std::stop_callback<Callback>>);
+static_assert(!std::is_move_constructible_v<std::stop_callback<Callback>>);
+static_assert(!std::is_copy_assignable_v<std::stop_callback<Callback>>);
+static_assert(!std::is_move_assignable_v<std::stop_callback<Callback>>);
diff --git a/libcxx/test/std/thread/thread.stoptoken/stopcallback/ctad.compile.pass.cpp b/libcxx/test/std/thread/thread.stoptoken/stopcallback/ctad.compile.pass.cpp
new file mode 100644
--- /dev/null
+++ b/libcxx/test/std/thread/thread.stoptoken/stopcallback/ctad.compile.pass.cpp
@@ -0,0 +1,28 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: availability-synchronization_library-missing
+
+//   template<class Callback>
+//   stop_callback(stop_token, Callback) -> stop_callback<Callback>;
+
+#include <stop_token>
+#include <type_traits>
+#include <utility>
+
+void test() {
+  std::stop_token st;
+  auto a = [] {};
+  static_assert(std::is_same_v<decltype(std::stop_callback(st, a)), std::stop_callback<decltype(a)>>);
+  static_assert(std::is_same_v<decltype(std::stop_callback(st, std::as_const(a))), std::stop_callback<decltype(a)>>);
+  static_assert(std::is_same_v<decltype(std::stop_callback(st, std::move(a))), std::stop_callback<decltype(a)>>);
+  static_assert(
+      std::is_same_v<decltype(std::stop_callback(st, std::move(std::as_const(a)))), std::stop_callback<decltype(a)>>);
+}
diff --git a/libcxx/test/std/thread/thread.stoptoken/stopcallback/dtor.pass.cpp b/libcxx/test/std/thread/thread.stoptoken/stopcallback/dtor.pass.cpp
new file mode 100644
--- /dev/null
+++ b/libcxx/test/std/thread/thread.stoptoken/stopcallback/dtor.pass.cpp
@@ -0,0 +1,166 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: availability-synchronization_library-missing
+
+// ~stop_callback();
+
+#include <atomic>
+#include <cassert>
+#include <chrono>
+#include <functional>
+#include <optional>
+#include <stop_token>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "make_test_thread.h"
+#include "test_macros.h"
+
+struct CallbackHolder;
+
+struct DeleteHolder {
+  CallbackHolder& holder_;
+  void operator()() const;
+};
+
+struct CallbackHolder {
+  std::unique_ptr<std::stop_callback<DeleteHolder>> callback_;
+};
+
+void DeleteHolder::operator()() const { holder_.callback_.reset(); }
+
+int main(int, char**) {
+  // Unregisters the callback from the owned stop state, if any
+  {
+    std::stop_source ss;
+    bool called = false;
+
+    {
+      std::stop_callback sc(ss.get_token(), [&] { called = true; });
+    }
+    ss.request_stop();
+    assert(!called);
+  }
+
+  // The destructor does not block waiting for the execution of another
+  // callback registered by an associated stop_callback.
+  {
+    std::stop_source ss;
+
+    std::atomic<int> startedIndex    = 0;
+    std::atomic<bool> callbackFinish = false;
+
+    std::optional<std::stop_callback<std::function<void()>>> sc1(std::in_place, ss.get_token(), [&] {
+      startedIndex = 1;
+      startedIndex.notify_all();
+      callbackFinish.wait(false);
+    });
+
+    std::optional<std::stop_callback<std::function<void()>>> sc2(std::in_place, ss.get_token(), [&] {
+      startedIndex = 2;
+      startedIndex.notify_all();
+      callbackFinish.wait(false);
+    });
+
+    auto thread = support::make_test_thread([&] { ss.request_stop(); });
+
+    startedIndex.wait(0);
+
+    // now one of the callback has started but not finished.
+    if (startedIndex == 1) {
+      sc2.reset();   // destructor should not block
+    } else if (startedIndex == 2) {
+      sc1.reset();   // destructor should not block
+    } else {
+      assert(false); // something is wrong
+    }
+
+    callbackFinish = true;
+    callbackFinish.notify_all();
+    thread.join();
+  }
+
+  // If callback is concurrently executing on another thread, then the
+  // return from the invocation of callback strongly happens before ([intro.races])
+  // callback is destroyed.
+  {
+    struct Callback {
+      std::atomic<bool>& started_;
+      std::atomic<bool>& waitDone_;
+      std::atomic<bool>& finished_;
+      bool moved = false;
+
+      Callback(std::atomic<bool>& started, std::atomic<bool>& waitDone, std::atomic<bool>& finished)
+          : started_(started), waitDone_(waitDone), finished_(finished) {}
+      Callback(Callback&& other) : started_(other.started_), waitDone_(other.waitDone_), finished_(other.finished_) {
+        other.moved = true;
+      }
+
+      void operator()() const {
+        struct ScopedGuard {
+          std::atomic<bool>& g_finished_;
+          ~ScopedGuard() { g_finished_.store(true, std::memory_order_relaxed); }
+        };
+
+        started_ = true;
+        started_.notify_all();
+        waitDone_.wait(false);
+        ScopedGuard g{finished_};
+      }
+
+      ~Callback() {
+        if (!moved) {
+          // destructor has to be called after operator() returns
+          assert(finished_.load(std::memory_order_relaxed));
+        }
+      }
+    };
+
+    std::stop_source ss;
+
+    std::atomic<bool> started  = false;
+    std::atomic<bool> waitDone = false;
+    std::atomic<bool> finished = false;
+
+    std::optional<std::stop_callback<Callback>> sc{
+        std::in_place, ss.get_token(), Callback{started, waitDone, finished}};
+
+    auto thread1 = support::make_test_thread([&] { ss.request_stop(); });
+    started.wait(false);
+
+    auto thread2 = support::make_test_thread([&] {
+      using namespace std::chrono_literals;
+      std::this_thread::sleep_for(1ms);
+      waitDone = true;
+      waitDone.notify_all();
+    });
+
+    sc.reset(); // destructor should block until operator() returns, i.e. waitDone to be true
+
+    thread1.join();
+    thread2.join();
+  }
+
+  // If callback is executing on the current thread, then the destructor does not block ([defns.block])
+  // waiting for the return from the invocation of callback.
+  {
+    std::stop_source ss;
+
+    CallbackHolder holder;
+    holder.callback_ = std::make_unique<std::stop_callback<DeleteHolder>>(ss.get_token(), DeleteHolder{holder});
+
+    assert(holder.callback_ != nullptr);
+
+    ss.request_stop(); // the callbacks deletes itself. if the destructor blocks, it would be deadlock
+    assert(holder.callback_ == nullptr);
+  }
+}
diff --git a/libcxx/test/std/thread/thread.stoptoken/stopcallback/typedef.compile.pass.cpp b/libcxx/test/std/thread/thread.stoptoken/stopcallback/typedef.compile.pass.cpp
new file mode 100644
--- /dev/null
+++ b/libcxx/test/std/thread/thread.stoptoken/stopcallback/typedef.compile.pass.cpp
@@ -0,0 +1,27 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: availability-synchronization_library-missing
+
+// using callback_type = _Callback;
+
+#include <stop_token>
+#include <type_traits>
+
+struct Callback {
+  void operator()() const;
+};
+
+static_assert(std::is_same_v<std::stop_callback<Callback>::callback_type, Callback>);
+static_assert(std::is_same_v<std::stop_callback<const Callback>::callback_type, const Callback>);
+static_assert(std::is_same_v<std::stop_callback<Callback&>::callback_type, Callback&>);
+static_assert(std::is_same_v<std::stop_callback<const Callback&>::callback_type, const Callback&>);
+static_assert(std::is_same_v<std::stop_callback<Callback&&>::callback_type, Callback&&>);
+static_assert(std::is_same_v<std::stop_callback<const Callback&&>::callback_type, const Callback&&>);
diff --git a/libcxx/test/std/thread/thread.stoptoken/stopsource/assign.copy.pass.cpp b/libcxx/test/std/thread/thread.stoptoken/stopsource/assign.copy.pass.cpp
new file mode 100644
--- /dev/null
+++ b/libcxx/test/std/thread/thread.stoptoken/stopsource/assign.copy.pass.cpp
@@ -0,0 +1,132 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: availability-synchronization_library-missing
+
+// stop_source& operator=(const stop_source& rhs) noexcept;
+
+#include <cassert>
+#include <concepts>
+#include <stop_token>
+#include <type_traits>
+
+#include "test_macros.h"
+
+static_assert(std::is_nothrow_copy_assignable_v<std::stop_source>);
+
+int main(int, char**) {
+  // have two different states
+  {
+    std::stop_source ss1;
+    std::stop_source ss2;
+
+    assert(ss1 != ss2);
+
+    ss2.request_stop();
+
+    assert(!ss1.stop_requested());
+    assert(ss2.stop_requested());
+
+    std::same_as<std::stop_source&> decltype(auto) ref = ss1 = ss2;
+    assert(&ref == &ss1);
+
+    assert(ss1 == ss2);
+    assert(ss1.stop_requested());
+    assert(ss2.stop_requested());
+  }
+
+  // this has no state
+  {
+    std::stop_source ss1{std::nostopstate};
+    std::stop_source ss2;
+
+    assert(ss1 != ss2);
+
+    ss2.request_stop();
+
+    assert(!ss1.stop_requested());
+    assert(!ss1.stop_possible());
+    assert(ss2.stop_requested());
+    assert(ss2.stop_possible());
+
+    std::same_as<std::stop_source&> decltype(auto) ref = ss1 = ss2;
+    assert(&ref == &ss1);
+
+    assert(ss1 == ss2);
+    assert(ss1.stop_requested());
+    assert(ss1.stop_possible());
+    assert(ss2.stop_requested());
+    assert(ss2.stop_possible());
+  }
+
+  // other has no state
+  {
+    std::stop_source ss1;
+    std::stop_source ss2{std::nostopstate};
+
+    assert(ss1 != ss2);
+
+    ss1.request_stop();
+
+    assert(ss1.stop_requested());
+    assert(ss1.stop_possible());
+    assert(!ss2.stop_requested());
+    assert(!ss2.stop_possible());
+
+    std::same_as<std::stop_source&> decltype(auto) ref = ss1 = ss2;
+    assert(&ref == &ss1);
+
+    assert(ss1 == ss2);
+    assert(!ss1.stop_requested());
+    assert(!ss1.stop_possible());
+    assert(!ss2.stop_requested());
+    assert(!ss2.stop_possible());
+  }
+
+  // both no state
+  {
+    std::stop_source ss1{std::nostopstate};
+    std::stop_source ss2{std::nostopstate};
+
+    assert(ss1 == ss2);
+
+    assert(!ss1.stop_requested());
+    assert(!ss1.stop_possible());
+    assert(!ss2.stop_requested());
+    assert(!ss2.stop_possible());
+
+    std::same_as<std::stop_source&> decltype(auto) ref = ss1 = ss2;
+    assert(&ref == &ss1);
+
+    assert(ss1 == ss2);
+    assert(!ss1.stop_requested());
+    assert(!ss1.stop_possible());
+    assert(!ss2.stop_requested());
+    assert(!ss2.stop_possible());
+  }
+
+  // self assignment
+  {
+    std::stop_source ss;
+    auto& self = ss;
+
+    assert(!ss.stop_requested());
+
+    std::same_as<std::stop_source&> decltype(auto) ref = ss = self;
+    assert(&ref == &ss);
+
+    assert(!ss.stop_requested());
+
+    ss.request_stop();
+    assert(ss.stop_requested());
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/thread/thread.stoptoken/stopsource/cons.copy.pass.cpp b/libcxx/test/std/thread/thread.stoptoken/stopsource/cons.copy.pass.cpp
new file mode 100644
--- /dev/null
+++ b/libcxx/test/std/thread/thread.stoptoken/stopsource/cons.copy.pass.cpp
@@ -0,0 +1,68 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: availability-synchronization_library-missing
+
+// stop_source(const stop_source&) noexcept;
+
+#include <cassert>
+#include <optional>
+#include <stop_token>
+#include <type_traits>
+
+#include "test_macros.h"
+
+static_assert(std::is_nothrow_copy_constructible_v<std::stop_source>);
+
+int main(int, char**) {
+  {
+    std::stop_source source;
+    std::stop_source copy{source};
+
+    assert(source == copy);
+
+    assert(source.stop_possible());
+    assert(!source.stop_requested());
+
+    assert(copy.stop_possible());
+    assert(!copy.stop_requested());
+
+    source.request_stop();
+    assert(source.stop_possible());
+    assert(source.stop_requested());
+
+    assert(copy.stop_possible());
+    assert(copy.stop_requested());
+  }
+
+  // source counter incremented
+  {
+    std::optional<std::stop_source> source(std::in_place);
+    auto st = source->get_token();
+    assert(st.stop_possible());
+
+    std::optional<std::stop_source> copy{source};
+    source.reset();
+
+    assert(st.stop_possible());
+
+    copy.reset();
+    assert(!st.stop_possible());
+  }
+
+  // copy from empty
+  {
+    std::stop_source ss1{std::nostopstate};
+    std::stop_source copy{ss1};
+    assert(!copy.stop_possible());
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/thread/thread.stoptoken/stopsource/cons.default.pass.cpp b/libcxx/test/std/thread/thread.stoptoken/stopsource/cons.default.pass.cpp
new file mode 100644
--- /dev/null
+++ b/libcxx/test/std/thread/thread.stoptoken/stopsource/cons.default.pass.cpp
@@ -0,0 +1,31 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: availability-synchronization_library-missing
+
+// stop_source();
+
+#include <cassert>
+#include <stop_token>
+#include <type_traits>
+
+#include "test_macros.h"
+
+static_assert(std::is_default_constructible_v<std::stop_source>);
+
+int main(int, char**) {
+  {
+    std::stop_source ss = {}; // implicit
+    assert(ss.stop_possible());
+    assert(!ss.stop_requested());
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/thread/thread.stoptoken/stopsource/cons.move.pass.cpp b/libcxx/test/std/thread/thread.stoptoken/stopsource/cons.move.pass.cpp
new file mode 100644
--- /dev/null
+++ b/libcxx/test/std/thread/thread.stoptoken/stopsource/cons.move.pass.cpp
@@ -0,0 +1,49 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: availability-synchronization_library-missing
+
+// stop_source(stop_source&&) noexcept;
+
+#include <cassert>
+#include <stop_token>
+#include <type_traits>
+#include <utility>
+
+#include "test_macros.h"
+
+static_assert(std::is_nothrow_move_constructible_v<std::stop_source>);
+
+int main(int, char**) {
+  {
+    std::stop_source source;
+
+    assert(source.stop_possible());
+    assert(!source.stop_requested());
+
+    std::stop_source source2{std::move(source)};
+
+    assert(!source.stop_possible());
+    assert(!source.stop_requested());
+
+    assert(source2.stop_possible());
+    assert(!source2.stop_requested());
+
+    source2.request_stop();
+
+    assert(!source.stop_possible());
+    assert(!source.stop_requested());
+
+    assert(source2.stop_possible());
+    assert(source2.stop_requested());
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/thread/thread.stoptoken/stopsource/cons.nostopstate.pass.cpp b/libcxx/test/std/thread/thread.stoptoken/stopsource/cons.nostopstate.pass.cpp
new file mode 100644
--- /dev/null
+++ b/libcxx/test/std/thread/thread.stoptoken/stopsource/cons.nostopstate.pass.cpp
@@ -0,0 +1,33 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: availability-synchronization_library-missing
+
+// explicit stop_source(nostopstate_t) noexcept;
+
+#include <cassert>
+#include <stop_token>
+#include <type_traits>
+
+#include "test_macros.h"
+
+static_assert(std::is_nothrow_constructible_v<std::stop_source, std::nostopstate_t>);
+// explicit
+static_assert(!std::is_convertible_v<std::nostopstate_t, std::stop_source>);
+
+int main(int, char**) {
+  {
+    std::stop_source ss(std::nostopstate);
+    assert(!ss.stop_possible());
+    assert(!ss.stop_requested());
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/thread/thread.stoptoken/stopsource/equals.pass.cpp b/libcxx/test/std/thread/thread.stoptoken/stopsource/equals.pass.cpp
new file mode 100644
--- /dev/null
+++ b/libcxx/test/std/thread/thread.stoptoken/stopsource/equals.pass.cpp
@@ -0,0 +1,64 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: availability-synchronization_library-missing
+
+// [[nodiscard]] bool operator==(const stop_source& lhs, const stop_source& rhs) noexcept;
+// Returns: true if lhs and rhs have ownership of the same stop state or if both lhs and rhs do not have ownership of a stop state; otherwise false.
+
+#include <cassert>
+#include <concepts>
+#include <stop_token>
+#include <type_traits>
+
+#include "test_macros.h"
+
+template <class T>
+concept IsNoThrowEqualityComparable = requires(const T& t1, const T& t2) {
+  { t1 == t2 } noexcept;
+};
+
+static_assert(IsNoThrowEqualityComparable<std::stop_source>);
+
+int main(int, char**) {
+  // both no state
+  {
+    const std::stop_source ss1(std::nostopstate);
+    const std::stop_source ss2(std::nostopstate);
+    assert(ss1 == ss2);
+    assert(!(ss1 != ss2));
+  }
+
+  // only one has no state
+  {
+    const std::stop_source ss1(std::nostopstate);
+    const std::stop_source ss2;
+    assert(!(ss1 == ss2));
+    assert(ss1 != ss2);
+  }
+
+  // both has states. same state
+  {
+    const std::stop_source ss1;
+    const std::stop_source ss2(ss1);
+    assert(ss1 == ss2);
+    assert(!(ss1 != ss2));
+  }
+
+  // both has states. different states
+  {
+    const std::stop_source ss1;
+    const std::stop_source ss2;
+    assert(!(ss1 == ss2));
+    assert(ss1 != ss2);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/thread/thread.stoptoken/stopsource/get_token.pass.cpp b/libcxx/test/std/thread/thread.stoptoken/stopsource/get_token.pass.cpp
new file mode 100644
--- /dev/null
+++ b/libcxx/test/std/thread/thread.stoptoken/stopsource/get_token.pass.cpp
@@ -0,0 +1,50 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: availability-synchronization_library-missing
+
+// [[nodiscard]] stop_token get_token() const noexcept;
+
+#include <cassert>
+#include <concepts>
+#include <stop_token>
+#include <type_traits>
+
+#include "test_macros.h"
+
+template <class T>
+concept IsGetTokenNoexcept = requires(const T& t) {
+  { t.get_token() } noexcept;
+};
+
+static_assert(IsGetTokenNoexcept<std::stop_source>);
+
+int main(int, char**) {
+  // no state
+  {
+    std::stop_source ss{std::nostopstate};
+    std::same_as<std::stop_token> decltype(auto) st = ss.get_token();
+    assert(!st.stop_possible());
+    assert(!st.stop_requested());
+  }
+
+  // with state
+  {
+    std::stop_source ss;
+    std::same_as<std::stop_token> decltype(auto) st = ss.get_token();
+    assert(st.stop_possible());
+    assert(!st.stop_requested());
+
+    ss.request_stop();
+    assert(st.stop_requested());
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/thread/thread.stoptoken/stopsource/move.copy.pass.cpp b/libcxx/test/std/thread/thread.stoptoken/stopsource/move.copy.pass.cpp
new file mode 100644
--- /dev/null
+++ b/libcxx/test/std/thread/thread.stoptoken/stopsource/move.copy.pass.cpp
@@ -0,0 +1,132 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: availability-synchronization_library-missing
+
+// stop_source& operator=(stop_source&& rhs) noexcept;
+
+#include <cassert>
+#include <concepts>
+#include <stop_token>
+#include <type_traits>
+#include <utility>
+
+#include "test_macros.h"
+
+static_assert(std::is_nothrow_move_assignable_v<std::stop_source>);
+
+int main(int, char**) {
+  // have two different states
+  {
+    std::stop_source ss1;
+    std::stop_source ss2;
+
+    assert(ss1 != ss2);
+
+    ss2.request_stop();
+
+    assert(!ss1.stop_requested());
+    assert(ss2.stop_requested());
+
+    std::same_as<std::stop_source&> decltype(auto) ref = ss1 = std::move(ss2);
+    assert(&ref == &ss1);
+
+    assert(ss1.stop_requested());
+    assert(!ss2.stop_possible());
+    assert(!ss2.stop_requested());
+  }
+
+  // this has no state
+  {
+    std::stop_source ss1{std::nostopstate};
+    std::stop_source ss2;
+
+    assert(ss1 != ss2);
+
+    ss2.request_stop();
+
+    assert(!ss1.stop_requested());
+    assert(!ss1.stop_possible());
+    assert(ss2.stop_requested());
+    assert(ss2.stop_possible());
+
+    std::same_as<std::stop_source&> decltype(auto) ref = ss1 = std::move(ss2);
+    assert(&ref == &ss1);
+
+    assert(ss1.stop_requested());
+    assert(ss1.stop_possible());
+    assert(!ss2.stop_requested());
+    assert(!ss2.stop_possible());
+  }
+
+  // other has no state
+  {
+    std::stop_source ss1;
+    std::stop_source ss2{std::nostopstate};
+
+    assert(ss1 != ss2);
+
+    ss1.request_stop();
+
+    assert(ss1.stop_requested());
+    assert(ss1.stop_possible());
+    assert(!ss2.stop_requested());
+    assert(!ss2.stop_possible());
+
+    std::same_as<std::stop_source&> decltype(auto) ref = ss1 = std::move(ss2);
+    assert(&ref == &ss1);
+
+    assert(ss1 == ss2);
+    assert(!ss1.stop_requested());
+    assert(!ss1.stop_possible());
+    assert(!ss2.stop_requested());
+    assert(!ss2.stop_possible());
+  }
+
+  // both no state
+  {
+    std::stop_source ss1{std::nostopstate};
+    std::stop_source ss2{std::nostopstate};
+
+    assert(ss1 == ss2);
+
+    assert(!ss1.stop_requested());
+    assert(!ss1.stop_possible());
+    assert(!ss2.stop_requested());
+    assert(!ss2.stop_possible());
+
+    std::same_as<std::stop_source&> decltype(auto) ref = ss1 = std::move(ss2);
+    assert(&ref == &ss1);
+
+    assert(ss1 == ss2);
+    assert(!ss1.stop_requested());
+    assert(!ss1.stop_possible());
+    assert(!ss2.stop_requested());
+    assert(!ss2.stop_possible());
+  }
+
+  // self assignment
+  {
+    std::stop_source ss;
+    auto& self = ss;
+
+    assert(!ss.stop_requested());
+
+    std::same_as<std::stop_source&> decltype(auto) ref = ss = std::move(self);
+    assert(&ref == &ss);
+
+    assert(!ss.stop_requested());
+
+    ss.request_stop();
+    assert(ss.stop_requested());
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/thread/thread.stoptoken/stopsource/nodiscard.verify.cpp b/libcxx/test/std/thread/thread.stoptoken/stopsource/nodiscard.verify.cpp
new file mode 100644
--- /dev/null
+++ b/libcxx/test/std/thread/thread.stoptoken/stopsource/nodiscard.verify.cpp
@@ -0,0 +1,26 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: availability-synchronization_library-missing
+
+// [[nodiscard]] stop_token get_token() const noexcept;
+// [[nodiscard]] bool stop_possible() const noexcept;
+// [[nodiscard]] bool stop_requested() const noexcept;
+// [[nodiscard]] friend bool operator==(const stop_source& lhs, const stop_source& rhs) noexcept;
+
+#include <stop_token>
+
+void test() {
+  std::stop_source ss;
+  ss.get_token();      // expected-warning {{ignoring return value of function}}
+  ss.stop_requested(); // expected-warning {{ignoring return value of function}}
+  ss.stop_possible();  // expected-warning {{ignoring return value of function}}
+  operator==(ss, ss);  // expected-warning {{ignoring return value of function}}
+}
diff --git a/libcxx/test/std/thread/thread.stoptoken/stopsource/request_stop.pass.cpp b/libcxx/test/std/thread/thread.stoptoken/stopsource/request_stop.pass.cpp
new file mode 100644
--- /dev/null
+++ b/libcxx/test/std/thread/thread.stoptoken/stopsource/request_stop.pass.cpp
@@ -0,0 +1,79 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: availability-synchronization_library-missing
+
+// bool request_stop() noexcept;
+
+#include <cassert>
+#include <chrono>
+#include <concepts>
+#include <optional>
+#include <stop_token>
+#include <type_traits>
+
+#include "make_test_thread.h"
+#include "test_macros.h"
+
+template <class T>
+concept IsRequestStopNoexcept = requires(T& t) {
+  { t.request_stop() } noexcept;
+};
+
+static_assert(IsRequestStopNoexcept<std::stop_source>);
+
+int main(int, char**) {
+  // If *this does not have ownership of a stop state, returns false
+  {
+    std::stop_source ss{std::nostopstate};
+    auto ret = ss.request_stop();
+    assert(!ret);
+    assert(!ss.stop_requested());
+  }
+
+  // Otherwise, atomically determines whether the owned stop state has received
+  // a stop request, and if not, makes a stop request
+  {
+    std::stop_source ss;
+
+    auto ret = ss.request_stop();
+    assert(ret);
+    assert(ss.stop_requested());
+  }
+
+  // already requested
+  {
+    std::stop_source ss;
+    ss.request_stop();
+    assert(ss.stop_requested());
+
+    auto ret = ss.request_stop();
+    assert(!ret);
+    assert(ss.stop_requested());
+  }
+
+  // If the request was made, the callbacks registered by
+  // associated stop_callback objects are synchronously called.
+  {
+    std::stop_source ss;
+    auto st = ss.get_token();
+
+    bool cb1Called = false;
+    bool cb2Called = false;
+    std::stop_callback sc1(st, [&] { cb1Called = true; });
+    std::stop_callback sc2(st, [&] { cb2Called = true; });
+
+    ss.request_stop();
+    assert(cb1Called);
+    assert(cb2Called);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/thread/thread.stoptoken/stopsource/stop_possible.pass.cpp b/libcxx/test/std/thread/thread.stoptoken/stopsource/stop_possible.pass.cpp
new file mode 100644
--- /dev/null
+++ b/libcxx/test/std/thread/thread.stoptoken/stopsource/stop_possible.pass.cpp
@@ -0,0 +1,43 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: availability-synchronization_library-missing
+
+// [[nodiscard]] bool stop_possible() const noexcept;
+// Returns: true if *this has ownership of a stop state; otherwise, false.
+
+#include <cassert>
+#include <stop_token>
+#include <type_traits>
+
+#include "test_macros.h"
+
+template <class T>
+concept IsStopPossibleNoexcept = requires(const T& t) {
+  { t.stop_possible() } noexcept;
+};
+
+static_assert(IsStopPossibleNoexcept<std::stop_source>);
+
+int main(int, char**) {
+  // no state
+  {
+    const std::stop_source st{std::nostopstate};
+    assert(!st.stop_possible());
+  }
+
+  // with state
+  {
+    const std::stop_source st;
+    assert(st.stop_possible());
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/thread/thread.stoptoken/stopsource/stop_requested.pass.cpp b/libcxx/test/std/thread/thread.stoptoken/stopsource/stop_requested.pass.cpp
new file mode 100644
--- /dev/null
+++ b/libcxx/test/std/thread/thread.stoptoken/stopsource/stop_requested.pass.cpp
@@ -0,0 +1,105 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: availability-synchronization_library-missing
+
+// [[nodiscard]] bool stop_requested() const noexcept;
+// true if *this has ownership of a stop state that has received a stop request; otherwise, false.
+
+#include <cassert>
+#include <chrono>
+#include <concepts>
+#include <optional>
+#include <stop_token>
+#include <type_traits>
+
+#include "make_test_thread.h"
+#include "test_macros.h"
+
+template <class T>
+concept IsStopRequestedNoexcept = requires(const T& t) {
+  { t.stop_requested() } noexcept;
+};
+
+static_assert(IsStopRequestedNoexcept<std::stop_source>);
+
+int main(int, char**) {
+  // no state
+  {
+    const std::stop_source ss{std::nostopstate};
+    assert(!ss.stop_requested());
+  }
+
+  // has state
+  {
+    std::stop_source ss;
+    assert(!ss.stop_requested());
+
+    ss.request_stop();
+    assert(ss.stop_requested());
+  }
+
+  // request from another instance with same state
+  {
+    std::stop_source ss1;
+    auto ss2 = ss1;
+    ss2.request_stop();
+    assert(ss1.stop_requested());
+  }
+
+  // request from another instance with different state
+  {
+    std::stop_source ss1;
+    std::stop_source ss2;
+
+    ss2.request_stop();
+    assert(!ss1.stop_requested());
+  }
+
+  // multiple threads
+  {
+    std::stop_source ss;
+
+    std::thread t = support::make_test_thread([&]() { ss.request_stop(); });
+
+    t.join();
+    assert(ss.stop_requested());
+  }
+
+  // [thread.stopsource.intro] A call to request_stop that returns true
+  // synchronizes with a call to stop_requested on an associated stop_source
+  // or stop_source object that returns true.
+  {
+    std::stop_source ss;
+
+    bool flag = false;
+
+    std::thread t = support::make_test_thread([&]() {
+      using namespace std::chrono_literals;
+      std::this_thread::sleep_for(1ms);
+
+      // happens-before request_stop
+      flag   = true;
+      auto b = ss.request_stop();
+      assert(b);
+    });
+
+    while (!ss.stop_requested()) {
+      std::this_thread::yield();
+    }
+
+    // write should be visible to the current thread
+    assert(flag == true);
+
+    t.join();
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/thread/thread.stoptoken/stopsource/swap.free.pass.cpp b/libcxx/test/std/thread/thread.stoptoken/stopsource/swap.free.pass.cpp
new file mode 100644
--- /dev/null
+++ b/libcxx/test/std/thread/thread.stoptoken/stopsource/swap.free.pass.cpp
@@ -0,0 +1,49 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: availability-synchronization_library-missing
+
+// void swap(stop_source& rhs) noexcept;
+
+#include <cassert>
+#include <concepts>
+#include <stop_token>
+#include <type_traits>
+
+#include "test_macros.h"
+
+template <class T>
+concept IsNoThrowFreeSwappable = requires(T& t) {
+  { swap(t, t) } noexcept;
+};
+
+static_assert(IsNoThrowFreeSwappable<std::stop_source>);
+
+int main(int, char**) {
+  {
+    std::stop_source ss1;
+    std::stop_source ss2;
+
+    assert(ss1 != ss2);
+
+    ss2.request_stop();
+
+    assert(!ss1.stop_requested());
+    assert(ss2.stop_requested());
+
+    swap(ss1, ss2);
+
+    assert(ss1 != ss2);
+    assert(ss1.stop_requested());
+    assert(!ss2.stop_requested());
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/thread/thread.stoptoken/stopsource/swap.member.pass.cpp b/libcxx/test/std/thread/thread.stoptoken/stopsource/swap.member.pass.cpp
new file mode 100644
--- /dev/null
+++ b/libcxx/test/std/thread/thread.stoptoken/stopsource/swap.member.pass.cpp
@@ -0,0 +1,49 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: availability-synchronization_library-missing
+
+// void swap(stop_source& rhs) noexcept;
+
+#include <cassert>
+#include <concepts>
+#include <stop_token>
+#include <type_traits>
+
+#include "test_macros.h"
+
+template <class T>
+concept IsNoThrowMemberSwappable = requires(T& t) {
+  { t.swap(t) } noexcept;
+};
+
+static_assert(IsNoThrowMemberSwappable<std::stop_source>);
+
+int main(int, char**) {
+  {
+    std::stop_source ss1;
+    std::stop_source ss2;
+
+    assert(ss1 != ss2);
+
+    ss2.request_stop();
+
+    assert(!ss1.stop_requested());
+    assert(ss2.stop_requested());
+
+    ss1.swap(ss2);
+
+    assert(ss1 != ss2);
+    assert(ss1.stop_requested());
+    assert(!ss2.stop_requested());
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/thread/thread.stoptoken/stoptoken/assign.copy.pass.cpp b/libcxx/test/std/thread/thread.stoptoken/stoptoken/assign.copy.pass.cpp
new file mode 100644
--- /dev/null
+++ b/libcxx/test/std/thread/thread.stoptoken/stoptoken/assign.copy.pass.cpp
@@ -0,0 +1,45 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: availability-synchronization_library-missing
+
+#include <cassert>
+#include <concepts>
+#include <stop_token>
+#include <type_traits>
+
+#include "test_macros.h"
+
+static_assert(std::is_nothrow_copy_assignable_v<std::stop_token>);
+
+int main(int, char**) {
+  {
+    std::stop_token st1;
+
+    std::stop_source source;
+    auto st2 = source.get_token();
+
+    assert(st1 != st2);
+
+    source.request_stop();
+
+    assert(!st1.stop_requested());
+    assert(st2.stop_requested());
+
+    std::same_as<std::stop_token&> decltype(auto) ref = st1 = st2;
+    assert(&ref == &st1);
+
+    assert(st1 == st2);
+    assert(st1.stop_requested());
+    assert(st2.stop_requested());
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/thread/thread.stoptoken/stoptoken/assign.move.pass.cpp b/libcxx/test/std/thread/thread.stoptoken/stoptoken/assign.move.pass.cpp
new file mode 100644
--- /dev/null
+++ b/libcxx/test/std/thread/thread.stoptoken/stoptoken/assign.move.pass.cpp
@@ -0,0 +1,48 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: availability-synchronization_library-missing
+
+// stop_token& operator=(stop_token&& rhs) noexcept;
+
+#include <cassert>
+#include <concepts>
+#include <stop_token>
+#include <type_traits>
+#include <utility>
+
+#include "test_macros.h"
+
+static_assert(std::is_nothrow_move_assignable_v<std::stop_token>);
+
+int main(int, char**) {
+  {
+    std::stop_token st1;
+
+    std::stop_source source;
+    auto st2 = source.get_token();
+
+    assert(st1 != st2);
+
+    source.request_stop();
+
+    assert(!st1.stop_requested());
+    assert(st2.stop_requested());
+
+    std::same_as<std::stop_token&> decltype(auto) ref = st1 = std::move(st2);
+    assert(&ref == &st1);
+
+    assert(st1 != st2);
+    assert(st1.stop_requested());
+    assert(!st2.stop_requested());
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/thread/thread.stoptoken/stoptoken/cons.copy.pass.cpp b/libcxx/test/std/thread/thread.stoptoken/stoptoken/cons.copy.pass.cpp
new file mode 100644
--- /dev/null
+++ b/libcxx/test/std/thread/thread.stoptoken/stoptoken/cons.copy.pass.cpp
@@ -0,0 +1,47 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: availability-synchronization_library-missing
+
+// stop_token(const stop_token&) noexcept;
+
+#include <cassert>
+#include <stop_token>
+#include <type_traits>
+
+#include "test_macros.h"
+
+static_assert(std::is_nothrow_copy_constructible_v<std::stop_token>);
+
+int main(int, char**) {
+  {
+    std::stop_source source;
+    auto st = source.get_token();
+    std::stop_token copy{st};
+
+    assert(st == copy);
+
+    assert(st.stop_possible());
+    assert(!st.stop_requested());
+
+    assert(copy.stop_possible());
+    assert(!copy.stop_requested());
+
+    source.request_stop();
+    assert(st.stop_possible());
+    assert(st.stop_requested());
+
+    assert(copy.stop_possible());
+    assert(copy.stop_requested());
+
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/thread/thread.stoptoken/stoptoken/cons.default.pass.cpp b/libcxx/test/std/thread/thread.stoptoken/stoptoken/cons.default.pass.cpp
new file mode 100644
--- /dev/null
+++ b/libcxx/test/std/thread/thread.stoptoken/stoptoken/cons.default.pass.cpp
@@ -0,0 +1,31 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: availability-synchronization_library-missing
+
+// stop_token() noexcept;
+
+#include <cassert>
+#include <stop_token>
+#include <type_traits>
+
+#include "test_macros.h"
+
+static_assert(std::is_nothrow_default_constructible_v<std::stop_token>);
+
+int main(int, char**) {
+  {
+    std::stop_token st = {}; // implicit
+    assert(!st.stop_possible());
+    assert(!st.stop_requested());
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/thread/thread.stoptoken/stoptoken/cons.move.pass.cpp b/libcxx/test/std/thread/thread.stoptoken/stoptoken/cons.move.pass.cpp
new file mode 100644
--- /dev/null
+++ b/libcxx/test/std/thread/thread.stoptoken/stoptoken/cons.move.pass.cpp
@@ -0,0 +1,51 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: availability-synchronization_library-missing
+
+// stop_token(stop_token&&) noexcept;
+
+#include <cassert>
+#include <stop_token>
+#include <type_traits>
+#include <utility>
+
+#include "test_macros.h"
+
+static_assert(std::is_nothrow_move_constructible_v<std::stop_token>);
+
+int main(int, char**) {
+  {
+    std::stop_source source;
+    auto st = source.get_token();
+
+    assert(st.stop_possible());
+    assert(!st.stop_requested());
+
+    std::stop_token st2{std::move(st)};
+
+    assert(!st.stop_possible());
+    assert(!st.stop_requested());
+
+    assert(st2.stop_possible());
+    assert(!st2.stop_requested());
+
+    source.request_stop();
+
+    assert(!st.stop_possible());
+    assert(!st.stop_requested());
+
+    assert(st2.stop_possible());
+    assert(st2.stop_requested());
+
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/thread/thread.stoptoken/stoptoken/equals.pass.cpp b/libcxx/test/std/thread/thread.stoptoken/stoptoken/equals.pass.cpp
new file mode 100644
--- /dev/null
+++ b/libcxx/test/std/thread/thread.stoptoken/stoptoken/equals.pass.cpp
@@ -0,0 +1,87 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: availability-synchronization_library-missing
+
+// [[nodiscard]] bool operator==(const stop_token& lhs, const stop_token& rhs) noexcept;
+// Returns: true if lhs and rhs have ownership of the same stop state or if both lhs and rhs do not have ownership of a stop state; otherwise false.
+
+// synthesized operator != also tested.
+
+#include <cassert>
+#include <concepts>
+#include <stop_token>
+#include <type_traits>
+
+#include "test_macros.h"
+
+// LWG 3254 is related.
+template <class T>
+concept IsNoThrowEqualityComparable = requires(const T& t1, const T& t2) {
+  { t1 == t2 } noexcept;
+};
+
+template <class T>
+concept IsNoThrowInequalityComparable = requires(const T& t1, const T& t2) {
+  { t1 != t2 } noexcept;
+};
+
+static_assert(IsNoThrowEqualityComparable<std::stop_token>);
+static_assert(IsNoThrowInequalityComparable<std::stop_token>);
+
+int main(int, char**) {
+  // both no state
+  {
+    const std::stop_token st1;
+    const std::stop_token st2;
+    assert(st1 == st2);
+    assert(!(st1 != st2));
+  }
+
+  // only one has no state
+  {
+    std::stop_source ss;
+    const std::stop_token st1;
+    const auto st2 = ss.get_token();
+    assert(!(st1 == st2));
+    assert(st1 != st2);
+  }
+
+  // both has states. same source
+  {
+    std::stop_source ss;
+    const auto st1 = ss.get_token();
+    const auto st2 = ss.get_token();
+    assert(st1 == st2);
+    assert(!(st1 != st2));
+  }
+
+  // both has states. different sources with same states
+  {
+    std::stop_source ss1;
+    auto ss2 = ss1;
+    const auto st1 = ss1.get_token();
+    const auto st2 = ss2.get_token();
+    assert(st1 == st2);
+    assert(!(st1 != st2));
+  }
+
+  // both has states. different sources with different states
+  {
+    std::stop_source ss1;
+    std::stop_source ss2;
+    const auto st1 = ss1.get_token();
+    const auto st2 = ss2.get_token();
+    assert(!(st1 == st2));
+    assert(st1 != st2);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/thread/thread.stoptoken/stoptoken/nodiscard.verify.cpp b/libcxx/test/std/thread/thread.stoptoken/stoptoken/nodiscard.verify.cpp
new file mode 100644
--- /dev/null
+++ b/libcxx/test/std/thread/thread.stoptoken/stoptoken/nodiscard.verify.cpp
@@ -0,0 +1,24 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: availability-synchronization_library-missing
+
+//  [[nodiscard]] bool stop_requested() const noexcept;
+//  [[nodiscard]] bool stop_possible() const noexcept;
+//  [[nodiscard]] friend bool operator==(const stop_token& lhs, const stop_token& rhs) noexcept;
+
+#include <stop_token>
+
+void test() {
+  std::stop_token st;
+  st.stop_requested(); // expected-warning {{ignoring return value of function}}
+  st.stop_possible();  // expected-warning {{ignoring return value of function}}
+  operator==(st, st);  // expected-warning {{ignoring return value of function}}
+}
diff --git a/libcxx/test/std/thread/thread.stoptoken/stoptoken/stop_possible.pass.cpp b/libcxx/test/std/thread/thread.stoptoken/stoptoken/stop_possible.pass.cpp
new file mode 100644
--- /dev/null
+++ b/libcxx/test/std/thread/thread.stoptoken/stoptoken/stop_possible.pass.cpp
@@ -0,0 +1,94 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: availability-synchronization_library-missing
+
+// [[nodiscard]] bool stop_possible() const noexcept;
+// Returns: false if:
+//    - *this does not have ownership of a stop state, or
+//    - a stop request was not made and there are no associated stop_source objects;
+// otherwise, true.
+
+#include <cassert>
+#include <concepts>
+#include <optional>
+#include <stop_token>
+#include <type_traits>
+
+#include "make_test_thread.h"
+#include "test_macros.h"
+
+template <class T>
+concept IsStopPossibleNoexcept = requires(const T& t) {
+  { t.stop_possible() } noexcept;
+};
+
+static_assert(IsStopPossibleNoexcept<std::stop_token>);
+
+int main(int, char**) {
+  // no state
+  {
+    const std::stop_token st;
+    assert(!st.stop_possible());
+  }
+
+  // a stop request was not made and there are no associated stop_source objects
+  {
+    std::optional<std::stop_source> ss{std::in_place};
+    const auto st = ss->get_token();
+    ss.reset();
+
+    assert(!st.stop_possible());
+  }
+
+  // a stop request was not made, but there is an associated stop_source objects
+  {
+    std::stop_source ss;
+    const auto st = ss.get_token();
+    assert(st.stop_possible());
+  }
+
+  // a stop request was made and there are no associated stop_source objects
+  {
+    std::optional<std::stop_source> ss{std::in_place};
+    const auto st = ss->get_token();
+    ss->request_stop();
+    ss.reset();
+
+    assert(st.stop_possible());
+  }
+
+  // a stop request was made and there is an associated stop_source objects
+  {
+    std::stop_source ss;
+    const auto st = ss.get_token();
+    ss.request_stop();
+    assert(st.stop_possible());
+  }
+
+  // a stop request was made on a different thread and
+  // there are no associated stop_source objects
+  {
+    std::optional<std::stop_source> ss{std::in_place};
+    const auto st = ss->get_token();
+
+    std::thread t = support::make_test_thread([&]() {
+      ss->request_stop();
+      ss.reset();
+    });
+
+    assert(st.stop_possible());
+    t.join();
+    assert(st.stop_possible());
+
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/thread/thread.stoptoken/stoptoken/stop_requested.pass.cpp b/libcxx/test/std/thread/thread.stoptoken/stoptoken/stop_requested.pass.cpp
new file mode 100644
--- /dev/null
+++ b/libcxx/test/std/thread/thread.stoptoken/stoptoken/stop_requested.pass.cpp
@@ -0,0 +1,155 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: availability-synchronization_library-missing
+
+// [[nodiscard]] bool stop_requested() const noexcept;
+// Returns: true if *this has ownership of a stop state that has received a stop request; otherwise, false.
+
+#include <cassert>
+#include <chrono>
+#include <concepts>
+#include <optional>
+#include <stop_token>
+#include <type_traits>
+
+#include "make_test_thread.h"
+#include "test_macros.h"
+
+template <class T>
+concept IsStopRequestedNoexcept = requires(const T& t) {
+  { t.stop_requested() } noexcept;
+};
+
+static_assert(IsStopRequestedNoexcept<std::stop_token>);
+
+int main(int, char**) {
+  // no state
+  {
+    const std::stop_token st;
+    assert(!st.stop_requested());
+  }
+
+  // has state
+  {
+    std::stop_source ss;
+    const auto st = ss.get_token();
+    assert(!st.stop_requested());
+
+    ss.request_stop();
+    assert(st.stop_requested());
+  }
+
+  // already requested before constructor
+  {
+    std::stop_source ss;
+    ss.request_stop();
+    const auto st = ss.get_token();
+    assert(st.stop_requested());
+  }
+
+  // stop_token should share the state
+  {
+    std::optional<std::stop_source> ss{std::in_place};
+    ss->request_stop();
+    const auto st = ss->get_token();
+
+    ss.reset();
+    assert(st.stop_requested());
+  }
+
+  // single stop_source, multiple stop_token
+  {
+    std::stop_source ss;
+    const auto st1 = ss.get_token();
+    const auto st2 = ss.get_token();
+    assert(!st1.stop_requested());
+    assert(!st2.stop_requested());
+
+    ss.request_stop();
+    assert(st1.stop_requested());
+    assert(st2.stop_requested());
+  }
+
+  // multiple stop_source, multiple stop_token
+  {
+    std::stop_source ss1;
+    std::stop_source ss2;
+
+    const auto st1 = ss1.get_token();
+    const auto st2 = ss2.get_token();
+    assert(!st1.stop_requested());
+    assert(!st2.stop_requested());
+
+    ss1.request_stop();
+    assert(st1.stop_requested());
+    assert(!st2.stop_requested());
+  }
+
+  // multiple threads
+  {
+    std::stop_source ss;
+    const auto st = ss.get_token();
+    assert(!st.stop_requested());
+
+    std::thread t = support::make_test_thread([&]() { ss.request_stop(); });
+
+    t.join();
+    assert(st.stop_requested());
+  }
+
+  // maybe concurrent calls
+  {
+    std::stop_source ss;
+    const auto st = ss.get_token();
+    assert(!st.stop_requested());
+
+    std::thread t = support::make_test_thread([&]() { ss.request_stop(); });
+
+    while (!st.stop_requested()) {
+      // should eventually exit the loop
+      std::this_thread::yield();
+    }
+
+    t.join();
+  }
+
+  // [thread.stoptoken.intro] A call to request_stop that returns true
+  // synchronizes with a call to stop_requested on an associated stop_token
+  // or stop_source object that returns true.
+  {
+    std::stop_source ss;
+    const auto st = ss.get_token();
+    assert(!st.stop_requested());
+
+    bool flag = false;
+
+    std::thread t = support::make_test_thread([&]() {
+      using namespace std::chrono_literals;
+      std::this_thread::sleep_for(1ms);
+
+      // happens-before request_stop
+      flag   = true;
+      auto b = ss.request_stop();
+      assert(b);
+    });
+
+    while (!st.stop_requested()) {
+      std::this_thread::yield();
+    }
+
+    // write should be visible to the current thread
+    assert(flag == true);
+
+    t.join();
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/thread/thread.stoptoken/stoptoken/swap.free.pass.cpp b/libcxx/test/std/thread/thread.stoptoken/stoptoken/swap.free.pass.cpp
new file mode 100644
--- /dev/null
+++ b/libcxx/test/std/thread/thread.stoptoken/stoptoken/swap.free.pass.cpp
@@ -0,0 +1,51 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: availability-synchronization_library-missing
+
+// friend void swap(stop_token& x, stop_token& y) noexcept;
+
+#include <cassert>
+#include <concepts>
+#include <stop_token>
+#include <type_traits>
+
+#include "test_macros.h"
+
+template <class T>
+concept IsNoThrowFreeSwappable = requires(T& t) {
+  { swap(t, t) } noexcept;
+};
+
+static_assert(IsNoThrowFreeSwappable<std::stop_token>);
+
+int main(int, char**) {
+  {
+    std::stop_token st1;
+
+    std::stop_source source;
+    auto st2 = source.get_token();
+
+    assert(st1 != st2);
+
+    source.request_stop();
+
+    assert(!st1.stop_requested());
+    assert(st2.stop_requested());
+
+    swap(st1, st2);
+
+    assert(st1 != st2);
+    assert(st1.stop_requested());
+    assert(!st2.stop_requested());
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/thread/thread.stoptoken/stoptoken/swap.member.pass.cpp b/libcxx/test/std/thread/thread.stoptoken/stoptoken/swap.member.pass.cpp
new file mode 100644
--- /dev/null
+++ b/libcxx/test/std/thread/thread.stoptoken/stoptoken/swap.member.pass.cpp
@@ -0,0 +1,51 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: availability-synchronization_library-missing
+
+// void swap(stop_token& rhs) noexcept;
+
+#include <cassert>
+#include <concepts>
+#include <stop_token>
+#include <type_traits>
+
+#include "test_macros.h"
+
+template <class T>
+concept IsNoThrowMemberSwappable = requires(T& t) {
+  { t.swap(t) } noexcept;
+};
+
+static_assert(IsNoThrowMemberSwappable<std::stop_token>);
+
+int main(int, char**) {
+  {
+    std::stop_token st1;
+
+    std::stop_source source;
+    auto st2 = source.get_token();
+
+    assert(st1 != st2);
+
+    source.request_stop();
+
+    assert(!st1.stop_requested());
+    assert(st2.stop_requested());
+
+    st1.swap(st2);
+
+    assert(st1 != st2);
+    assert(st1.stop_requested());
+    assert(!st2.stop_requested());
+  }
+
+  return 0;
+}
diff --git a/libcxx/utils/data/ignore_format.txt b/libcxx/utils/data/ignore_format.txt
--- a/libcxx/utils/data/ignore_format.txt
+++ b/libcxx/utils/data/ignore_format.txt
@@ -250,6 +250,7 @@
 libcxx/include/__chrono/monthday.h
 libcxx/include/__chrono/month.h
 libcxx/include/__chrono/month_weekday.h
+libcxx/include/__chrono/statically_widen.h
 libcxx/include/__chrono/steady_clock.h
 libcxx/include/__chrono/system_clock.h
 libcxx/include/__chrono/time_point.h
@@ -307,8 +308,10 @@
 libcxx/include/__debug_utils/randomize_range.h
 libcxx/include/deque
 libcxx/include/errno.h
+libcxx/include/__exception/exception_ptr.h
 libcxx/include/expected
 libcxx/include/__expected/expected.h
+libcxx/include/__expected/unexpected.h
 libcxx/include/experimental/__config
 libcxx/include/experimental/iterator
 libcxx/include/experimental/map
@@ -488,7 +491,6 @@
 libcxx/include/new
 libcxx/include/__node_handle
 libcxx/include/numbers
-libcxx/include/numeric
 libcxx/include/__numeric/accumulate.h
 libcxx/include/__numeric/adjacent_difference.h
 libcxx/include/__numeric/exclusive_scan.h
@@ -504,6 +506,7 @@
 libcxx/include/__numeric/transform_reduce.h
 libcxx/include/optional
 libcxx/include/ostream
+libcxx/include/__pstl/internal/algorithm_impl.h
 libcxx/include/__pstl/internal/numeric_impl.h
 libcxx/include/__pstl/internal/omp/parallel_for_each.h
 libcxx/include/__pstl/internal/omp/parallel_for.h
@@ -625,6 +628,9 @@
 libcxx/include/__support/win32/locale_win32.h
 libcxx/include/__support/xlocale/__nop_locale_mgmt.h
 libcxx/include/__system_error/errc.h
+libcxx/include/__system_error/error_category.h
+libcxx/include/__system_error/error_code.h
+libcxx/include/__system_error/error_condition.h
 libcxx/include/thread
 libcxx/include/__threading_support
 libcxx/include/__thread/poll_with_backoff.h
@@ -771,6 +777,7 @@
 libcxx/include/__utility/piecewise_construct.h
 libcxx/include/__utility/priority_tag.h
 libcxx/include/__utility/rel_ops.h
+libcxx/include/__utility/terminate_on_exception.h
 libcxx/include/__utility/to_underlying.h
 libcxx/include/__utility/unreachable.h
 libcxx/include/valarray
diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py
--- a/libcxx/utils/generate_feature_test_macro_components.py
+++ b/libcxx/utils/generate_feature_test_macro_components.py
@@ -815,6 +815,7 @@
   "semaphore": ["UNSUPPORTED: no-threads"],
   "shared_mutex": ["UNSUPPORTED: no-threads"],
   "stdatomic.h": ["UNSUPPORTED: no-threads"],
+  "stop_token": ["UNSUPPORTED: no-threads, availability-synchronization_library-missing"],
   "thread": ["UNSUPPORTED: no-threads"],
 }
 
diff --git a/libcxx/utils/generate_header_inclusion_tests.py b/libcxx/utils/generate_header_inclusion_tests.py
--- a/libcxx/utils/generate_header_inclusion_tests.py
+++ b/libcxx/utils/generate_header_inclusion_tests.py
@@ -73,6 +73,7 @@
     "initializer_list": "11",
     "optional": "17",
     "ranges": "20",
+    "stop_token": "20",
     "string_view": "17",
     "syncstream": "20",
     "system_error": "11",
diff --git a/libcxx/utils/generate_header_tests.py b/libcxx/utils/generate_header_tests.py
--- a/libcxx/utils/generate_header_tests.py
+++ b/libcxx/utils/generate_header_tests.py
@@ -15,6 +15,7 @@
     "semaphore": "!defined(_LIBCPP_HAS_NO_THREADS)",
     "shared_mutex": "!defined(_LIBCPP_HAS_NO_THREADS)",
     "stdatomic.h": "__cplusplus > 202002L && !defined(_LIBCPP_HAS_NO_THREADS)",
+    "stop_token": "!defined(_LIBCPP_HAS_NO_THREADS) && defined(_LIBCPP_AVAILABILITY_SYNC)",
     "thread": "!defined(_LIBCPP_HAS_NO_THREADS)",
 
     "filesystem": "!defined(_LIBCPP_HAS_NO_FILESYSTEM_LIBRARY)",
diff --git a/lldb/include/lldb/Symbol/SymbolContext.h b/lldb/include/lldb/Symbol/SymbolContext.h
--- a/lldb/include/lldb/Symbol/SymbolContext.h
+++ b/lldb/include/lldb/Symbol/SymbolContext.h
@@ -250,8 +250,8 @@
   /// For C++ the name is "this", for Objective-C the name is "self".
   ///
   /// \return
-  ///     Returns a string for the name of the instance variable.
-  ConstString GetInstanceVariableName();
+  ///     Returns a StringRef for the name of the instance variable.
+  llvm::StringRef GetInstanceVariableName();
 
   /// Sorts the types in TypeMap according to SymbolContext to TypeList
   ///
diff --git a/lldb/include/lldb/Target/Language.h b/lldb/include/lldb/Target/Language.h
--- a/lldb/include/lldb/Target/Language.h
+++ b/lldb/include/lldb/Target/Language.h
@@ -326,7 +326,7 @@
     return ConstString();
   }
 
-  virtual ConstString GetInstanceVariableName() { return {}; }
+  virtual llvm::StringRef GetInstanceVariableName() { return {}; }
 
 protected:
   // Classes that inherit from Language can see and modify these
diff --git a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.h b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.h
--- a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.h
+++ b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.h
@@ -165,7 +165,7 @@
   ConstString FindBestAlternateFunctionMangledName(
       const Mangled mangled, const SymbolContext &sym_ctx) const override;
 
-  ConstString GetInstanceVariableName() override { return ConstString("this"); }
+  llvm::StringRef GetInstanceVariableName() override { return "this"; }
 
   // PluginInterface protocol
   llvm::StringRef GetPluginName() override { return GetPluginNameStatic(); }
diff --git a/lldb/source/Plugins/Language/ObjC/ObjCLanguage.h b/lldb/source/Plugins/Language/ObjC/ObjCLanguage.h
--- a/lldb/source/Plugins/Language/ObjC/ObjCLanguage.h
+++ b/lldb/source/Plugins/Language/ObjC/ObjCLanguage.h
@@ -191,7 +191,7 @@
       return false;
   }
 
-  ConstString GetInstanceVariableName() override { return ConstString("self"); }
+  llvm::StringRef GetInstanceVariableName() override { return "self"; }
 
   // PluginInterface protocol
   llvm::StringRef GetPluginName() override { return GetPluginNameStatic(); }
diff --git a/lldb/source/Plugins/Language/ObjCPlusPlus/ObjCPlusPlusLanguage.h b/lldb/source/Plugins/Language/ObjCPlusPlus/ObjCPlusPlusLanguage.h
--- a/lldb/source/Plugins/Language/ObjCPlusPlus/ObjCPlusPlusLanguage.h
+++ b/lldb/source/Plugins/Language/ObjCPlusPlus/ObjCPlusPlusLanguage.h
@@ -40,7 +40,7 @@
 
   static lldb_private::Language *CreateInstance(lldb::LanguageType language);
 
-  ConstString GetInstanceVariableName() override { return ConstString("self"); }
+  llvm::StringRef GetInstanceVariableName() override { return "self"; }
 
   static llvm::StringRef GetPluginNameStatic() { return "objcplusplus"; }
 
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -5115,6 +5115,7 @@
     case clang::BuiltinType::RvvBool16:
     case clang::BuiltinType::RvvBool32:
     case clang::BuiltinType::RvvBool64:
+    case clang::BuiltinType::RvvInt32m1x2:
       break;
 
     // WebAssembly builtin types.
diff --git a/lldb/source/Symbol/SymbolContext.cpp b/lldb/source/Symbol/SymbolContext.cpp
--- a/lldb/source/Symbol/SymbolContext.cpp
+++ b/lldb/source/Symbol/SymbolContext.cpp
@@ -541,7 +541,7 @@
   return nullptr;
 }
 
-ConstString SymbolContext::GetInstanceVariableName() {
+llvm::StringRef SymbolContext::GetInstanceVariableName() {
   LanguageType lang_type = eLanguageTypeUnknown;
 
   if (Block *function_block = GetFunctionBlock())
diff --git a/lldb/source/Target/StackFrame.cpp b/lldb/source/Target/StackFrame.cpp
--- a/lldb/source/Target/StackFrame.cpp
+++ b/lldb/source/Target/StackFrame.cpp
@@ -567,8 +567,9 @@
     // Check for direct ivars access which helps us with implicit access to
     // ivars using "this" or "self".
     GetSymbolContext(eSymbolContextFunction | eSymbolContextBlock);
-    if (auto instance_var_name = m_sc.GetInstanceVariableName()) {
-      var_sp = variable_list->FindVariable(instance_var_name);
+    llvm::StringRef instance_var_name = m_sc.GetInstanceVariableName();
+    if (!instance_var_name.empty()) {
+      var_sp = variable_list->FindVariable(ConstString(instance_var_name));
       if (var_sp) {
         separator_idx = 0;
         if (Type *var_type = var_sp->GetType())
diff --git a/lldb/test/API/functionalities/bt-interrupt/TestInterruptBacktrace.py b/lldb/test/API/functionalities/bt-interrupt/TestInterruptBacktrace.py
--- a/lldb/test/API/functionalities/bt-interrupt/TestInterruptBacktrace.py
+++ b/lldb/test/API/functionalities/bt-interrupt/TestInterruptBacktrace.py
@@ -6,12 +6,13 @@
 import lldb
 import lldbsuite.test.lldbutil as lldbutil
 from lldbsuite.test.lldbtest import *
-
+from lldbsuite.test.decorators import *
 
 class TestInterruptingBacktrace(TestBase):
 
     NO_DEBUG_INFO_TESTCASE = True
 
+    @skipIf(oslist=["linux"], archs=["arm"])
     def test_backtrace_interrupt(self):
         """Use RequestInterrupt followed by stack operations
            to ensure correct interrupt behavior for stacks."""
diff --git a/llvm/CODE_OWNERS.TXT b/llvm/CODE_OWNERS.TXT
--- a/llvm/CODE_OWNERS.TXT
+++ b/llvm/CODE_OWNERS.TXT
@@ -160,8 +160,8 @@
 E: david.majnemer@gmail.com
 D: IR Constant Folder, InstCombine
 
-N: Dylan McKay
-E: me@dylanmckay.io
+N: Ben Shi
+E: 2283975856@qq.com, powerman1st@163.com
 D: AVR Backend
 
 N: Kazushi Marukawa
diff --git a/llvm/include/llvm/ADT/PostOrderIterator.h b/llvm/include/llvm/ADT/PostOrderIterator.h
--- a/llvm/include/llvm/ADT/PostOrderIterator.h
+++ b/llvm/include/llvm/ADT/PostOrderIterator.h
@@ -106,13 +106,14 @@
   using NodeRef = typename GT::NodeRef;
   using ChildItTy = typename GT::ChildIteratorType;
 
-  // VisitStack - Used to maintain the ordering.  Top = current block
-  // First element is basic block pointer, second is the 'next child' to visit
-  SmallVector<std::pair<NodeRef, ChildItTy>, 8> VisitStack;
+  /// Used to maintain the ordering.
+  /// First element is basic block pointer, second is iterator for the next
+  /// child to visit, third is the end iterator.
+  SmallVector<std::tuple<NodeRef, ChildItTy, ChildItTy>, 8> VisitStack;
 
   po_iterator(NodeRef BB) {
     this->insertEdge(std::optional<NodeRef>(), BB);
-    VisitStack.push_back(std::make_pair(BB, GT::child_begin(BB)));
+    VisitStack.emplace_back(BB, GT::child_begin(BB), GT::child_end(BB));
     traverseChild();
   }
 
@@ -121,7 +122,7 @@
   po_iterator(NodeRef BB, SetType &S)
       : po_iterator_storage<SetType, ExtStorage>(S) {
     if (this->insertEdge(std::optional<NodeRef>(), BB)) {
-      VisitStack.push_back(std::make_pair(BB, GT::child_begin(BB)));
+      VisitStack.emplace_back(BB, GT::child_begin(BB), GT::child_end(BB));
       traverseChild();
     }
   }
@@ -131,12 +132,14 @@
   } // End is when stack is empty.
 
   void traverseChild() {
-    while (VisitStack.back().second != GT::child_end(VisitStack.back().first)) {
-      NodeRef BB = *VisitStack.back().second++;
-      if (this->insertEdge(std::optional<NodeRef>(VisitStack.back().first),
-                           BB)) {
+    while (true) {
+      auto &[ParentBB, It, End] = VisitStack.back();
+      if (It == End)
+        break;
+      NodeRef BB = *It++;
+      if (this->insertEdge(std::optional<NodeRef>(ParentBB), BB)) {
         // If the block is not visited...
-        VisitStack.push_back(std::make_pair(BB, GT::child_begin(BB)));
+        VisitStack.emplace_back(BB, GT::child_begin(BB), GT::child_end(BB));
       }
     }
   }
@@ -158,7 +161,7 @@
   }
   bool operator!=(const po_iterator &x) const { return !(*this == x); }
 
-  const NodeRef &operator*() const { return VisitStack.back().first; }
+  const NodeRef &operator*() const { return std::get<0>(VisitStack.back()); }
 
   // This is a nonstandard operator-> that dereferences the pointer an extra
   // time... so that you can actually call methods ON the BasicBlock, because
@@ -167,7 +170,7 @@
   NodeRef operator->() const { return **this; }
 
   po_iterator &operator++() { // Preincrement
-    this->finishPostorder(VisitStack.back().first);
+    this->finishPostorder(std::get<0>(VisitStack.back()));
     VisitStack.pop_back();
     if (!VisitStack.empty())
       traverseChild();
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -320,9 +320,11 @@
 
   /// Estimate the cost of a chain of pointers (typically pointer operands of a
   /// chain of loads or stores within same block) operations set when lowered.
+  /// \p AccessTy is the type of the loads/stores that will ultimately use the
+  /// \p Ptrs.
   InstructionCost
   getPointersChainCost(ArrayRef<const Value *> Ptrs, const Value *Base,
-                       const PointersChainInfo &Info,
+                       const PointersChainInfo &Info, Type *AccessTy,
                        TargetCostKind CostKind = TTI::TCK_RecipThroughput
 
   ) const;
@@ -1663,7 +1665,7 @@
                                      TTI::TargetCostKind CostKind) = 0;
   virtual InstructionCost
   getPointersChainCost(ArrayRef<const Value *> Ptrs, const Value *Base,
-                       const TTI::PointersChainInfo &Info,
+                       const TTI::PointersChainInfo &Info, Type *AccessTy,
                        TTI::TargetCostKind CostKind) = 0;
   virtual unsigned getInliningThresholdMultiplier() = 0;
   virtual unsigned adjustInliningThreshold(const CallBase *CB) = 0;
@@ -2024,8 +2026,9 @@
   InstructionCost getPointersChainCost(ArrayRef<const Value *> Ptrs,
                                        const Value *Base,
                                        const PointersChainInfo &Info,
+                                       Type *AccessTy,
                                        TargetCostKind CostKind) override {
-    return Impl.getPointersChainCost(Ptrs, Base, Info, CostKind);
+    return Impl.getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind);
   }
   unsigned getInliningThresholdMultiplier() override {
     return Impl.getInliningThresholdMultiplier();
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1041,6 +1041,7 @@
   InstructionCost getPointersChainCost(ArrayRef<const Value *> Ptrs,
                                        const Value *Base,
                                        const TTI::PointersChainInfo &Info,
+                                       Type *AccessTy,
                                        TTI::TargetCostKind CostKind) {
     InstructionCost Cost = TTI::TCC_Free;
     // In the basic model we take into account GEP instructions only
diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h
--- a/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/llvm/include/llvm/Analysis/ValueTracking.h
@@ -356,8 +356,19 @@
   }
 
   void fabs() {
-    KnownFPClasses &= (fcPositive | fcNan);
-    SignBit = false;
+    if (KnownFPClasses & fcNegZero)
+      KnownFPClasses |= fcPosZero;
+
+    if (KnownFPClasses & fcNegInf)
+      KnownFPClasses |= fcPosInf;
+
+    if (KnownFPClasses & fcNegSubnormal)
+      KnownFPClasses |= fcPosSubnormal;
+
+    if (KnownFPClasses & fcNegNormal)
+      KnownFPClasses |= fcPosNormal;
+
+    signBitMustBeZero();
   }
 
   /// Return true if the sign bit must be 0, ignoring the sign of nans.
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -2107,12 +2107,12 @@
 // Reinterpreting data
 //
 
-def int_aarch64_sve_convert_from_svbool : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+def int_aarch64_sve_convert_from_svbool : DefaultAttrsIntrinsic<[llvm_any_ty],
                                                     [llvm_nxv16i1_ty],
                                                     [IntrNoMem]>;
 
 def int_aarch64_sve_convert_to_svbool : DefaultAttrsIntrinsic<[llvm_nxv16i1_ty],
-                                                  [llvm_anyvector_ty],
+                                                  [llvm_any_ty],
                                                   [IntrNoMem]>;
 
 //
@@ -2600,6 +2600,46 @@
 def int_aarch64_sve_bfdot_lane_v2   : SVE_4Vec_BF16_Indexed;
 def int_aarch64_sve_bfmlalb_lane_v2 : SVE_4Vec_BF16_Indexed;
 def int_aarch64_sve_bfmlalt_lane_v2 : SVE_4Vec_BF16_Indexed;
+
+//
+// SVE2.1 - Contiguous loads to multiple consecutive vectors
+//
+
+  class SVE2p1_Load_PN_X2_Intrinsic
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
+                [llvm_aarch64_svcount_ty, llvm_ptr_ty],
+                [IntrReadMem, IntrArgMemOnly]>;
+
+  class SVE2p1_Load_PN_X4_Intrinsic
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
+                             LLVMMatchType<0>, LLVMMatchType<0>],
+                [llvm_aarch64_svcount_ty, llvm_ptr_ty],
+                [IntrReadMem, IntrArgMemOnly]>;
+
+def int_aarch64_sve_ld1_pn_x2 : SVE2p1_Load_PN_X2_Intrinsic;
+def int_aarch64_sve_ld1_pn_x4 : SVE2p1_Load_PN_X4_Intrinsic;
+def int_aarch64_sve_ldnt1_pn_x2 : SVE2p1_Load_PN_X2_Intrinsic;
+def int_aarch64_sve_ldnt1_pn_x4 : SVE2p1_Load_PN_X4_Intrinsic;
+
+//
+// SVE2.1 - Contiguous stores to multiple consecutive vectors
+//
+
+  class SVE2p1_Store_PN_X2_Intrinsic
+    : DefaultAttrsIntrinsic<[], [ llvm_anyvector_ty, LLVMMatchType<0>,
+                                  llvm_aarch64_svcount_ty, llvm_ptr_ty ],
+                [IntrWriteMem, IntrArgMemOnly]>;
+
+  class SVE2p1_Store_PN_X4_Intrinsic
+    : DefaultAttrsIntrinsic<[], [ llvm_anyvector_ty, LLVMMatchType<0>,
+                                  LLVMMatchType<0>, LLVMMatchType<0>,
+                                  llvm_aarch64_svcount_ty, llvm_ptr_ty],
+                [IntrWriteMem, IntrArgMemOnly]>;
+
+def int_aarch64_sve_st1_pn_x2 : SVE2p1_Store_PN_X2_Intrinsic;
+def int_aarch64_sve_st1_pn_x4 : SVE2p1_Store_PN_X4_Intrinsic;
+def int_aarch64_sve_stnt1_pn_x2 : SVE2p1_Store_PN_X2_Intrinsic;
+def int_aarch64_sve_stnt1_pn_x4 : SVE2p1_Store_PN_X4_Intrinsic;
 }
 
 //
@@ -2752,9 +2792,9 @@
   //
 
   def int_aarch64_sve_psel
-      : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-                              [LLVMMatchType<0>,
-                               LLVMMatchType<0>, llvm_i32_ty],
+      : DefaultAttrsIntrinsic<[llvm_nxv16i1_ty],
+                              [llvm_nxv16i1_ty,
+                               llvm_anyvector_ty, llvm_i32_ty],
                               [IntrNoMem]>;
 
   //
@@ -2926,6 +2966,21 @@
                              LLVMMatchType<0>, LLVMMatchType<0>],
                             [IntrNoMem]>;
 
+  class SVE2_VG2_Sel_Intrinsic
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
+                [llvm_aarch64_svcount_ty, LLVMMatchType<0>,
+                 LLVMMatchType<0>, LLVMMatchType<0>,
+                 LLVMMatchType<0>], [IntrNoMem]>;
+
+  class SVE2_VG4_Sel_Intrinsic
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
+                             LLVMMatchType<0>, LLVMMatchType<0>],
+                [llvm_aarch64_svcount_ty, LLVMMatchType<0>,
+                 LLVMMatchType<0>, LLVMMatchType<0>,
+                 LLVMMatchType<0>, LLVMMatchType<0>,
+                 LLVMMatchType<0>, LLVMMatchType<0>,
+                 LLVMMatchType<0>], [IntrNoMem]>;
+
   class SME2_CVT_VG2_SINGLE_Intrinsic
     : DefaultAttrsIntrinsic<[LLVMSubdivide2VectorType<0>],
                             [llvm_anyvector_ty, LLVMMatchType<0>],
@@ -3385,4 +3440,9 @@
   def int_aarch64_sve_uunpk_x2 : SME2_VG2_Unpk_Intrinsic;
   def int_aarch64_sve_sunpk_x4 : SME2_VG4_Unpk_Intrinsic;
   def int_aarch64_sve_uunpk_x4 : SME2_VG4_Unpk_Intrinsic;
+
+  // 2-way and 4-way vector selects
+  def int_aarch64_sve_sel_x2  : SVE2_VG2_Sel_Intrinsic;
+  def int_aarch64_sve_sel_x4  : SVE2_VG4_Sel_Intrinsic;
+
 }
diff --git a/llvm/include/llvm/MCA/CustomBehaviour.h b/llvm/include/llvm/MCA/CustomBehaviour.h
--- a/llvm/include/llvm/MCA/CustomBehaviour.h
+++ b/llvm/include/llvm/MCA/CustomBehaviour.h
@@ -133,7 +133,7 @@
   StringRef getData() const { return Data; }
 };
 
-using SharedInstrument = std::shared_ptr<Instrument>;
+using UniqueInstrument = std::unique_ptr<Instrument>;
 
 /// This class allows targets to optionally customize the logic that resolves
 /// scheduling class IDs. Targets can use information encoded in Instrument
@@ -156,8 +156,8 @@
   // Instrument.Desc equal to Type
   virtual bool supportsInstrumentType(StringRef Type) const { return false; }
 
-  /// Allocate an Instrument, and return a shared pointer to it.
-  virtual SharedInstrument createInstrument(StringRef Desc, StringRef Data);
+  /// Allocate an Instrument, and return a unique pointer to it.
+  virtual UniqueInstrument createInstrument(StringRef Desc, StringRef Data);
 
   /// Given an MCInst and a vector of Instrument, a target can
   /// return a SchedClassID. This can be used by a subtarget to return a
@@ -165,9 +165,8 @@
   /// BaseInstruction This can be useful when a BaseInstruction does not convey
   /// the correct scheduling information without additional data. By default,
   /// it returns the SchedClassID that belongs to MCI.
-  virtual unsigned
-  getSchedClassID(const MCInstrInfo &MCII, const MCInst &MCI,
-                  const SmallVector<SharedInstrument> &IVec) const;
+  virtual unsigned getSchedClassID(const MCInstrInfo &MCII, const MCInst &MCI,
+                                   const SmallVector<Instrument *> &IVec) const;
 };
 
 } // namespace mca
diff --git a/llvm/include/llvm/MCA/InstrBuilder.h b/llvm/include/llvm/MCA/InstrBuilder.h
--- a/llvm/include/llvm/MCA/InstrBuilder.h
+++ b/llvm/include/llvm/MCA/InstrBuilder.h
@@ -84,11 +84,10 @@
   InstRecycleCallback InstRecycleCB;
 
   Expected<const InstrDesc &>
-  createInstrDescImpl(const MCInst &MCI,
-                      const SmallVector<SharedInstrument> &IVec);
+  createInstrDescImpl(const MCInst &MCI, const SmallVector<Instrument *> &IVec);
   Expected<const InstrDesc &>
   getOrCreateInstrDesc(const MCInst &MCI,
-                       const SmallVector<SharedInstrument> &IVec);
+                       const SmallVector<Instrument *> &IVec);
 
   InstrBuilder(const InstrBuilder &) = delete;
   InstrBuilder &operator=(const InstrBuilder &) = delete;
@@ -114,8 +113,7 @@
   void setInstRecycleCallback(InstRecycleCallback CB) { InstRecycleCB = CB; }
 
   Expected<std::unique_ptr<Instruction>>
-  createInstruction(const MCInst &MCI,
-                    const SmallVector<SharedInstrument> &IVec);
+  createInstruction(const MCInst &MCI, const SmallVector<Instrument *> &IVec);
 };
 } // namespace mca
 } // namespace llvm
diff --git a/llvm/include/llvm/Pass.h b/llvm/include/llvm/Pass.h
--- a/llvm/include/llvm/Pass.h
+++ b/llvm/include/llvm/Pass.h
@@ -28,6 +28,9 @@
 #ifndef LLVM_PASS_H
 #define LLVM_PASS_H
 
+#ifdef EXPENSIVE_CHECKS
+#include <cstdint>
+#endif
 #include <string>
 
 namespace llvm {
diff --git a/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h b/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h
--- a/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h
+++ b/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h
@@ -48,9 +48,9 @@
 #ifndef LLVM_TRANSFORMS_IPO_FUNCTIONSPECIALIZATION_H
 #define LLVM_TRANSFORMS_IPO_FUNCTIONSPECIALIZATION_H
 
+#include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/InlineCost.h"
-#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Transforms/Scalar/SCCP.h"
 #include "llvm/Transforms/Utils/Cloning.h"
@@ -126,6 +126,7 @@
   FunctionAnalysisManager *FAM;
 
   /// Analyses used to help determine if a function should be specialized.
+  std::function<BlockFrequencyInfo &(Function &)> GetBFI;
   std::function<const TargetLibraryInfo &(Function &)> GetTLI;
   std::function<TargetTransformInfo &(Function &)> GetTTI;
   std::function<AssumptionCache &(Function &)> GetAC;
@@ -137,11 +138,12 @@
 public:
   FunctionSpecializer(
       SCCPSolver &Solver, Module &M, FunctionAnalysisManager *FAM,
+      std::function<BlockFrequencyInfo &(Function &)> GetBFI,
       std::function<const TargetLibraryInfo &(Function &)> GetTLI,
       std::function<TargetTransformInfo &(Function &)> GetTTI,
       std::function<AssumptionCache &(Function &)> GetAC)
-      : Solver(Solver), M(M), FAM(FAM), GetTLI(GetTLI), GetTTI(GetTTI),
-        GetAC(GetAC) {}
+      : Solver(Solver), M(M), FAM(FAM), GetBFI(GetBFI), GetTLI(GetTLI),
+        GetTTI(GetTTI), GetAC(GetAC) {}
 
   ~FunctionSpecializer();
 
@@ -193,7 +195,7 @@
   Cost getSpecializationCost(Function *F);
 
   /// Compute a bonus for replacing argument \p A with constant \p C.
-  Cost getSpecializationBonus(Argument *A, Constant *C, const LoopInfo &LI);
+  Cost getSpecializationBonus(Argument *A, Constant *C);
 
   /// Determine if it is possible to specialise the function for constant values
   /// of the formal parameter \p A.
diff --git a/llvm/include/llvm/Transforms/Utils/SCCPSolver.h b/llvm/include/llvm/Transforms/Utils/SCCPSolver.h
--- a/llvm/include/llvm/Transforms/Utils/SCCPSolver.h
+++ b/llvm/include/llvm/Transforms/Utils/SCCPSolver.h
@@ -44,7 +44,6 @@
   std::unique_ptr<PredicateInfo> PredInfo;
   DominatorTree *DT;
   PostDominatorTree *PDT;
-  LoopInfo *LI;
 };
 
 /// Helper struct shared between Function Specialization and SCCP Solver.
@@ -91,8 +90,6 @@
 
   const PredicateBase *getPredicateInfoFor(Instruction *I);
 
-  const LoopInfo &getLoopInfo(Function &F);
-
   DomTreeUpdater getDTU(Function &F);
 
   /// trackValueOfGlobalVariable - Clients can use this method to
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -135,10 +135,10 @@
 
 #define DEBUG_TYPE "scalar-evolution"
 
-STATISTIC(NumTripCountsComputed,
-          "Number of loops with predictable loop counts");
-STATISTIC(NumTripCountsNotComputed,
-          "Number of loops without predictable loop counts");
+STATISTIC(NumExitCountsComputed,
+          "Number of loop exits with predictable exit counts");
+STATISTIC(NumExitCountsNotComputed,
+          "Number of loop exits without predictable exit counts");
 STATISTIC(NumBruteForceTripCountsComputed,
           "Number of loops with trip counts computed by force");
 
@@ -8450,23 +8450,6 @@
   // must be cleared in this scope.
   BackedgeTakenInfo Result = computeBackedgeTakenCount(L);
 
-  // In product build, there are no usage of statistic.
-  (void)NumTripCountsComputed;
-  (void)NumTripCountsNotComputed;
-#if LLVM_ENABLE_STATS || !defined(NDEBUG)
-  const SCEV *BEExact = Result.getExact(L, this);
-  if (BEExact != getCouldNotCompute()) {
-    assert(isLoopInvariant(BEExact, L) &&
-           isLoopInvariant(Result.getConstantMax(this), L) &&
-           "Computed backedge-taken count isn't loop invariant for loop!");
-    ++NumTripCountsComputed;
-  } else if (Result.getConstantMax(this) == getCouldNotCompute() &&
-             isa<PHINode>(L->getHeader()->begin())) {
-    // Only count loops that have phi nodes as not being computable.
-    ++NumTripCountsNotComputed;
-  }
-#endif // LLVM_ENABLE_STATS || !defined(NDEBUG)
-
   // Now that we know more about the trip count for this loop, forget any
   // existing SCEV values for PHI nodes in this loop since they are only
   // conservative estimates made without the benefit of trip count
@@ -8852,7 +8835,9 @@
 
     // 1. For each exit that can be computed, add an entry to ExitCounts.
     // CouldComputeBECount is true only if all exits can be computed.
-    if (EL.ExactNotTaken == getCouldNotCompute())
+    if (EL.ExactNotTaken != getCouldNotCompute())
+      ++NumExitCountsComputed;
+    else
       // We couldn't compute an exact value for this exit, so
       // we won't be able to compute an exact value for the loop.
       CouldComputeBECount = false;
@@ -8860,9 +8845,11 @@
     // Exact always implies symbolic, only check symbolic.
     if (EL.SymbolicMaxNotTaken != getCouldNotCompute())
       ExitCounts.emplace_back(ExitBB, EL);
-    else
+    else {
       assert(EL.ExactNotTaken == getCouldNotCompute() &&
              "Exact is known but symbolic isn't?");
+      ++NumExitCountsNotComputed;
+    }
 
     // 2. Derive the loop's MaxBECount from each exit's max number of
     // non-exiting iterations. Partition the loop exits into two kinds:
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -230,10 +230,11 @@
 
 InstructionCost TargetTransformInfo::getPointersChainCost(
     ArrayRef<const Value *> Ptrs, const Value *Base,
-    const TTI::PointersChainInfo &Info, TTI::TargetCostKind CostKind) const {
+    const TTI::PointersChainInfo &Info, Type *AccessTy,
+    TTI::TargetCostKind CostKind) const {
   assert((Base || !Info.isSameBase()) &&
          "If pointers have same base address it has to be provided.");
-  return TTIImpl->getPointersChainCost(Ptrs, Base, Info, CostKind);
+  return TTIImpl->getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind);
 }
 
 unsigned TargetTransformInfo::getEstimatedNumberOfCaseClusters(
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -3817,18 +3817,12 @@
     switch (IID) {
     default:
       break;
-    // sqrt(-0.0) = -0.0, no other negative results are possible.
     case Intrinsic::sqrt:
+    case Intrinsic::experimental_constrained_sqrt:
+      // sqrt(-0.0) = -0.0, no other negative results are possible.
+      // FIXME: Account for denormal-fp-math=preserve-sign denormal inputs
     case Intrinsic::canonicalize:
       return CannotBeNegativeZero(Call->getArgOperand(0), TLI, Depth + 1);
-    case Intrinsic::experimental_constrained_sqrt: {
-      // NOTE: This rounding mode restriction may be too strict.
-      const auto *CI = cast<ConstrainedFPIntrinsic>(Call);
-      if (CI->getRoundingMode() == RoundingMode::NearestTiesToEven)
-        return CannotBeNegativeZero(Call->getArgOperand(0), TLI, Depth + 1);
-      else
-        return false;
-    }
     // fabs(x) != -0.0
     case Intrinsic::fabs:
       return true;
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp
--- a/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp
@@ -70,13 +70,16 @@
   }
 
   void fixPLTEdge(Edge &E, Symbol &PLTStubs) {
-    assert(E.getKind() == R_RISCV_CALL_PLT && "Not a R_RISCV_CALL_PLT edge?");
+    assert((E.getKind() == R_RISCV_CALL || E.getKind() == R_RISCV_CALL_PLT ||
+            E.getKind() == CallRelaxable) &&
+           "Not a PLT edge?");
     E.setKind(R_RISCV_CALL);
     E.setTarget(PLTStubs);
   }
 
   bool isExternalBranchEdge(Edge &E) const {
-    return (E.getKind() == R_RISCV_CALL || E.getKind() == R_RISCV_CALL_PLT) &&
+    return (E.getKind() == R_RISCV_CALL || E.getKind() == R_RISCV_CALL_PLT ||
+            E.getKind() == CallRelaxable) &&
            !E.getTarget().isDefined();
   }
 
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -479,7 +479,9 @@
 
     assert(isInt<16>(BranchImm));
 
-    *TargetPtr &= 0xfff8001fU;
+    uint32_t RawInstr = *(support::little32_t *)TargetPtr;
+    *(support::little32_t *)TargetPtr = RawInstr & 0xfff8001fU;
+
     // Immediate:15:2 goes in bits 18:5 of TBZ, TBNZ
     or32le(TargetPtr, (BranchImm & 0x0000FFFC) << 3);
     break;
diff --git a/llvm/lib/MCA/CustomBehaviour.cpp b/llvm/lib/MCA/CustomBehaviour.cpp
--- a/llvm/lib/MCA/CustomBehaviour.cpp
+++ b/llvm/lib/MCA/CustomBehaviour.cpp
@@ -42,14 +42,14 @@
   return std::vector<std::unique_ptr<View>>();
 }
 
-SharedInstrument InstrumentManager::createInstrument(llvm::StringRef Desc,
+UniqueInstrument InstrumentManager::createInstrument(llvm::StringRef Desc,
                                                      llvm::StringRef Data) {
-  return std::make_shared<Instrument>(Desc, Data);
+  return std::make_unique<Instrument>(Desc, Data);
 }
 
 unsigned InstrumentManager::getSchedClassID(
     const MCInstrInfo &MCII, const MCInst &MCI,
-    const llvm::SmallVector<SharedInstrument> &IVec) const {
+    const llvm::SmallVector<Instrument *> &IVec) const {
   return MCII.get(MCI.getOpcode()).getSchedClass();
 }
 
diff --git a/llvm/lib/MCA/InstrBuilder.cpp b/llvm/lib/MCA/InstrBuilder.cpp
--- a/llvm/lib/MCA/InstrBuilder.cpp
+++ b/llvm/lib/MCA/InstrBuilder.cpp
@@ -511,7 +511,7 @@
 
 Expected<const InstrDesc &>
 InstrBuilder::createInstrDescImpl(const MCInst &MCI,
-                                  const SmallVector<SharedInstrument> &IVec) {
+                                  const SmallVector<Instrument *> &IVec) {
   assert(STI.getSchedModel().hasInstrSchedModel() &&
          "Itineraries are not yet supported!");
 
@@ -601,7 +601,7 @@
 
 Expected<const InstrDesc &>
 InstrBuilder::getOrCreateInstrDesc(const MCInst &MCI,
-                                   const SmallVector<SharedInstrument> &IVec) {
+                                   const SmallVector<Instrument *> &IVec) {
   // Cache lookup using SchedClassID from Instrumentation
   unsigned SchedClassID = IM.getSchedClassID(MCII, MCI, IVec);
 
@@ -622,7 +622,7 @@
 
 Expected<std::unique_ptr<Instruction>>
 InstrBuilder::createInstruction(const MCInst &MCI,
-                                const SmallVector<SharedInstrument> &IVec) {
+                                const SmallVector<Instrument *> &IVec) {
   Expected<const InstrDesc &> DescOrErr = getOrCreateInstrDesc(MCI, IVec);
   if (!DescOrErr)
     return DescOrErr.takeError();
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -370,8 +370,12 @@
   void SelectPredicatedLoad(SDNode *N, unsigned NumVecs, unsigned Scale,
                             unsigned Opc_rr, unsigned Opc_ri,
                             bool IsIntr = false);
+  void SelectContiguousMultiVectorLoad(SDNode *N, unsigned NumVecs,
+                                       unsigned Scale, unsigned Opc_rr,
+                                       unsigned Opc_ri);
   void SelectDestructiveMultiIntrinsic(SDNode *N, unsigned NumVecs,
-                                       bool IsZmMulti, unsigned Opcode);
+                                       bool IsZmMulti, unsigned Opcode,
+                                       bool HasPred = false);
   void SelectPExtPair(SDNode *N, unsigned Opc);
   void SelectWhilePair(SDNode *N, unsigned Opc);
   void SelectCVTIntrinsic(SDNode *N, unsigned NumVecs, unsigned Opcode);
@@ -1709,11 +1713,13 @@
 void AArch64DAGToDAGISel::SelectDestructiveMultiIntrinsic(SDNode *N,
                                                           unsigned NumVecs,
                                                           bool IsZmMulti,
-                                                          unsigned Opcode) {
+                                                          unsigned Opcode,
+                                                          bool HasPred) {
   assert(Opcode != 0 && "Unexpected opcode");
 
   SDLoc DL(N);
   EVT VT = N->getValueType(0);
+  unsigned FirstVecIdx = HasPred ? 2 : 1;
 
   auto GetMultiVecOperand = [=](unsigned StartIdx) {
     SmallVector<SDValue, 4> Regs(N->op_begin() + StartIdx,
@@ -1721,16 +1727,20 @@
     return createZMulTuple(Regs);
   };
 
-  SDValue Zdn = GetMultiVecOperand(1);
+  SDValue Zdn = GetMultiVecOperand(FirstVecIdx);
 
   SDValue Zm;
   if (IsZmMulti)
-    Zm = GetMultiVecOperand(NumVecs + 1);
+    Zm = GetMultiVecOperand(NumVecs + FirstVecIdx);
   else
-    Zm = N->getOperand(NumVecs + 1);
-
-  SDNode *Intrinsic = CurDAG->getMachineNode(Opcode, DL, MVT::Untyped, Zdn, Zm);
+    Zm = N->getOperand(NumVecs + FirstVecIdx);
 
+  SDNode *Intrinsic;
+  if (HasPred)
+    Intrinsic = CurDAG->getMachineNode(Opcode, DL, MVT::Untyped,
+                                       N->getOperand(1), Zdn, Zm);
+  else
+    Intrinsic = CurDAG->getMachineNode(Opcode, DL, MVT::Untyped, Zdn, Zm);
   SDValue SuperReg = SDValue(Intrinsic, 0);
   for (unsigned i = 0; i < NumVecs; ++i)
     ReplaceUses(SDValue(N, i), CurDAG->getTargetExtractSubreg(
@@ -1772,6 +1782,39 @@
   CurDAG->RemoveDeadNode(N);
 }
 
+void AArch64DAGToDAGISel::SelectContiguousMultiVectorLoad(SDNode *N,
+                                                          unsigned NumVecs,
+                                                          unsigned Scale,
+                                                          unsigned Opc_ri,
+                                                          unsigned Opc_rr) {
+  assert(Scale < 4 && "Invalid scaling value.");
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  SDValue Chain = N->getOperand(0);
+
+  // Use simplest addressing mode for now - base + 0 offset
+  SDValue PNg = N->getOperand(2);
+  SDValue Base = N->getOperand(3);
+  SDValue Offset = CurDAG->getTargetConstant(0, DL, MVT::i64);
+
+  SDValue Ops[] = {PNg,            // Predicate-as-counter
+                   Base,           // Memory operand
+                   Offset, Chain};
+
+  const EVT ResTys[] = {MVT::Untyped, MVT::Other};
+
+  SDNode *Load = CurDAG->getMachineNode(Opc_ri, DL, ResTys, Ops);
+  SDValue SuperReg = SDValue(Load, 0);
+  for (unsigned i = 0; i < NumVecs; ++i)
+    ReplaceUses(SDValue(N, i), CurDAG->getTargetExtractSubreg(
+                                   AArch64::zsub0 + i, DL, VT, SuperReg));
+
+  // Copy chain
+  unsigned ChainIdx = NumVecs;
+  ReplaceUses(SDValue(N, ChainIdx), SDValue(Load, 1));
+  CurDAG->RemoveDeadNode(N);
+}
+
 void AArch64DAGToDAGISel::SelectFrintFromVT(SDNode *N, unsigned NumVecs,
                                             unsigned Opcode) {
   if (N->getValueType(0) != MVT::nxv4f32)
@@ -4648,6 +4691,74 @@
       }
       break;
     }
+    case Intrinsic::aarch64_sve_ld1_pn_x2: {
+      if (VT == MVT::nxv16i8) {
+        SelectContiguousMultiVectorLoad(Node, 2, 0, AArch64::LD1B_2Z_IMM, AArch64::LD1B_2Z);
+        return;
+      } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
+                 VT == MVT::nxv8bf16) {
+        SelectContiguousMultiVectorLoad(Node, 2, 1, AArch64::LD1H_2Z_IMM, AArch64::LD1H_2Z);
+        return;
+      } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+        SelectContiguousMultiVectorLoad(Node, 2, 2, AArch64::LD1W_2Z_IMM, AArch64::LD1W_2Z);
+        return;
+      } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+        SelectContiguousMultiVectorLoad(Node, 2, 3, AArch64::LD1D_2Z_IMM, AArch64::LD1D_2Z);
+        return;
+      }
+      break;
+    }
+    case Intrinsic::aarch64_sve_ld1_pn_x4: {
+      if (VT == MVT::nxv16i8) {
+        SelectContiguousMultiVectorLoad(Node, 4, 0, AArch64::LD1B_4Z_IMM, AArch64::LD1B_4Z);
+        return;
+      } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
+                 VT == MVT::nxv8bf16) {
+        SelectContiguousMultiVectorLoad(Node, 4, 1, AArch64::LD1H_4Z_IMM, AArch64::LD1H_4Z);
+        return;
+      } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+        SelectContiguousMultiVectorLoad(Node, 4, 2, AArch64::LD1W_4Z_IMM, AArch64::LD1W_4Z);
+        return;
+      } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+        SelectContiguousMultiVectorLoad(Node, 4, 3, AArch64::LD1D_4Z_IMM, AArch64::LD1D_4Z);
+        return;
+      }
+      break;
+    }
+    case Intrinsic::aarch64_sve_ldnt1_pn_x2: {
+      if (VT == MVT::nxv16i8) {
+        SelectContiguousMultiVectorLoad(Node, 2, 0, AArch64::LDNT1B_2Z_IMM, AArch64::LDNT1B_2Z);
+        return;
+      } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
+                 VT == MVT::nxv8bf16) {
+        SelectContiguousMultiVectorLoad(Node, 2, 1, AArch64::LDNT1H_2Z_IMM, AArch64::LDNT1H_2Z);
+        return;
+      } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+        SelectContiguousMultiVectorLoad(Node, 2, 2, AArch64::LDNT1W_2Z_IMM, AArch64::LDNT1W_2Z);
+        return;
+      } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+        SelectContiguousMultiVectorLoad(Node, 2, 3, AArch64::LDNT1D_2Z_IMM, AArch64::LDNT1D_2Z);
+        return;
+      }
+      break;
+    }
+    case Intrinsic::aarch64_sve_ldnt1_pn_x4: {
+      if (VT == MVT::nxv16i8) {
+        SelectContiguousMultiVectorLoad(Node, 4, 0, AArch64::LDNT1B_4Z_IMM, AArch64::LDNT1B_4Z);
+        return;
+      } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
+                 VT == MVT::nxv8bf16) {
+        SelectContiguousMultiVectorLoad(Node, 4, 1, AArch64::LDNT1H_4Z_IMM, AArch64::LDNT1H_4Z);
+        return;
+      } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+        SelectContiguousMultiVectorLoad(Node, 4, 2, AArch64::LDNT1W_4Z_IMM, AArch64::LDNT1W_4Z);
+        return;
+      } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+        SelectContiguousMultiVectorLoad(Node, 4, 3, AArch64::LDNT1D_4Z_IMM, AArch64::LDNT1D_4Z);
+        return;
+      }
+      break;
+    }
     case Intrinsic::aarch64_sve_ld3_sret: {
       if (VT == MVT::nxv16i8) {
         SelectPredicatedLoad(Node, 3, 0, AArch64::LD3B_IMM, AArch64::LD3B,
@@ -5330,6 +5441,20 @@
       SelectUnaryMultiIntrinsic(Node, 4, /*IsTupleInput=*/true,
                                 AArch64::UZP_VG4_4Z4Z_Q);
       return;
+    case Intrinsic::aarch64_sve_sel_x2:
+      if (auto Op = SelectOpcodeFromVT<SelectTypeKind::AnyType>(
+              Node->getValueType(0),
+              {AArch64::SEL_VG2_2ZC2Z2Z_B, AArch64::SEL_VG2_2ZC2Z2Z_H,
+               AArch64::SEL_VG2_2ZC2Z2Z_S, AArch64::SEL_VG2_2ZC2Z2Z_D}))
+        SelectDestructiveMultiIntrinsic(Node, 2, true, Op, /*HasPred=*/true);
+      return;
+    case Intrinsic::aarch64_sve_sel_x4:
+      if (auto Op = SelectOpcodeFromVT<SelectTypeKind::AnyType>(
+              Node->getValueType(0),
+              {AArch64::SEL_VG4_4ZC4Z4Z_B, AArch64::SEL_VG4_4ZC4Z4Z_H,
+               AArch64::SEL_VG4_4ZC4Z4Z_S, AArch64::SEL_VG4_4ZC4Z4Z_D}))
+        SelectDestructiveMultiIntrinsic(Node, 4, true, Op, /*HasPred=*/true);
+      return;
     case Intrinsic::aarch64_sve_frinta_x2:
       SelectFrintFromVT(Node, 2, AArch64::FRINTA_2Z2Z_S);
       return;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -5042,8 +5042,12 @@
   case Intrinsic::aarch64_sve_dupq_lane:
     return LowerDUPQLane(Op, DAG);
   case Intrinsic::aarch64_sve_convert_from_svbool:
+    if (Op.getValueType() == MVT::aarch64svcount)
+      return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Op.getOperand(1));
     return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG);
   case Intrinsic::aarch64_sve_convert_to_svbool:
+    if (Op.getOperand(1).getValueType() == MVT::aarch64svcount)
+      return DAG.getNode(ISD::BITCAST, dl, MVT::nxv16i1, Op.getOperand(1));
     return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG);
   case Intrinsic::aarch64_sve_fneg:
     return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(),
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -632,8 +632,8 @@
 defm UQRSHRN_VG4_Z4ZI : sme2_sat_shift_vector_vg4<"uqrshrn", 0b101, int_aarch64_sve_uqrshrn_x4>;
 defm SQRSHRUN_VG4_Z4ZI : sme2_sat_shift_vector_vg4<"sqrshrun", 0b110, int_aarch64_sve_sqrshrun_x4>;
 
-defm SEL_VG2_2ZP2Z2Z: sme2_sel_vector_vg2<"sel">;
-defm SEL_VG4_4ZP4Z4Z: sme2_sel_vector_vg4<"sel">;
+defm SEL_VG2_2ZC2Z2Z: sme2_sel_vector_vg2<"sel">;
+defm SEL_VG4_4ZC4Z4Z: sme2_sel_vector_vg4<"sel">;
 
 def  LD1B_VG2_M2ZPXX : sme2_ld_vector_vg2_multi_scalar_scalar<0b00, 0b0,    ZZ_b_strided,    GPR64shifted8, "ld1b">;
 def  LD1B_VG4_M4ZPXX : sme2_ld_vector_vg4_multi_scalar_scalar<0b00, 0b0,    ZZZZ_b_strided,  GPR64shifted8, "ld1b">;
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -3837,6 +3837,59 @@
 defm STNT1W_4Z_IMM : sve2p1_mem_cst_si_4z<"stnt1w", 0b10, 0b1, ZZZZ_s_mul_r>;
 defm STNT1D_4Z_IMM : sve2p1_mem_cst_si_4z<"stnt1d", 0b11, 0b1, ZZZZ_d_mul_r>;
 
+multiclass store_pn_x2<ValueType Ty, SDPatternOperator Store,
+                        Instruction RegImmInst> {
+  def : Pat<(Store (Ty ZPR:$vec0), (Ty ZPR:$vec1),
+                   (aarch64svcount PPR:$PNg), GPR64:$base),
+            (RegImmInst (REG_SEQUENCE ZPR2Mul2, Ty:$vec0, zsub0, Ty:$vec1, zsub1),
+                         PPR:$PNg, GPR64:$base, (i64 0))>;
+}
+
+// Stores of 2 consecutive vectors
+defm : store_pn_x2<nxv16i8, int_aarch64_sve_st1_pn_x2, ST1B_2Z_IMM>;
+defm : store_pn_x2<nxv8i16, int_aarch64_sve_st1_pn_x2, ST1H_2Z_IMM>;
+defm : store_pn_x2<nxv4i32, int_aarch64_sve_st1_pn_x2, ST1W_2Z_IMM>;
+defm : store_pn_x2<nxv2i64, int_aarch64_sve_st1_pn_x2, ST1D_2Z_IMM>;
+defm : store_pn_x2<nxv16i8, int_aarch64_sve_stnt1_pn_x2, STNT1B_2Z_IMM>;
+defm : store_pn_x2<nxv8i16, int_aarch64_sve_stnt1_pn_x2, STNT1H_2Z_IMM>;
+defm : store_pn_x2<nxv4i32, int_aarch64_sve_stnt1_pn_x2, STNT1W_2Z_IMM>;
+defm : store_pn_x2<nxv2i64, int_aarch64_sve_stnt1_pn_x2, STNT1D_2Z_IMM>;
+defm : store_pn_x2<nxv8f16, int_aarch64_sve_st1_pn_x2, ST1H_2Z_IMM>;
+defm : store_pn_x2<nxv8bf16, int_aarch64_sve_st1_pn_x2, ST1H_2Z_IMM>;
+defm : store_pn_x2<nxv4f32, int_aarch64_sve_st1_pn_x2, ST1W_2Z_IMM>;
+defm : store_pn_x2<nxv2f64, int_aarch64_sve_st1_pn_x2, ST1D_2Z_IMM>;
+defm : store_pn_x2<nxv8f16, int_aarch64_sve_stnt1_pn_x2, STNT1H_2Z_IMM>;
+defm : store_pn_x2<nxv8bf16, int_aarch64_sve_stnt1_pn_x2, STNT1H_2Z_IMM>;
+defm : store_pn_x2<nxv4f32, int_aarch64_sve_stnt1_pn_x2, STNT1W_2Z_IMM>;
+defm : store_pn_x2<nxv2f64, int_aarch64_sve_stnt1_pn_x2, STNT1D_2Z_IMM>;
+
+multiclass store_pn_x4<ValueType Ty, SDPatternOperator Store,
+                        Instruction RegImmInst> {
+  def : Pat<(Store (Ty ZPR:$vec0), (Ty ZPR:$vec1), (Ty ZPR:$vec2), (Ty ZPR:$vec3),
+                   (aarch64svcount PPR:$PNg), GPR64:$base),
+            (RegImmInst (REG_SEQUENCE ZPR4Mul4, Ty:$vec0, zsub0, Ty:$vec1, zsub1,
+                                                Ty:$vec2, zsub2, Ty:$vec3, zsub3),
+                        PPR:$PNg, GPR64:$base, (i64 0))>;
+}
+
+// Stores of 4 consecutive vectors
+defm : store_pn_x4<nxv16i8, int_aarch64_sve_st1_pn_x4, ST1B_4Z_IMM>;
+defm : store_pn_x4<nxv8i16, int_aarch64_sve_st1_pn_x4, ST1H_4Z_IMM>;
+defm : store_pn_x4<nxv4i32, int_aarch64_sve_st1_pn_x4, ST1W_4Z_IMM>;
+defm : store_pn_x4<nxv2i64, int_aarch64_sve_st1_pn_x4, ST1D_4Z_IMM>;
+defm : store_pn_x4<nxv16i8, int_aarch64_sve_stnt1_pn_x4, STNT1B_4Z_IMM>;
+defm : store_pn_x4<nxv8i16, int_aarch64_sve_stnt1_pn_x4, STNT1H_4Z_IMM>;
+defm : store_pn_x4<nxv4i32, int_aarch64_sve_stnt1_pn_x4, STNT1W_4Z_IMM>;
+defm : store_pn_x4<nxv2i64, int_aarch64_sve_stnt1_pn_x4, STNT1D_4Z_IMM>;
+defm : store_pn_x4<nxv8f16, int_aarch64_sve_st1_pn_x4, ST1H_4Z_IMM>;
+defm : store_pn_x4<nxv8bf16, int_aarch64_sve_st1_pn_x4, ST1H_4Z_IMM>;
+defm : store_pn_x4<nxv4f32, int_aarch64_sve_st1_pn_x4, ST1W_4Z_IMM>;
+defm : store_pn_x4<nxv2f64, int_aarch64_sve_st1_pn_x4, ST1D_4Z_IMM>;
+defm : store_pn_x4<nxv8f16, int_aarch64_sve_stnt1_pn_x4, STNT1H_4Z_IMM>;
+defm : store_pn_x4<nxv8bf16, int_aarch64_sve_stnt1_pn_x4, STNT1H_4Z_IMM>;
+defm : store_pn_x4<nxv4f32, int_aarch64_sve_stnt1_pn_x4, STNT1W_4Z_IMM>;
+defm : store_pn_x4<nxv2f64, int_aarch64_sve_stnt1_pn_x4, STNT1D_4Z_IMM>;
+
 defm WHILEGE_2PXX : sve2p1_int_while_rr_pair<"whilege", 0b000>;
 defm WHILEGT_2PXX : sve2p1_int_while_rr_pair<"whilegt", 0b001>;
 defm WHILELT_2PXX : sve2p1_int_while_rr_pair<"whilelt", 0b010>;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -737,6 +737,11 @@
   if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
     return BinOpCombine;
 
+  // Ignore converts to/from svcount_t.
+  if (isa<TargetExtType>(II.getArgOperand(0)->getType()) ||
+      isa<TargetExtType>(II.getType()))
+    return std::nullopt;
+
   SmallVector<Instruction *, 32> CandidatesForRemoval;
   Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
 
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -1309,30 +1309,30 @@
                   (!cast<Instruction>(NAME # _D) PNRAny:$Pd,
                       PNRAny:$Pn, PPR64:$Pm, MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_1:$imm), 0>;
 
-  def : Pat<(nxv16i1 (op (nxv16i1 PPRAny:$Pn), (nxv16i1 PPRAny:$Pm),
+  def : Pat<(nxv16i1 (op (nxv16i1 PPRAny:$Pn), (nxv16i1 PPR8:$Pm),
              MatrixIndexGPR32Op12_15:$idx)),
             (!cast<Instruction>(NAME # _B) $Pn, $Pm, $idx, 0)>;
-  def : Pat<(nxv8i1 (op (nxv8i1 PPRAny:$Pn), (nxv8i1 PPRAny:$Pm),
+  def : Pat<(nxv16i1 (op (nxv16i1 PPRAny:$Pn), (nxv8i1 PPR16:$Pm),
              MatrixIndexGPR32Op12_15:$idx)),
             (!cast<Instruction>(NAME # _H) $Pn, $Pm, $idx, 0)>;
-  def : Pat<(nxv4i1 (op (nxv4i1 PPRAny:$Pn), (nxv4i1 PPRAny:$Pm),
+  def : Pat<(nxv16i1 (op (nxv16i1 PPRAny:$Pn), (nxv4i1 PPR32:$Pm),
              MatrixIndexGPR32Op12_15:$idx)),
             (!cast<Instruction>(NAME # _S) $Pn, $Pm, $idx, 0)>;
-  def : Pat<(nxv2i1 (op (nxv2i1 PPRAny:$Pn), (nxv2i1 PPRAny:$Pm),
+  def : Pat<(nxv16i1 (op (nxv16i1 PPRAny:$Pn), (nxv2i1 PPR64:$Pm),
              MatrixIndexGPR32Op12_15:$idx)),
             (!cast<Instruction>(NAME # _D) $Pn, $Pm, $idx, 0)>;
 
   let AddedComplexity = 1 in {
-    def : Pat<(nxv16i1 (op (nxv16i1 PPRAny:$Pn), (nxv16i1 PPRAny:$Pm),
+    def : Pat<(nxv16i1 (op (nxv16i1 PPRAny:$Pn), (nxv16i1 PPR8:$Pm),
                (i32 (tileslice8 MatrixIndexGPR32Op12_15:$idx, sme_elm_idx0_15:$imm)))),
               (!cast<Instruction>(NAME # _B) $Pn, $Pm, $idx, $imm)>;
-    def : Pat<(nxv8i1 (op (nxv8i1 PPRAny:$Pn), (nxv8i1 PPRAny:$Pm),
+    def : Pat<(nxv16i1 (op (nxv16i1 PPRAny:$Pn), (nxv8i1 PPR16:$Pm),
                (i32 (tileslice16 MatrixIndexGPR32Op12_15:$idx, sme_elm_idx0_7:$imm)))),
               (!cast<Instruction>(NAME # _H) $Pn, $Pm, $idx, $imm)>;
-    def : Pat<(nxv4i1 (op (nxv4i1 PPRAny:$Pn), (nxv4i1 PPRAny:$Pm),
+    def : Pat<(nxv16i1 (op (nxv16i1 PPRAny:$Pn), (nxv4i1 PPR32:$Pm),
                (i32 (tileslice32 MatrixIndexGPR32Op12_15:$idx, sme_elm_idx0_3:$imm)))),
               (!cast<Instruction>(NAME # _S) $Pn, $Pm, $idx, $imm)>;
-    def : Pat<(nxv2i1 (op (nxv2i1 PPRAny:$Pn), (nxv2i1 PPRAny:$Pm),
+    def : Pat<(nxv16i1 (op (nxv16i1 PPRAny:$Pn), (nxv2i1 PPR64:$Pm),
                (i32 (tileslice64 MatrixIndexGPR32Op12_15:$idx, sme_elm_idx0_1:$imm)))),
               (!cast<Instruction>(NAME # _D) $Pn, $Pm, $idx, $imm)>;
   }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -1556,6 +1556,73 @@
   return BreakPhiNodesCache[&I] = true;
 }
 
+/// Helper class for "break large PHIs" (visitPHINode).
+///
+/// This represents a slice of a PHI's incoming value, which is made up of:
+///   - The type of the slice (Ty)
+///   - The index in the incoming value's vector where the slice starts (Idx)
+///   - The number of elements in the slice (NumElts).
+/// It also keeps track of the NewPHI node inserted for this particular slice.
+///
+/// Slice examples:
+///   <4 x i64> -> Split into four i64 slices.
+///     -> [i64, 0, 1], [i64, 1, 1], [i64, 2, 1], [i64, 3, 1]
+///   <5 x i16> -> Split into 2 <2 x i16> slices + a i16 tail.
+///     -> [<2 x i16>, 0, 2], [<2 x i16>, 2, 2], [i16, 4, 1]
+class VectorSlice {
+public:
+  VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
+      : Ty(Ty), Idx(Idx), NumElts(NumElts) {}
+
+  Type *Ty = nullptr;
+  unsigned Idx = 0;
+  unsigned NumElts = 0;
+  PHINode *NewPHI = nullptr;
+
+  /// Slice \p Inc according to the information contained within this slice.
+  /// This is cached, so if called multiple times for the same \p BB & \p Inc
+  /// pair, it returns the same Sliced value as well.
+  ///
+  /// Note this *intentionally* does not return the same value for, say,
+  /// [%bb.0, %0] & [%bb.1, %0] as:
+  ///   - It could cause issues with dominance (e.g. if bb.1 is seen first, then
+  ///   the value in bb.1 may not be reachable from bb.0 if it's its
+  ///   predecessor.)
+  ///   - We also want to make our extract instructions as local as possible so
+  ///   the DAG has better chances of folding them out. Duplicating them like
+  ///   that is beneficial in that regard.
+  ///
+  /// This is both a minor optimization to avoid creating duplicate
+  /// instructions, but also a requirement for correctness. It is not forbidden
+  /// for a PHI node to have the same [BB, Val] pair multiple times. If we
+  /// returned a new value each time, those previously identical pairs would all
+  /// have different incoming values (from the same block) and it'd cause a "PHI
+  /// node has multiple entries for the same basic block with different incoming
+  /// values!" verifier error.
+  Value *getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName) {
+    Value *&Res = SlicedVals[{BB, Inc}];
+    if (Res)
+      return Res;
+
+    IRBuilder<> B(BB->getTerminator());
+    if (Instruction *IncInst = dyn_cast<Instruction>(Inc))
+      B.SetCurrentDebugLocation(IncInst->getDebugLoc());
+
+    if (NumElts > 1) {
+      SmallVector<int, 4> Mask;
+      for (unsigned K = Idx; K < (Idx + NumElts); ++K)
+        Mask.push_back(K);
+      Res = B.CreateShuffleVector(Inc, Mask, NewValName);
+    } else
+      Res = B.CreateExtractElement(Inc, Idx, NewValName);
+
+    return Res;
+  }
+
+private:
+  SmallDenseMap<std::pair<BasicBlock *, Value *>, Value *> SlicedVals;
+};
+
 bool AMDGPUCodeGenPrepare::visitPHINode(PHINode &I) {
   // Break-up fixed-vector PHIs into smaller pieces.
   // Default threshold is 32, so it breaks up any vector that's >32 bits into
@@ -1577,14 +1644,6 @@
   if (!ForceScalarizeLargePHIs && !canBreakPHINode(I))
     return false;
 
-  struct VectorSlice {
-    Type *Ty = nullptr;
-    unsigned Idx = 0;
-    unsigned NumElts = 0;
-    std::vector<Value *> IncomingValues = {};
-    PHINode *NewPHI = nullptr;
-  };
-
   std::vector<VectorSlice> Slices;
 
   Type *EltTy = FVT->getElementType();
@@ -1599,47 +1658,36 @@
       Type *SubVecTy = FixedVectorType::get(EltTy, SubVecSize);
       for (unsigned End = alignDown(NumElts, SubVecSize); Idx < End;
            Idx += SubVecSize)
-        Slices.push_back(VectorSlice{SubVecTy, Idx, SubVecSize});
+        Slices.emplace_back(SubVecTy, Idx, SubVecSize);
     }
 
     // Scalarize all remaining elements.
     for (; Idx < NumElts; ++Idx)
-      Slices.push_back(VectorSlice{EltTy, Idx, 1});
+      Slices.emplace_back(EltTy, Idx, 1);
   }
 
   if (Slices.size() == 1)
     return false;
 
-  // Break up this PHI's incoming values.
-  for (unsigned Idx = 0; Idx < I.getNumIncomingValues(); ++Idx) {
-    Value *Inc = I.getIncomingValue(Idx);
-
-    IRBuilder<> B(I.getIncomingBlock(Idx)->getTerminator());
-    if (Instruction *IncInst = dyn_cast<Instruction>(Inc))
-      B.SetCurrentDebugLocation(IncInst->getDebugLoc());
-
-    unsigned NameSuffix = 0;
-    for (VectorSlice &S : Slices) {
-      const auto ValName =
-          "largephi.extractslice" + std::to_string(NameSuffix++);
-      if (S.NumElts > 1) {
-        SmallVector<int, 4> Mask;
-        for (unsigned K = S.Idx; K < (S.Idx + S.NumElts); ++K)
-          Mask.push_back(K);
-        S.IncomingValues.push_back(B.CreateShuffleVector(Inc, Mask, ValName));
-      } else
-        S.IncomingValues.push_back(B.CreateExtractElement(Inc, S.Idx, ValName));
-    }
-  }
-
-  // Now create one PHI per vector piece.
-  IRBuilder<> B(I.getParent()->getFirstNonPHI());
+  // Create one PHI per vector piece. The "VectorSlice" class takes care of
+  // creating the necessary instruction to extract the relevant slices of each
+  // incoming value.
+  IRBuilder<> B(I.getParent());
   B.SetCurrentDebugLocation(I.getDebugLoc());
 
+  unsigned IncNameSuffix = 0;
   for (VectorSlice &S : Slices) {
+    // We need to reset the build on each iteration, because getSlicedVal may
+    // have inserted something into I's BB.
+    B.SetInsertPoint(I.getParent()->getFirstNonPHI());
     S.NewPHI = B.CreatePHI(S.Ty, I.getNumIncomingValues());
-    for (const auto &[Idx, BB] : enumerate(I.blocks()))
-      S.NewPHI->addIncoming(S.IncomingValues[Idx], BB);
+
+    for (const auto &[Idx, BB] : enumerate(I.blocks())) {
+      S.NewPHI->addIncoming(S.getSlicedVal(BB, I.getIncomingValue(Idx),
+                                           "largephi.extractslice" +
+                                               std::to_string(IncNameSuffix++)),
+                            BB);
+    }
   }
 
   // And replace this PHI with a vector of all the previous PHI values.
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4678,7 +4678,10 @@
                                              SelectionDAG &DAG) const {
   unsigned Opc = Op.getOpcode();
   EVT VT = Op.getValueType();
-  assert(VT == MVT::v4f16 || VT == MVT::v4i16);
+  assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
+         VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
+         VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
+         VT == MVT::v32f32);
 
   SDValue Lo, Hi;
   std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -634,11 +634,11 @@
   assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&
          "VGPR used for an intermediate copy should have been reserved.");
 
-  // Only loop through if there are any free registers left, otherwise
-  // scavenger may report a fatal error without emergency spill slot
-  // or spill with the slot.
-  while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) {
-    Register Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
+  // Only loop through if there are any free registers left. We don't want to
+  // spill.
+  while (RegNo--) {
+    Register Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0,
+                                        /* AllowSpill */ false);
     if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
       break;
     Tmp = Tmp2;
@@ -7919,9 +7919,10 @@
     return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
 
   // If available, prefer to use vcc.
-  Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
-                             ? Register(RI.getVCC())
-                             : RS.scavengeRegister(RI.getBoolRC(), I, 0, false);
+  Register UnusedCarry =
+      !RS.isRegUsed(AMDGPU::VCC)
+          ? Register(RI.getVCC())
+          : RS.scavengeRegister(RI.getBoolRC(), I, 0, /* AllowSpill */ false);
 
   // TODO: Users need to deal with this.
   if (!UnusedCarry.isValid())
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -1539,7 +1539,11 @@
       assert(MI->getNumExplicitOperands() == 2);
     }
 
-    MI->setDesc(TII->get(AMDGPU::COPY));
+    unsigned CopyOp = MI->getOperand(1).isReg()
+                          ? AMDGPU::COPY
+                          : TII->getMovOpcode(TRI->getRegClassForOperandReg(
+                                *MRI, MI->getOperand(0)));
+    MI->setDesc(TII->get(CopyOp));
   }
 }
 
diff --git a/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.h b/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.h
--- a/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.h
+++ b/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.h
@@ -47,13 +47,13 @@
   bool supportsInstrumentType(StringRef Type) const override;
 
   /// Create a Instrument for RISC-V target
-  SharedInstrument createInstrument(StringRef Desc, StringRef Data) override;
+  UniqueInstrument createInstrument(StringRef Desc, StringRef Data) override;
 
   /// Using the Instrument, returns a SchedClassID to use instead of
   /// the SchedClassID that belongs to the MCI or the original SchedClassID.
   unsigned
   getSchedClassID(const MCInstrInfo &MCII, const MCInst &MCI,
-                  const SmallVector<SharedInstrument> &IVec) const override;
+                  const SmallVector<Instrument *> &IVec) const override;
 };
 
 } // namespace mca
diff --git a/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp b/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp
--- a/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp
+++ b/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp
@@ -73,7 +73,7 @@
   return Type == RISCVLMULInstrument::DESC_NAME;
 }
 
-SharedInstrument
+UniqueInstrument
 RISCVInstrumentManager::createInstrument(llvm::StringRef Desc,
                                          llvm::StringRef Data) {
   if (Desc != RISCVLMULInstrument::DESC_NAME) {
@@ -86,19 +86,19 @@
                       << Data << '\n');
     return nullptr;
   }
-  return std::make_shared<RISCVLMULInstrument>(Data);
+  return std::make_unique<RISCVLMULInstrument>(Data);
 }
 
 unsigned RISCVInstrumentManager::getSchedClassID(
     const MCInstrInfo &MCII, const MCInst &MCI,
-    const llvm::SmallVector<SharedInstrument> &IVec) const {
+    const llvm::SmallVector<Instrument *> &IVec) const {
   unsigned short Opcode = MCI.getOpcode();
   unsigned SchedClassID = MCII.get(Opcode).getSchedClass();
 
   for (const auto &I : IVec) {
     // Unknown Instrument kind
     if (I->getDesc() == RISCVLMULInstrument::DESC_NAME) {
-      uint8_t LMUL = static_cast<RISCVLMULInstrument *>(I.get())->getLMUL();
+      uint8_t LMUL = static_cast<RISCVLMULInstrument *>(I)->getLMUL();
       const RISCVVInversePseudosTable::PseudoInfo *RVV =
           RISCVVInversePseudosTable::getBaseInfo(Opcode, LMUL);
       // Not a RVV instr
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -342,9 +342,7 @@
 defvar vbool64_t = nxv1i1;
 
 // There is no need to define register classes for fractional LMUL.
-def LMULList {
-  list<int> m = [1, 2, 4, 8];
-}
+defvar LMULList = [1, 2, 4, 8];
 
 //===----------------------------------------------------------------------===//
 // Utility classes for segment load/store.
@@ -576,7 +574,7 @@
            (add (sequence "V%u", 8, 31),
                 (sequence "V%u", 0, 7)), 1>;
 
-foreach m = LMULList.m in {
+foreach m = LMULList in {
   foreach nf = NFList<m>.L in {
     def "VRN" # nf # "M" # m # "NoV0": VReg<[untyped],
                                (add !cast<RegisterTuples>("VN" # nf # "M" # m # "NoV0")),
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -106,6 +106,12 @@
                                         Align Alignment, unsigned AddressSpace,
                                         TTI::TargetCostKind CostKind);
 
+  InstructionCost getPointersChainCost(ArrayRef<const Value *> Ptrs,
+                                       const Value *Base,
+                                       const TTI::PointersChainInfo &Info,
+                                       Type *AccessTy,
+                                       TTI::TargetCostKind CostKind);
+
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                TTI::UnrollingPreferences &UP,
                                OptimizationRemarkEmitter *ORE);
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1592,6 +1592,55 @@
   }
 }
 
+// TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
+InstructionCost RISCVTTIImpl::getPointersChainCost(
+    ArrayRef<const Value *> Ptrs, const Value *Base,
+    const TTI::PointersChainInfo &Info, Type *AccessTy,
+    TTI::TargetCostKind CostKind) {
+  InstructionCost Cost = TTI::TCC_Free;
+  // In the basic model we take into account GEP instructions only
+  // (although here can come alloca instruction, a value, constants and/or
+  // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
+  // pointer). Typically, if Base is a not a GEP-instruction and all the
+  // pointers are relative to the same base address, all the rest are
+  // either GEP instructions, PHIs, bitcasts or constants. When we have same
+  // base, we just calculate cost of each non-Base GEP as an ADD operation if
+  // any their index is a non-const.
+  // If no known dependecies between the pointers cost is calculated as a sum
+  // of costs of GEP instructions.
+  for (auto [I, V] : enumerate(Ptrs)) {
+    const auto *GEP = dyn_cast<GetElementPtrInst>(V);
+    if (!GEP)
+      continue;
+    if (Info.isSameBase() && V != Base) {
+      if (GEP->hasAllConstantIndices())
+        continue;
+      // If the chain is unit-stride and BaseReg + stride*i is a legal
+      // addressing mode, then presume the base GEP is sitting around in a
+      // register somewhere and check if we can fold the offset relative to
+      // it.
+      unsigned Stride = DL.getTypeStoreSize(AccessTy);
+      if (Info.isUnitStride() &&
+          isLegalAddressingMode(AccessTy,
+                                /* BaseGV */ nullptr,
+                                /* BaseOffset */ Stride * I,
+                                /* HasBaseReg */ true,
+                                /* Scale */ 0,
+                                GEP->getType()->getPointerAddressSpace()))
+        continue;
+      Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind,
+                                     {TTI::OK_AnyValue, TTI::OP_None},
+                                     {TTI::OK_AnyValue, TTI::OP_None},
+                                     std::nullopt);
+    } else {
+      SmallVector<const Value *> Indices(GEP->indices());
+      Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
+                         Indices, CostKind);
+    }
+  }
+  return Cost;
+}
+
 void RISCVTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                            TTI::UnrollingPreferences &UP,
                                            OptimizationRemarkEmitter *ORE) {
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -529,7 +529,7 @@
   }
 
   if (Subtarget.hasSSEPrefetch() || Subtarget.hasThreeDNow())
-    setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
+    setOperationAction(ISD::PREFETCH      , MVT::Other, Custom);
 
   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
 
@@ -33984,6 +33984,18 @@
   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
 }
 
+static SDValue LowerPREFETCH(SDValue Op, const X86Subtarget &Subtarget,
+                             SelectionDAG &DAG) {
+  unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
+
+  // We don't support non-data prefetch without PREFETCHI.
+  // Just preserve the chain.
+  if (!IsData && !Subtarget.hasPREFETCHI())
+    return Op.getOperand(0);
+
+  return Op;
+}
+
 static StringRef getInstrStrFromOpNo(const SmallVectorImpl<StringRef> &AsmStrs,
                                      unsigned OpNo) {
   const APInt Operand(32, OpNo);
@@ -34188,6 +34200,7 @@
   case ISD::GC_TRANSITION_END:  return LowerGC_TRANSITION(Op, DAG);
   case ISD::ADDRSPACECAST:      return LowerADDRSPACECAST(Op, DAG);
   case X86ISD::CVTPS2PH:        return LowerCVTPS2PH(Op, DAG);
+  case ISD::PREFETCH:           return LowerPREFETCH(Op, Subtarget, DAG);
   }
 }
 
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -180,6 +180,7 @@
   InstructionCost getPointersChainCost(ArrayRef<const Value *> Ptrs,
                                        const Value *Base,
                                        const TTI::PointersChainInfo &Info,
+                                       Type *AccessTy,
                                        TTI::TargetCostKind CostKind);
   InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
                                             const SCEV *Ptr);
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -4943,9 +4943,11 @@
   return Cost + LT.first;
 }
 
-InstructionCost X86TTIImpl::getPointersChainCost(
-    ArrayRef<const Value *> Ptrs, const Value *Base,
-    const TTI::PointersChainInfo &Info, TTI::TargetCostKind CostKind) {
+InstructionCost
+X86TTIImpl::getPointersChainCost(ArrayRef<const Value *> Ptrs,
+                                 const Value *Base,
+                                 const TTI::PointersChainInfo &Info,
+                                 Type *AccessTy, TTI::TargetCostKind CostKind) {
   if (Info.isSameBase() && Info.isKnownStride()) {
     // If all the pointers have known stride all the differences are translated
     // into constants. X86 memory addressing allows encoding it into
@@ -4957,7 +4959,7 @@
     }
     return TTI::TCC_Free;
   }
-  return BaseT::getPointersChainCost(Ptrs, Base, Info, CostKind);
+  return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind);
 }
 
 InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty,
diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
--- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
@@ -49,7 +49,6 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/InlineCost.h"
-#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueLattice.h"
 #include "llvm/Analysis/ValueLatticeUtils.h"
@@ -82,10 +81,6 @@
     "Don't specialize functions that have less than this number of "
     "instructions"));
 
-static cl::opt<unsigned> AvgLoopIters(
-    "funcspec-avg-loop-iters", cl::init(10), cl::Hidden, cl::desc(
-    "Average loop iteration count"));
-
 static cl::opt<bool> SpecializeOnAddress(
     "funcspec-on-address", cl::init(false), cl::Hidden, cl::desc(
     "Enable function specialization on the address of global values"));
@@ -502,8 +497,7 @@
       // Calculate the specialisation gain.
       Cost Score = 0 - SpecCost;
       for (ArgInfo &A : S.Args)
-        Score +=
-            getSpecializationBonus(A.Formal, A.Actual, Solver.getLoopInfo(*F));
+        Score += getSpecializationBonus(A.Formal, A.Actual);
 
       // Discard unprofitable specialisations.
       if (!ForceSpecialization && Score <= 0)
@@ -594,41 +588,42 @@
 }
 
 static Cost getUserBonus(User *U, TargetTransformInfo &TTI,
-                         const LoopInfo &LI) {
+                         BlockFrequencyInfo &BFI) {
   auto *I = dyn_cast_or_null<Instruction>(U);
   // If not an instruction we do not know how to evaluate.
   // Keep minimum possible cost for now so that it doesnt affect
   // specialization.
   if (!I)
-    return std::numeric_limits<unsigned>::min();
+    return 0;
 
-  Cost Bonus =
-      TTI.getInstructionCost(U, TargetTransformInfo::TCK_SizeAndLatency);
+  uint64_t Weight = BFI.getBlockFreq(I->getParent()).getFrequency() /
+                    BFI.getEntryFreq();
+  if (!Weight)
+    return 0;
 
-  // Increase the cost if it is inside the loop.
-  unsigned LoopDepth = LI.getLoopDepth(I->getParent());
-  Bonus *= std::pow((double)AvgLoopIters, LoopDepth);
+  Cost Bonus = Weight *
+      TTI.getInstructionCost(U, TargetTransformInfo::TCK_SizeAndLatency);
 
   // Traverse recursively if there are more uses.
   // TODO: Any other instructions to be added here?
   if (I->mayReadFromMemory() || I->isCast())
     for (auto *User : I->users())
-      Bonus += getUserBonus(User, TTI, LI);
+      Bonus += getUserBonus(User, TTI, BFI);
 
   return Bonus;
 }
 
 /// Compute a bonus for replacing argument \p A with constant \p C.
-Cost FunctionSpecializer::getSpecializationBonus(Argument *A, Constant *C,
-                                                 const LoopInfo &LI) {
+Cost FunctionSpecializer::getSpecializationBonus(Argument *A, Constant *C) {
   Function *F = A->getParent();
   auto &TTI = (GetTTI)(*F);
+  auto &BFI = (GetBFI)(*F);
   LLVM_DEBUG(dbgs() << "FnSpecialization: Analysing bonus for constant: "
                     << C->getNameOrAsOperand() << "\n");
 
   Cost TotalCost = 0;
   for (auto *U : A->users()) {
-    TotalCost += getUserBonus(U, TTI, LI);
+    TotalCost += getUserBonus(U, TTI, BFI);
     LLVM_DEBUG(dbgs() << "FnSpecialization:   User cost ";
                TotalCost.print(dbgs()); dbgs() << " for: " << *U << "\n");
   }
diff --git a/llvm/lib/Transforms/IPO/SCCP.cpp b/llvm/lib/Transforms/IPO/SCCP.cpp
--- a/llvm/lib/Transforms/IPO/SCCP.cpp
+++ b/llvm/lib/Transforms/IPO/SCCP.cpp
@@ -13,7 +13,7 @@
 #include "llvm/Transforms/IPO/SCCP.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -107,13 +107,15 @@
 
 static bool runIPSCCP(
     Module &M, const DataLayout &DL, FunctionAnalysisManager *FAM,
+    std::function<BlockFrequencyInfo &(Function &)> GetBFI,
     std::function<const TargetLibraryInfo &(Function &)> GetTLI,
     std::function<TargetTransformInfo &(Function &)> GetTTI,
     std::function<AssumptionCache &(Function &)> GetAC,
     function_ref<AnalysisResultsForFn(Function &)> getAnalysis,
     bool IsFuncSpecEnabled) {
   SCCPSolver Solver(DL, GetTLI, M.getContext());
-  FunctionSpecializer Specializer(Solver, M, FAM, GetTLI, GetTTI, GetAC);
+  FunctionSpecializer Specializer(Solver, M, FAM, GetBFI, GetTLI, GetTTI,
+                                  GetAC);
 
   // Loop over all functions, marking arguments to those with their addresses
   // taken or that are external as overdefined.
@@ -381,21 +383,23 @@
   auto GetTLI = [&FAM](Function &F) -> const TargetLibraryInfo & {
     return FAM.getResult<TargetLibraryAnalysis>(F);
   };
+  auto GetBFI = [&](Function &F) -> BlockFrequencyInfo & {
+    return FAM.getResult<BlockFrequencyAnalysis>(F);
+  };
   auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & {
     return FAM.getResult<TargetIRAnalysis>(F);
   };
   auto GetAC = [&FAM](Function &F) -> AssumptionCache & {
     return FAM.getResult<AssumptionAnalysis>(F);
   };
-  auto getAnalysis = [&FAM, this](Function &F) -> AnalysisResultsForFn {
+  auto getAnalysis = [&FAM](Function &F) -> AnalysisResultsForFn {
     DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
     return {
         std::make_unique<PredicateInfo>(F, DT, FAM.getResult<AssumptionAnalysis>(F)),
-        &DT, FAM.getCachedResult<PostDominatorTreeAnalysis>(F),
-        isFuncSpecEnabled() ? &FAM.getResult<LoopAnalysis>(F) : nullptr };
+        &DT, FAM.getCachedResult<PostDominatorTreeAnalysis>(F) };
   };
 
-  if (!runIPSCCP(M, DL, &FAM, GetTLI, GetTTI, GetAC, getAnalysis,
+  if (!runIPSCCP(M, DL, &FAM, GetBFI, GetTLI, GetTTI, GetAC, getAnalysis,
                  isFuncSpecEnabled()))
     return PreservedAnalyses::all();
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -1611,6 +1611,17 @@
   if (!OtherBr || BBI == OtherBB->begin())
     return false;
 
+  auto OtherStoreIsMergeable = [&](StoreInst *OtherStore) -> bool {
+    if (!OtherStore ||
+        OtherStore->getPointerOperand() != SI.getPointerOperand())
+      return false;
+
+    auto *SIVTy = SI.getValueOperand()->getType();
+    auto *OSVTy = OtherStore->getValueOperand()->getType();
+    return CastInst::isBitOrNoopPointerCastable(OSVTy, SIVTy, DL) &&
+           SI.hasSameSpecialState(OtherStore);
+  };
+
   // If the other block ends in an unconditional branch, check for the 'if then
   // else' case. There is an instruction before the branch.
   StoreInst *OtherStore = nullptr;
@@ -1626,8 +1637,7 @@
     // If this isn't a store, isn't a store to the same location, or is not the
     // right kind of store, bail out.
     OtherStore = dyn_cast<StoreInst>(BBI);
-    if (!OtherStore || OtherStore->getOperand(1) != SI.getOperand(1) ||
-        !SI.isSameOperationAs(OtherStore))
+    if (!OtherStoreIsMergeable(OtherStore))
       return false;
   } else {
     // Otherwise, the other block ended with a conditional branch. If one of the
@@ -1641,12 +1651,10 @@
     // lives in OtherBB.
     for (;; --BBI) {
       // Check to see if we find the matching store.
-      if ((OtherStore = dyn_cast<StoreInst>(BBI))) {
-        if (OtherStore->getOperand(1) != SI.getOperand(1) ||
-            !SI.isSameOperationAs(OtherStore))
-          return false;
+      OtherStore = dyn_cast<StoreInst>(BBI);
+      if (OtherStoreIsMergeable(OtherStore))
         break;
-      }
+
       // If we find something that may be using or overwriting the stored
       // value, or if we run out of instructions, we can't do the transform.
       if (BBI->mayReadFromMemory() || BBI->mayThrow() ||
@@ -1664,14 +1672,17 @@
   }
 
   // Insert a PHI node now if we need it.
-  Value *MergedVal = OtherStore->getOperand(0);
+  Value *MergedVal = OtherStore->getValueOperand();
   // The debug locations of the original instructions might differ. Merge them.
   DebugLoc MergedLoc = DILocation::getMergedLocation(SI.getDebugLoc(),
                                                      OtherStore->getDebugLoc());
-  if (MergedVal != SI.getOperand(0)) {
-    PHINode *PN = PHINode::Create(MergedVal->getType(), 2, "storemerge");
-    PN->addIncoming(SI.getOperand(0), SI.getParent());
-    PN->addIncoming(OtherStore->getOperand(0), OtherBB);
+  if (MergedVal != SI.getValueOperand()) {
+    PHINode *PN =
+        PHINode::Create(SI.getValueOperand()->getType(), 2, "storemerge");
+    PN->addIncoming(SI.getValueOperand(), SI.getParent());
+    Builder.SetInsertPoint(OtherStore);
+    PN->addIncoming(Builder.CreateBitOrPointerCast(MergedVal, PN->getType()),
+                    OtherBB);
     MergedVal = InsertNewInstBefore(PN, DestBB->front());
     PN->setDebugLoc(MergedLoc);
   }
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -106,6 +106,8 @@
           "Number of min/max expressions hoisted out of the loop");
 STATISTIC(NumGEPsHoisted,
           "Number of geps reassociated and hoisted out of the loop");
+STATISTIC(NumAddSubHoisted, "Number of add/subtract expressions reassociated "
+                            "and hoisted out of the loop");
 
 /// Memory promotion is enabled by default.
 static cl::opt<bool>
@@ -2525,10 +2527,89 @@
   return true;
 }
 
+/// Try to turn things like "LV + C1 < C2" into "LV < C2 - C1". Here
+/// C1 and C2 are loop invariants and LV is a loop-variant.
+static bool hoistAdd(ICmpInst::Predicate Pred, Value *VariantLHS,
+                     Value *InvariantRHS, ICmpInst &ICmp, Loop &L,
+                     ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU,
+                     AssumptionCache *AC, DominatorTree *DT) {
+  assert(ICmpInst::isSigned(Pred) && "Not supported yet!");
+  assert(!L.isLoopInvariant(VariantLHS) && "Precondition.");
+  assert(L.isLoopInvariant(InvariantRHS) && "Precondition.");
+
+  // Try to represent VariantLHS as sum of invariant and variant operands.
+  using namespace PatternMatch;
+  Value *VariantOp, *InvariantOp;
+  if (!match(VariantLHS, m_NSWAdd(m_Value(VariantOp), m_Value(InvariantOp))))
+    return false;
+
+  // LHS itself is a loop-variant, try to represent it in the form:
+  // "VariantOp + InvariantOp". If it is possible, then we can reassociate.
+  if (L.isLoopInvariant(VariantOp))
+    std::swap(VariantOp, InvariantOp);
+  if (L.isLoopInvariant(VariantOp) || !L.isLoopInvariant(InvariantOp))
+    return false;
+
+  // In order to turn "LV + C1 < C2" into "LV < C2 - C1", we need to be able to
+  // freely move values from left side of inequality to right side (just as in
+  // normal linear arithmetics). Overflows make things much more complicated, so
+  // we want to avoid this.
+  auto &DL = L.getHeader()->getModule()->getDataLayout();
+  bool ProvedNoOverflowAfterReassociate =
+      computeOverflowForSignedSub(InvariantRHS, InvariantOp, DL, AC, &ICmp,
+                                  DT) == llvm::OverflowResult::NeverOverflows;
+  if (!ProvedNoOverflowAfterReassociate)
+    return false;
+  auto *Preheader = L.getLoopPreheader();
+  assert(Preheader && "Loop is not in simplify form?");
+  IRBuilder<> Builder(Preheader->getTerminator());
+  Value *NewCmpOp = Builder.CreateSub(InvariantRHS, InvariantOp, "invariant.op",
+                                      /*HasNUW*/ false, /*HasNSW*/ true);
+  ICmp.setPredicate(Pred);
+  ICmp.setOperand(0, VariantOp);
+  ICmp.setOperand(1, NewCmpOp);
+  eraseInstruction(cast<Instruction>(*VariantLHS), SafetyInfo, MSSAU);
+  return true;
+}
+
+/// Reassociate and hoist add/sub expressions.
+static bool hoistAddSub(Instruction &I, Loop &L, ICFLoopSafetyInfo &SafetyInfo,
+                        MemorySSAUpdater &MSSAU, AssumptionCache *AC,
+                        DominatorTree *DT) {
+  using namespace PatternMatch;
+  ICmpInst::Predicate Pred;
+  Value *LHS, *RHS;
+  if (!match(&I, m_ICmp(Pred, m_Value(LHS), m_Value(RHS))))
+    return false;
+
+  // TODO: Support unsigned predicates?
+  if (!ICmpInst::isSigned(Pred))
+    return false;
+
+  // Put variant operand to LHS position.
+  if (L.isLoopInvariant(LHS)) {
+    std::swap(LHS, RHS);
+    Pred = ICmpInst::getSwappedPredicate(Pred);
+  }
+  // We want to delete the initial operation after reassociation, so only do it
+  // if it has no other uses.
+  if (L.isLoopInvariant(LHS) || !L.isLoopInvariant(RHS) || !LHS->hasOneUse())
+    return false;
+
+  // TODO: We could go with smarter context, taking common dominator of all I's
+  // users instead of I itself.
+  if (hoistAdd(Pred, LHS, RHS, cast<ICmpInst>(I), L, SafetyInfo, MSSAU, AC, DT))
+    return true;
+
+  // TODO: Support Sub.
+
+  return false;
+}
+
 static bool hoistArithmetics(Instruction &I, Loop &L,
                              ICFLoopSafetyInfo &SafetyInfo,
-                             MemorySSAUpdater &MSSAU,
-                             AssumptionCache *AC, DominatorTree *DT) {
+                             MemorySSAUpdater &MSSAU, AssumptionCache *AC,
+                             DominatorTree *DT) {
   // Optimize complex patterns, such as (x < INV1 && x < INV2), turning them
   // into (x < min(INV1, INV2)), and hoisting the invariant part of this
   // expression out of the loop.
@@ -2545,6 +2626,13 @@
     return true;
   }
 
+  // Try to hoist add/sub's by reassociation.
+  if (hoistAddSub(I, L, SafetyInfo, MSSAU, AC, DT)) {
+    ++NumHoisted;
+    ++NumAddSubHoisted;
+    return true;
+  }
+
   return false;
 }
 
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -664,13 +664,6 @@
     return A->second.PredInfo->getPredicateInfoFor(I);
   }
 
-  const LoopInfo &getLoopInfo(Function &F) {
-    auto A = AnalysisResults.find(&F);
-    assert(A != AnalysisResults.end() && A->second.LI &&
-           "Need LoopInfo analysis results for function.");
-    return *A->second.LI;
-  }
-
   DomTreeUpdater getDTU(Function &F) {
     auto A = AnalysisResults.find(&F);
     assert(A != AnalysisResults.end() && "Need analysis results for function.");
@@ -1962,10 +1955,6 @@
   return Visitor->getPredicateInfoFor(I);
 }
 
-const LoopInfo &SCCPSolver::getLoopInfo(Function &F) {
-  return Visitor->getLoopInfo(F);
-}
-
 DomTreeUpdater SCCPSolver::getDTU(Function &F) { return Visitor->getDTU(F); }
 
 void SCCPSolver::trackValueOfGlobalVariable(GlobalVariable *GV) {
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -7394,7 +7394,8 @@
       // stay in vectorized code due to uses outside of these scalar
       // loads/stores.
       ScalarCost = TTI->getPointersChainCost(
-          Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), CostKind);
+          Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
+          CostKind);
 
       SmallVector<const Value *> PtrsRetainedInVecCode;
       for (Value *V : Ptrs) {
@@ -7420,7 +7421,7 @@
       }
       VecCost = TTI->getPointersChainCost(
           PtrsRetainedInVecCode, BasePtr,
-          TTI::PointersChainInfo::getKnownStride(), CostKind);
+          TTI::PointersChainInfo::getKnownStride(), VecTy, CostKind);
     } else {
       // Case 1: Ptrs are the arguments of loads that we are going to transform
       // into masked gather load intrinsic.
@@ -7436,7 +7437,8 @@
               ? TTI::PointersChainInfo::getUnknownStride()
               : TTI::PointersChainInfo::getKnownStride();
 
-      ScalarCost = TTI->getPointersChainCost(Ptrs, BasePtr, PtrsInfo, CostKind);
+      ScalarCost = TTI->getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy,
+                                             CostKind);
 
       // Remark: it not quite correct to use scalar GEP cost for a vector GEP,
       // but it's not clear how to do that without having vector GEP arguments
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -686,6 +686,11 @@
   }
 
   PHINode *getPhi() const { return Phi; }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the VPLiveOut to \p O.
+  void print(raw_ostream &O, VPSlotTracker &SlotTracker) const;
+#endif
 };
 
 /// VPRecipeBase is a base class modeling a sequence of one or more output IR
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -791,11 +791,7 @@
   if (!LiveOuts.empty())
     O << "\n";
   for (const auto &KV : LiveOuts) {
-    O << "Live-out ";
-    KV.second->getPhi()->printAsOperand(O);
-    O << " = ";
-    KV.second->getOperand(0)->printAsOperand(O, SlotTracker);
-    O << "\n";
+    KV.second->print(O, SlotTracker);
   }
 
   O << "}\n";
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -169,6 +169,16 @@
                    State.Builder.GetInsertBlock());
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
+  O << "Live-out ";
+  getPhi()->printAsOperand(O);
+  O << " = ";
+  getOperand(0)->printAsOperand(O, SlotTracker);
+  O << "\n";
+}
+#endif
+
 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
   assert(!Parent && "Recipe already in some VPBasicBlock");
   assert(InsertPos->getParent() &&
diff --git a/llvm/test/Analysis/ScalarEvolution/pr62380.ll b/llvm/test/Analysis/ScalarEvolution/pr62380.ll
--- a/llvm/test/Analysis/ScalarEvolution/pr62380.ll
+++ b/llvm/test/Analysis/ScalarEvolution/pr62380.ll
@@ -1,12 +1,50 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
 ; RUN: opt -passes='loop(loop-deletion),loop-mssa(loop-predication,licm<allowspeculation>,simple-loop-unswitch<nontrivial>),loop(loop-predication)' -S < %s | FileCheck %s
 
-; REQUIRES: asserts
-; XFAIL: *
-
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128-ni:1-p2:32:8:8:32-ni:2"
 target triple = "x86_64-unknown-linux-gnu"
 
 define void @test(i32 %arg) {
+; CHECK-LABEL: define void @test
+; CHECK-SAME: (i32 [[ARG:%.*]]) {
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB2:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br i1 false, label [[BB3_PREHEADER:%.*]], label [[BB1]]
+; CHECK:       bb3.preheader:
+; CHECK-NEXT:    [[LOAD_LE:%.*]] = load i32, ptr null, align 4
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb3.loopexit:
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[ADD:%.*]], [[BB3_LOOPEXIT:%.*]] ], [ 0, [[BB3_PREHEADER]] ]
+; CHECK-NEXT:    [[ADD]] = add i32 [[PHI]], 1
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp ult i32 [[PHI]], [[LOAD_LE]]
+; CHECK-NEXT:    br i1 [[ICMP]], label [[BB5:%.*]], label [[BB4:%.*]]
+; CHECK:       bb4:
+; CHECK-NEXT:    ret void
+; CHECK:       bb5:
+; CHECK-NEXT:    [[CALL:%.*]] = call i1 @llvm.experimental.widenable.condition()
+; CHECK-NEXT:    br i1 [[CALL]], label [[BB9_PREHEADER:%.*]], label [[BB14:%.*]]
+; CHECK:       bb9.preheader:
+; CHECK-NEXT:    br label [[BB9:%.*]]
+; CHECK:       bb6:
+; CHECK-NEXT:    [[ADD7:%.*]] = add i32 [[PHI10:%.*]], 1
+; CHECK-NEXT:    [[ICMP8:%.*]] = icmp ugt i32 [[PHI10]], 1
+; CHECK-NEXT:    br i1 [[ICMP8]], label [[BB3_LOOPEXIT]], label [[BB9]]
+; CHECK:       bb9:
+; CHECK-NEXT:    [[PHI10]] = phi i32 [ [[ADD7]], [[BB6:%.*]] ], [ [[PHI]], [[BB9_PREHEADER]] ]
+; CHECK-NEXT:    [[ICMP11:%.*]] = icmp ult i32 [[PHI10]], [[ARG]]
+; CHECK-NEXT:    [[CALL12:%.*]] = call i1 @llvm.experimental.widenable.condition()
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[ICMP11]], true
+; CHECK-NEXT:    br i1 [[AND]], label [[BB6]], label [[BB13:%.*]]
+; CHECK:       bb13:
+; CHECK-NEXT:    ret void
+; CHECK:       bb14:
+; CHECK-NEXT:    ret void
+;
 bb:
   br label %bb1
 
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-reinterpret.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-reinterpret.ll
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-reinterpret.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-reinterpret.ll
@@ -55,6 +55,14 @@
   ret <vscale x 16 x i1> %res
 }
 
+define <vscale x 16 x i1> @reinterpret_bool_from_svcount(target("aarch64.svcount") %pg) "target-features"="+sme2" {
+; CHECK-LABEL: reinterpret_bool_from_svcount:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ret
+  %out = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.taarch64.svcountt(target("aarch64.svcount") %pg)
+  ret <vscale x 16 x i1> %out
+}
+
 ;
 ; Converting from svbool_t
 ;
@@ -99,6 +107,15 @@
   ret <vscale x 1 x i1> %out
 }
 
+define target("aarch64.svcount") @reinterpret_bool_to_svcount(<vscale x 16 x i1> %pg) "target-features"="+sme2" {
+; CHECK-LABEL: reinterpret_bool_to_svcount:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ret
+  %out = call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt(<vscale x 16 x i1> %pg)
+  ret target("aarch64.svcount") %out
+}
+
+
 ; Reinterpreting a ptrue should not introduce an `and` instruction.
 define <vscale x 16 x i1> @reinterpret_ptrue() {
 ; CHECK-LABEL: reinterpret_ptrue:
@@ -142,9 +159,11 @@
 declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1>)
 declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1>)
 declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv1i1(<vscale x 1 x i1>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.taarch64.svcountt(target("aarch64.svcount"))
 
 declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv16i1(<vscale x 16 x i1>)
 declare <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1>)
 declare <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1>)
 declare <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1>)
 declare <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1>)
+declare target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt(<vscale x 16 x i1>)
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-psel.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-psel.ll
--- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-psel.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-psel.ll
@@ -22,70 +22,70 @@
   ret <vscale x 16 x i1> %res
 }
 
-define <vscale x 8 x i1> @psel_h(<vscale x 8 x i1> %p1, <vscale x 8 x i1> %p2, i32 %idx) {
+define <vscale x 16 x i1> @psel_h(<vscale x 16 x i1> %p1, <vscale x 8 x i1> %p2, i32 %idx) {
 ; CHECK-LABEL: psel_h:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    psel p0, p0, p1.h[w12, 0]
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i1> @llvm.aarch64.sve.psel.nxv8i1(<vscale x 8 x i1> %p1, <vscale x 8 x i1> %p2, i32 %idx)
-  ret <vscale x 8 x i1> %res
+  %res = call <vscale x 16 x i1> @llvm.aarch64.sve.psel.nxv8i1(<vscale x 16 x i1> %p1, <vscale x 8 x i1> %p2, i32 %idx)
+  ret <vscale x 16 x i1> %res
 }
 
-define <vscale x 8 x i1> @psel_h_imm(<vscale x 8 x i1> %p1, <vscale x 8 x i1> %p2, i32 %idx) {
+define <vscale x 16 x i1> @psel_h_imm(<vscale x 16 x i1> %p1, <vscale x 8 x i1> %p2, i32 %idx) {
 ; CHECK-LABEL: psel_h_imm:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    psel p0, p0, p1.h[w12, 7]
 ; CHECK-NEXT:    ret
   %add = add i32 %idx, 7
-  %res = call <vscale x 8 x i1> @llvm.aarch64.sve.psel.nxv8i1(<vscale x 8 x i1> %p1, <vscale x 8 x i1> %p2, i32 %add)
-  ret <vscale x 8 x i1> %res
+  %res = call <vscale x 16 x i1> @llvm.aarch64.sve.psel.nxv8i1(<vscale x 16 x i1> %p1, <vscale x 8 x i1> %p2, i32 %add)
+  ret <vscale x 16 x i1> %res
 }
 
-define <vscale x 4 x i1> @psel_s(<vscale x 4 x i1> %p1, <vscale x 4 x i1> %p2, i32 %idx) {
+define <vscale x 16 x i1> @psel_s(<vscale x 16 x i1> %p1, <vscale x 4 x i1> %p2, i32 %idx) {
 ; CHECK-LABEL: psel_s:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    psel p0, p0, p1.s[w12, 0]
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i1> @llvm.aarch64.sve.psel.nxv4i1(<vscale x 4 x i1> %p1, <vscale x 4 x i1> %p2, i32 %idx)
-  ret <vscale x 4 x i1> %res
+  %res = call <vscale x 16 x i1> @llvm.aarch64.sve.psel.nxv4i1(<vscale x 16 x i1> %p1, <vscale x 4 x i1> %p2, i32 %idx)
+  ret <vscale x 16 x i1> %res
 }
 
-define <vscale x 4 x i1> @psel_s_imm(<vscale x 4 x i1> %p1, <vscale x 4 x i1> %p2, i32 %idx) {
+define <vscale x 16 x i1> @psel_s_imm(<vscale x 16 x i1> %p1, <vscale x 4 x i1> %p2, i32 %idx) {
 ; CHECK-LABEL: psel_s_imm:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    psel p0, p0, p1.s[w12, 3]
 ; CHECK-NEXT:    ret
   %add = add i32 %idx, 3
-  %res = call <vscale x 4 x i1> @llvm.aarch64.sve.psel.nxv4i1(<vscale x 4 x i1> %p1, <vscale x 4 x i1> %p2, i32 %add)
-  ret <vscale x 4 x i1> %res
+  %res = call <vscale x 16 x i1> @llvm.aarch64.sve.psel.nxv4i1(<vscale x 16 x i1> %p1, <vscale x 4 x i1> %p2, i32 %add)
+  ret <vscale x 16 x i1> %res
 }
 
-define <vscale x 2 x i1> @psel_d(<vscale x 2 x i1> %p1, <vscale x 2 x i1> %p2, i32 %idx) {
+define <vscale x 16 x i1> @psel_d(<vscale x 16 x i1> %p1, <vscale x 2 x i1> %p2, i32 %idx) {
 ; CHECK-LABEL: psel_d:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    psel p0, p0, p1.d[w12, 0]
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i1> @llvm.aarch64.sve.psel.nxv2i1(<vscale x 2 x i1> %p1, <vscale x 2 x i1> %p2, i32 %idx)
-  ret <vscale x 2 x i1> %res
+  %res = call <vscale x 16 x i1> @llvm.aarch64.sve.psel.nxv2i1(<vscale x 16 x i1> %p1, <vscale x 2 x i1> %p2, i32 %idx)
+  ret <vscale x 16 x i1> %res
 }
 
-define <vscale x 2 x i1> @psel_d_imm(<vscale x 2 x i1> %p1, <vscale x 2 x i1> %p2, i32 %idx) {
+define <vscale x 16 x i1> @psel_d_imm(<vscale x 16 x i1> %p1, <vscale x 2 x i1> %p2, i32 %idx) {
 ; CHECK-LABEL: psel_d_imm:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    psel p0, p0, p1.d[w12, 1]
 ; CHECK-NEXT:    ret
   %add = add i32 %idx, 1
-  %res = call <vscale x 2 x i1> @llvm.aarch64.sve.psel.nxv2i1(<vscale x 2 x i1> %p1, <vscale x 2 x i1> %p2, i32 %add)
-  ret <vscale x 2 x i1> %res
+  %res = call <vscale x 16 x i1> @llvm.aarch64.sve.psel.nxv2i1(<vscale x 16 x i1> %p1, <vscale x 2 x i1> %p2, i32 %add)
+  ret <vscale x 16 x i1> %res
 }
 
 declare <vscale x 16 x i1> @llvm.aarch64.sve.psel.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, i32)
-declare <vscale x 8 x i1>  @llvm.aarch64.sve.psel.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, i32)
-declare <vscale x 4 x i1>  @llvm.aarch64.sve.psel.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, i32)
-declare <vscale x 2 x i1>  @llvm.aarch64.sve.psel.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, i32)
+declare <vscale x 16 x i1>  @llvm.aarch64.sve.psel.nxv8i1(<vscale x 16 x i1>, <vscale x 8 x i1>, i32)
+declare <vscale x 16 x i1>  @llvm.aarch64.sve.psel.nxv4i1(<vscale x 16 x i1>, <vscale x 4 x i1>, i32)
+declare <vscale x 16 x i1>  @llvm.aarch64.sve.psel.nxv2i1(<vscale x 16 x i1>, <vscale x 2 x i1>, i32)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-loads.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-loads.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-loads.ll
@@ -0,0 +1,648 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1,+bf16 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+bf16 < %s | FileCheck %s
+
+; == Normal Multi-Vector Consecutive Loads ==
+
+define { <vscale x 16 x i8>, <vscale x 16 x i8> } @ld1_x2_i8(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ld1_x2_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    ld1b { z0.b, z1.b }, pn8/z, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %pn, ptr %ptr);
+  ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res
+}
+
+define { <vscale x 8 x i16>, <vscale x 8 x i16> } @ld1_x2_i16(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ld1_x2_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    ld1h { z0.h, z1.h }, pn8/z, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %pn, ptr %ptr);
+  ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res
+}
+
+define { <vscale x 4 x i32>, <vscale x 4 x i32> } @ld1_x2_i32(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ld1_x2_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    ld1w { z0.s, z1.s }, pn8/z, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x2.nxv4i32(target("aarch64.svcount") %pn, ptr %ptr);
+  ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res
+}
+
+define { <vscale x 2 x i64>, <vscale x 2 x i64> } @ld1_x2_i64(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ld1_x2_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    ld1d { z0.d, z1.d }, pn8/z, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x2.nxv2i64(target("aarch64.svcount") %pn, ptr %ptr);
+  ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res
+}
+
+define { <vscale x 8 x half>, <vscale x 8 x half> } @ld1_x2_f16(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ld1_x2_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    ld1h { z0.h, z1.h }, pn8/z, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x2.nxv8f16(target("aarch64.svcount") %pn, ptr %ptr);
+  ret { <vscale x 8 x half>, <vscale x 8 x half> } %res
+}
+
+define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @ld1_x2_bf16(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ld1_x2_bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    ld1h { z0.h, z1.h }, pn8/z, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld1.pn.x2.nxv8bf16(target("aarch64.svcount") %pn, ptr %ptr);
+  ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
+}
+
+define { <vscale x 4 x float>, <vscale x 4 x float> } @ld1_x2_f32(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ld1_x2_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    ld1w { z0.s, z1.s }, pn8/z, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x2.nxv4f32(target("aarch64.svcount") %pn, ptr %ptr);
+  ret { <vscale x 4 x float>, <vscale x 4 x float> } %res
+}
+
+define { <vscale x 2 x double>, <vscale x 2 x double> } @ld1_x2_f64(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ld1_x2_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    ld1d { z0.d, z1.d }, pn8/z, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x2.nxv2f64(target("aarch64.svcount") %pn, ptr %ptr);
+  ret { <vscale x 2 x double>, <vscale x 2 x double> } %res
+}
+
+; Test to ensure we load into the correct registers for the instruction
+define <vscale x 16 x i8> @ld1_x2_i8_z0_taken(target("aarch64.svcount") %pn, ptr %ptr, <vscale x 16 x i8> %val) {
+; CHECK-LABEL: ld1_x2_i8_z0_taken:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    ld1b { z2.b, z3.b }, pn8/z, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    add z0.b, z0.b, z2.b
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %ld1 = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %pn, ptr %ptr);
+  %ld1_0 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %ld1, 0
+  %res = add <vscale x 16 x i8> %val, %ld1_0
+  ret <vscale x 16 x i8>  %res
+}
+
+define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld1_x4_i8(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ld1_x4_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    ld1b { z0.b - z3.b }, pn8/z, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %pn, ptr %ptr);
+  ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
+}
+
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @ld1_x4_i16(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ld1_x4_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    ld1h { z0.h - z3.h }, pn8/z, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %pn, ptr %ptr);
+  ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
+}
+
+define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @ld1_x4_i32(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ld1_x4_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    ld1w { z0.s - z3.s }, pn8/z, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x4.nxv4i32(target("aarch64.svcount") %pn, ptr %ptr);
+  ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
+}
+
+define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @ld1_x4_i64(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ld1_x4_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    ld1d { z0.d - z3.d }, pn8/z, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x4.nxv2i64(target("aarch64.svcount") %pn, ptr %ptr);
+  ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
+}
+
+define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @ld1_x4_f16(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ld1_x4_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    ld1h { z0.h - z3.h }, pn8/z, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x4.nxv8f16(target("aarch64.svcount") %pn, ptr %ptr);
+  ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
+}
+
+define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @ld1_x4_bf16(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ld1_x4_bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    ld1h { z0.h - z3.h }, pn8/z, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld1.pn.x4.nxv8bf16(target("aarch64.svcount") %pn, ptr %ptr);
+  ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
+}
+
+define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @ld1_x4_f32(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ld1_x4_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    ld1w { z0.s - z3.s }, pn8/z, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") %pn, ptr %ptr);
+  ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
+}
+
+define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @ld1_x4_f64(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ld1_x4_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    ld1d { z0.d - z3.d }, pn8/z, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x4.nxv2f64(target("aarch64.svcount") %pn, ptr %ptr);
+  ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res
+}
+
+; Test to ensure we load into the correct registers for the instruction
+define <vscale x 8 x i16> @ld1_x4_i16_z0_taken(target("aarch64.svcount") %pn, ptr %ptr, <vscale x 8 x i16> %val) {
+; CHECK-LABEL: ld1_x4_i16_z0_taken:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    ld1h { z4.h - z7.h }, pn8/z, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    add z0.h, z0.h, z4.h
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %ld1 = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %pn, ptr %ptr);
+  %ld1_0 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %ld1, 0
+  %res = add <vscale x 8 x i16> %val, %ld1_0
+  ret <vscale x 8 x i16>  %res
+}
+
+
+; == Non-temporal Multi-Vector Consecutive Loads ==
+
+define { <vscale x 16 x i8>, <vscale x 16 x i8> } @ldnt1_x2_i8(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ldnt1_x2_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    ldnt1b { z0.b, z1.b }, pn8/z, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv16i8(target("aarch64.svcount") %pn, ptr %ptr);
+  ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res
+}
+
+define { <vscale x 8 x i16>, <vscale x 8 x i16> } @ldnt1_x2_i16(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ldnt1_x2_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    ldnt1h { z0.h, z1.h }, pn8/z, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8i16(target("aarch64.svcount") %pn, ptr %ptr);
+  ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res
+}
+
+define { <vscale x 4 x i32>, <vscale x 4 x i32> } @ldnt1_x2_i32(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ldnt1_x2_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    ldnt1w { z0.s, z1.s }, pn8/z, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4i32(target("aarch64.svcount") %pn, ptr %ptr);
+  ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res
+}
+
+define { <vscale x 2 x i64>, <vscale x 2 x i64> } @ldnt1_x2_i64(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ldnt1_x2_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    ldnt1d { z0.d, z1.d }, pn8/z, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2i64(target("aarch64.svcount") %pn, ptr %ptr);
+  ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res
+}
+
+define { <vscale x 8 x half>, <vscale x 8 x half> } @ldnt1_x2_f16(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ldnt1_x2_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    ldnt1h { z0.h, z1.h }, pn8/z, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8f16(target("aarch64.svcount") %pn, ptr %ptr);
+  ret { <vscale x 8 x half>, <vscale x 8 x half> } %res
+}
+
+define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @ldnt1_x2_bf16(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ldnt1_x2_bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    ldnt1h { z0.h, z1.h }, pn8/z, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8bf16(target("aarch64.svcount") %pn, ptr %ptr);
+  ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
+}
+
+define { <vscale x 4 x float>, <vscale x 4 x float> } @ldnt1_x2_f32(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ldnt1_x2_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    ldnt1w { z0.s, z1.s }, pn8/z, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4f32(target("aarch64.svcount") %pn, ptr %ptr);
+  ret { <vscale x 4 x float>, <vscale x 4 x float> } %res
+}
+
+define { <vscale x 2 x double>, <vscale x 2 x double> } @ldnt1_x2_f64(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ldnt1_x2_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    ldnt1d { z0.d, z1.d }, pn8/z, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2f64(target("aarch64.svcount") %pn, ptr %ptr);
+  ret { <vscale x 2 x double>, <vscale x 2 x double> } %res
+}
+
+; Test to ensure we load into the correct registers for the instruction
+define <vscale x 4 x i32> @ldnt1_x2_i32_z0_taken(target("aarch64.svcount") %pn, ptr %ptr, <vscale x 4 x i32> %val) {
+; CHECK-LABEL: ldnt1_x2_i32_z0_taken:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    ldnt1w { z2.s, z3.s }, pn8/z, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    add z0.s, z0.s, z2.s
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %ld1 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4i32(target("aarch64.svcount") %pn, ptr %ptr);
+  %ld1_0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %ld1, 0
+  %res = add <vscale x 4 x i32> %val, %ld1_0
+  ret <vscale x 4 x i32>  %res
+}
+
+define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ldnt1_x4_i8(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ldnt1_x4_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    ldnt1b { z0.b - z3.b }, pn8/z, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv16i8(target("aarch64.svcount") %pn, ptr %ptr);
+  ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
+}
+
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @ldnt1_x4_i16(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ldnt1_x4_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    ldnt1h { z0.h - z3.h }, pn8/z, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8i16(target("aarch64.svcount") %pn, ptr %ptr);
+  ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
+}
+
+define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @ldnt1_x4_i32(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ldnt1_x4_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    ldnt1w { z0.s - z3.s }, pn8/z, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4i32(target("aarch64.svcount") %pn, ptr %ptr);
+  ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
+}
+
+define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @ldnt1_x4_i64(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ldnt1_x4_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    ldnt1d { z0.d - z3.d }, pn8/z, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2i64(target("aarch64.svcount") %pn, ptr %ptr);
+  ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
+}
+
+define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @ldnt1_x4_f16(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ldnt1_x4_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    ldnt1h { z0.h - z3.h }, pn8/z, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8f16(target("aarch64.svcount") %pn, ptr %ptr);
+  ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
+}
+
+define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @ldnt1_x4_bf16(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ldnt1_x4_bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    ldnt1h { z0.h - z3.h }, pn8/z, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8bf16(target("aarch64.svcount") %pn, ptr %ptr);
+  ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
+}
+
+define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @ldnt1_x4_f32(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ldnt1_x4_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    ldnt1w { z0.s - z3.s }, pn8/z, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4f32(target("aarch64.svcount") %pn, ptr %ptr);
+  ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
+}
+
+define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @ldnt1_x4_f64(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: ldnt1_x4_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    ldnt1d { z0.d - z3.d }, pn8/z, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2f64(target("aarch64.svcount") %pn, ptr %ptr);
+  ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res
+}
+
+; Test to ensure we load into the correct registers for the instruction
+define <vscale x 2 x i64> @ldnt1_x4_i64_z0_taken(target("aarch64.svcount") %pn, ptr %ptr, <vscale x 2 x i64> %val) {
+; CHECK-LABEL: ldnt1_x4_i64_z0_taken:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    ldnt1d { z4.d - z7.d }, pn8/z, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    add z0.d, z0.d, z4.d
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %ld1 = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2i64(target("aarch64.svcount") %pn, ptr %ptr);
+  %ld1_0 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %ld1, 0
+  %res = add <vscale x 2 x i64> %val, %ld1_0
+  ret <vscale x 2 x i64>  %res
+}
+
+declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x2.nxv2i64(target("aarch64.svcount"), ptr)
+declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x2.nxv4i32(target("aarch64.svcount"), ptr)
+declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount"), ptr)
+declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount"), ptr)
+declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x2.nxv2f64(target("aarch64.svcount"), ptr)
+declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x2.nxv4f32(target("aarch64.svcount"), ptr)
+declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x2.nxv8f16(target("aarch64.svcount"), ptr)
+declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld1.pn.x2.nxv8bf16(target("aarch64.svcount"), ptr)
+
+declare { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x4.nxv2i64(target("aarch64.svcount"), ptr)
+declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x4.nxv4i32(target("aarch64.svcount"), ptr)
+declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount"), ptr)
+declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount"), ptr)
+declare { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x4.nxv2f64(target("aarch64.svcount"), ptr)
+declare { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount"), ptr)
+declare { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x4.nxv8f16(target("aarch64.svcount"), ptr)
+declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld1.pn.x4.nxv8bf16(target("aarch64.svcount"), ptr)
+
+declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2i64(target("aarch64.svcount"), ptr)
+declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4i32(target("aarch64.svcount"), ptr)
+declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8i16(target("aarch64.svcount"), ptr)
+declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv16i8(target("aarch64.svcount"), ptr)
+declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2f64(target("aarch64.svcount"), ptr)
+declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4f32(target("aarch64.svcount"), ptr)
+declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8f16(target("aarch64.svcount"), ptr)
+declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8bf16(target("aarch64.svcount"), ptr)
+
+declare { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2i64(target("aarch64.svcount"), ptr)
+declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4i32(target("aarch64.svcount"), ptr)
+declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8i16(target("aarch64.svcount"), ptr)
+declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv16i8(target("aarch64.svcount"), ptr)
+declare { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2f64(target("aarch64.svcount"), ptr)
+declare { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4f32(target("aarch64.svcount"), ptr)
+declare { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8f16(target("aarch64.svcount"), ptr)
+declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8bf16(target("aarch64.svcount"), ptr)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx2.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx2.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx2.ll
@@ -0,0 +1,174 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
+
+; == 8 to 64-bit elements ==
+
+define { <vscale x 16 x i8>, <vscale x 16 x i8> } @sel_x2_i8(target("aarch64.svcount") %pn, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2) nounwind {
+; CHECK-LABEL: sel_x2_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z5.d, z4.d
+; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z4.d, z3.d
+; CHECK-NEXT:    mov z6.d, z1.d
+; CHECK-NEXT:    sel { z0.b, z1.b }, pn8, { z6.b, z7.b }, { z4.b, z5.b }
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.sel.x2.nxv16i8(target("aarch64.svcount") %pn, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2)
+  ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res
+}
+
+define { <vscale x 8 x i16>, <vscale x 8 x i16> } @sel_x2_i16(target("aarch64.svcount") %pn, <vscale x 8 x i16> %unused, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2) nounwind {
+; CHECK-LABEL: sel_x2_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z5.d, z4.d
+; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z4.d, z3.d
+; CHECK-NEXT:    mov z6.d, z1.d
+; CHECK-NEXT:    sel { z0.h, z1.h }, pn8, { z6.h, z7.h }, { z4.h, z5.h }
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.sel.x2.nxv8i16(target("aarch64.svcount") %pn, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2)
+  ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res
+}
+
+define { <vscale x 8 x half>, <vscale x 8 x half> } @sel_x2_f16(target("aarch64.svcount") %pn, <vscale x 8 x half> %unused, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2) nounwind {
+; CHECK-LABEL: sel_x2_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z5.d, z4.d
+; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z4.d, z3.d
+; CHECK-NEXT:    mov z6.d, z1.d
+; CHECK-NEXT:    sel { z0.h, z1.h }, pn8, { z6.h, z7.h }, { z4.h, z5.h }
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.sel.x2.nxv8f16(target("aarch64.svcount") %pn, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2)
+  ret { <vscale x 8 x half>, <vscale x 8 x half> } %res
+}
+
+define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @sel_x2_bf16(target("aarch64.svcount") %pn, <vscale x 8 x bfloat> %unused, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2) nounwind {
+; CHECK-LABEL: sel_x2_bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z5.d, z4.d
+; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z4.d, z3.d
+; CHECK-NEXT:    mov z6.d, z1.d
+; CHECK-NEXT:    sel { z0.h, z1.h }, pn8, { z6.h, z7.h }, { z4.h, z5.h }
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.sel.x2.nxv8bf16(target("aarch64.svcount") %pn, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2)
+  ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
+}
+
+define { <vscale x 4 x i32>, <vscale x 4 x i32> } @sel_x2_i32(target("aarch64.svcount") %pn, <vscale x 4 x i32> %unused, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2) nounwind {
+; CHECK-LABEL: sel_x2_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z5.d, z4.d
+; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z4.d, z3.d
+; CHECK-NEXT:    mov z6.d, z1.d
+; CHECK-NEXT:    sel { z0.s, z1.s }, pn8, { z6.s, z7.s }, { z4.s, z5.s }
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.sel.x2.nxv4i32(target("aarch64.svcount") %pn, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2)
+  ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res
+}
+
+define { <vscale x 4 x float>, <vscale x 4 x float> } @sel_x2_f32(target("aarch64.svcount") %pn, <vscale x 4 x float> %unused, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2) nounwind {
+; CHECK-LABEL: sel_x2_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z5.d, z4.d
+; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z4.d, z3.d
+; CHECK-NEXT:    mov z6.d, z1.d
+; CHECK-NEXT:    sel { z0.s, z1.s }, pn8, { z6.s, z7.s }, { z4.s, z5.s }
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.sel.x2.nxv4f32(target("aarch64.svcount") %pn, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2)
+  ret { <vscale x 4 x float>, <vscale x 4 x float> } %res
+}
+
+define { <vscale x 2 x i64>, <vscale x 2 x i64> } @sel_x2_i64(target("aarch64.svcount") %pn, <vscale x 2 x i64> %unused, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2) nounwind {
+; CHECK-LABEL: sel_x2_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z5.d, z4.d
+; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z4.d, z3.d
+; CHECK-NEXT:    mov z6.d, z1.d
+; CHECK-NEXT:    sel { z0.d, z1.d }, pn8, { z6.d, z7.d }, { z4.d, z5.d }
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.sel.x2.nxv2i64(target("aarch64.svcount") %pn, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2)
+  ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res
+}
+
+define { <vscale x 2 x double>, <vscale x 2 x double> } @sel_x2_f64(target("aarch64.svcount") %pn, <vscale x 2 x double> %unused, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2) nounwind {
+; CHECK-LABEL: sel_x2_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z5.d, z4.d
+; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z4.d, z3.d
+; CHECK-NEXT:    mov z6.d, z1.d
+; CHECK-NEXT:    sel { z0.d, z1.d }, pn8, { z6.d, z7.d }, { z4.d, z5.d }
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.sel.x2.nxv2f64(target("aarch64.svcount") %pn, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2)
+  ret { <vscale x 2 x double>, <vscale x 2 x double> } %res
+}
+
+; == 8 to 64-bit elements ==
+declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.sel.x2.nxv16i8(target("aarch64.svcount") %pn, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2)
+declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.sel.x2.nxv8i16(target("aarch64.svcount") %pn, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2)
+declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.sel.x2.nxv4i32(target("aarch64.svcount") %pn, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2)
+declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.sel.x2.nxv2i64(target("aarch64.svcount") %pn, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2)
+declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.sel.x2.nxv8f16(target("aarch64.svcount") %pn, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2)
+declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.sel.x2.nxv8bf16(target("aarch64.svcount") %pn, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2)
+declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.sel.x2.nxv4f32(target("aarch64.svcount") %pn, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2)
+declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.sel.x2.nxv2f64(target("aarch64.svcount") %pn, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx4.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx4.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx4.ll
@@ -0,0 +1,215 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
+
+; == 8 to 64-bit elements ==
+
+define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @sel_x4_i8(target("aarch64.svcount") %pn, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4) nounwind {
+; CHECK-LABEL: sel_x4_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z26.d, z7.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov z25.d, z6.d
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    mov z24.d, z5.d
+; CHECK-NEXT:    mov z31.d, z4.d
+; CHECK-NEXT:    ld1b { z27.b }, p1/z, [x0]
+; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    mov z29.d, z2.d
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    sel { z0.b - z3.b }, pn8, { z28.b - z31.b }, { z24.b - z27.b }
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.sel.x4.nxv16i8(target("aarch64.svcount") %pn, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4)
+  ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
+}
+
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @sel_x4_i16(target("aarch64.svcount") %pn, <vscale x 8 x i16> %unused, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3, <vscale x 8 x i16> %zm4) nounwind {
+; CHECK-LABEL: sel_x4_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z26.d, z7.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov z25.d, z6.d
+; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    mov z24.d, z5.d
+; CHECK-NEXT:    mov z31.d, z4.d
+; CHECK-NEXT:    ld1h { z27.h }, p1/z, [x0]
+; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    mov z29.d, z2.d
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    sel { z0.h - z3.h }, pn8, { z28.h - z31.h }, { z24.h - z27.h }
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.sel.x4.nxv8i16(target("aarch64.svcount") %pn, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3, <vscale x 8 x i16> %zm4)
+  ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
+}
+
+define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @sel_x4_f16(target("aarch64.svcount") %pn, <vscale x 8 x half> %unused, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zn4, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3, <vscale x 8 x half> %zm4) nounwind {
+; CHECK-LABEL: sel_x4_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z26.d, z7.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov z25.d, z6.d
+; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    mov z24.d, z5.d
+; CHECK-NEXT:    mov z31.d, z4.d
+; CHECK-NEXT:    ld1h { z27.h }, p1/z, [x0]
+; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    mov z29.d, z2.d
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    sel { z0.h - z3.h }, pn8, { z28.h - z31.h }, { z24.h - z27.h }
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.sel.x4.nxv8f16(target("aarch64.svcount") %pn, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zn4, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3, <vscale x 8 x half> %zm4)
+  ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
+}
+
+define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @sel_x4_bf16(target("aarch64.svcount") %pn, <vscale x 8 x bfloat> %unused, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zn4, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2, <vscale x 8 x bfloat> %zm3, <vscale x 8 x bfloat> %zm4) nounwind {
+; CHECK-LABEL: sel_x4_bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z26.d, z7.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov z25.d, z6.d
+; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    mov z24.d, z5.d
+; CHECK-NEXT:    mov z31.d, z4.d
+; CHECK-NEXT:    ld1h { z27.h }, p1/z, [x0]
+; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    mov z29.d, z2.d
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    sel { z0.h - z3.h }, pn8, { z28.h - z31.h }, { z24.h - z27.h }
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.sel.x4.nxv8bf16(target("aarch64.svcount") %pn, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zn4, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2, <vscale x 8 x bfloat> %zm3, <vscale x 8 x bfloat> %zm4)
+  ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
+}
+
+define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @sel_x4_i32(target("aarch64.svcount") %pn, <vscale x 4 x i32> %unused, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, <vscale x 4 x i32> %zn4, <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3, <vscale x 4 x i32> %zm4) nounwind {
+; CHECK-LABEL: sel_x4_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z26.d, z7.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov z25.d, z6.d
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    mov z24.d, z5.d
+; CHECK-NEXT:    mov z31.d, z4.d
+; CHECK-NEXT:    ld1w { z27.s }, p1/z, [x0]
+; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    mov z29.d, z2.d
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    sel { z0.s - z3.s }, pn8, { z28.s - z31.s }, { z24.s - z27.s }
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.sel.x4.nxv4i32(target("aarch64.svcount") %pn, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, <vscale x 4 x i32> %zn4, <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3, <vscale x 4 x i32> %zm4)
+  ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
+}
+
+define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @sel_x4_f32(target("aarch64.svcount") %pn, <vscale x 4 x float> %unused, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, <vscale x 4 x float> %zn4, <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4) nounwind {
+; CHECK-LABEL: sel_x4_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z26.d, z7.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov z25.d, z6.d
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    mov z24.d, z5.d
+; CHECK-NEXT:    mov z31.d, z4.d
+; CHECK-NEXT:    ld1w { z27.s }, p1/z, [x0]
+; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    mov z29.d, z2.d
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    sel { z0.s - z3.s }, pn8, { z28.s - z31.s }, { z24.s - z27.s }
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.sel.x4.nxv4f32(target("aarch64.svcount") %pn, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, <vscale x 4 x float> %zn4, <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4)
+  ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
+}
+
+define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @sel_x4_i64(target("aarch64.svcount") %pn, <vscale x 2 x i64> %unused, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, <vscale x 2 x i64> %zn4, <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3, <vscale x 2 x i64> %zm4) nounwind {
+; CHECK-LABEL: sel_x4_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z26.d, z7.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov z25.d, z6.d
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    mov z24.d, z5.d
+; CHECK-NEXT:    mov z31.d, z4.d
+; CHECK-NEXT:    ld1d { z27.d }, p1/z, [x0]
+; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    mov z29.d, z2.d
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    sel { z0.d - z3.d }, pn8, { z28.d - z31.d }, { z24.d - z27.d }
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.sel.x4.nxv2i64(target("aarch64.svcount") %pn, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, <vscale x 2 x i64> %zn4, <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3, <vscale x 2 x i64> %zm4)
+  ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
+}
+
+define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @sel_x4_f64(target("aarch64.svcount") %pn, <vscale x 2 x double> %unused, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, <vscale x 2 x double> %zn4, <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4) nounwind {
+; CHECK-LABEL: sel_x4_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z26.d, z7.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov z25.d, z6.d
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    mov z24.d, z5.d
+; CHECK-NEXT:    mov z31.d, z4.d
+; CHECK-NEXT:    ld1d { z27.d }, p1/z, [x0]
+; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    mov z29.d, z2.d
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    sel { z0.d - z3.d }, pn8, { z28.d - z31.d }, { z24.d - z27.d }
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.sel.x4.nxv2f64(target("aarch64.svcount") %pn, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, <vscale x 2 x double> %zn4, <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4)
+  ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res
+}
+
+
+; == 8 to 64-bit elements ==
+declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.sel.x4.nxv16i8(target("aarch64.svcount") %pn, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4)
+declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.sel.x4.nxv8i16(target("aarch64.svcount") %pn, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3, <vscale x 8 x i16> %zm4)
+declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.sel.x4.nxv4i32(target("aarch64.svcount") %pn, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, <vscale x 4 x i32> %zn4, <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3, <vscale x 4 x i32> %zm4)
+declare { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.sel.x4.nxv2i64(target("aarch64.svcount") %pn, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, <vscale x 2 x i64> %zn4, <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3, <vscale x 2 x i64> %zm4)
+declare { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.sel.x4.nxv8f16(target("aarch64.svcount") %pn, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zn4, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3, <vscale x 8 x half> %zm4)
+declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.sel.x4.nxv8bf16(target("aarch64.svcount") %pn, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zn4, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2, <vscale x 8 x bfloat> %zm3, <vscale x 8 x bfloat> %zm4)
+declare { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.sel.x4.nxv4f32(target("aarch64.svcount") %pn, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, <vscale x 4 x float> %zn4, <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4)
+declare { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.sel.x4.nxv2f64(target("aarch64.svcount") %pn, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, <vscale x 2 x double> %zn4, <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-stores.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-stores.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-stores.ll
@@ -0,0 +1,650 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1,+bf16 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+bf16 < %s | FileCheck %s
+
+; == Normal Multi-Vector Consecutive Stores ==
+
+define void @st1_x2_i8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: st1_x2_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z3.d, z2.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    st1b { z2.b, z3.b }, pn8, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sve.st1.pn.x2.nxv16i8(<vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
+  ret void
+}
+
+define void @st1_x2_i16(<vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: st1_x2_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z3.d, z2.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    st1h { z2.h, z3.h }, pn8, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sve.st1.pn.x2.nxv8i16(<vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
+  ret void
+}
+
+define void @st1_x2_i32(<vscale x 16 x i8> %unused, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: st1_x2_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z3.d, z2.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    st1w { z2.s, z3.s }, pn8, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sve.st1.pn.x2.nxv4i32(<vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
+  ret void
+}
+
+define void @st1_x2_i64(<vscale x 16 x i8> %unused, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: st1_x2_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z3.d, z2.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    st1d { z2.d, z3.d }, pn8, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sve.st1.pn.x2.nxv2i64(<vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
+  ret void
+}
+
+define void @st1_x2_f16(<vscale x 16 x i8> %unused, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: st1_x2_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z3.d, z2.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    st1h { z2.h, z3.h }, pn8, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sve.st1.pn.x2.nxv8f16(<vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
+  ret void
+}
+
+define void @st1_x2_bf16(<vscale x 16 x i8> %unused, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: st1_x2_bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z3.d, z2.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    st1h { z2.h, z3.h }, pn8, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sve.st1.pn.x2.nxv8bf16(<vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
+  ret void
+}
+
+define void @st1_x2_f32(<vscale x 16 x i8> %unused, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: st1_x2_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z3.d, z2.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    st1w { z2.s, z3.s }, pn8, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sve.st1.pn.x2.nxv4f32(<vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
+  ret void
+}
+
+define void @st1_x2_f64(<vscale x 16 x i8> %unused, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: st1_x2_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z3.d, z2.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    st1d { z2.d, z3.d }, pn8, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sve.st1.pn.x2.nxv2f64(<vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
+  ret void
+}
+
+define void @st1_x4_i8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: st1_x4_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    st1b { z4.b - z7.b }, pn8, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sve.st1.pn.x4.nxv16i8(<vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
+  ret void
+}
+
+define void @st1_x4_i16(<vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: st1_x4_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    st1h { z4.h - z7.h }, pn8, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sve.st1.pn.x4.nxv8i16(<vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
+  ret void
+}
+
+define void @st1_x4_i32(<vscale x 16 x i8> %unused, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: st1_x4_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    st1w { z4.s - z7.s }, pn8, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sve.st1.pn.x4.nxv4i32(<vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
+  ret void
+}
+
+define void @st1_x4_i64(<vscale x 16 x i8> %unused, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: st1_x4_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    st1d { z4.d - z7.d }, pn8, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sve.st1.pn.x4.nxv2i64(<vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
+  ret void
+}
+
+define void @st1_x4_f16(<vscale x 16 x i8> %unused, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: st1_x4_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    st1h { z4.h - z7.h }, pn8, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sve.st1.pn.x4.nxv8f16(<vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
+  ret void
+}
+
+define void @st1_x4_bf16(<vscale x 16 x i8> %unused, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: st1_x4_bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    st1h { z4.h - z7.h }, pn8, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sve.st1.pn.x4.nxv8bf16(<vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
+  ret void
+}
+
+define void @st1_x4_f32(<vscale x 16 x i8> %unused, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: st1_x4_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    st1w { z4.s - z7.s }, pn8, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sve.st1.pn.x4.nxv4f32(<vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
+  ret void
+}
+
+define void @st1_x4_f64(<vscale x 16 x i8> %unused, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: st1_x4_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    st1d { z4.d - z7.d }, pn8, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sve.st1.pn.x4.nxv2f64(<vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
+  ret void
+}
+
+; == Non-temporal Multi-Vector Consecutive Stores ==
+
+define void @stnt1_x2_i8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: stnt1_x2_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z3.d, z2.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    stnt1b { z2.b, z3.b }, pn8, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sve.stnt1.pn.x2.nxv16i8(<vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
+  ret void
+}
+
+define void @stnt1_x2_i16(<vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: stnt1_x2_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z3.d, z2.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    stnt1h { z2.h, z3.h }, pn8, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8i16(<vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
+  ret void
+}
+
+define void @stnt1_x2_i32(<vscale x 16 x i8> %unused, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: stnt1_x2_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z3.d, z2.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    stnt1w { z2.s, z3.s }, pn8, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4i32(<vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
+  ret void
+}
+
+define void @stnt1_x2_i64(<vscale x 16 x i8> %unused, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: stnt1_x2_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z3.d, z2.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    stnt1d { z2.d, z3.d }, pn8, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2i64(<vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
+  ret void
+}
+
+define void @stnt1_x2_f16(<vscale x 16 x i8> %unused, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: stnt1_x2_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z3.d, z2.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    stnt1h { z2.h, z3.h }, pn8, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8f16(<vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
+  ret void
+}
+
+define void @stnt1_x2_bf16(<vscale x 16 x i8> %unused, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: stnt1_x2_bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z3.d, z2.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    stnt1h { z2.h, z3.h }, pn8, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8bf16(<vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
+  ret void
+}
+
+define void @stnt1_x2_f32(<vscale x 16 x i8> %unused, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: stnt1_x2_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z3.d, z2.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    stnt1w { z2.s, z3.s }, pn8, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4f32(<vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
+  ret void
+}
+
+define void @stnt1_x2_f64(<vscale x 16 x i8> %unused, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: stnt1_x2_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z3.d, z2.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    stnt1d { z2.d, z3.d }, pn8, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2f64(<vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
+  ret void
+}
+
+define void @stnt1_x4_i8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: stnt1_x4_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    stnt1b { z4.b - z7.b }, pn8, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sve.stnt1.pn.x4.nxv16i8(<vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
+  ret void
+}
+
+define void @stnt1_x4_i16(<vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: stnt1_x4_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    stnt1h { z4.h - z7.h }, pn8, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8i16(<vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
+  ret void
+}
+
+define void @stnt1_x4_i32(<vscale x 16 x i8> %unused, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: stnt1_x4_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    stnt1w { z4.s - z7.s }, pn8, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4i32(<vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
+  ret void
+}
+
+define void @stnt1_x4_i64(<vscale x 16 x i8> %unused, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: stnt1_x4_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    stnt1d { z4.d - z7.d }, pn8, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2i64(<vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
+  ret void
+}
+
+define void @stnt1_x4_f16(<vscale x 16 x i8> %unused, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: stnt1_x4_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    stnt1h { z4.h - z7.h }, pn8, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8f16(<vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
+  ret void
+}
+
+define void @stnt1_x4_bf16(<vscale x 16 x i8> %unused, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: stnt1_x4_bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    stnt1h { z4.h - z7.h }, pn8, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8bf16(<vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
+  ret void
+}
+
+define void @stnt1_x4_f32(<vscale x 16 x i8> %unused, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: stnt1_x4_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    stnt1w { z4.s - z7.s }, pn8, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4f32(<vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
+  ret void
+}
+
+define void @stnt1_x4_f64(<vscale x 16 x i8> %unused, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
+; CHECK-LABEL: stnt1_x4_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    stnt1d { z4.d - z7.d }, pn8, [x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2f64(<vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
+  ret void
+}
+
+declare void @llvm.aarch64.sve.st1.pn.x2.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.st1.pn.x2.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.st1.pn.x2.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.st1.pn.x2.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.st1.pn.x2.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.st1.pn.x2.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.st1.pn.x2.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.st1.pn.x2.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.stnt1.pn.x2.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.stnt1.pn.x2.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.stnt1.pn.x2.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.stnt1.pn.x2.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.stnt1.pn.x2.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.stnt1.pn.x2.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.stnt1.pn.x2.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.stnt1.pn.x2.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, target("aarch64.svcount"), ptr)
+
+
+declare void @llvm.aarch64.sve.st1.pn.x4.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.st1.pn.x4.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.st1.pn.x4.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.st1.pn.x4.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.st1.pn.x4.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.st1.pn.x4.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.st1.pn.x4.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.st1.pn.x4.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.stnt1.pn.x4.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.stnt1.pn.x4.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.stnt1.pn.x4.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.stnt1.pn.x4.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.stnt1.pn.x4.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.stnt1.pn.x4.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.stnt1.pn.x4.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, target("aarch64.svcount"), ptr)
+declare void @llvm.aarch64.sve.stnt1.pn.x4.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, target("aarch64.svcount"), ptr)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis-heuristics.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis-heuristics.ll
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis-heuristics.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis-heuristics.ll
@@ -12,19 +12,19 @@
 ; CHECK:       then:
 ; CHECK-NEXT:    [[X:%.*]] = insertelement <5 x double> [[IN:%.*]], double 3.140000e+00, i64 3
 ; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE0:%.*]] = extractelement <5 x double> [[X]], i64 0
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = extractelement <5 x double> [[X]], i64 1
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE2:%.*]] = extractelement <5 x double> [[X]], i64 2
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = extractelement <5 x double> [[X]], i64 3
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE4:%.*]] = extractelement <5 x double> [[X]], i64 4
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE2:%.*]] = extractelement <5 x double> [[X]], i64 1
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE4:%.*]] = extractelement <5 x double> [[X]], i64 2
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE6:%.*]] = extractelement <5 x double> [[X]], i64 3
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE8:%.*]] = extractelement <5 x double> [[X]], i64 4
 ; CHECK-NEXT:    br label [[FINALLY:%.*]]
 ; CHECK:       else:
 ; CHECK-NEXT:    br label [[FINALLY]]
 ; CHECK:       finally:
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE0]], [[THEN]] ], [ 3.140000e+00, [[ELSE]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE1]], [[THEN]] ], [ 9.900000e+00, [[ELSE]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE2]], [[THEN]] ], [ 6.140000e+00, [[ELSE]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE3]], [[THEN]] ], [ poison, [[ELSE]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE4]], [[THEN]] ], [ poison, [[ELSE]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE2]], [[THEN]] ], [ 9.900000e+00, [[ELSE]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE4]], [[THEN]] ], [ 6.140000e+00, [[ELSE]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE6]], [[THEN]] ], [ poison, [[ELSE]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE8]], [[THEN]] ], [ poison, [[ELSE]] ]
 ; CHECK-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = insertelement <5 x double> poison, double [[TMP0]], i64 0
 ; CHECK-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE0]], double [[TMP1]], i64 1
 ; CHECK-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE1]], double [[TMP2]], i64 2
@@ -57,19 +57,19 @@
 ; CHECK:       then:
 ; CHECK-NEXT:    [[X:%.*]] = insertelement <5 x double> [[IN:%.*]], double 3.140000e+00, i64 3
 ; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE0:%.*]] = extractelement <5 x double> [[X]], i64 0
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = extractelement <5 x double> [[X]], i64 1
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE2:%.*]] = extractelement <5 x double> [[X]], i64 2
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = extractelement <5 x double> [[X]], i64 3
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE4:%.*]] = extractelement <5 x double> [[X]], i64 4
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE2:%.*]] = extractelement <5 x double> [[X]], i64 1
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE4:%.*]] = extractelement <5 x double> [[X]], i64 2
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE6:%.*]] = extractelement <5 x double> [[X]], i64 3
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE8:%.*]] = extractelement <5 x double> [[X]], i64 4
 ; CHECK-NEXT:    br label [[FINALLY:%.*]]
 ; CHECK:       else:
 ; CHECK-NEXT:    br label [[FINALLY]]
 ; CHECK:       finally:
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE0]], [[THEN]] ], [ 0.000000e+00, [[ELSE]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE1]], [[THEN]] ], [ 0.000000e+00, [[ELSE]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE2]], [[THEN]] ], [ 0.000000e+00, [[ELSE]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE3]], [[THEN]] ], [ 0.000000e+00, [[ELSE]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE4]], [[THEN]] ], [ 0.000000e+00, [[ELSE]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE2]], [[THEN]] ], [ 0.000000e+00, [[ELSE]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE4]], [[THEN]] ], [ 0.000000e+00, [[ELSE]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE6]], [[THEN]] ], [ 0.000000e+00, [[ELSE]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE8]], [[THEN]] ], [ 0.000000e+00, [[ELSE]] ]
 ; CHECK-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = insertelement <5 x double> poison, double [[TMP0]], i64 0
 ; CHECK-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE0]], double [[TMP1]], i64 1
 ; CHECK-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE1]], double [[TMP2]], i64 2
@@ -101,19 +101,19 @@
 ; CHECK:       then:
 ; CHECK-NEXT:    [[X:%.*]] = insertelement <5 x double> [[IN:%.*]], double 3.140000e+00, i64 3
 ; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE0:%.*]] = extractelement <5 x double> [[X]], i64 0
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = extractelement <5 x double> [[X]], i64 1
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE2:%.*]] = extractelement <5 x double> [[X]], i64 2
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = extractelement <5 x double> [[X]], i64 3
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE4:%.*]] = extractelement <5 x double> [[X]], i64 4
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE2:%.*]] = extractelement <5 x double> [[X]], i64 1
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE4:%.*]] = extractelement <5 x double> [[X]], i64 2
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE6:%.*]] = extractelement <5 x double> [[X]], i64 3
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE8:%.*]] = extractelement <5 x double> [[X]], i64 4
 ; CHECK-NEXT:    br label [[FINALLY:%.*]]
 ; CHECK:       else:
 ; CHECK-NEXT:    br label [[FINALLY]]
 ; CHECK:       finally:
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE0]], [[THEN]] ], [ poison, [[ELSE]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE1]], [[THEN]] ], [ poison, [[ELSE]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE2]], [[THEN]] ], [ poison, [[ELSE]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE3]], [[THEN]] ], [ poison, [[ELSE]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE4]], [[THEN]] ], [ poison, [[ELSE]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE2]], [[THEN]] ], [ poison, [[ELSE]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE4]], [[THEN]] ], [ poison, [[ELSE]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE6]], [[THEN]] ], [ poison, [[ELSE]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE8]], [[THEN]] ], [ poison, [[ELSE]] ]
 ; CHECK-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = insertelement <5 x double> poison, double [[TMP0]], i64 0
 ; CHECK-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE0]], double [[TMP1]], i64 1
 ; CHECK-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE1]], double [[TMP2]], i64 2
@@ -145,25 +145,25 @@
 ; CHECK:       then:
 ; CHECK-NEXT:    [[X:%.*]] = insertelement <5 x double> [[IN:%.*]], double 3.140000e+00, i64 3
 ; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE0:%.*]] = extractelement <5 x double> [[X]], i64 0
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = extractelement <5 x double> [[X]], i64 1
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE2:%.*]] = extractelement <5 x double> [[X]], i64 2
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = extractelement <5 x double> [[X]], i64 3
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE4:%.*]] = extractelement <5 x double> [[X]], i64 4
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE2:%.*]] = extractelement <5 x double> [[X]], i64 1
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE4:%.*]] = extractelement <5 x double> [[X]], i64 2
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE6:%.*]] = extractelement <5 x double> [[X]], i64 3
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE8:%.*]] = extractelement <5 x double> [[X]], i64 4
 ; CHECK-NEXT:    br label [[FINALLY:%.*]]
 ; CHECK:       else:
 ; CHECK-NEXT:    [[SHUFFLED:%.*]] = shufflevector <5 x double> [[IN]], <5 x double> poison, <5 x i32> <i32 0, i32 3, i32 2, i32 1, i32 4>
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE01:%.*]] = extractelement <5 x double> [[SHUFFLED]], i64 0
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE12:%.*]] = extractelement <5 x double> [[SHUFFLED]], i64 1
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE23:%.*]] = extractelement <5 x double> [[SHUFFLED]], i64 2
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE34:%.*]] = extractelement <5 x double> [[SHUFFLED]], i64 3
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE45:%.*]] = extractelement <5 x double> [[SHUFFLED]], i64 4
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = extractelement <5 x double> [[SHUFFLED]], i64 0
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = extractelement <5 x double> [[SHUFFLED]], i64 1
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE5:%.*]] = extractelement <5 x double> [[SHUFFLED]], i64 2
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE7:%.*]] = extractelement <5 x double> [[SHUFFLED]], i64 3
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE9:%.*]] = extractelement <5 x double> [[SHUFFLED]], i64 4
 ; CHECK-NEXT:    br label [[FINALLY]]
 ; CHECK:       finally:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE0]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE01]], [[ELSE]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE1]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE12]], [[ELSE]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE2]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE23]], [[ELSE]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE3]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE34]], [[ELSE]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE4]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE45]], [[ELSE]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE0]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE2]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[ELSE]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE4]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[ELSE]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE6]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE8]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[ELSE]] ]
 ; CHECK-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = insertelement <5 x double> poison, double [[TMP0]], i64 0
 ; CHECK-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE0]], double [[TMP1]], i64 1
 ; CHECK-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE1]], double [[TMP2]], i64 2
@@ -196,26 +196,26 @@
 ; CHECK:       then:
 ; CHECK-NEXT:    [[X:%.*]] = insertelement <5 x double> [[IN:%.*]], double 3.140000e+00, i64 3
 ; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE0:%.*]] = extractelement <5 x double> [[X]], i64 0
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = extractelement <5 x double> [[X]], i64 1
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE2:%.*]] = extractelement <5 x double> [[X]], i64 2
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = extractelement <5 x double> [[X]], i64 3
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE4:%.*]] = extractelement <5 x double> [[X]], i64 4
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE2:%.*]] = extractelement <5 x double> [[X]], i64 1
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE4:%.*]] = extractelement <5 x double> [[X]], i64 2
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE6:%.*]] = extractelement <5 x double> [[X]], i64 3
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE8:%.*]] = extractelement <5 x double> [[X]], i64 4
 ; CHECK-NEXT:    br label [[FINALLY:%.*]]
 ; CHECK:       else:
 ; CHECK-NEXT:    [[LOCAL_SHUFFLE_SRC:%.*]] = insertelement <5 x double> [[IN]], double 3.250000e+00, i64 2
 ; CHECK-NEXT:    [[SHUFFLED:%.*]] = shufflevector <5 x double> [[LOCAL_SHUFFLE_SRC]], <5 x double> [[IN]], <5 x i32> <i32 7, i32 3, i32 2, i32 5, i32 4>
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE01:%.*]] = extractelement <5 x double> [[SHUFFLED]], i64 0
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE12:%.*]] = extractelement <5 x double> [[SHUFFLED]], i64 1
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE23:%.*]] = extractelement <5 x double> [[SHUFFLED]], i64 2
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE34:%.*]] = extractelement <5 x double> [[SHUFFLED]], i64 3
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE45:%.*]] = extractelement <5 x double> [[SHUFFLED]], i64 4
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = extractelement <5 x double> [[SHUFFLED]], i64 0
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = extractelement <5 x double> [[SHUFFLED]], i64 1
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE5:%.*]] = extractelement <5 x double> [[SHUFFLED]], i64 2
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE7:%.*]] = extractelement <5 x double> [[SHUFFLED]], i64 3
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE9:%.*]] = extractelement <5 x double> [[SHUFFLED]], i64 4
 ; CHECK-NEXT:    br label [[FINALLY]]
 ; CHECK:       finally:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE0]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE01]], [[ELSE]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE1]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE12]], [[ELSE]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE2]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE23]], [[ELSE]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE3]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE34]], [[ELSE]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE4]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE45]], [[ELSE]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE0]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE2]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[ELSE]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE4]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[ELSE]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE6]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE8]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[ELSE]] ]
 ; CHECK-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = insertelement <5 x double> poison, double [[TMP0]], i64 0
 ; CHECK-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE0]], double [[TMP1]], i64 1
 ; CHECK-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE1]], double [[TMP2]], i64 2
@@ -249,26 +249,26 @@
 ; CHECK:       then:
 ; CHECK-NEXT:    [[X:%.*]] = insertelement <5 x double> [[IN:%.*]], double 3.140000e+00, i64 3
 ; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE0:%.*]] = extractelement <5 x double> [[X]], i64 0
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = extractelement <5 x double> [[X]], i64 1
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE2:%.*]] = extractelement <5 x double> [[X]], i64 2
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = extractelement <5 x double> [[X]], i64 3
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE4:%.*]] = extractelement <5 x double> [[X]], i64 4
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE2:%.*]] = extractelement <5 x double> [[X]], i64 1
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE4:%.*]] = extractelement <5 x double> [[X]], i64 2
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE6:%.*]] = extractelement <5 x double> [[X]], i64 3
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE8:%.*]] = extractelement <5 x double> [[X]], i64 4
 ; CHECK-NEXT:    br label [[FINALLY:%.*]]
 ; CHECK:       else:
 ; CHECK-NEXT:    [[LOCAL_SHUFFLE_SRC:%.*]] = insertelement <5 x double> [[IN]], double 3.250000e+00, i64 2
 ; CHECK-NEXT:    [[SHUFFLED:%.*]] = shufflevector <5 x double> [[IN]], <5 x double> [[LOCAL_SHUFFLE_SRC]], <5 x i32> <i32 7, i32 3, i32 2, i32 5, i32 4>
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE01:%.*]] = extractelement <5 x double> [[SHUFFLED]], i64 0
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE12:%.*]] = extractelement <5 x double> [[SHUFFLED]], i64 1
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE23:%.*]] = extractelement <5 x double> [[SHUFFLED]], i64 2
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE34:%.*]] = extractelement <5 x double> [[SHUFFLED]], i64 3
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE45:%.*]] = extractelement <5 x double> [[SHUFFLED]], i64 4
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = extractelement <5 x double> [[SHUFFLED]], i64 0
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = extractelement <5 x double> [[SHUFFLED]], i64 1
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE5:%.*]] = extractelement <5 x double> [[SHUFFLED]], i64 2
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE7:%.*]] = extractelement <5 x double> [[SHUFFLED]], i64 3
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE9:%.*]] = extractelement <5 x double> [[SHUFFLED]], i64 4
 ; CHECK-NEXT:    br label [[FINALLY]]
 ; CHECK:       finally:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE0]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE01]], [[ELSE]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE1]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE12]], [[ELSE]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE2]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE23]], [[ELSE]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE3]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE34]], [[ELSE]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE4]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE45]], [[ELSE]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE0]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE2]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[ELSE]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE4]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[ELSE]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE6]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE8]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[ELSE]] ]
 ; CHECK-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = insertelement <5 x double> poison, double [[TMP0]], i64 0
 ; CHECK-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE0]], double [[TMP1]], i64 1
 ; CHECK-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE1]], double [[TMP2]], i64 2
@@ -340,24 +340,24 @@
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <5 x double> [[TMP1]], double [[Z:%.*]], i64 3
 ; CHECK-NEXT:    [[X_4:%.*]] = insertelement <5 x double> [[TMP2]], double [[X]], i64 4
 ; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE0:%.*]] = extractelement <5 x double> [[X_4]], i64 0
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = extractelement <5 x double> [[X_4]], i64 1
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE2:%.*]] = extractelement <5 x double> [[X_4]], i64 2
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = extractelement <5 x double> [[X_4]], i64 3
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE4:%.*]] = extractelement <5 x double> [[X_4]], i64 4
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE2:%.*]] = extractelement <5 x double> [[X_4]], i64 1
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE4:%.*]] = extractelement <5 x double> [[X_4]], i64 2
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE6:%.*]] = extractelement <5 x double> [[X_4]], i64 3
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE8:%.*]] = extractelement <5 x double> [[X_4]], i64 4
 ; CHECK-NEXT:    br label [[FINALLY:%.*]]
 ; CHECK:       else:
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE01:%.*]] = extractelement <5 x double> [[IN:%.*]], i64 0
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE12:%.*]] = extractelement <5 x double> [[IN]], i64 1
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE23:%.*]] = extractelement <5 x double> [[IN]], i64 2
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE34:%.*]] = extractelement <5 x double> [[IN]], i64 3
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE45:%.*]] = extractelement <5 x double> [[IN]], i64 4
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = extractelement <5 x double> [[IN:%.*]], i64 0
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = extractelement <5 x double> [[IN]], i64 1
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE5:%.*]] = extractelement <5 x double> [[IN]], i64 2
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE7:%.*]] = extractelement <5 x double> [[IN]], i64 3
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE9:%.*]] = extractelement <5 x double> [[IN]], i64 4
 ; CHECK-NEXT:    br label [[FINALLY]]
 ; CHECK:       finally:
-; CHECK-NEXT:    [[TMP3:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE0]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE01]], [[ELSE]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE1]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE12]], [[ELSE]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE2]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE23]], [[ELSE]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE3]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE34]], [[ELSE]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE4]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE45]], [[ELSE]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE0]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE2]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[ELSE]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE4]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[ELSE]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE6]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE8]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[ELSE]] ]
 ; CHECK-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = insertelement <5 x double> poison, double [[TMP3]], i64 0
 ; CHECK-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE0]], double [[TMP4]], i64 1
 ; CHECK-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE1]], double [[TMP5]], i64 2
@@ -394,24 +394,24 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <5 x double> <double 3.140000e+00, double 9.900000e+00, double 6.140000e+00, double poison, double poison>, double [[X:%.*]], i64 3
 ; CHECK-NEXT:    [[X_4:%.*]] = insertelement <5 x double> [[TMP0]], double [[X]], i64 4
 ; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE0:%.*]] = extractelement <5 x double> [[X_4]], i64 0
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = extractelement <5 x double> [[X_4]], i64 1
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE2:%.*]] = extractelement <5 x double> [[X_4]], i64 2
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = extractelement <5 x double> [[X_4]], i64 3
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE4:%.*]] = extractelement <5 x double> [[X_4]], i64 4
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE2:%.*]] = extractelement <5 x double> [[X_4]], i64 1
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE4:%.*]] = extractelement <5 x double> [[X_4]], i64 2
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE6:%.*]] = extractelement <5 x double> [[X_4]], i64 3
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE8:%.*]] = extractelement <5 x double> [[X_4]], i64 4
 ; CHECK-NEXT:    br label [[FINALLY:%.*]]
 ; CHECK:       else:
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE01:%.*]] = extractelement <5 x double> [[IN:%.*]], i64 0
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE12:%.*]] = extractelement <5 x double> [[IN]], i64 1
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE23:%.*]] = extractelement <5 x double> [[IN]], i64 2
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE34:%.*]] = extractelement <5 x double> [[IN]], i64 3
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE45:%.*]] = extractelement <5 x double> [[IN]], i64 4
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = extractelement <5 x double> [[IN:%.*]], i64 0
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = extractelement <5 x double> [[IN]], i64 1
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE5:%.*]] = extractelement <5 x double> [[IN]], i64 2
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE7:%.*]] = extractelement <5 x double> [[IN]], i64 3
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE9:%.*]] = extractelement <5 x double> [[IN]], i64 4
 ; CHECK-NEXT:    br label [[FINALLY]]
 ; CHECK:       finally:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE0]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE01]], [[ELSE]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE1]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE12]], [[ELSE]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE2]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE23]], [[ELSE]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE3]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE34]], [[ELSE]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE4]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE45]], [[ELSE]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE0]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE2]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[ELSE]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE4]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[ELSE]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE6]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE8]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[ELSE]] ]
 ; CHECK-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = insertelement <5 x double> poison, double [[TMP1]], i64 0
 ; CHECK-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE0]], double [[TMP2]], i64 1
 ; CHECK-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE1]], double [[TMP3]], i64 2
@@ -446,24 +446,24 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <5 x double> [[X_1]], <5 x double> <double poison, double poison, double poison, double 6.140000e+00, double 9.900000e+00>, <5 x i32> <i32 0, i32 1, i32 poison, i32 8, i32 9>
 ; CHECK-NEXT:    [[X_4:%.*]] = insertelement <5 x double> [[TMP0]], double [[X]], i64 2
 ; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE0:%.*]] = extractelement <5 x double> [[X_4]], i64 0
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = extractelement <5 x double> [[X_4]], i64 1
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE2:%.*]] = extractelement <5 x double> [[X_4]], i64 2
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = extractelement <5 x double> [[X_4]], i64 3
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE4:%.*]] = extractelement <5 x double> [[X_4]], i64 4
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE2:%.*]] = extractelement <5 x double> [[X_4]], i64 1
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE4:%.*]] = extractelement <5 x double> [[X_4]], i64 2
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE6:%.*]] = extractelement <5 x double> [[X_4]], i64 3
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE8:%.*]] = extractelement <5 x double> [[X_4]], i64 4
 ; CHECK-NEXT:    br label [[FINALLY:%.*]]
 ; CHECK:       else:
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE01:%.*]] = extractelement <5 x double> [[IN:%.*]], i64 0
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE12:%.*]] = extractelement <5 x double> [[IN]], i64 1
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE23:%.*]] = extractelement <5 x double> [[IN]], i64 2
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE34:%.*]] = extractelement <5 x double> [[IN]], i64 3
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE45:%.*]] = extractelement <5 x double> [[IN]], i64 4
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = extractelement <5 x double> [[IN:%.*]], i64 0
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = extractelement <5 x double> [[IN]], i64 1
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE5:%.*]] = extractelement <5 x double> [[IN]], i64 2
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE7:%.*]] = extractelement <5 x double> [[IN]], i64 3
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE9:%.*]] = extractelement <5 x double> [[IN]], i64 4
 ; CHECK-NEXT:    br label [[FINALLY]]
 ; CHECK:       finally:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE0]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE01]], [[ELSE]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE1]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE12]], [[ELSE]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE2]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE23]], [[ELSE]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE3]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE34]], [[ELSE]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE4]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE45]], [[ELSE]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE0]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE2]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[ELSE]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE4]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[ELSE]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE6]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE8]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[ELSE]] ]
 ; CHECK-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = insertelement <5 x double> poison, double [[TMP1]], i64 0
 ; CHECK-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE0]], double [[TMP2]], i64 1
 ; CHECK-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE1]], double [[TMP3]], i64 2
@@ -571,19 +571,19 @@
 ; CHECK:       then:
 ; CHECK-NEXT:    [[X:%.*]] = insertelement <5 x double> [[IN:%.*]], double 3.140000e+00, i64 3
 ; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE0:%.*]] = extractelement <5 x double> [[X]], i64 0
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = extractelement <5 x double> [[X]], i64 1
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE2:%.*]] = extractelement <5 x double> [[X]], i64 2
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = extractelement <5 x double> [[X]], i64 3
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE4:%.*]] = extractelement <5 x double> [[X]], i64 4
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE2:%.*]] = extractelement <5 x double> [[X]], i64 1
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE4:%.*]] = extractelement <5 x double> [[X]], i64 2
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE6:%.*]] = extractelement <5 x double> [[X]], i64 3
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE8:%.*]] = extractelement <5 x double> [[X]], i64 4
 ; CHECK-NEXT:    br label [[FINALLY:%.*]]
 ; CHECK:       else:
 ; CHECK-NEXT:    br label [[FINALLY]]
 ; CHECK:       finally:
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE0]], [[THEN]] ], [ 0.000000e+00, [[ELSE]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE1]], [[THEN]] ], [ 0.000000e+00, [[ELSE]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE2]], [[THEN]] ], [ 0.000000e+00, [[ELSE]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE3]], [[THEN]] ], [ 0.000000e+00, [[ELSE]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE4]], [[THEN]] ], [ 0.000000e+00, [[ELSE]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE2]], [[THEN]] ], [ 0.000000e+00, [[ELSE]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE4]], [[THEN]] ], [ 0.000000e+00, [[ELSE]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE6]], [[THEN]] ], [ 0.000000e+00, [[ELSE]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE8]], [[THEN]] ], [ 0.000000e+00, [[ELSE]] ]
 ; CHECK-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = insertelement <5 x double> poison, double [[TMP0]], i64 0
 ; CHECK-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE0]], double [[TMP1]], i64 1
 ; CHECK-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE1]], double [[TMP2]], i64 2
@@ -593,17 +593,17 @@
 ; CHECK-NEXT:    br i1 [[COND2:%.*]], label [[THEN1:%.*]], label [[END:%.*]]
 ; CHECK:       then1:
 ; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE01:%.*]] = extractelement <5 x double> [[LARGEPHI_INSERTSLICE4]], i64 0
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE12:%.*]] = extractelement <5 x double> [[LARGEPHI_INSERTSLICE4]], i64 1
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE23:%.*]] = extractelement <5 x double> [[LARGEPHI_INSERTSLICE4]], i64 2
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE34:%.*]] = extractelement <5 x double> [[LARGEPHI_INSERTSLICE4]], i64 3
-; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE45:%.*]] = extractelement <5 x double> [[LARGEPHI_INSERTSLICE4]], i64 4
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE22:%.*]] = extractelement <5 x double> [[LARGEPHI_INSERTSLICE4]], i64 1
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE43:%.*]] = extractelement <5 x double> [[LARGEPHI_INSERTSLICE4]], i64 2
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE64:%.*]] = extractelement <5 x double> [[LARGEPHI_INSERTSLICE4]], i64 3
+; CHECK-NEXT:    [[LARGEPHI_EXTRACTSLICE85:%.*]] = extractelement <5 x double> [[LARGEPHI_INSERTSLICE4]], i64 4
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
 ; CHECK-NEXT:    [[TMP5:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE01]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE12]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE23]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE34]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
-; CHECK-NEXT:    [[TMP9:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE45]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE22]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE43]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE64]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE85]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
 ; CHECK-NEXT:    [[LARGEPHI_INSERTSLICE06:%.*]] = insertelement <5 x double> poison, double [[TMP5]], i64 0
 ; CHECK-NEXT:    [[LARGEPHI_INSERTSLICE17:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE06]], double [[TMP6]], i64 1
 ; CHECK-NEXT:    [[LARGEPHI_INSERTSLICE28:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE17]], double [[TMP7]], i64 2
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare -amdgpu-codegenprepare-force-break-large-phis %s | FileCheck  %s --check-prefixes=OPT
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare -amdgpu-codegenprepare-force-break-large-phis -verify %s | FileCheck  %s --check-prefixes=OPT
 ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare --global-isel %s | FileCheck  %s --check-prefixes=NOOPT
 ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare -amdgpu-codegenprepare-break-large-phis=0 %s | FileCheck  %s --check-prefixes=NOOPT
 
@@ -9,32 +9,32 @@
 ; OPT-NEXT:    br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]]
 ; OPT:       then:
 ; OPT-NEXT:    [[X:%.*]] = insertelement <5 x double> [[IN:%.*]], double 3.140000e+00, i32 3
-; OPT-NEXT:    [[TMP0:%.*]] = extractelement <5 x double> [[X]], i64 0
-; OPT-NEXT:    [[TMP1:%.*]] = extractelement <5 x double> [[X]], i64 1
-; OPT-NEXT:    [[TMP2:%.*]] = extractelement <5 x double> [[X]], i64 2
-; OPT-NEXT:    [[TMP3:%.*]] = extractelement <5 x double> [[X]], i64 3
-; OPT-NEXT:    [[TMP4:%.*]] = extractelement <5 x double> [[X]], i64 4
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE0:%.*]] = extractelement <5 x double> [[X]], i64 0
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE2:%.*]] = extractelement <5 x double> [[X]], i64 1
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE4:%.*]] = extractelement <5 x double> [[X]], i64 2
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE6:%.*]] = extractelement <5 x double> [[X]], i64 3
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE8:%.*]] = extractelement <5 x double> [[X]], i64 4
 ; OPT-NEXT:    br label [[FINALLY:%.*]]
 ; OPT:       else:
 ; OPT-NEXT:    [[Y:%.*]] = insertelement <5 x double> [[IN]], double 9.140000e+00, i32 2
-; OPT-NEXT:    [[TMP5:%.*]] = extractelement <5 x double> [[Y]], i64 0
-; OPT-NEXT:    [[TMP6:%.*]] = extractelement <5 x double> [[Y]], i64 1
-; OPT-NEXT:    [[TMP7:%.*]] = extractelement <5 x double> [[Y]], i64 2
-; OPT-NEXT:    [[TMP8:%.*]] = extractelement <5 x double> [[Y]], i64 3
-; OPT-NEXT:    [[TMP9:%.*]] = extractelement <5 x double> [[Y]], i64 4
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = extractelement <5 x double> [[Y]], i64 0
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = extractelement <5 x double> [[Y]], i64 1
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE5:%.*]] = extractelement <5 x double> [[Y]], i64 2
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE7:%.*]] = extractelement <5 x double> [[Y]], i64 3
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE9:%.*]] = extractelement <5 x double> [[Y]], i64 4
 ; OPT-NEXT:    br label [[FINALLY]]
 ; OPT:       finally:
-; OPT-NEXT:    [[TMP10:%.*]] = phi double [ [[TMP0]], [[THEN]] ], [ [[TMP5]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP11:%.*]] = phi double [ [[TMP1]], [[THEN]] ], [ [[TMP6]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP12:%.*]] = phi double [ [[TMP2]], [[THEN]] ], [ [[TMP7]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP13:%.*]] = phi double [ [[TMP3]], [[THEN]] ], [ [[TMP8]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP14:%.*]] = phi double [ [[TMP4]], [[THEN]] ], [ [[TMP9]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP15:%.*]] = insertelement <5 x double> poison, double [[TMP10]], i64 0
-; OPT-NEXT:    [[TMP16:%.*]] = insertelement <5 x double> [[TMP15]], double [[TMP11]], i64 1
-; OPT-NEXT:    [[TMP17:%.*]] = insertelement <5 x double> [[TMP16]], double [[TMP12]], i64 2
-; OPT-NEXT:    [[TMP18:%.*]] = insertelement <5 x double> [[TMP17]], double [[TMP13]], i64 3
-; OPT-NEXT:    [[TMP19:%.*]] = insertelement <5 x double> [[TMP18]], double [[TMP14]], i64 4
-; OPT-NEXT:    store <5 x double> [[TMP19]], ptr [[OUT:%.*]], align 1
+; OPT-NEXT:    [[TMP0:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE0]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP1:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE2]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP2:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE4]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP3:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE6]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP4:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE8]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[ELSE]] ]
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = insertelement <5 x double> poison, double [[TMP0]], i64 0
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE0]], double [[TMP1]], i64 1
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE1]], double [[TMP2]], i64 2
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE2]], double [[TMP3]], i64 3
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE4:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE3]], double [[TMP4]], i64 4
+; OPT-NEXT:    store <5 x double> [[LARGEPHI_INSERTSLICE4]], ptr [[OUT:%.*]], align 1
 ; OPT-NEXT:    ret void
 ;
 ; NOOPT-LABEL: @phi_v5f64(
@@ -71,40 +71,40 @@
 ; OPT-NEXT:    br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]]
 ; OPT:       then:
 ; OPT-NEXT:    [[X:%.*]] = insertelement <7 x double> [[IN:%.*]], double 3.140000e+00, i32 3
-; OPT-NEXT:    [[TMP0:%.*]] = extractelement <7 x double> [[X]], i64 0
-; OPT-NEXT:    [[TMP1:%.*]] = extractelement <7 x double> [[X]], i64 1
-; OPT-NEXT:    [[TMP2:%.*]] = extractelement <7 x double> [[X]], i64 2
-; OPT-NEXT:    [[TMP3:%.*]] = extractelement <7 x double> [[X]], i64 3
-; OPT-NEXT:    [[TMP4:%.*]] = extractelement <7 x double> [[X]], i64 4
-; OPT-NEXT:    [[TMP5:%.*]] = extractelement <7 x double> [[X]], i64 5
-; OPT-NEXT:    [[TMP6:%.*]] = extractelement <7 x double> [[X]], i64 6
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE0:%.*]] = extractelement <7 x double> [[X]], i64 0
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE2:%.*]] = extractelement <7 x double> [[X]], i64 1
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE4:%.*]] = extractelement <7 x double> [[X]], i64 2
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE6:%.*]] = extractelement <7 x double> [[X]], i64 3
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE8:%.*]] = extractelement <7 x double> [[X]], i64 4
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE10:%.*]] = extractelement <7 x double> [[X]], i64 5
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE12:%.*]] = extractelement <7 x double> [[X]], i64 6
 ; OPT-NEXT:    br label [[FINALLY:%.*]]
 ; OPT:       else:
 ; OPT-NEXT:    [[Y:%.*]] = insertelement <7 x double> [[IN]], double 9.140000e+00, i32 6
-; OPT-NEXT:    [[TMP7:%.*]] = extractelement <7 x double> [[Y]], i64 0
-; OPT-NEXT:    [[TMP8:%.*]] = extractelement <7 x double> [[Y]], i64 1
-; OPT-NEXT:    [[TMP9:%.*]] = extractelement <7 x double> [[Y]], i64 2
-; OPT-NEXT:    [[TMP10:%.*]] = extractelement <7 x double> [[Y]], i64 3
-; OPT-NEXT:    [[TMP11:%.*]] = extractelement <7 x double> [[Y]], i64 4
-; OPT-NEXT:    [[TMP12:%.*]] = extractelement <7 x double> [[Y]], i64 5
-; OPT-NEXT:    [[TMP13:%.*]] = extractelement <7 x double> [[Y]], i64 6
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = extractelement <7 x double> [[Y]], i64 0
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = extractelement <7 x double> [[Y]], i64 1
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE5:%.*]] = extractelement <7 x double> [[Y]], i64 2
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE7:%.*]] = extractelement <7 x double> [[Y]], i64 3
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE9:%.*]] = extractelement <7 x double> [[Y]], i64 4
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE11:%.*]] = extractelement <7 x double> [[Y]], i64 5
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE13:%.*]] = extractelement <7 x double> [[Y]], i64 6
 ; OPT-NEXT:    br label [[FINALLY]]
 ; OPT:       finally:
-; OPT-NEXT:    [[TMP14:%.*]] = phi double [ [[TMP0]], [[THEN]] ], [ [[TMP7]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP15:%.*]] = phi double [ [[TMP1]], [[THEN]] ], [ [[TMP8]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP16:%.*]] = phi double [ [[TMP2]], [[THEN]] ], [ [[TMP9]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP17:%.*]] = phi double [ [[TMP3]], [[THEN]] ], [ [[TMP10]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP18:%.*]] = phi double [ [[TMP4]], [[THEN]] ], [ [[TMP11]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP19:%.*]] = phi double [ [[TMP5]], [[THEN]] ], [ [[TMP12]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP20:%.*]] = phi double [ [[TMP6]], [[THEN]] ], [ [[TMP13]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP21:%.*]] = insertelement <7 x double> poison, double [[TMP14]], i64 0
-; OPT-NEXT:    [[TMP22:%.*]] = insertelement <7 x double> [[TMP21]], double [[TMP15]], i64 1
-; OPT-NEXT:    [[TMP23:%.*]] = insertelement <7 x double> [[TMP22]], double [[TMP16]], i64 2
-; OPT-NEXT:    [[TMP24:%.*]] = insertelement <7 x double> [[TMP23]], double [[TMP17]], i64 3
-; OPT-NEXT:    [[TMP25:%.*]] = insertelement <7 x double> [[TMP24]], double [[TMP18]], i64 4
-; OPT-NEXT:    [[TMP26:%.*]] = insertelement <7 x double> [[TMP25]], double [[TMP19]], i64 5
-; OPT-NEXT:    [[TMP27:%.*]] = insertelement <7 x double> [[TMP26]], double [[TMP20]], i64 6
-; OPT-NEXT:    store <7 x double> [[TMP27]], ptr [[OUT:%.*]], align 1
+; OPT-NEXT:    [[TMP0:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE0]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP1:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE2]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP2:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE4]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP3:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE6]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP4:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE8]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP5:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE10]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE11]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP6:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE12]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE13]], [[ELSE]] ]
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = insertelement <7 x double> poison, double [[TMP0]], i64 0
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = insertelement <7 x double> [[LARGEPHI_INSERTSLICE0]], double [[TMP1]], i64 1
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = insertelement <7 x double> [[LARGEPHI_INSERTSLICE1]], double [[TMP2]], i64 2
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = insertelement <7 x double> [[LARGEPHI_INSERTSLICE2]], double [[TMP3]], i64 3
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE4:%.*]] = insertelement <7 x double> [[LARGEPHI_INSERTSLICE3]], double [[TMP4]], i64 4
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE5:%.*]] = insertelement <7 x double> [[LARGEPHI_INSERTSLICE4]], double [[TMP5]], i64 5
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE6:%.*]] = insertelement <7 x double> [[LARGEPHI_INSERTSLICE5]], double [[TMP6]], i64 6
+; OPT-NEXT:    store <7 x double> [[LARGEPHI_INSERTSLICE6]], ptr [[OUT:%.*]], align 1
 ; OPT-NEXT:    ret void
 ;
 ; NOOPT-LABEL: @phi_v7f64(
@@ -141,56 +141,56 @@
 ; OPT-NEXT:    br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]]
 ; OPT:       then:
 ; OPT-NEXT:    [[X:%.*]] = insertelement <11 x double> [[IN:%.*]], double 3.140000e+00, i32 3
-; OPT-NEXT:    [[TMP0:%.*]] = extractelement <11 x double> [[X]], i64 0
-; OPT-NEXT:    [[TMP1:%.*]] = extractelement <11 x double> [[X]], i64 1
-; OPT-NEXT:    [[TMP2:%.*]] = extractelement <11 x double> [[X]], i64 2
-; OPT-NEXT:    [[TMP3:%.*]] = extractelement <11 x double> [[X]], i64 3
-; OPT-NEXT:    [[TMP4:%.*]] = extractelement <11 x double> [[X]], i64 4
-; OPT-NEXT:    [[TMP5:%.*]] = extractelement <11 x double> [[X]], i64 5
-; OPT-NEXT:    [[TMP6:%.*]] = extractelement <11 x double> [[X]], i64 6
-; OPT-NEXT:    [[TMP7:%.*]] = extractelement <11 x double> [[X]], i64 7
-; OPT-NEXT:    [[TMP8:%.*]] = extractelement <11 x double> [[X]], i64 8
-; OPT-NEXT:    [[TMP9:%.*]] = extractelement <11 x double> [[X]], i64 9
-; OPT-NEXT:    [[TMP10:%.*]] = extractelement <11 x double> [[X]], i64 10
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE0:%.*]] = extractelement <11 x double> [[X]], i64 0
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE2:%.*]] = extractelement <11 x double> [[X]], i64 1
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE4:%.*]] = extractelement <11 x double> [[X]], i64 2
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE6:%.*]] = extractelement <11 x double> [[X]], i64 3
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE8:%.*]] = extractelement <11 x double> [[X]], i64 4
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE10:%.*]] = extractelement <11 x double> [[X]], i64 5
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE12:%.*]] = extractelement <11 x double> [[X]], i64 6
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE14:%.*]] = extractelement <11 x double> [[X]], i64 7
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE16:%.*]] = extractelement <11 x double> [[X]], i64 8
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE18:%.*]] = extractelement <11 x double> [[X]], i64 9
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE20:%.*]] = extractelement <11 x double> [[X]], i64 10
 ; OPT-NEXT:    br label [[FINALLY:%.*]]
 ; OPT:       else:
 ; OPT-NEXT:    [[Y:%.*]] = insertelement <11 x double> [[IN]], double 9.140000e+00, i32 6
-; OPT-NEXT:    [[TMP11:%.*]] = extractelement <11 x double> [[Y]], i64 0
-; OPT-NEXT:    [[TMP12:%.*]] = extractelement <11 x double> [[Y]], i64 1
-; OPT-NEXT:    [[TMP13:%.*]] = extractelement <11 x double> [[Y]], i64 2
-; OPT-NEXT:    [[TMP14:%.*]] = extractelement <11 x double> [[Y]], i64 3
-; OPT-NEXT:    [[TMP15:%.*]] = extractelement <11 x double> [[Y]], i64 4
-; OPT-NEXT:    [[TMP16:%.*]] = extractelement <11 x double> [[Y]], i64 5
-; OPT-NEXT:    [[TMP17:%.*]] = extractelement <11 x double> [[Y]], i64 6
-; OPT-NEXT:    [[TMP18:%.*]] = extractelement <11 x double> [[Y]], i64 7
-; OPT-NEXT:    [[TMP19:%.*]] = extractelement <11 x double> [[Y]], i64 8
-; OPT-NEXT:    [[TMP20:%.*]] = extractelement <11 x double> [[Y]], i64 9
-; OPT-NEXT:    [[TMP21:%.*]] = extractelement <11 x double> [[Y]], i64 10
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = extractelement <11 x double> [[Y]], i64 0
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = extractelement <11 x double> [[Y]], i64 1
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE5:%.*]] = extractelement <11 x double> [[Y]], i64 2
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE7:%.*]] = extractelement <11 x double> [[Y]], i64 3
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE9:%.*]] = extractelement <11 x double> [[Y]], i64 4
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE11:%.*]] = extractelement <11 x double> [[Y]], i64 5
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE13:%.*]] = extractelement <11 x double> [[Y]], i64 6
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE15:%.*]] = extractelement <11 x double> [[Y]], i64 7
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE17:%.*]] = extractelement <11 x double> [[Y]], i64 8
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE19:%.*]] = extractelement <11 x double> [[Y]], i64 9
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE21:%.*]] = extractelement <11 x double> [[Y]], i64 10
 ; OPT-NEXT:    br label [[FINALLY]]
 ; OPT:       finally:
-; OPT-NEXT:    [[TMP22:%.*]] = phi double [ [[TMP0]], [[THEN]] ], [ [[TMP11]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP23:%.*]] = phi double [ [[TMP1]], [[THEN]] ], [ [[TMP12]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP24:%.*]] = phi double [ [[TMP2]], [[THEN]] ], [ [[TMP13]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP25:%.*]] = phi double [ [[TMP3]], [[THEN]] ], [ [[TMP14]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP26:%.*]] = phi double [ [[TMP4]], [[THEN]] ], [ [[TMP15]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP27:%.*]] = phi double [ [[TMP5]], [[THEN]] ], [ [[TMP16]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP28:%.*]] = phi double [ [[TMP6]], [[THEN]] ], [ [[TMP17]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP29:%.*]] = phi double [ [[TMP7]], [[THEN]] ], [ [[TMP18]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP30:%.*]] = phi double [ [[TMP8]], [[THEN]] ], [ [[TMP19]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP31:%.*]] = phi double [ [[TMP9]], [[THEN]] ], [ [[TMP20]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP32:%.*]] = phi double [ [[TMP10]], [[THEN]] ], [ [[TMP21]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP33:%.*]] = insertelement <11 x double> poison, double [[TMP22]], i64 0
-; OPT-NEXT:    [[TMP34:%.*]] = insertelement <11 x double> [[TMP33]], double [[TMP23]], i64 1
-; OPT-NEXT:    [[TMP35:%.*]] = insertelement <11 x double> [[TMP34]], double [[TMP24]], i64 2
-; OPT-NEXT:    [[TMP36:%.*]] = insertelement <11 x double> [[TMP35]], double [[TMP25]], i64 3
-; OPT-NEXT:    [[TMP37:%.*]] = insertelement <11 x double> [[TMP36]], double [[TMP26]], i64 4
-; OPT-NEXT:    [[TMP38:%.*]] = insertelement <11 x double> [[TMP37]], double [[TMP27]], i64 5
-; OPT-NEXT:    [[TMP39:%.*]] = insertelement <11 x double> [[TMP38]], double [[TMP28]], i64 6
-; OPT-NEXT:    [[TMP40:%.*]] = insertelement <11 x double> [[TMP39]], double [[TMP29]], i64 7
-; OPT-NEXT:    [[TMP41:%.*]] = insertelement <11 x double> [[TMP40]], double [[TMP30]], i64 8
-; OPT-NEXT:    [[TMP42:%.*]] = insertelement <11 x double> [[TMP41]], double [[TMP31]], i64 9
-; OPT-NEXT:    [[TMP43:%.*]] = insertelement <11 x double> [[TMP42]], double [[TMP32]], i64 10
-; OPT-NEXT:    store <11 x double> [[TMP43]], ptr [[OUT:%.*]], align 1
+; OPT-NEXT:    [[TMP0:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE0]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP1:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE2]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP2:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE4]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP3:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE6]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP4:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE8]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP5:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE10]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE11]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP6:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE12]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE13]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP7:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE14]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE15]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP8:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE16]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE17]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP9:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE18]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE19]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP10:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE20]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE21]], [[ELSE]] ]
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = insertelement <11 x double> poison, double [[TMP0]], i64 0
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = insertelement <11 x double> [[LARGEPHI_INSERTSLICE0]], double [[TMP1]], i64 1
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = insertelement <11 x double> [[LARGEPHI_INSERTSLICE1]], double [[TMP2]], i64 2
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = insertelement <11 x double> [[LARGEPHI_INSERTSLICE2]], double [[TMP3]], i64 3
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE4:%.*]] = insertelement <11 x double> [[LARGEPHI_INSERTSLICE3]], double [[TMP4]], i64 4
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE5:%.*]] = insertelement <11 x double> [[LARGEPHI_INSERTSLICE4]], double [[TMP5]], i64 5
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE6:%.*]] = insertelement <11 x double> [[LARGEPHI_INSERTSLICE5]], double [[TMP6]], i64 6
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE7:%.*]] = insertelement <11 x double> [[LARGEPHI_INSERTSLICE6]], double [[TMP7]], i64 7
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE8:%.*]] = insertelement <11 x double> [[LARGEPHI_INSERTSLICE7]], double [[TMP8]], i64 8
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE9:%.*]] = insertelement <11 x double> [[LARGEPHI_INSERTSLICE8]], double [[TMP9]], i64 9
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE10:%.*]] = insertelement <11 x double> [[LARGEPHI_INSERTSLICE9]], double [[TMP10]], i64 10
+; OPT-NEXT:    store <11 x double> [[LARGEPHI_INSERTSLICE10]], ptr [[OUT:%.*]], align 1
 ; OPT-NEXT:    ret void
 ;
 ; NOOPT-LABEL: @phi_v11f64(
@@ -227,42 +227,42 @@
 ; OPT-NEXT:    br i1 [[COND:%.*]], label [[THEN:%.*]], label [[FINALLY:%.*]]
 ; OPT:       then:
 ; OPT-NEXT:    [[X:%.*]] = insertelement <11 x double> [[IN:%.*]], double 3.140000e+00, i32 3
-; OPT-NEXT:    [[TMP0:%.*]] = extractelement <11 x double> [[X]], i64 0
-; OPT-NEXT:    [[TMP1:%.*]] = extractelement <11 x double> [[X]], i64 1
-; OPT-NEXT:    [[TMP2:%.*]] = extractelement <11 x double> [[X]], i64 2
-; OPT-NEXT:    [[TMP3:%.*]] = extractelement <11 x double> [[X]], i64 3
-; OPT-NEXT:    [[TMP4:%.*]] = extractelement <11 x double> [[X]], i64 4
-; OPT-NEXT:    [[TMP5:%.*]] = extractelement <11 x double> [[X]], i64 5
-; OPT-NEXT:    [[TMP6:%.*]] = extractelement <11 x double> [[X]], i64 6
-; OPT-NEXT:    [[TMP7:%.*]] = extractelement <11 x double> [[X]], i64 7
-; OPT-NEXT:    [[TMP8:%.*]] = extractelement <11 x double> [[X]], i64 8
-; OPT-NEXT:    [[TMP9:%.*]] = extractelement <11 x double> [[X]], i64 9
-; OPT-NEXT:    [[TMP10:%.*]] = extractelement <11 x double> [[X]], i64 10
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE0:%.*]] = extractelement <11 x double> [[X]], i64 0
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE2:%.*]] = extractelement <11 x double> [[X]], i64 1
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE4:%.*]] = extractelement <11 x double> [[X]], i64 2
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE6:%.*]] = extractelement <11 x double> [[X]], i64 3
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE8:%.*]] = extractelement <11 x double> [[X]], i64 4
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE10:%.*]] = extractelement <11 x double> [[X]], i64 5
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE12:%.*]] = extractelement <11 x double> [[X]], i64 6
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE14:%.*]] = extractelement <11 x double> [[X]], i64 7
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE16:%.*]] = extractelement <11 x double> [[X]], i64 8
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE18:%.*]] = extractelement <11 x double> [[X]], i64 9
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE20:%.*]] = extractelement <11 x double> [[X]], i64 10
 ; OPT-NEXT:    br label [[FINALLY]]
 ; OPT:       finally:
-; OPT-NEXT:    [[TMP11:%.*]] = phi double [ [[TMP0]], [[THEN]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; OPT-NEXT:    [[TMP12:%.*]] = phi double [ [[TMP1]], [[THEN]] ], [ 0.000000e+00, [[ENTRY]] ]
-; OPT-NEXT:    [[TMP13:%.*]] = phi double [ [[TMP2]], [[THEN]] ], [ 0.000000e+00, [[ENTRY]] ]
-; OPT-NEXT:    [[TMP14:%.*]] = phi double [ [[TMP3]], [[THEN]] ], [ 0.000000e+00, [[ENTRY]] ]
-; OPT-NEXT:    [[TMP15:%.*]] = phi double [ [[TMP4]], [[THEN]] ], [ 0.000000e+00, [[ENTRY]] ]
-; OPT-NEXT:    [[TMP16:%.*]] = phi double [ [[TMP5]], [[THEN]] ], [ 0.000000e+00, [[ENTRY]] ]
-; OPT-NEXT:    [[TMP17:%.*]] = phi double [ [[TMP6]], [[THEN]] ], [ 0.000000e+00, [[ENTRY]] ]
-; OPT-NEXT:    [[TMP18:%.*]] = phi double [ [[TMP7]], [[THEN]] ], [ 0.000000e+00, [[ENTRY]] ]
-; OPT-NEXT:    [[TMP19:%.*]] = phi double [ [[TMP8]], [[THEN]] ], [ 0.000000e+00, [[ENTRY]] ]
-; OPT-NEXT:    [[TMP20:%.*]] = phi double [ [[TMP9]], [[THEN]] ], [ 0.000000e+00, [[ENTRY]] ]
-; OPT-NEXT:    [[TMP21:%.*]] = phi double [ [[TMP10]], [[THEN]] ], [ 0.000000e+00, [[ENTRY]] ]
-; OPT-NEXT:    [[TMP22:%.*]] = insertelement <11 x double> poison, double [[TMP11]], i64 0
-; OPT-NEXT:    [[TMP23:%.*]] = insertelement <11 x double> [[TMP22]], double [[TMP12]], i64 1
-; OPT-NEXT:    [[TMP24:%.*]] = insertelement <11 x double> [[TMP23]], double [[TMP13]], i64 2
-; OPT-NEXT:    [[TMP25:%.*]] = insertelement <11 x double> [[TMP24]], double [[TMP14]], i64 3
-; OPT-NEXT:    [[TMP26:%.*]] = insertelement <11 x double> [[TMP25]], double [[TMP15]], i64 4
-; OPT-NEXT:    [[TMP27:%.*]] = insertelement <11 x double> [[TMP26]], double [[TMP16]], i64 5
-; OPT-NEXT:    [[TMP28:%.*]] = insertelement <11 x double> [[TMP27]], double [[TMP17]], i64 6
-; OPT-NEXT:    [[TMP29:%.*]] = insertelement <11 x double> [[TMP28]], double [[TMP18]], i64 7
-; OPT-NEXT:    [[TMP30:%.*]] = insertelement <11 x double> [[TMP29]], double [[TMP19]], i64 8
-; OPT-NEXT:    [[TMP31:%.*]] = insertelement <11 x double> [[TMP30]], double [[TMP20]], i64 9
-; OPT-NEXT:    [[TMP32:%.*]] = insertelement <11 x double> [[TMP31]], double [[TMP21]], i64 10
-; OPT-NEXT:    store <11 x double> [[TMP32]], ptr [[OUT:%.*]], align 1
+; OPT-NEXT:    [[TMP0:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE0]], [[THEN]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
+; OPT-NEXT:    [[TMP1:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE2]], [[THEN]] ], [ 0.000000e+00, [[ENTRY]] ]
+; OPT-NEXT:    [[TMP2:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE4]], [[THEN]] ], [ 0.000000e+00, [[ENTRY]] ]
+; OPT-NEXT:    [[TMP3:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE6]], [[THEN]] ], [ 0.000000e+00, [[ENTRY]] ]
+; OPT-NEXT:    [[TMP4:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE8]], [[THEN]] ], [ 0.000000e+00, [[ENTRY]] ]
+; OPT-NEXT:    [[TMP5:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE10]], [[THEN]] ], [ 0.000000e+00, [[ENTRY]] ]
+; OPT-NEXT:    [[TMP6:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE12]], [[THEN]] ], [ 0.000000e+00, [[ENTRY]] ]
+; OPT-NEXT:    [[TMP7:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE14]], [[THEN]] ], [ 0.000000e+00, [[ENTRY]] ]
+; OPT-NEXT:    [[TMP8:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE16]], [[THEN]] ], [ 0.000000e+00, [[ENTRY]] ]
+; OPT-NEXT:    [[TMP9:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE18]], [[THEN]] ], [ 0.000000e+00, [[ENTRY]] ]
+; OPT-NEXT:    [[TMP10:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE20]], [[THEN]] ], [ 0.000000e+00, [[ENTRY]] ]
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = insertelement <11 x double> poison, double [[TMP0]], i64 0
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = insertelement <11 x double> [[LARGEPHI_INSERTSLICE0]], double [[TMP1]], i64 1
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = insertelement <11 x double> [[LARGEPHI_INSERTSLICE1]], double [[TMP2]], i64 2
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = insertelement <11 x double> [[LARGEPHI_INSERTSLICE2]], double [[TMP3]], i64 3
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE4:%.*]] = insertelement <11 x double> [[LARGEPHI_INSERTSLICE3]], double [[TMP4]], i64 4
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE5:%.*]] = insertelement <11 x double> [[LARGEPHI_INSERTSLICE4]], double [[TMP5]], i64 5
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE6:%.*]] = insertelement <11 x double> [[LARGEPHI_INSERTSLICE5]], double [[TMP6]], i64 6
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE7:%.*]] = insertelement <11 x double> [[LARGEPHI_INSERTSLICE6]], double [[TMP7]], i64 7
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE8:%.*]] = insertelement <11 x double> [[LARGEPHI_INSERTSLICE7]], double [[TMP8]], i64 8
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE9:%.*]] = insertelement <11 x double> [[LARGEPHI_INSERTSLICE8]], double [[TMP9]], i64 9
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE10:%.*]] = insertelement <11 x double> [[LARGEPHI_INSERTSLICE9]], double [[TMP10]], i64 10
+; OPT-NEXT:    store <11 x double> [[LARGEPHI_INSERTSLICE10]], ptr [[OUT:%.*]], align 1
 ; OPT-NEXT:    ret void
 ;
 ; NOOPT-LABEL: @phi_v11f64_cst(
@@ -293,72 +293,72 @@
 ; OPT-NEXT:    br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]]
 ; OPT:       then:
 ; OPT-NEXT:    [[X:%.*]] = insertelement <15 x i64> [[IN:%.*]], i64 42, i32 3
-; OPT-NEXT:    [[TMP0:%.*]] = extractelement <15 x i64> [[X]], i64 0
-; OPT-NEXT:    [[TMP1:%.*]] = extractelement <15 x i64> [[X]], i64 1
-; OPT-NEXT:    [[TMP2:%.*]] = extractelement <15 x i64> [[X]], i64 2
-; OPT-NEXT:    [[TMP3:%.*]] = extractelement <15 x i64> [[X]], i64 3
-; OPT-NEXT:    [[TMP4:%.*]] = extractelement <15 x i64> [[X]], i64 4
-; OPT-NEXT:    [[TMP5:%.*]] = extractelement <15 x i64> [[X]], i64 5
-; OPT-NEXT:    [[TMP6:%.*]] = extractelement <15 x i64> [[X]], i64 6
-; OPT-NEXT:    [[TMP7:%.*]] = extractelement <15 x i64> [[X]], i64 7
-; OPT-NEXT:    [[TMP8:%.*]] = extractelement <15 x i64> [[X]], i64 8
-; OPT-NEXT:    [[TMP9:%.*]] = extractelement <15 x i64> [[X]], i64 9
-; OPT-NEXT:    [[TMP10:%.*]] = extractelement <15 x i64> [[X]], i64 10
-; OPT-NEXT:    [[TMP11:%.*]] = extractelement <15 x i64> [[X]], i64 11
-; OPT-NEXT:    [[TMP12:%.*]] = extractelement <15 x i64> [[X]], i64 12
-; OPT-NEXT:    [[TMP13:%.*]] = extractelement <15 x i64> [[X]], i64 13
-; OPT-NEXT:    [[TMP14:%.*]] = extractelement <15 x i64> [[X]], i64 14
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE0:%.*]] = extractelement <15 x i64> [[X]], i64 0
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE2:%.*]] = extractelement <15 x i64> [[X]], i64 1
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE4:%.*]] = extractelement <15 x i64> [[X]], i64 2
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE6:%.*]] = extractelement <15 x i64> [[X]], i64 3
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE8:%.*]] = extractelement <15 x i64> [[X]], i64 4
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE10:%.*]] = extractelement <15 x i64> [[X]], i64 5
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE12:%.*]] = extractelement <15 x i64> [[X]], i64 6
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE14:%.*]] = extractelement <15 x i64> [[X]], i64 7
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE16:%.*]] = extractelement <15 x i64> [[X]], i64 8
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE18:%.*]] = extractelement <15 x i64> [[X]], i64 9
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE20:%.*]] = extractelement <15 x i64> [[X]], i64 10
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE22:%.*]] = extractelement <15 x i64> [[X]], i64 11
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE24:%.*]] = extractelement <15 x i64> [[X]], i64 12
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE26:%.*]] = extractelement <15 x i64> [[X]], i64 13
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE28:%.*]] = extractelement <15 x i64> [[X]], i64 14
 ; OPT-NEXT:    br label [[FINALLY:%.*]]
 ; OPT:       else:
 ; OPT-NEXT:    [[Y:%.*]] = insertelement <15 x i64> [[IN]], i64 64, i32 6
-; OPT-NEXT:    [[TMP15:%.*]] = extractelement <15 x i64> [[Y]], i64 0
-; OPT-NEXT:    [[TMP16:%.*]] = extractelement <15 x i64> [[Y]], i64 1
-; OPT-NEXT:    [[TMP17:%.*]] = extractelement <15 x i64> [[Y]], i64 2
-; OPT-NEXT:    [[TMP18:%.*]] = extractelement <15 x i64> [[Y]], i64 3
-; OPT-NEXT:    [[TMP19:%.*]] = extractelement <15 x i64> [[Y]], i64 4
-; OPT-NEXT:    [[TMP20:%.*]] = extractelement <15 x i64> [[Y]], i64 5
-; OPT-NEXT:    [[TMP21:%.*]] = extractelement <15 x i64> [[Y]], i64 6
-; OPT-NEXT:    [[TMP22:%.*]] = extractelement <15 x i64> [[Y]], i64 7
-; OPT-NEXT:    [[TMP23:%.*]] = extractelement <15 x i64> [[Y]], i64 8
-; OPT-NEXT:    [[TMP24:%.*]] = extractelement <15 x i64> [[Y]], i64 9
-; OPT-NEXT:    [[TMP25:%.*]] = extractelement <15 x i64> [[Y]], i64 10
-; OPT-NEXT:    [[TMP26:%.*]] = extractelement <15 x i64> [[Y]], i64 11
-; OPT-NEXT:    [[TMP27:%.*]] = extractelement <15 x i64> [[Y]], i64 12
-; OPT-NEXT:    [[TMP28:%.*]] = extractelement <15 x i64> [[Y]], i64 13
-; OPT-NEXT:    [[TMP29:%.*]] = extractelement <15 x i64> [[Y]], i64 14
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = extractelement <15 x i64> [[Y]], i64 0
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = extractelement <15 x i64> [[Y]], i64 1
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE5:%.*]] = extractelement <15 x i64> [[Y]], i64 2
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE7:%.*]] = extractelement <15 x i64> [[Y]], i64 3
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE9:%.*]] = extractelement <15 x i64> [[Y]], i64 4
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE11:%.*]] = extractelement <15 x i64> [[Y]], i64 5
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE13:%.*]] = extractelement <15 x i64> [[Y]], i64 6
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE15:%.*]] = extractelement <15 x i64> [[Y]], i64 7
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE17:%.*]] = extractelement <15 x i64> [[Y]], i64 8
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE19:%.*]] = extractelement <15 x i64> [[Y]], i64 9
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE21:%.*]] = extractelement <15 x i64> [[Y]], i64 10
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE23:%.*]] = extractelement <15 x i64> [[Y]], i64 11
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE25:%.*]] = extractelement <15 x i64> [[Y]], i64 12
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE27:%.*]] = extractelement <15 x i64> [[Y]], i64 13
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE29:%.*]] = extractelement <15 x i64> [[Y]], i64 14
 ; OPT-NEXT:    br label [[FINALLY]]
 ; OPT:       finally:
-; OPT-NEXT:    [[TMP30:%.*]] = phi i64 [ [[TMP0]], [[THEN]] ], [ [[TMP15]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP31:%.*]] = phi i64 [ [[TMP1]], [[THEN]] ], [ [[TMP16]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP32:%.*]] = phi i64 [ [[TMP2]], [[THEN]] ], [ [[TMP17]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP33:%.*]] = phi i64 [ [[TMP3]], [[THEN]] ], [ [[TMP18]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP34:%.*]] = phi i64 [ [[TMP4]], [[THEN]] ], [ [[TMP19]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP35:%.*]] = phi i64 [ [[TMP5]], [[THEN]] ], [ [[TMP20]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP36:%.*]] = phi i64 [ [[TMP6]], [[THEN]] ], [ [[TMP21]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP37:%.*]] = phi i64 [ [[TMP7]], [[THEN]] ], [ [[TMP22]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP38:%.*]] = phi i64 [ [[TMP8]], [[THEN]] ], [ [[TMP23]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP39:%.*]] = phi i64 [ [[TMP9]], [[THEN]] ], [ [[TMP24]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP40:%.*]] = phi i64 [ [[TMP10]], [[THEN]] ], [ [[TMP25]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP41:%.*]] = phi i64 [ [[TMP11]], [[THEN]] ], [ [[TMP26]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP42:%.*]] = phi i64 [ [[TMP12]], [[THEN]] ], [ [[TMP27]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP43:%.*]] = phi i64 [ [[TMP13]], [[THEN]] ], [ [[TMP28]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP44:%.*]] = phi i64 [ [[TMP14]], [[THEN]] ], [ [[TMP29]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP45:%.*]] = insertelement <15 x i64> poison, i64 [[TMP30]], i64 0
-; OPT-NEXT:    [[TMP46:%.*]] = insertelement <15 x i64> [[TMP45]], i64 [[TMP31]], i64 1
-; OPT-NEXT:    [[TMP47:%.*]] = insertelement <15 x i64> [[TMP46]], i64 [[TMP32]], i64 2
-; OPT-NEXT:    [[TMP48:%.*]] = insertelement <15 x i64> [[TMP47]], i64 [[TMP33]], i64 3
-; OPT-NEXT:    [[TMP49:%.*]] = insertelement <15 x i64> [[TMP48]], i64 [[TMP34]], i64 4
-; OPT-NEXT:    [[TMP50:%.*]] = insertelement <15 x i64> [[TMP49]], i64 [[TMP35]], i64 5
-; OPT-NEXT:    [[TMP51:%.*]] = insertelement <15 x i64> [[TMP50]], i64 [[TMP36]], i64 6
-; OPT-NEXT:    [[TMP52:%.*]] = insertelement <15 x i64> [[TMP51]], i64 [[TMP37]], i64 7
-; OPT-NEXT:    [[TMP53:%.*]] = insertelement <15 x i64> [[TMP52]], i64 [[TMP38]], i64 8
-; OPT-NEXT:    [[TMP54:%.*]] = insertelement <15 x i64> [[TMP53]], i64 [[TMP39]], i64 9
-; OPT-NEXT:    [[TMP55:%.*]] = insertelement <15 x i64> [[TMP54]], i64 [[TMP40]], i64 10
-; OPT-NEXT:    [[TMP56:%.*]] = insertelement <15 x i64> [[TMP55]], i64 [[TMP41]], i64 11
-; OPT-NEXT:    [[TMP57:%.*]] = insertelement <15 x i64> [[TMP56]], i64 [[TMP42]], i64 12
-; OPT-NEXT:    [[TMP58:%.*]] = insertelement <15 x i64> [[TMP57]], i64 [[TMP43]], i64 13
-; OPT-NEXT:    [[TMP59:%.*]] = insertelement <15 x i64> [[TMP58]], i64 [[TMP44]], i64 14
-; OPT-NEXT:    store <15 x i64> [[TMP59]], ptr [[OUT:%.*]], align 1
+; OPT-NEXT:    [[TMP0:%.*]] = phi i64 [ [[LARGEPHI_EXTRACTSLICE0]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP1:%.*]] = phi i64 [ [[LARGEPHI_EXTRACTSLICE2]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP2:%.*]] = phi i64 [ [[LARGEPHI_EXTRACTSLICE4]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP3:%.*]] = phi i64 [ [[LARGEPHI_EXTRACTSLICE6]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP4:%.*]] = phi i64 [ [[LARGEPHI_EXTRACTSLICE8]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP5:%.*]] = phi i64 [ [[LARGEPHI_EXTRACTSLICE10]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE11]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP6:%.*]] = phi i64 [ [[LARGEPHI_EXTRACTSLICE12]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE13]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP7:%.*]] = phi i64 [ [[LARGEPHI_EXTRACTSLICE14]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE15]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP8:%.*]] = phi i64 [ [[LARGEPHI_EXTRACTSLICE16]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE17]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP9:%.*]] = phi i64 [ [[LARGEPHI_EXTRACTSLICE18]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE19]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP10:%.*]] = phi i64 [ [[LARGEPHI_EXTRACTSLICE20]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE21]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP11:%.*]] = phi i64 [ [[LARGEPHI_EXTRACTSLICE22]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE23]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP12:%.*]] = phi i64 [ [[LARGEPHI_EXTRACTSLICE24]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE25]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP13:%.*]] = phi i64 [ [[LARGEPHI_EXTRACTSLICE26]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE27]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP14:%.*]] = phi i64 [ [[LARGEPHI_EXTRACTSLICE28]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE29]], [[ELSE]] ]
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = insertelement <15 x i64> poison, i64 [[TMP0]], i64 0
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = insertelement <15 x i64> [[LARGEPHI_INSERTSLICE0]], i64 [[TMP1]], i64 1
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = insertelement <15 x i64> [[LARGEPHI_INSERTSLICE1]], i64 [[TMP2]], i64 2
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = insertelement <15 x i64> [[LARGEPHI_INSERTSLICE2]], i64 [[TMP3]], i64 3
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE4:%.*]] = insertelement <15 x i64> [[LARGEPHI_INSERTSLICE3]], i64 [[TMP4]], i64 4
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE5:%.*]] = insertelement <15 x i64> [[LARGEPHI_INSERTSLICE4]], i64 [[TMP5]], i64 5
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE6:%.*]] = insertelement <15 x i64> [[LARGEPHI_INSERTSLICE5]], i64 [[TMP6]], i64 6
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE7:%.*]] = insertelement <15 x i64> [[LARGEPHI_INSERTSLICE6]], i64 [[TMP7]], i64 7
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE8:%.*]] = insertelement <15 x i64> [[LARGEPHI_INSERTSLICE7]], i64 [[TMP8]], i64 8
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE9:%.*]] = insertelement <15 x i64> [[LARGEPHI_INSERTSLICE8]], i64 [[TMP9]], i64 9
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE10:%.*]] = insertelement <15 x i64> [[LARGEPHI_INSERTSLICE9]], i64 [[TMP10]], i64 10
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE11:%.*]] = insertelement <15 x i64> [[LARGEPHI_INSERTSLICE10]], i64 [[TMP11]], i64 11
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE12:%.*]] = insertelement <15 x i64> [[LARGEPHI_INSERTSLICE11]], i64 [[TMP12]], i64 12
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE13:%.*]] = insertelement <15 x i64> [[LARGEPHI_INSERTSLICE12]], i64 [[TMP13]], i64 13
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE14:%.*]] = insertelement <15 x i64> [[LARGEPHI_INSERTSLICE13]], i64 [[TMP14]], i64 14
+; OPT-NEXT:    store <15 x i64> [[LARGEPHI_INSERTSLICE14]], ptr [[OUT:%.*]], align 1
 ; OPT-NEXT:    ret void
 ;
 ; NOOPT-LABEL: @phi_v15i64(
@@ -395,68 +395,68 @@
 ; OPT-NEXT:    br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]]
 ; OPT:       then:
 ; OPT-NEXT:    [[X:%.*]] = insertelement <27 x i16> [[IN:%.*]], i16 42, i32 3
-; OPT-NEXT:    [[TMP0:%.*]] = shufflevector <27 x i16> [[X]], <27 x i16> poison, <2 x i32> <i32 0, i32 1>
-; OPT-NEXT:    [[TMP1:%.*]] = shufflevector <27 x i16> [[X]], <27 x i16> poison, <2 x i32> <i32 2, i32 3>
-; OPT-NEXT:    [[TMP2:%.*]] = shufflevector <27 x i16> [[X]], <27 x i16> poison, <2 x i32> <i32 4, i32 5>
-; OPT-NEXT:    [[TMP3:%.*]] = shufflevector <27 x i16> [[X]], <27 x i16> poison, <2 x i32> <i32 6, i32 7>
-; OPT-NEXT:    [[TMP4:%.*]] = shufflevector <27 x i16> [[X]], <27 x i16> poison, <2 x i32> <i32 8, i32 9>
-; OPT-NEXT:    [[TMP5:%.*]] = shufflevector <27 x i16> [[X]], <27 x i16> poison, <2 x i32> <i32 10, i32 11>
-; OPT-NEXT:    [[TMP6:%.*]] = shufflevector <27 x i16> [[X]], <27 x i16> poison, <2 x i32> <i32 12, i32 13>
-; OPT-NEXT:    [[TMP7:%.*]] = shufflevector <27 x i16> [[X]], <27 x i16> poison, <2 x i32> <i32 14, i32 15>
-; OPT-NEXT:    [[TMP8:%.*]] = shufflevector <27 x i16> [[X]], <27 x i16> poison, <2 x i32> <i32 16, i32 17>
-; OPT-NEXT:    [[TMP9:%.*]] = shufflevector <27 x i16> [[X]], <27 x i16> poison, <2 x i32> <i32 18, i32 19>
-; OPT-NEXT:    [[TMP10:%.*]] = shufflevector <27 x i16> [[X]], <27 x i16> poison, <2 x i32> <i32 20, i32 21>
-; OPT-NEXT:    [[TMP11:%.*]] = shufflevector <27 x i16> [[X]], <27 x i16> poison, <2 x i32> <i32 22, i32 23>
-; OPT-NEXT:    [[TMP12:%.*]] = shufflevector <27 x i16> [[X]], <27 x i16> poison, <2 x i32> <i32 24, i32 25>
-; OPT-NEXT:    [[TMP13:%.*]] = extractelement <27 x i16> [[X]], i64 26
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE0:%.*]] = shufflevector <27 x i16> [[X]], <27 x i16> poison, <2 x i32> <i32 0, i32 1>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE2:%.*]] = shufflevector <27 x i16> [[X]], <27 x i16> poison, <2 x i32> <i32 2, i32 3>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE4:%.*]] = shufflevector <27 x i16> [[X]], <27 x i16> poison, <2 x i32> <i32 4, i32 5>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE6:%.*]] = shufflevector <27 x i16> [[X]], <27 x i16> poison, <2 x i32> <i32 6, i32 7>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE8:%.*]] = shufflevector <27 x i16> [[X]], <27 x i16> poison, <2 x i32> <i32 8, i32 9>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE10:%.*]] = shufflevector <27 x i16> [[X]], <27 x i16> poison, <2 x i32> <i32 10, i32 11>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE12:%.*]] = shufflevector <27 x i16> [[X]], <27 x i16> poison, <2 x i32> <i32 12, i32 13>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE14:%.*]] = shufflevector <27 x i16> [[X]], <27 x i16> poison, <2 x i32> <i32 14, i32 15>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE16:%.*]] = shufflevector <27 x i16> [[X]], <27 x i16> poison, <2 x i32> <i32 16, i32 17>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE18:%.*]] = shufflevector <27 x i16> [[X]], <27 x i16> poison, <2 x i32> <i32 18, i32 19>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE20:%.*]] = shufflevector <27 x i16> [[X]], <27 x i16> poison, <2 x i32> <i32 20, i32 21>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE22:%.*]] = shufflevector <27 x i16> [[X]], <27 x i16> poison, <2 x i32> <i32 22, i32 23>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE24:%.*]] = shufflevector <27 x i16> [[X]], <27 x i16> poison, <2 x i32> <i32 24, i32 25>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE26:%.*]] = extractelement <27 x i16> [[X]], i64 26
 ; OPT-NEXT:    br label [[FINALLY:%.*]]
 ; OPT:       else:
 ; OPT-NEXT:    [[Y:%.*]] = insertelement <27 x i16> [[IN]], i16 64, i32 6
-; OPT-NEXT:    [[TMP14:%.*]] = shufflevector <27 x i16> [[Y]], <27 x i16> poison, <2 x i32> <i32 0, i32 1>
-; OPT-NEXT:    [[TMP15:%.*]] = shufflevector <27 x i16> [[Y]], <27 x i16> poison, <2 x i32> <i32 2, i32 3>
-; OPT-NEXT:    [[TMP16:%.*]] = shufflevector <27 x i16> [[Y]], <27 x i16> poison, <2 x i32> <i32 4, i32 5>
-; OPT-NEXT:    [[TMP17:%.*]] = shufflevector <27 x i16> [[Y]], <27 x i16> poison, <2 x i32> <i32 6, i32 7>
-; OPT-NEXT:    [[TMP18:%.*]] = shufflevector <27 x i16> [[Y]], <27 x i16> poison, <2 x i32> <i32 8, i32 9>
-; OPT-NEXT:    [[TMP19:%.*]] = shufflevector <27 x i16> [[Y]], <27 x i16> poison, <2 x i32> <i32 10, i32 11>
-; OPT-NEXT:    [[TMP20:%.*]] = shufflevector <27 x i16> [[Y]], <27 x i16> poison, <2 x i32> <i32 12, i32 13>
-; OPT-NEXT:    [[TMP21:%.*]] = shufflevector <27 x i16> [[Y]], <27 x i16> poison, <2 x i32> <i32 14, i32 15>
-; OPT-NEXT:    [[TMP22:%.*]] = shufflevector <27 x i16> [[Y]], <27 x i16> poison, <2 x i32> <i32 16, i32 17>
-; OPT-NEXT:    [[TMP23:%.*]] = shufflevector <27 x i16> [[Y]], <27 x i16> poison, <2 x i32> <i32 18, i32 19>
-; OPT-NEXT:    [[TMP24:%.*]] = shufflevector <27 x i16> [[Y]], <27 x i16> poison, <2 x i32> <i32 20, i32 21>
-; OPT-NEXT:    [[TMP25:%.*]] = shufflevector <27 x i16> [[Y]], <27 x i16> poison, <2 x i32> <i32 22, i32 23>
-; OPT-NEXT:    [[TMP26:%.*]] = shufflevector <27 x i16> [[Y]], <27 x i16> poison, <2 x i32> <i32 24, i32 25>
-; OPT-NEXT:    [[TMP27:%.*]] = extractelement <27 x i16> [[Y]], i64 26
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = shufflevector <27 x i16> [[Y]], <27 x i16> poison, <2 x i32> <i32 0, i32 1>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = shufflevector <27 x i16> [[Y]], <27 x i16> poison, <2 x i32> <i32 2, i32 3>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE5:%.*]] = shufflevector <27 x i16> [[Y]], <27 x i16> poison, <2 x i32> <i32 4, i32 5>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE7:%.*]] = shufflevector <27 x i16> [[Y]], <27 x i16> poison, <2 x i32> <i32 6, i32 7>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE9:%.*]] = shufflevector <27 x i16> [[Y]], <27 x i16> poison, <2 x i32> <i32 8, i32 9>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE11:%.*]] = shufflevector <27 x i16> [[Y]], <27 x i16> poison, <2 x i32> <i32 10, i32 11>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE13:%.*]] = shufflevector <27 x i16> [[Y]], <27 x i16> poison, <2 x i32> <i32 12, i32 13>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE15:%.*]] = shufflevector <27 x i16> [[Y]], <27 x i16> poison, <2 x i32> <i32 14, i32 15>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE17:%.*]] = shufflevector <27 x i16> [[Y]], <27 x i16> poison, <2 x i32> <i32 16, i32 17>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE19:%.*]] = shufflevector <27 x i16> [[Y]], <27 x i16> poison, <2 x i32> <i32 18, i32 19>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE21:%.*]] = shufflevector <27 x i16> [[Y]], <27 x i16> poison, <2 x i32> <i32 20, i32 21>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE23:%.*]] = shufflevector <27 x i16> [[Y]], <27 x i16> poison, <2 x i32> <i32 22, i32 23>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE25:%.*]] = shufflevector <27 x i16> [[Y]], <27 x i16> poison, <2 x i32> <i32 24, i32 25>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE27:%.*]] = extractelement <27 x i16> [[Y]], i64 26
 ; OPT-NEXT:    br label [[FINALLY]]
 ; OPT:       finally:
-; OPT-NEXT:    [[TMP28:%.*]] = phi <2 x i16> [ [[TMP0]], [[THEN]] ], [ [[TMP14]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP29:%.*]] = phi <2 x i16> [ [[TMP1]], [[THEN]] ], [ [[TMP15]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP30:%.*]] = phi <2 x i16> [ [[TMP2]], [[THEN]] ], [ [[TMP16]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP31:%.*]] = phi <2 x i16> [ [[TMP3]], [[THEN]] ], [ [[TMP17]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP32:%.*]] = phi <2 x i16> [ [[TMP4]], [[THEN]] ], [ [[TMP18]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP33:%.*]] = phi <2 x i16> [ [[TMP5]], [[THEN]] ], [ [[TMP19]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP34:%.*]] = phi <2 x i16> [ [[TMP6]], [[THEN]] ], [ [[TMP20]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP35:%.*]] = phi <2 x i16> [ [[TMP7]], [[THEN]] ], [ [[TMP21]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP36:%.*]] = phi <2 x i16> [ [[TMP8]], [[THEN]] ], [ [[TMP22]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP37:%.*]] = phi <2 x i16> [ [[TMP9]], [[THEN]] ], [ [[TMP23]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP38:%.*]] = phi <2 x i16> [ [[TMP10]], [[THEN]] ], [ [[TMP24]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP39:%.*]] = phi <2 x i16> [ [[TMP11]], [[THEN]] ], [ [[TMP25]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP40:%.*]] = phi <2 x i16> [ [[TMP12]], [[THEN]] ], [ [[TMP26]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP41:%.*]] = phi i16 [ [[TMP13]], [[THEN]] ], [ [[TMP27]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP42:%.*]] = call <27 x i16> @llvm.vector.insert.v27i16.v2i16(<27 x i16> poison, <2 x i16> [[TMP28]], i64 0)
-; OPT-NEXT:    [[TMP43:%.*]] = call <27 x i16> @llvm.vector.insert.v27i16.v2i16(<27 x i16> [[TMP42]], <2 x i16> [[TMP29]], i64 2)
-; OPT-NEXT:    [[TMP44:%.*]] = call <27 x i16> @llvm.vector.insert.v27i16.v2i16(<27 x i16> [[TMP43]], <2 x i16> [[TMP30]], i64 4)
-; OPT-NEXT:    [[TMP45:%.*]] = call <27 x i16> @llvm.vector.insert.v27i16.v2i16(<27 x i16> [[TMP44]], <2 x i16> [[TMP31]], i64 6)
-; OPT-NEXT:    [[TMP46:%.*]] = call <27 x i16> @llvm.vector.insert.v27i16.v2i16(<27 x i16> [[TMP45]], <2 x i16> [[TMP32]], i64 8)
-; OPT-NEXT:    [[TMP47:%.*]] = call <27 x i16> @llvm.vector.insert.v27i16.v2i16(<27 x i16> [[TMP46]], <2 x i16> [[TMP33]], i64 10)
-; OPT-NEXT:    [[TMP48:%.*]] = call <27 x i16> @llvm.vector.insert.v27i16.v2i16(<27 x i16> [[TMP47]], <2 x i16> [[TMP34]], i64 12)
-; OPT-NEXT:    [[TMP49:%.*]] = call <27 x i16> @llvm.vector.insert.v27i16.v2i16(<27 x i16> [[TMP48]], <2 x i16> [[TMP35]], i64 14)
-; OPT-NEXT:    [[TMP50:%.*]] = call <27 x i16> @llvm.vector.insert.v27i16.v2i16(<27 x i16> [[TMP49]], <2 x i16> [[TMP36]], i64 16)
-; OPT-NEXT:    [[TMP51:%.*]] = call <27 x i16> @llvm.vector.insert.v27i16.v2i16(<27 x i16> [[TMP50]], <2 x i16> [[TMP37]], i64 18)
-; OPT-NEXT:    [[TMP52:%.*]] = call <27 x i16> @llvm.vector.insert.v27i16.v2i16(<27 x i16> [[TMP51]], <2 x i16> [[TMP38]], i64 20)
-; OPT-NEXT:    [[TMP53:%.*]] = call <27 x i16> @llvm.vector.insert.v27i16.v2i16(<27 x i16> [[TMP52]], <2 x i16> [[TMP39]], i64 22)
-; OPT-NEXT:    [[TMP54:%.*]] = call <27 x i16> @llvm.vector.insert.v27i16.v2i16(<27 x i16> [[TMP53]], <2 x i16> [[TMP40]], i64 24)
-; OPT-NEXT:    [[TMP55:%.*]] = insertelement <27 x i16> [[TMP54]], i16 [[TMP41]], i64 26
-; OPT-NEXT:    store <27 x i16> [[TMP55]], ptr [[OUT:%.*]], align 1
+; OPT-NEXT:    [[TMP0:%.*]] = phi <2 x i16> [ [[LARGEPHI_EXTRACTSLICE0]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP1:%.*]] = phi <2 x i16> [ [[LARGEPHI_EXTRACTSLICE2]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP2:%.*]] = phi <2 x i16> [ [[LARGEPHI_EXTRACTSLICE4]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP3:%.*]] = phi <2 x i16> [ [[LARGEPHI_EXTRACTSLICE6]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP4:%.*]] = phi <2 x i16> [ [[LARGEPHI_EXTRACTSLICE8]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP5:%.*]] = phi <2 x i16> [ [[LARGEPHI_EXTRACTSLICE10]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE11]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP6:%.*]] = phi <2 x i16> [ [[LARGEPHI_EXTRACTSLICE12]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE13]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP7:%.*]] = phi <2 x i16> [ [[LARGEPHI_EXTRACTSLICE14]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE15]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP8:%.*]] = phi <2 x i16> [ [[LARGEPHI_EXTRACTSLICE16]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE17]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP9:%.*]] = phi <2 x i16> [ [[LARGEPHI_EXTRACTSLICE18]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE19]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP10:%.*]] = phi <2 x i16> [ [[LARGEPHI_EXTRACTSLICE20]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE21]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP11:%.*]] = phi <2 x i16> [ [[LARGEPHI_EXTRACTSLICE22]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE23]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP12:%.*]] = phi <2 x i16> [ [[LARGEPHI_EXTRACTSLICE24]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE25]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP13:%.*]] = phi i16 [ [[LARGEPHI_EXTRACTSLICE26]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE27]], [[ELSE]] ]
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = call <27 x i16> @llvm.vector.insert.v27i16.v2i16(<27 x i16> poison, <2 x i16> [[TMP0]], i64 0)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = call <27 x i16> @llvm.vector.insert.v27i16.v2i16(<27 x i16> [[LARGEPHI_INSERTSLICE0]], <2 x i16> [[TMP1]], i64 2)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = call <27 x i16> @llvm.vector.insert.v27i16.v2i16(<27 x i16> [[LARGEPHI_INSERTSLICE1]], <2 x i16> [[TMP2]], i64 4)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = call <27 x i16> @llvm.vector.insert.v27i16.v2i16(<27 x i16> [[LARGEPHI_INSERTSLICE2]], <2 x i16> [[TMP3]], i64 6)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE4:%.*]] = call <27 x i16> @llvm.vector.insert.v27i16.v2i16(<27 x i16> [[LARGEPHI_INSERTSLICE3]], <2 x i16> [[TMP4]], i64 8)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE5:%.*]] = call <27 x i16> @llvm.vector.insert.v27i16.v2i16(<27 x i16> [[LARGEPHI_INSERTSLICE4]], <2 x i16> [[TMP5]], i64 10)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE6:%.*]] = call <27 x i16> @llvm.vector.insert.v27i16.v2i16(<27 x i16> [[LARGEPHI_INSERTSLICE5]], <2 x i16> [[TMP6]], i64 12)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE7:%.*]] = call <27 x i16> @llvm.vector.insert.v27i16.v2i16(<27 x i16> [[LARGEPHI_INSERTSLICE6]], <2 x i16> [[TMP7]], i64 14)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE8:%.*]] = call <27 x i16> @llvm.vector.insert.v27i16.v2i16(<27 x i16> [[LARGEPHI_INSERTSLICE7]], <2 x i16> [[TMP8]], i64 16)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE9:%.*]] = call <27 x i16> @llvm.vector.insert.v27i16.v2i16(<27 x i16> [[LARGEPHI_INSERTSLICE8]], <2 x i16> [[TMP9]], i64 18)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE10:%.*]] = call <27 x i16> @llvm.vector.insert.v27i16.v2i16(<27 x i16> [[LARGEPHI_INSERTSLICE9]], <2 x i16> [[TMP10]], i64 20)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE11:%.*]] = call <27 x i16> @llvm.vector.insert.v27i16.v2i16(<27 x i16> [[LARGEPHI_INSERTSLICE10]], <2 x i16> [[TMP11]], i64 22)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE12:%.*]] = call <27 x i16> @llvm.vector.insert.v27i16.v2i16(<27 x i16> [[LARGEPHI_INSERTSLICE11]], <2 x i16> [[TMP12]], i64 24)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE13:%.*]] = insertelement <27 x i16> [[LARGEPHI_INSERTSLICE12]], i16 [[TMP13]], i64 26
+; OPT-NEXT:    store <27 x i16> [[LARGEPHI_INSERTSLICE13]], ptr [[OUT:%.*]], align 1
 ; OPT-NEXT:    ret void
 ;
 ; NOOPT-LABEL: @phi_v27i16(
@@ -494,44 +494,44 @@
 ; OPT-NEXT:    br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]]
 ; OPT:       then:
 ; OPT-NEXT:    [[X:%.*]] = insertelement <23 x i8> [[IN:%.*]], i8 42, i32 3
-; OPT-NEXT:    [[TMP0:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; OPT-NEXT:    [[TMP1:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; OPT-NEXT:    [[TMP2:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; OPT-NEXT:    [[TMP3:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; OPT-NEXT:    [[TMP4:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; OPT-NEXT:    [[TMP5:%.*]] = extractelement <23 x i8> [[X]], i64 20
-; OPT-NEXT:    [[TMP6:%.*]] = extractelement <23 x i8> [[X]], i64 21
-; OPT-NEXT:    [[TMP7:%.*]] = extractelement <23 x i8> [[X]], i64 22
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE0:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE2:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE4:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE6:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE8:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE10:%.*]] = extractelement <23 x i8> [[X]], i64 20
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE12:%.*]] = extractelement <23 x i8> [[X]], i64 21
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE14:%.*]] = extractelement <23 x i8> [[X]], i64 22
 ; OPT-NEXT:    br label [[FINALLY:%.*]]
 ; OPT:       else:
 ; OPT-NEXT:    [[Y:%.*]] = insertelement <23 x i8> [[IN]], i8 64, i32 6
-; OPT-NEXT:    [[TMP8:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; OPT-NEXT:    [[TMP9:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; OPT-NEXT:    [[TMP10:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; OPT-NEXT:    [[TMP11:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; OPT-NEXT:    [[TMP12:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; OPT-NEXT:    [[TMP13:%.*]] = extractelement <23 x i8> [[Y]], i64 20
-; OPT-NEXT:    [[TMP14:%.*]] = extractelement <23 x i8> [[Y]], i64 21
-; OPT-NEXT:    [[TMP15:%.*]] = extractelement <23 x i8> [[Y]], i64 22
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE5:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE7:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE9:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE11:%.*]] = extractelement <23 x i8> [[Y]], i64 20
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE13:%.*]] = extractelement <23 x i8> [[Y]], i64 21
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE15:%.*]] = extractelement <23 x i8> [[Y]], i64 22
 ; OPT-NEXT:    br label [[FINALLY]]
 ; OPT:       finally:
-; OPT-NEXT:    [[TMP16:%.*]] = phi <4 x i8> [ [[TMP0]], [[THEN]] ], [ [[TMP8]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP17:%.*]] = phi <4 x i8> [ [[TMP1]], [[THEN]] ], [ [[TMP9]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP18:%.*]] = phi <4 x i8> [ [[TMP2]], [[THEN]] ], [ [[TMP10]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP19:%.*]] = phi <4 x i8> [ [[TMP3]], [[THEN]] ], [ [[TMP11]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP20:%.*]] = phi <4 x i8> [ [[TMP4]], [[THEN]] ], [ [[TMP12]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP21:%.*]] = phi i8 [ [[TMP5]], [[THEN]] ], [ [[TMP13]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP22:%.*]] = phi i8 [ [[TMP6]], [[THEN]] ], [ [[TMP14]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP23:%.*]] = phi i8 [ [[TMP7]], [[THEN]] ], [ [[TMP15]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP24:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> poison, <4 x i8> [[TMP16]], i64 0)
-; OPT-NEXT:    [[TMP25:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[TMP24]], <4 x i8> [[TMP17]], i64 4)
-; OPT-NEXT:    [[TMP26:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[TMP25]], <4 x i8> [[TMP18]], i64 8)
-; OPT-NEXT:    [[TMP27:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[TMP26]], <4 x i8> [[TMP19]], i64 12)
-; OPT-NEXT:    [[TMP28:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[TMP27]], <4 x i8> [[TMP20]], i64 16)
-; OPT-NEXT:    [[TMP29:%.*]] = insertelement <23 x i8> [[TMP28]], i8 [[TMP21]], i64 20
-; OPT-NEXT:    [[TMP30:%.*]] = insertelement <23 x i8> [[TMP29]], i8 [[TMP22]], i64 21
-; OPT-NEXT:    [[TMP31:%.*]] = insertelement <23 x i8> [[TMP30]], i8 [[TMP23]], i64 22
-; OPT-NEXT:    store <23 x i8> [[TMP31]], ptr [[OUT:%.*]], align 1
+; OPT-NEXT:    [[TMP0:%.*]] = phi <4 x i8> [ [[LARGEPHI_EXTRACTSLICE0]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP1:%.*]] = phi <4 x i8> [ [[LARGEPHI_EXTRACTSLICE2]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP2:%.*]] = phi <4 x i8> [ [[LARGEPHI_EXTRACTSLICE4]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP3:%.*]] = phi <4 x i8> [ [[LARGEPHI_EXTRACTSLICE6]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP4:%.*]] = phi <4 x i8> [ [[LARGEPHI_EXTRACTSLICE8]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP5:%.*]] = phi i8 [ [[LARGEPHI_EXTRACTSLICE10]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE11]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP6:%.*]] = phi i8 [ [[LARGEPHI_EXTRACTSLICE12]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE13]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP7:%.*]] = phi i8 [ [[LARGEPHI_EXTRACTSLICE14]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE15]], [[ELSE]] ]
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> poison, <4 x i8> [[TMP0]], i64 0)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE0]], <4 x i8> [[TMP1]], i64 4)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE1]], <4 x i8> [[TMP2]], i64 8)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE2]], <4 x i8> [[TMP3]], i64 12)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE4:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE3]], <4 x i8> [[TMP4]], i64 16)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE5:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE4]], i8 [[TMP5]], i64 20
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE6:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE5]], i8 [[TMP6]], i64 21
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE7:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE6]], i8 [[TMP7]], i64 22
+; OPT-NEXT:    store <23 x i8> [[LARGEPHI_INSERTSLICE7]], ptr [[OUT:%.*]], align 1
 ; OPT-NEXT:    ret void
 ;
 ; NOOPT-LABEL: @phi_v23i8(
@@ -571,33 +571,33 @@
 ; OPT-NEXT:    br label [[FINALLY:%.*]]
 ; OPT:       else:
 ; OPT-NEXT:    [[Y:%.*]] = insertelement <23 x i8> [[IN:%.*]], i8 64, i32 6
-; OPT-NEXT:    [[TMP0:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; OPT-NEXT:    [[TMP1:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; OPT-NEXT:    [[TMP2:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; OPT-NEXT:    [[TMP3:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; OPT-NEXT:    [[TMP4:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; OPT-NEXT:    [[TMP5:%.*]] = extractelement <23 x i8> [[Y]], i64 20
-; OPT-NEXT:    [[TMP6:%.*]] = extractelement <23 x i8> [[Y]], i64 21
-; OPT-NEXT:    [[TMP7:%.*]] = extractelement <23 x i8> [[Y]], i64 22
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE5:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE7:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE9:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE11:%.*]] = extractelement <23 x i8> [[Y]], i64 20
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE13:%.*]] = extractelement <23 x i8> [[Y]], i64 21
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE15:%.*]] = extractelement <23 x i8> [[Y]], i64 22
 ; OPT-NEXT:    br label [[FINALLY]]
 ; OPT:       finally:
-; OPT-NEXT:    [[TMP8:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[TMP0]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP9:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[TMP1]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP10:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[TMP2]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP11:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[TMP3]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP12:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[TMP4]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP13:%.*]] = phi i8 [ 0, [[THEN]] ], [ [[TMP5]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP14:%.*]] = phi i8 [ 0, [[THEN]] ], [ [[TMP6]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP15:%.*]] = phi i8 [ 0, [[THEN]] ], [ [[TMP7]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP16:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> poison, <4 x i8> [[TMP8]], i64 0)
-; OPT-NEXT:    [[TMP17:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[TMP16]], <4 x i8> [[TMP9]], i64 4)
-; OPT-NEXT:    [[TMP18:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[TMP17]], <4 x i8> [[TMP10]], i64 8)
-; OPT-NEXT:    [[TMP19:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[TMP18]], <4 x i8> [[TMP11]], i64 12)
-; OPT-NEXT:    [[TMP20:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[TMP19]], <4 x i8> [[TMP12]], i64 16)
-; OPT-NEXT:    [[TMP21:%.*]] = insertelement <23 x i8> [[TMP20]], i8 [[TMP13]], i64 20
-; OPT-NEXT:    [[TMP22:%.*]] = insertelement <23 x i8> [[TMP21]], i8 [[TMP14]], i64 21
-; OPT-NEXT:    [[TMP23:%.*]] = insertelement <23 x i8> [[TMP22]], i8 [[TMP15]], i64 22
-; OPT-NEXT:    store <23 x i8> [[TMP23]], ptr [[OUT:%.*]], align 1
+; OPT-NEXT:    [[TMP0:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP1:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP2:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP3:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP4:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP5:%.*]] = phi i8 [ 0, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE11]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP6:%.*]] = phi i8 [ 0, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE13]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP7:%.*]] = phi i8 [ 0, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE15]], [[ELSE]] ]
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> poison, <4 x i8> [[TMP0]], i64 0)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE0]], <4 x i8> [[TMP1]], i64 4)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE1]], <4 x i8> [[TMP2]], i64 8)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE2]], <4 x i8> [[TMP3]], i64 12)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE4:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[LARGEPHI_INSERTSLICE3]], <4 x i8> [[TMP4]], i64 16)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE5:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE4]], i8 [[TMP5]], i64 20
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE6:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE5]], i8 [[TMP6]], i64 21
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE7:%.*]] = insertelement <23 x i8> [[LARGEPHI_INSERTSLICE6]], i8 [[TMP7]], i64 22
+; OPT-NEXT:    store <23 x i8> [[LARGEPHI_INSERTSLICE7]], ptr [[OUT:%.*]], align 1
 ; OPT-NEXT:    ret void
 ;
 ; NOOPT-LABEL: @phi_v23i8_zeroinit(
@@ -634,27 +634,27 @@
 ; OPT-NEXT:    br label [[FINALLY:%.*]]
 ; OPT:       else:
 ; OPT-NEXT:    [[Y:%.*]] = insertelement <15 x i8> [[IN:%.*]], i8 64, i32 6
-; OPT-NEXT:    [[TMP0:%.*]] = shufflevector <15 x i8> [[Y]], <15 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; OPT-NEXT:    [[TMP1:%.*]] = shufflevector <15 x i8> [[Y]], <15 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; OPT-NEXT:    [[TMP2:%.*]] = shufflevector <15 x i8> [[Y]], <15 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; OPT-NEXT:    [[TMP3:%.*]] = extractelement <15 x i8> [[Y]], i64 12
-; OPT-NEXT:    [[TMP4:%.*]] = extractelement <15 x i8> [[Y]], i64 13
-; OPT-NEXT:    [[TMP5:%.*]] = extractelement <15 x i8> [[Y]], i64 14
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = shufflevector <15 x i8> [[Y]], <15 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = shufflevector <15 x i8> [[Y]], <15 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE5:%.*]] = shufflevector <15 x i8> [[Y]], <15 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE7:%.*]] = extractelement <15 x i8> [[Y]], i64 12
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE9:%.*]] = extractelement <15 x i8> [[Y]], i64 13
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE11:%.*]] = extractelement <15 x i8> [[Y]], i64 14
 ; OPT-NEXT:    br label [[FINALLY]]
 ; OPT:       finally:
-; OPT-NEXT:    [[TMP6:%.*]] = phi <4 x i8> [ <i8 poison, i8 1, i8 2, i8 3>, [[THEN]] ], [ [[TMP0]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP7:%.*]] = phi <4 x i8> [ <i8 4, i8 undef, i8 6, i8 7>, [[THEN]] ], [ [[TMP1]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP8:%.*]] = phi <4 x i8> [ <i8 9, i8 10, i8 11, i8 12>, [[THEN]] ], [ [[TMP2]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP9:%.*]] = phi i8 [ 13, [[THEN]] ], [ [[TMP3]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP10:%.*]] = phi i8 [ 14, [[THEN]] ], [ [[TMP4]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP11:%.*]] = phi i8 [ undef, [[THEN]] ], [ [[TMP5]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP12:%.*]] = call <15 x i8> @llvm.vector.insert.v15i8.v4i8(<15 x i8> poison, <4 x i8> [[TMP6]], i64 0)
-; OPT-NEXT:    [[TMP13:%.*]] = call <15 x i8> @llvm.vector.insert.v15i8.v4i8(<15 x i8> [[TMP12]], <4 x i8> [[TMP7]], i64 4)
-; OPT-NEXT:    [[TMP14:%.*]] = call <15 x i8> @llvm.vector.insert.v15i8.v4i8(<15 x i8> [[TMP13]], <4 x i8> [[TMP8]], i64 8)
-; OPT-NEXT:    [[TMP15:%.*]] = insertelement <15 x i8> [[TMP14]], i8 [[TMP9]], i64 12
-; OPT-NEXT:    [[TMP16:%.*]] = insertelement <15 x i8> [[TMP15]], i8 [[TMP10]], i64 13
-; OPT-NEXT:    [[TMP17:%.*]] = insertelement <15 x i8> [[TMP16]], i8 [[TMP11]], i64 14
-; OPT-NEXT:    store <15 x i8> [[TMP17]], ptr [[OUT:%.*]], align 1
+; OPT-NEXT:    [[TMP0:%.*]] = phi <4 x i8> [ <i8 poison, i8 1, i8 2, i8 3>, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP1:%.*]] = phi <4 x i8> [ <i8 4, i8 undef, i8 6, i8 7>, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP2:%.*]] = phi <4 x i8> [ <i8 9, i8 10, i8 11, i8 12>, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP3:%.*]] = phi i8 [ 13, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP4:%.*]] = phi i8 [ 14, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP5:%.*]] = phi i8 [ undef, [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE11]], [[ELSE]] ]
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = call <15 x i8> @llvm.vector.insert.v15i8.v4i8(<15 x i8> poison, <4 x i8> [[TMP0]], i64 0)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = call <15 x i8> @llvm.vector.insert.v15i8.v4i8(<15 x i8> [[LARGEPHI_INSERTSLICE0]], <4 x i8> [[TMP1]], i64 4)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = call <15 x i8> @llvm.vector.insert.v15i8.v4i8(<15 x i8> [[LARGEPHI_INSERTSLICE1]], <4 x i8> [[TMP2]], i64 8)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = insertelement <15 x i8> [[LARGEPHI_INSERTSLICE2]], i8 [[TMP3]], i64 12
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE4:%.*]] = insertelement <15 x i8> [[LARGEPHI_INSERTSLICE3]], i8 [[TMP4]], i64 13
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE5:%.*]] = insertelement <15 x i8> [[LARGEPHI_INSERTSLICE4]], i8 [[TMP5]], i64 14
+; OPT-NEXT:    store <15 x i8> [[LARGEPHI_INSERTSLICE5]], ptr [[OUT:%.*]], align 1
 ; OPT-NEXT:    ret void
 ;
 ; NOOPT-LABEL: @phi_v15i8_random_constant_init(
@@ -689,104 +689,104 @@
 ; OPT-NEXT:    br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]]
 ; OPT:       then:
 ; OPT-NEXT:    [[X:%.*]] = insertelement <23 x i32> [[IN:%.*]], i32 42, i32 3
-; OPT-NEXT:    [[TMP0:%.*]] = extractelement <23 x i32> [[X]], i64 0
-; OPT-NEXT:    [[TMP1:%.*]] = extractelement <23 x i32> [[X]], i64 1
-; OPT-NEXT:    [[TMP2:%.*]] = extractelement <23 x i32> [[X]], i64 2
-; OPT-NEXT:    [[TMP3:%.*]] = extractelement <23 x i32> [[X]], i64 3
-; OPT-NEXT:    [[TMP4:%.*]] = extractelement <23 x i32> [[X]], i64 4
-; OPT-NEXT:    [[TMP5:%.*]] = extractelement <23 x i32> [[X]], i64 5
-; OPT-NEXT:    [[TMP6:%.*]] = extractelement <23 x i32> [[X]], i64 6
-; OPT-NEXT:    [[TMP7:%.*]] = extractelement <23 x i32> [[X]], i64 7
-; OPT-NEXT:    [[TMP8:%.*]] = extractelement <23 x i32> [[X]], i64 8
-; OPT-NEXT:    [[TMP9:%.*]] = extractelement <23 x i32> [[X]], i64 9
-; OPT-NEXT:    [[TMP10:%.*]] = extractelement <23 x i32> [[X]], i64 10
-; OPT-NEXT:    [[TMP11:%.*]] = extractelement <23 x i32> [[X]], i64 11
-; OPT-NEXT:    [[TMP12:%.*]] = extractelement <23 x i32> [[X]], i64 12
-; OPT-NEXT:    [[TMP13:%.*]] = extractelement <23 x i32> [[X]], i64 13
-; OPT-NEXT:    [[TMP14:%.*]] = extractelement <23 x i32> [[X]], i64 14
-; OPT-NEXT:    [[TMP15:%.*]] = extractelement <23 x i32> [[X]], i64 15
-; OPT-NEXT:    [[TMP16:%.*]] = extractelement <23 x i32> [[X]], i64 16
-; OPT-NEXT:    [[TMP17:%.*]] = extractelement <23 x i32> [[X]], i64 17
-; OPT-NEXT:    [[TMP18:%.*]] = extractelement <23 x i32> [[X]], i64 18
-; OPT-NEXT:    [[TMP19:%.*]] = extractelement <23 x i32> [[X]], i64 19
-; OPT-NEXT:    [[TMP20:%.*]] = extractelement <23 x i32> [[X]], i64 20
-; OPT-NEXT:    [[TMP21:%.*]] = extractelement <23 x i32> [[X]], i64 21
-; OPT-NEXT:    [[TMP22:%.*]] = extractelement <23 x i32> [[X]], i64 22
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE0:%.*]] = extractelement <23 x i32> [[X]], i64 0
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE2:%.*]] = extractelement <23 x i32> [[X]], i64 1
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE4:%.*]] = extractelement <23 x i32> [[X]], i64 2
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE6:%.*]] = extractelement <23 x i32> [[X]], i64 3
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE8:%.*]] = extractelement <23 x i32> [[X]], i64 4
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE10:%.*]] = extractelement <23 x i32> [[X]], i64 5
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE12:%.*]] = extractelement <23 x i32> [[X]], i64 6
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE14:%.*]] = extractelement <23 x i32> [[X]], i64 7
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE16:%.*]] = extractelement <23 x i32> [[X]], i64 8
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE18:%.*]] = extractelement <23 x i32> [[X]], i64 9
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE20:%.*]] = extractelement <23 x i32> [[X]], i64 10
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE22:%.*]] = extractelement <23 x i32> [[X]], i64 11
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE24:%.*]] = extractelement <23 x i32> [[X]], i64 12
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE26:%.*]] = extractelement <23 x i32> [[X]], i64 13
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE28:%.*]] = extractelement <23 x i32> [[X]], i64 14
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE30:%.*]] = extractelement <23 x i32> [[X]], i64 15
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE32:%.*]] = extractelement <23 x i32> [[X]], i64 16
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE34:%.*]] = extractelement <23 x i32> [[X]], i64 17
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE36:%.*]] = extractelement <23 x i32> [[X]], i64 18
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE38:%.*]] = extractelement <23 x i32> [[X]], i64 19
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE40:%.*]] = extractelement <23 x i32> [[X]], i64 20
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE42:%.*]] = extractelement <23 x i32> [[X]], i64 21
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE44:%.*]] = extractelement <23 x i32> [[X]], i64 22
 ; OPT-NEXT:    br label [[FINALLY:%.*]]
 ; OPT:       else:
 ; OPT-NEXT:    [[Y:%.*]] = insertelement <23 x i32> [[IN]], i32 64, i32 6
-; OPT-NEXT:    [[TMP23:%.*]] = extractelement <23 x i32> [[Y]], i64 0
-; OPT-NEXT:    [[TMP24:%.*]] = extractelement <23 x i32> [[Y]], i64 1
-; OPT-NEXT:    [[TMP25:%.*]] = extractelement <23 x i32> [[Y]], i64 2
-; OPT-NEXT:    [[TMP26:%.*]] = extractelement <23 x i32> [[Y]], i64 3
-; OPT-NEXT:    [[TMP27:%.*]] = extractelement <23 x i32> [[Y]], i64 4
-; OPT-NEXT:    [[TMP28:%.*]] = extractelement <23 x i32> [[Y]], i64 5
-; OPT-NEXT:    [[TMP29:%.*]] = extractelement <23 x i32> [[Y]], i64 6
-; OPT-NEXT:    [[TMP30:%.*]] = extractelement <23 x i32> [[Y]], i64 7
-; OPT-NEXT:    [[TMP31:%.*]] = extractelement <23 x i32> [[Y]], i64 8
-; OPT-NEXT:    [[TMP32:%.*]] = extractelement <23 x i32> [[Y]], i64 9
-; OPT-NEXT:    [[TMP33:%.*]] = extractelement <23 x i32> [[Y]], i64 10
-; OPT-NEXT:    [[TMP34:%.*]] = extractelement <23 x i32> [[Y]], i64 11
-; OPT-NEXT:    [[TMP35:%.*]] = extractelement <23 x i32> [[Y]], i64 12
-; OPT-NEXT:    [[TMP36:%.*]] = extractelement <23 x i32> [[Y]], i64 13
-; OPT-NEXT:    [[TMP37:%.*]] = extractelement <23 x i32> [[Y]], i64 14
-; OPT-NEXT:    [[TMP38:%.*]] = extractelement <23 x i32> [[Y]], i64 15
-; OPT-NEXT:    [[TMP39:%.*]] = extractelement <23 x i32> [[Y]], i64 16
-; OPT-NEXT:    [[TMP40:%.*]] = extractelement <23 x i32> [[Y]], i64 17
-; OPT-NEXT:    [[TMP41:%.*]] = extractelement <23 x i32> [[Y]], i64 18
-; OPT-NEXT:    [[TMP42:%.*]] = extractelement <23 x i32> [[Y]], i64 19
-; OPT-NEXT:    [[TMP43:%.*]] = extractelement <23 x i32> [[Y]], i64 20
-; OPT-NEXT:    [[TMP44:%.*]] = extractelement <23 x i32> [[Y]], i64 21
-; OPT-NEXT:    [[TMP45:%.*]] = extractelement <23 x i32> [[Y]], i64 22
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = extractelement <23 x i32> [[Y]], i64 0
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = extractelement <23 x i32> [[Y]], i64 1
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE5:%.*]] = extractelement <23 x i32> [[Y]], i64 2
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE7:%.*]] = extractelement <23 x i32> [[Y]], i64 3
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE9:%.*]] = extractelement <23 x i32> [[Y]], i64 4
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE11:%.*]] = extractelement <23 x i32> [[Y]], i64 5
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE13:%.*]] = extractelement <23 x i32> [[Y]], i64 6
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE15:%.*]] = extractelement <23 x i32> [[Y]], i64 7
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE17:%.*]] = extractelement <23 x i32> [[Y]], i64 8
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE19:%.*]] = extractelement <23 x i32> [[Y]], i64 9
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE21:%.*]] = extractelement <23 x i32> [[Y]], i64 10
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE23:%.*]] = extractelement <23 x i32> [[Y]], i64 11
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE25:%.*]] = extractelement <23 x i32> [[Y]], i64 12
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE27:%.*]] = extractelement <23 x i32> [[Y]], i64 13
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE29:%.*]] = extractelement <23 x i32> [[Y]], i64 14
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE31:%.*]] = extractelement <23 x i32> [[Y]], i64 15
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE33:%.*]] = extractelement <23 x i32> [[Y]], i64 16
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE35:%.*]] = extractelement <23 x i32> [[Y]], i64 17
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE37:%.*]] = extractelement <23 x i32> [[Y]], i64 18
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE39:%.*]] = extractelement <23 x i32> [[Y]], i64 19
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE41:%.*]] = extractelement <23 x i32> [[Y]], i64 20
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE43:%.*]] = extractelement <23 x i32> [[Y]], i64 21
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE45:%.*]] = extractelement <23 x i32> [[Y]], i64 22
 ; OPT-NEXT:    br label [[FINALLY]]
 ; OPT:       finally:
-; OPT-NEXT:    [[TMP46:%.*]] = phi i32 [ [[TMP0]], [[THEN]] ], [ [[TMP23]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP47:%.*]] = phi i32 [ [[TMP1]], [[THEN]] ], [ [[TMP24]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP48:%.*]] = phi i32 [ [[TMP2]], [[THEN]] ], [ [[TMP25]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP49:%.*]] = phi i32 [ [[TMP3]], [[THEN]] ], [ [[TMP26]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP50:%.*]] = phi i32 [ [[TMP4]], [[THEN]] ], [ [[TMP27]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP51:%.*]] = phi i32 [ [[TMP5]], [[THEN]] ], [ [[TMP28]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP52:%.*]] = phi i32 [ [[TMP6]], [[THEN]] ], [ [[TMP29]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP53:%.*]] = phi i32 [ [[TMP7]], [[THEN]] ], [ [[TMP30]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP54:%.*]] = phi i32 [ [[TMP8]], [[THEN]] ], [ [[TMP31]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP55:%.*]] = phi i32 [ [[TMP9]], [[THEN]] ], [ [[TMP32]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP56:%.*]] = phi i32 [ [[TMP10]], [[THEN]] ], [ [[TMP33]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP57:%.*]] = phi i32 [ [[TMP11]], [[THEN]] ], [ [[TMP34]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP58:%.*]] = phi i32 [ [[TMP12]], [[THEN]] ], [ [[TMP35]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP59:%.*]] = phi i32 [ [[TMP13]], [[THEN]] ], [ [[TMP36]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP60:%.*]] = phi i32 [ [[TMP14]], [[THEN]] ], [ [[TMP37]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP61:%.*]] = phi i32 [ [[TMP15]], [[THEN]] ], [ [[TMP38]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP62:%.*]] = phi i32 [ [[TMP16]], [[THEN]] ], [ [[TMP39]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP63:%.*]] = phi i32 [ [[TMP17]], [[THEN]] ], [ [[TMP40]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP64:%.*]] = phi i32 [ [[TMP18]], [[THEN]] ], [ [[TMP41]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP65:%.*]] = phi i32 [ [[TMP19]], [[THEN]] ], [ [[TMP42]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP66:%.*]] = phi i32 [ [[TMP20]], [[THEN]] ], [ [[TMP43]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP67:%.*]] = phi i32 [ [[TMP21]], [[THEN]] ], [ [[TMP44]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP68:%.*]] = phi i32 [ [[TMP22]], [[THEN]] ], [ [[TMP45]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP69:%.*]] = insertelement <23 x i32> poison, i32 [[TMP46]], i64 0
-; OPT-NEXT:    [[TMP70:%.*]] = insertelement <23 x i32> [[TMP69]], i32 [[TMP47]], i64 1
-; OPT-NEXT:    [[TMP71:%.*]] = insertelement <23 x i32> [[TMP70]], i32 [[TMP48]], i64 2
-; OPT-NEXT:    [[TMP72:%.*]] = insertelement <23 x i32> [[TMP71]], i32 [[TMP49]], i64 3
-; OPT-NEXT:    [[TMP73:%.*]] = insertelement <23 x i32> [[TMP72]], i32 [[TMP50]], i64 4
-; OPT-NEXT:    [[TMP74:%.*]] = insertelement <23 x i32> [[TMP73]], i32 [[TMP51]], i64 5
-; OPT-NEXT:    [[TMP75:%.*]] = insertelement <23 x i32> [[TMP74]], i32 [[TMP52]], i64 6
-; OPT-NEXT:    [[TMP76:%.*]] = insertelement <23 x i32> [[TMP75]], i32 [[TMP53]], i64 7
-; OPT-NEXT:    [[TMP77:%.*]] = insertelement <23 x i32> [[TMP76]], i32 [[TMP54]], i64 8
-; OPT-NEXT:    [[TMP78:%.*]] = insertelement <23 x i32> [[TMP77]], i32 [[TMP55]], i64 9
-; OPT-NEXT:    [[TMP79:%.*]] = insertelement <23 x i32> [[TMP78]], i32 [[TMP56]], i64 10
-; OPT-NEXT:    [[TMP80:%.*]] = insertelement <23 x i32> [[TMP79]], i32 [[TMP57]], i64 11
-; OPT-NEXT:    [[TMP81:%.*]] = insertelement <23 x i32> [[TMP80]], i32 [[TMP58]], i64 12
-; OPT-NEXT:    [[TMP82:%.*]] = insertelement <23 x i32> [[TMP81]], i32 [[TMP59]], i64 13
-; OPT-NEXT:    [[TMP83:%.*]] = insertelement <23 x i32> [[TMP82]], i32 [[TMP60]], i64 14
-; OPT-NEXT:    [[TMP84:%.*]] = insertelement <23 x i32> [[TMP83]], i32 [[TMP61]], i64 15
-; OPT-NEXT:    [[TMP85:%.*]] = insertelement <23 x i32> [[TMP84]], i32 [[TMP62]], i64 16
-; OPT-NEXT:    [[TMP86:%.*]] = insertelement <23 x i32> [[TMP85]], i32 [[TMP63]], i64 17
-; OPT-NEXT:    [[TMP87:%.*]] = insertelement <23 x i32> [[TMP86]], i32 [[TMP64]], i64 18
-; OPT-NEXT:    [[TMP88:%.*]] = insertelement <23 x i32> [[TMP87]], i32 [[TMP65]], i64 19
-; OPT-NEXT:    [[TMP89:%.*]] = insertelement <23 x i32> [[TMP88]], i32 [[TMP66]], i64 20
-; OPT-NEXT:    [[TMP90:%.*]] = insertelement <23 x i32> [[TMP89]], i32 [[TMP67]], i64 21
-; OPT-NEXT:    [[TMP91:%.*]] = insertelement <23 x i32> [[TMP90]], i32 [[TMP68]], i64 22
-; OPT-NEXT:    store <23 x i32> [[TMP91]], ptr [[OUT:%.*]], align 1
+; OPT-NEXT:    [[TMP0:%.*]] = phi i32 [ [[LARGEPHI_EXTRACTSLICE0]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP1:%.*]] = phi i32 [ [[LARGEPHI_EXTRACTSLICE2]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP2:%.*]] = phi i32 [ [[LARGEPHI_EXTRACTSLICE4]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP3:%.*]] = phi i32 [ [[LARGEPHI_EXTRACTSLICE6]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP4:%.*]] = phi i32 [ [[LARGEPHI_EXTRACTSLICE8]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP5:%.*]] = phi i32 [ [[LARGEPHI_EXTRACTSLICE10]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE11]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP6:%.*]] = phi i32 [ [[LARGEPHI_EXTRACTSLICE12]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE13]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP7:%.*]] = phi i32 [ [[LARGEPHI_EXTRACTSLICE14]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE15]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP8:%.*]] = phi i32 [ [[LARGEPHI_EXTRACTSLICE16]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE17]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP9:%.*]] = phi i32 [ [[LARGEPHI_EXTRACTSLICE18]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE19]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP10:%.*]] = phi i32 [ [[LARGEPHI_EXTRACTSLICE20]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE21]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP11:%.*]] = phi i32 [ [[LARGEPHI_EXTRACTSLICE22]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE23]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP12:%.*]] = phi i32 [ [[LARGEPHI_EXTRACTSLICE24]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE25]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP13:%.*]] = phi i32 [ [[LARGEPHI_EXTRACTSLICE26]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE27]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP14:%.*]] = phi i32 [ [[LARGEPHI_EXTRACTSLICE28]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE29]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP15:%.*]] = phi i32 [ [[LARGEPHI_EXTRACTSLICE30]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE31]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP16:%.*]] = phi i32 [ [[LARGEPHI_EXTRACTSLICE32]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE33]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP17:%.*]] = phi i32 [ [[LARGEPHI_EXTRACTSLICE34]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE35]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP18:%.*]] = phi i32 [ [[LARGEPHI_EXTRACTSLICE36]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE37]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP19:%.*]] = phi i32 [ [[LARGEPHI_EXTRACTSLICE38]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE39]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP20:%.*]] = phi i32 [ [[LARGEPHI_EXTRACTSLICE40]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE41]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP21:%.*]] = phi i32 [ [[LARGEPHI_EXTRACTSLICE42]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE43]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP22:%.*]] = phi i32 [ [[LARGEPHI_EXTRACTSLICE44]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE45]], [[ELSE]] ]
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = insertelement <23 x i32> poison, i32 [[TMP0]], i64 0
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = insertelement <23 x i32> [[LARGEPHI_INSERTSLICE0]], i32 [[TMP1]], i64 1
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = insertelement <23 x i32> [[LARGEPHI_INSERTSLICE1]], i32 [[TMP2]], i64 2
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = insertelement <23 x i32> [[LARGEPHI_INSERTSLICE2]], i32 [[TMP3]], i64 3
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE4:%.*]] = insertelement <23 x i32> [[LARGEPHI_INSERTSLICE3]], i32 [[TMP4]], i64 4
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE5:%.*]] = insertelement <23 x i32> [[LARGEPHI_INSERTSLICE4]], i32 [[TMP5]], i64 5
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE6:%.*]] = insertelement <23 x i32> [[LARGEPHI_INSERTSLICE5]], i32 [[TMP6]], i64 6
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE7:%.*]] = insertelement <23 x i32> [[LARGEPHI_INSERTSLICE6]], i32 [[TMP7]], i64 7
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE8:%.*]] = insertelement <23 x i32> [[LARGEPHI_INSERTSLICE7]], i32 [[TMP8]], i64 8
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE9:%.*]] = insertelement <23 x i32> [[LARGEPHI_INSERTSLICE8]], i32 [[TMP9]], i64 9
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE10:%.*]] = insertelement <23 x i32> [[LARGEPHI_INSERTSLICE9]], i32 [[TMP10]], i64 10
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE11:%.*]] = insertelement <23 x i32> [[LARGEPHI_INSERTSLICE10]], i32 [[TMP11]], i64 11
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE12:%.*]] = insertelement <23 x i32> [[LARGEPHI_INSERTSLICE11]], i32 [[TMP12]], i64 12
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE13:%.*]] = insertelement <23 x i32> [[LARGEPHI_INSERTSLICE12]], i32 [[TMP13]], i64 13
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE14:%.*]] = insertelement <23 x i32> [[LARGEPHI_INSERTSLICE13]], i32 [[TMP14]], i64 14
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE15:%.*]] = insertelement <23 x i32> [[LARGEPHI_INSERTSLICE14]], i32 [[TMP15]], i64 15
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE16:%.*]] = insertelement <23 x i32> [[LARGEPHI_INSERTSLICE15]], i32 [[TMP16]], i64 16
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE17:%.*]] = insertelement <23 x i32> [[LARGEPHI_INSERTSLICE16]], i32 [[TMP17]], i64 17
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE18:%.*]] = insertelement <23 x i32> [[LARGEPHI_INSERTSLICE17]], i32 [[TMP18]], i64 18
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE19:%.*]] = insertelement <23 x i32> [[LARGEPHI_INSERTSLICE18]], i32 [[TMP19]], i64 19
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE20:%.*]] = insertelement <23 x i32> [[LARGEPHI_INSERTSLICE19]], i32 [[TMP20]], i64 20
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE21:%.*]] = insertelement <23 x i32> [[LARGEPHI_INSERTSLICE20]], i32 [[TMP21]], i64 21
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE22:%.*]] = insertelement <23 x i32> [[LARGEPHI_INSERTSLICE21]], i32 [[TMP22]], i64 22
+; OPT-NEXT:    store <23 x i32> [[LARGEPHI_INSERTSLICE22]], ptr [[OUT:%.*]], align 1
 ; OPT-NEXT:    ret void
 ;
 ; NOOPT-LABEL: @phi_v23i32(
@@ -823,76 +823,76 @@
 ; OPT-NEXT:    br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]]
 ; OPT:       then:
 ; OPT-NEXT:    [[X:%.*]] = insertelement <16 x i64> [[IN:%.*]], i64 42, i32 3
-; OPT-NEXT:    [[TMP0:%.*]] = extractelement <16 x i64> [[X]], i64 0
-; OPT-NEXT:    [[TMP1:%.*]] = extractelement <16 x i64> [[X]], i64 1
-; OPT-NEXT:    [[TMP2:%.*]] = extractelement <16 x i64> [[X]], i64 2
-; OPT-NEXT:    [[TMP3:%.*]] = extractelement <16 x i64> [[X]], i64 3
-; OPT-NEXT:    [[TMP4:%.*]] = extractelement <16 x i64> [[X]], i64 4
-; OPT-NEXT:    [[TMP5:%.*]] = extractelement <16 x i64> [[X]], i64 5
-; OPT-NEXT:    [[TMP6:%.*]] = extractelement <16 x i64> [[X]], i64 6
-; OPT-NEXT:    [[TMP7:%.*]] = extractelement <16 x i64> [[X]], i64 7
-; OPT-NEXT:    [[TMP8:%.*]] = extractelement <16 x i64> [[X]], i64 8
-; OPT-NEXT:    [[TMP9:%.*]] = extractelement <16 x i64> [[X]], i64 9
-; OPT-NEXT:    [[TMP10:%.*]] = extractelement <16 x i64> [[X]], i64 10
-; OPT-NEXT:    [[TMP11:%.*]] = extractelement <16 x i64> [[X]], i64 11
-; OPT-NEXT:    [[TMP12:%.*]] = extractelement <16 x i64> [[X]], i64 12
-; OPT-NEXT:    [[TMP13:%.*]] = extractelement <16 x i64> [[X]], i64 13
-; OPT-NEXT:    [[TMP14:%.*]] = extractelement <16 x i64> [[X]], i64 14
-; OPT-NEXT:    [[TMP15:%.*]] = extractelement <16 x i64> [[X]], i64 15
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE0:%.*]] = extractelement <16 x i64> [[X]], i64 0
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE2:%.*]] = extractelement <16 x i64> [[X]], i64 1
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE4:%.*]] = extractelement <16 x i64> [[X]], i64 2
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE6:%.*]] = extractelement <16 x i64> [[X]], i64 3
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE8:%.*]] = extractelement <16 x i64> [[X]], i64 4
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE10:%.*]] = extractelement <16 x i64> [[X]], i64 5
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE12:%.*]] = extractelement <16 x i64> [[X]], i64 6
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE14:%.*]] = extractelement <16 x i64> [[X]], i64 7
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE16:%.*]] = extractelement <16 x i64> [[X]], i64 8
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE18:%.*]] = extractelement <16 x i64> [[X]], i64 9
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE20:%.*]] = extractelement <16 x i64> [[X]], i64 10
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE22:%.*]] = extractelement <16 x i64> [[X]], i64 11
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE24:%.*]] = extractelement <16 x i64> [[X]], i64 12
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE26:%.*]] = extractelement <16 x i64> [[X]], i64 13
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE28:%.*]] = extractelement <16 x i64> [[X]], i64 14
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE30:%.*]] = extractelement <16 x i64> [[X]], i64 15
 ; OPT-NEXT:    br label [[FINALLY:%.*]]
 ; OPT:       else:
 ; OPT-NEXT:    [[Y:%.*]] = insertelement <16 x i64> [[IN]], i64 64, i32 6
-; OPT-NEXT:    [[TMP16:%.*]] = extractelement <16 x i64> [[Y]], i64 0
-; OPT-NEXT:    [[TMP17:%.*]] = extractelement <16 x i64> [[Y]], i64 1
-; OPT-NEXT:    [[TMP18:%.*]] = extractelement <16 x i64> [[Y]], i64 2
-; OPT-NEXT:    [[TMP19:%.*]] = extractelement <16 x i64> [[Y]], i64 3
-; OPT-NEXT:    [[TMP20:%.*]] = extractelement <16 x i64> [[Y]], i64 4
-; OPT-NEXT:    [[TMP21:%.*]] = extractelement <16 x i64> [[Y]], i64 5
-; OPT-NEXT:    [[TMP22:%.*]] = extractelement <16 x i64> [[Y]], i64 6
-; OPT-NEXT:    [[TMP23:%.*]] = extractelement <16 x i64> [[Y]], i64 7
-; OPT-NEXT:    [[TMP24:%.*]] = extractelement <16 x i64> [[Y]], i64 8
-; OPT-NEXT:    [[TMP25:%.*]] = extractelement <16 x i64> [[Y]], i64 9
-; OPT-NEXT:    [[TMP26:%.*]] = extractelement <16 x i64> [[Y]], i64 10
-; OPT-NEXT:    [[TMP27:%.*]] = extractelement <16 x i64> [[Y]], i64 11
-; OPT-NEXT:    [[TMP28:%.*]] = extractelement <16 x i64> [[Y]], i64 12
-; OPT-NEXT:    [[TMP29:%.*]] = extractelement <16 x i64> [[Y]], i64 13
-; OPT-NEXT:    [[TMP30:%.*]] = extractelement <16 x i64> [[Y]], i64 14
-; OPT-NEXT:    [[TMP31:%.*]] = extractelement <16 x i64> [[Y]], i64 15
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = extractelement <16 x i64> [[Y]], i64 0
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = extractelement <16 x i64> [[Y]], i64 1
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE5:%.*]] = extractelement <16 x i64> [[Y]], i64 2
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE7:%.*]] = extractelement <16 x i64> [[Y]], i64 3
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE9:%.*]] = extractelement <16 x i64> [[Y]], i64 4
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE11:%.*]] = extractelement <16 x i64> [[Y]], i64 5
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE13:%.*]] = extractelement <16 x i64> [[Y]], i64 6
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE15:%.*]] = extractelement <16 x i64> [[Y]], i64 7
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE17:%.*]] = extractelement <16 x i64> [[Y]], i64 8
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE19:%.*]] = extractelement <16 x i64> [[Y]], i64 9
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE21:%.*]] = extractelement <16 x i64> [[Y]], i64 10
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE23:%.*]] = extractelement <16 x i64> [[Y]], i64 11
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE25:%.*]] = extractelement <16 x i64> [[Y]], i64 12
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE27:%.*]] = extractelement <16 x i64> [[Y]], i64 13
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE29:%.*]] = extractelement <16 x i64> [[Y]], i64 14
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE31:%.*]] = extractelement <16 x i64> [[Y]], i64 15
 ; OPT-NEXT:    br label [[FINALLY]]
 ; OPT:       finally:
-; OPT-NEXT:    [[TMP32:%.*]] = phi i64 [ [[TMP0]], [[THEN]] ], [ [[TMP16]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP33:%.*]] = phi i64 [ [[TMP1]], [[THEN]] ], [ [[TMP17]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP34:%.*]] = phi i64 [ [[TMP2]], [[THEN]] ], [ [[TMP18]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP35:%.*]] = phi i64 [ [[TMP3]], [[THEN]] ], [ [[TMP19]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP36:%.*]] = phi i64 [ [[TMP4]], [[THEN]] ], [ [[TMP20]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP37:%.*]] = phi i64 [ [[TMP5]], [[THEN]] ], [ [[TMP21]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP38:%.*]] = phi i64 [ [[TMP6]], [[THEN]] ], [ [[TMP22]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP39:%.*]] = phi i64 [ [[TMP7]], [[THEN]] ], [ [[TMP23]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP40:%.*]] = phi i64 [ [[TMP8]], [[THEN]] ], [ [[TMP24]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP41:%.*]] = phi i64 [ [[TMP9]], [[THEN]] ], [ [[TMP25]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP42:%.*]] = phi i64 [ [[TMP10]], [[THEN]] ], [ [[TMP26]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP43:%.*]] = phi i64 [ [[TMP11]], [[THEN]] ], [ [[TMP27]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP44:%.*]] = phi i64 [ [[TMP12]], [[THEN]] ], [ [[TMP28]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP45:%.*]] = phi i64 [ [[TMP13]], [[THEN]] ], [ [[TMP29]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP46:%.*]] = phi i64 [ [[TMP14]], [[THEN]] ], [ [[TMP30]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP47:%.*]] = phi i64 [ [[TMP15]], [[THEN]] ], [ [[TMP31]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP48:%.*]] = insertelement <16 x i64> poison, i64 [[TMP32]], i64 0
-; OPT-NEXT:    [[TMP49:%.*]] = insertelement <16 x i64> [[TMP48]], i64 [[TMP33]], i64 1
-; OPT-NEXT:    [[TMP50:%.*]] = insertelement <16 x i64> [[TMP49]], i64 [[TMP34]], i64 2
-; OPT-NEXT:    [[TMP51:%.*]] = insertelement <16 x i64> [[TMP50]], i64 [[TMP35]], i64 3
-; OPT-NEXT:    [[TMP52:%.*]] = insertelement <16 x i64> [[TMP51]], i64 [[TMP36]], i64 4
-; OPT-NEXT:    [[TMP53:%.*]] = insertelement <16 x i64> [[TMP52]], i64 [[TMP37]], i64 5
-; OPT-NEXT:    [[TMP54:%.*]] = insertelement <16 x i64> [[TMP53]], i64 [[TMP38]], i64 6
-; OPT-NEXT:    [[TMP55:%.*]] = insertelement <16 x i64> [[TMP54]], i64 [[TMP39]], i64 7
-; OPT-NEXT:    [[TMP56:%.*]] = insertelement <16 x i64> [[TMP55]], i64 [[TMP40]], i64 8
-; OPT-NEXT:    [[TMP57:%.*]] = insertelement <16 x i64> [[TMP56]], i64 [[TMP41]], i64 9
-; OPT-NEXT:    [[TMP58:%.*]] = insertelement <16 x i64> [[TMP57]], i64 [[TMP42]], i64 10
-; OPT-NEXT:    [[TMP59:%.*]] = insertelement <16 x i64> [[TMP58]], i64 [[TMP43]], i64 11
-; OPT-NEXT:    [[TMP60:%.*]] = insertelement <16 x i64> [[TMP59]], i64 [[TMP44]], i64 12
-; OPT-NEXT:    [[TMP61:%.*]] = insertelement <16 x i64> [[TMP60]], i64 [[TMP45]], i64 13
-; OPT-NEXT:    [[TMP62:%.*]] = insertelement <16 x i64> [[TMP61]], i64 [[TMP46]], i64 14
-; OPT-NEXT:    [[TMP63:%.*]] = insertelement <16 x i64> [[TMP62]], i64 [[TMP47]], i64 15
-; OPT-NEXT:    store <16 x i64> [[TMP63]], ptr [[OUT:%.*]], align 1
+; OPT-NEXT:    [[TMP0:%.*]] = phi i64 [ [[LARGEPHI_EXTRACTSLICE0]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP1:%.*]] = phi i64 [ [[LARGEPHI_EXTRACTSLICE2]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP2:%.*]] = phi i64 [ [[LARGEPHI_EXTRACTSLICE4]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP3:%.*]] = phi i64 [ [[LARGEPHI_EXTRACTSLICE6]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP4:%.*]] = phi i64 [ [[LARGEPHI_EXTRACTSLICE8]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP5:%.*]] = phi i64 [ [[LARGEPHI_EXTRACTSLICE10]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE11]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP6:%.*]] = phi i64 [ [[LARGEPHI_EXTRACTSLICE12]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE13]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP7:%.*]] = phi i64 [ [[LARGEPHI_EXTRACTSLICE14]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE15]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP8:%.*]] = phi i64 [ [[LARGEPHI_EXTRACTSLICE16]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE17]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP9:%.*]] = phi i64 [ [[LARGEPHI_EXTRACTSLICE18]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE19]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP10:%.*]] = phi i64 [ [[LARGEPHI_EXTRACTSLICE20]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE21]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP11:%.*]] = phi i64 [ [[LARGEPHI_EXTRACTSLICE22]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE23]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP12:%.*]] = phi i64 [ [[LARGEPHI_EXTRACTSLICE24]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE25]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP13:%.*]] = phi i64 [ [[LARGEPHI_EXTRACTSLICE26]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE27]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP14:%.*]] = phi i64 [ [[LARGEPHI_EXTRACTSLICE28]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE29]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP15:%.*]] = phi i64 [ [[LARGEPHI_EXTRACTSLICE30]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE31]], [[ELSE]] ]
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = insertelement <16 x i64> poison, i64 [[TMP0]], i64 0
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = insertelement <16 x i64> [[LARGEPHI_INSERTSLICE0]], i64 [[TMP1]], i64 1
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = insertelement <16 x i64> [[LARGEPHI_INSERTSLICE1]], i64 [[TMP2]], i64 2
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = insertelement <16 x i64> [[LARGEPHI_INSERTSLICE2]], i64 [[TMP3]], i64 3
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE4:%.*]] = insertelement <16 x i64> [[LARGEPHI_INSERTSLICE3]], i64 [[TMP4]], i64 4
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE5:%.*]] = insertelement <16 x i64> [[LARGEPHI_INSERTSLICE4]], i64 [[TMP5]], i64 5
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE6:%.*]] = insertelement <16 x i64> [[LARGEPHI_INSERTSLICE5]], i64 [[TMP6]], i64 6
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE7:%.*]] = insertelement <16 x i64> [[LARGEPHI_INSERTSLICE6]], i64 [[TMP7]], i64 7
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE8:%.*]] = insertelement <16 x i64> [[LARGEPHI_INSERTSLICE7]], i64 [[TMP8]], i64 8
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE9:%.*]] = insertelement <16 x i64> [[LARGEPHI_INSERTSLICE8]], i64 [[TMP9]], i64 9
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE10:%.*]] = insertelement <16 x i64> [[LARGEPHI_INSERTSLICE9]], i64 [[TMP10]], i64 10
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE11:%.*]] = insertelement <16 x i64> [[LARGEPHI_INSERTSLICE10]], i64 [[TMP11]], i64 11
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE12:%.*]] = insertelement <16 x i64> [[LARGEPHI_INSERTSLICE11]], i64 [[TMP12]], i64 12
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE13:%.*]] = insertelement <16 x i64> [[LARGEPHI_INSERTSLICE12]], i64 [[TMP13]], i64 13
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE14:%.*]] = insertelement <16 x i64> [[LARGEPHI_INSERTSLICE13]], i64 [[TMP14]], i64 14
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE15:%.*]] = insertelement <16 x i64> [[LARGEPHI_INSERTSLICE14]], i64 [[TMP15]], i64 15
+; OPT-NEXT:    store <16 x i64> [[LARGEPHI_INSERTSLICE15]], ptr [[OUT:%.*]], align 1
 ; OPT-NEXT:    ret void
 ;
 ; NOOPT-LABEL: @phi_v16i64(
@@ -929,28 +929,28 @@
 ; OPT-NEXT:    br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]]
 ; OPT:       then:
 ; OPT-NEXT:    [[X:%.*]] = insertelement <7 x i16> [[IN:%.*]], i16 3, i32 3
-; OPT-NEXT:    [[TMP0:%.*]] = shufflevector <7 x i16> [[X]], <7 x i16> poison, <2 x i32> <i32 0, i32 1>
-; OPT-NEXT:    [[TMP1:%.*]] = shufflevector <7 x i16> [[X]], <7 x i16> poison, <2 x i32> <i32 2, i32 3>
-; OPT-NEXT:    [[TMP2:%.*]] = shufflevector <7 x i16> [[X]], <7 x i16> poison, <2 x i32> <i32 4, i32 5>
-; OPT-NEXT:    [[TMP3:%.*]] = extractelement <7 x i16> [[X]], i64 6
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE0:%.*]] = shufflevector <7 x i16> [[X]], <7 x i16> poison, <2 x i32> <i32 0, i32 1>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE2:%.*]] = shufflevector <7 x i16> [[X]], <7 x i16> poison, <2 x i32> <i32 2, i32 3>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE4:%.*]] = shufflevector <7 x i16> [[X]], <7 x i16> poison, <2 x i32> <i32 4, i32 5>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE6:%.*]] = extractelement <7 x i16> [[X]], i64 6
 ; OPT-NEXT:    br label [[FINALLY:%.*]]
 ; OPT:       else:
 ; OPT-NEXT:    [[Y:%.*]] = insertelement <7 x i16> [[IN]], i16 9, i32 6
-; OPT-NEXT:    [[TMP4:%.*]] = shufflevector <7 x i16> [[Y]], <7 x i16> poison, <2 x i32> <i32 0, i32 1>
-; OPT-NEXT:    [[TMP5:%.*]] = shufflevector <7 x i16> [[Y]], <7 x i16> poison, <2 x i32> <i32 2, i32 3>
-; OPT-NEXT:    [[TMP6:%.*]] = shufflevector <7 x i16> [[Y]], <7 x i16> poison, <2 x i32> <i32 4, i32 5>
-; OPT-NEXT:    [[TMP7:%.*]] = extractelement <7 x i16> [[Y]], i64 6
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = shufflevector <7 x i16> [[Y]], <7 x i16> poison, <2 x i32> <i32 0, i32 1>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = shufflevector <7 x i16> [[Y]], <7 x i16> poison, <2 x i32> <i32 2, i32 3>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE5:%.*]] = shufflevector <7 x i16> [[Y]], <7 x i16> poison, <2 x i32> <i32 4, i32 5>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE7:%.*]] = extractelement <7 x i16> [[Y]], i64 6
 ; OPT-NEXT:    br label [[FINALLY]]
 ; OPT:       finally:
-; OPT-NEXT:    [[TMP8:%.*]] = phi <2 x i16> [ [[TMP0]], [[THEN]] ], [ [[TMP4]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP9:%.*]] = phi <2 x i16> [ [[TMP1]], [[THEN]] ], [ [[TMP5]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP10:%.*]] = phi <2 x i16> [ [[TMP2]], [[THEN]] ], [ [[TMP6]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP11:%.*]] = phi i16 [ [[TMP3]], [[THEN]] ], [ [[TMP7]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP12:%.*]] = call <7 x i16> @llvm.vector.insert.v7i16.v2i16(<7 x i16> poison, <2 x i16> [[TMP8]], i64 0)
-; OPT-NEXT:    [[TMP13:%.*]] = call <7 x i16> @llvm.vector.insert.v7i16.v2i16(<7 x i16> [[TMP12]], <2 x i16> [[TMP9]], i64 2)
-; OPT-NEXT:    [[TMP14:%.*]] = call <7 x i16> @llvm.vector.insert.v7i16.v2i16(<7 x i16> [[TMP13]], <2 x i16> [[TMP10]], i64 4)
-; OPT-NEXT:    [[TMP15:%.*]] = insertelement <7 x i16> [[TMP14]], i16 [[TMP11]], i64 6
-; OPT-NEXT:    store <7 x i16> [[TMP15]], ptr [[OUT:%.*]], align 1
+; OPT-NEXT:    [[TMP0:%.*]] = phi <2 x i16> [ [[LARGEPHI_EXTRACTSLICE0]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP1:%.*]] = phi <2 x i16> [ [[LARGEPHI_EXTRACTSLICE2]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP2:%.*]] = phi <2 x i16> [ [[LARGEPHI_EXTRACTSLICE4]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP3:%.*]] = phi i16 [ [[LARGEPHI_EXTRACTSLICE6]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = call <7 x i16> @llvm.vector.insert.v7i16.v2i16(<7 x i16> poison, <2 x i16> [[TMP0]], i64 0)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = call <7 x i16> @llvm.vector.insert.v7i16.v2i16(<7 x i16> [[LARGEPHI_INSERTSLICE0]], <2 x i16> [[TMP1]], i64 2)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = call <7 x i16> @llvm.vector.insert.v7i16.v2i16(<7 x i16> [[LARGEPHI_INSERTSLICE1]], <2 x i16> [[TMP2]], i64 4)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = insertelement <7 x i16> [[LARGEPHI_INSERTSLICE2]], i16 [[TMP3]], i64 6
+; OPT-NEXT:    store <7 x i16> [[LARGEPHI_INSERTSLICE3]], ptr [[OUT:%.*]], align 1
 ; OPT-NEXT:    ret void
 ;
 ; NOOPT-LABEL: @phi_v7i16(
@@ -991,34 +991,34 @@
 ; OPT-NEXT:    i8 3, label [[THEN_2:%.*]]
 ; OPT-NEXT:    ]
 ; OPT:       then.1:
-; OPT-NEXT:    [[TMP0:%.*]] = shufflevector <7 x i16> [[X]], <7 x i16> poison, <2 x i32> <i32 0, i32 1>
-; OPT-NEXT:    [[TMP1:%.*]] = shufflevector <7 x i16> [[X]], <7 x i16> poison, <2 x i32> <i32 2, i32 3>
-; OPT-NEXT:    [[TMP2:%.*]] = shufflevector <7 x i16> [[X]], <7 x i16> poison, <2 x i32> <i32 4, i32 5>
-; OPT-NEXT:    [[TMP3:%.*]] = extractelement <7 x i16> [[X]], i64 6
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE0:%.*]] = shufflevector <7 x i16> [[X]], <7 x i16> poison, <2 x i32> <i32 0, i32 1>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = shufflevector <7 x i16> [[X]], <7 x i16> poison, <2 x i32> <i32 2, i32 3>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE6:%.*]] = shufflevector <7 x i16> [[X]], <7 x i16> poison, <2 x i32> <i32 4, i32 5>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE9:%.*]] = extractelement <7 x i16> [[X]], i64 6
 ; OPT-NEXT:    br label [[FINALLY:%.*]]
 ; OPT:       then.2:
-; OPT-NEXT:    [[TMP4:%.*]] = shufflevector <7 x i16> [[X]], <7 x i16> poison, <2 x i32> <i32 0, i32 1>
-; OPT-NEXT:    [[TMP5:%.*]] = shufflevector <7 x i16> [[X]], <7 x i16> poison, <2 x i32> <i32 2, i32 3>
-; OPT-NEXT:    [[TMP6:%.*]] = shufflevector <7 x i16> [[X]], <7 x i16> poison, <2 x i32> <i32 4, i32 5>
-; OPT-NEXT:    [[TMP7:%.*]] = extractelement <7 x i16> [[X]], i64 6
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = shufflevector <7 x i16> [[X]], <7 x i16> poison, <2 x i32> <i32 0, i32 1>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE4:%.*]] = shufflevector <7 x i16> [[X]], <7 x i16> poison, <2 x i32> <i32 2, i32 3>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE7:%.*]] = shufflevector <7 x i16> [[X]], <7 x i16> poison, <2 x i32> <i32 4, i32 5>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE10:%.*]] = extractelement <7 x i16> [[X]], i64 6
 ; OPT-NEXT:    br label [[FINALLY]]
 ; OPT:       else:
 ; OPT-NEXT:    [[Y:%.*]] = insertelement <7 x i16> [[IN]], i16 9, i32 6
-; OPT-NEXT:    [[TMP8:%.*]] = shufflevector <7 x i16> [[Y]], <7 x i16> poison, <2 x i32> <i32 0, i32 1>
-; OPT-NEXT:    [[TMP9:%.*]] = shufflevector <7 x i16> [[Y]], <7 x i16> poison, <2 x i32> <i32 2, i32 3>
-; OPT-NEXT:    [[TMP10:%.*]] = shufflevector <7 x i16> [[Y]], <7 x i16> poison, <2 x i32> <i32 4, i32 5>
-; OPT-NEXT:    [[TMP11:%.*]] = extractelement <7 x i16> [[Y]], i64 6
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE2:%.*]] = shufflevector <7 x i16> [[Y]], <7 x i16> poison, <2 x i32> <i32 0, i32 1>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE5:%.*]] = shufflevector <7 x i16> [[Y]], <7 x i16> poison, <2 x i32> <i32 2, i32 3>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE8:%.*]] = shufflevector <7 x i16> [[Y]], <7 x i16> poison, <2 x i32> <i32 4, i32 5>
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE11:%.*]] = extractelement <7 x i16> [[Y]], i64 6
 ; OPT-NEXT:    br label [[FINALLY]]
 ; OPT:       finally:
-; OPT-NEXT:    [[TMP12:%.*]] = phi <2 x i16> [ [[TMP0]], [[THEN_1]] ], [ [[TMP4]], [[THEN_2]] ], [ [[TMP8]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP13:%.*]] = phi <2 x i16> [ [[TMP1]], [[THEN_1]] ], [ [[TMP5]], [[THEN_2]] ], [ [[TMP9]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP14:%.*]] = phi <2 x i16> [ [[TMP2]], [[THEN_1]] ], [ [[TMP6]], [[THEN_2]] ], [ [[TMP10]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP15:%.*]] = phi i16 [ [[TMP3]], [[THEN_1]] ], [ [[TMP7]], [[THEN_2]] ], [ [[TMP11]], [[ELSE]] ]
-; OPT-NEXT:    [[TMP16:%.*]] = call <7 x i16> @llvm.vector.insert.v7i16.v2i16(<7 x i16> poison, <2 x i16> [[TMP12]], i64 0)
-; OPT-NEXT:    [[TMP17:%.*]] = call <7 x i16> @llvm.vector.insert.v7i16.v2i16(<7 x i16> [[TMP16]], <2 x i16> [[TMP13]], i64 2)
-; OPT-NEXT:    [[TMP18:%.*]] = call <7 x i16> @llvm.vector.insert.v7i16.v2i16(<7 x i16> [[TMP17]], <2 x i16> [[TMP14]], i64 4)
-; OPT-NEXT:    [[TMP19:%.*]] = insertelement <7 x i16> [[TMP18]], i16 [[TMP15]], i64 6
-; OPT-NEXT:    store <7 x i16> [[TMP19]], ptr [[OUT:%.*]], align 1
+; OPT-NEXT:    [[TMP0:%.*]] = phi <2 x i16> [ [[LARGEPHI_EXTRACTSLICE0]], [[THEN_1]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[THEN_2]] ], [ [[LARGEPHI_EXTRACTSLICE2]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP1:%.*]] = phi <2 x i16> [ [[LARGEPHI_EXTRACTSLICE3]], [[THEN_1]] ], [ [[LARGEPHI_EXTRACTSLICE4]], [[THEN_2]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP2:%.*]] = phi <2 x i16> [ [[LARGEPHI_EXTRACTSLICE6]], [[THEN_1]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[THEN_2]] ], [ [[LARGEPHI_EXTRACTSLICE8]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP3:%.*]] = phi i16 [ [[LARGEPHI_EXTRACTSLICE9]], [[THEN_1]] ], [ [[LARGEPHI_EXTRACTSLICE10]], [[THEN_2]] ], [ [[LARGEPHI_EXTRACTSLICE11]], [[ELSE]] ]
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = call <7 x i16> @llvm.vector.insert.v7i16.v2i16(<7 x i16> poison, <2 x i16> [[TMP0]], i64 0)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = call <7 x i16> @llvm.vector.insert.v7i16.v2i16(<7 x i16> [[LARGEPHI_INSERTSLICE0]], <2 x i16> [[TMP1]], i64 2)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = call <7 x i16> @llvm.vector.insert.v7i16.v2i16(<7 x i16> [[LARGEPHI_INSERTSLICE1]], <2 x i16> [[TMP2]], i64 4)
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = insertelement <7 x i16> [[LARGEPHI_INSERTSLICE2]], i16 [[TMP3]], i64 6
+; OPT-NEXT:    store <7 x i16> [[LARGEPHI_INSERTSLICE3]], ptr [[OUT:%.*]], align 1
 ; OPT-NEXT:    ret void
 ;
 ; NOOPT-LABEL: @phi_v7i16_switch(
@@ -1058,3 +1058,142 @@
   store <7 x i16> %val, ptr %out, align 1
   ret void
 }
+
+; Checks that we can deal with PHIs that have the  same basic block/incoming value
+; pair appear more than once in the incoming blocks.
+; It's not illegal IR. However, if the pass lazily transforms all incoming values
+; without checking for duplicates, it could create a PHI with the same basic block
+; appearing multiple times, but with different incoming values, which is then illegal
+; IR.
+; The error was:
+;   PHI node has multiple entries for the same basic block with different incoming values!
+define amdgpu_kernel void @multi_inc_same_bb(<5 x double> %in, ptr %out, i1 %cond) {
+; OPT-LABEL: @multi_inc_same_bb(
+; OPT-NEXT:  entry:
+; OPT-NEXT:    br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; OPT:       then:
+; OPT-NEXT:    [[X:%.*]] = insertelement <5 x double> [[IN:%.*]], double 3.140000e+00, i32 3
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE0:%.*]] = extractelement <5 x double> [[X]], i64 0
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = extractelement <5 x double> [[X]], i64 1
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE6:%.*]] = extractelement <5 x double> [[X]], i64 2
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE9:%.*]] = extractelement <5 x double> [[X]], i64 3
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE12:%.*]] = extractelement <5 x double> [[X]], i64 4
+; OPT-NEXT:    br label [[FINALLY:%.*]]
+; OPT:       else:
+; OPT-NEXT:    [[Y:%.*]] = insertelement <5 x double> [[IN]], double 9.140000e+00, i32 2
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = extractelement <5 x double> [[Y]], i64 0
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE4:%.*]] = extractelement <5 x double> [[Y]], i64 1
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE7:%.*]] = extractelement <5 x double> [[Y]], i64 2
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE10:%.*]] = extractelement <5 x double> [[Y]], i64 3
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE13:%.*]] = extractelement <5 x double> [[Y]], i64 4
+; OPT-NEXT:    br i1 [[COND]], label [[FINALLY]], label [[FINALLY]]
+; OPT:       finally:
+; OPT-NEXT:    [[TMP0:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE0]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP1:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE3]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE4]], [[ELSE]] ], [ [[LARGEPHI_EXTRACTSLICE4]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP2:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE6]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP3:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE9]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE10]], [[ELSE]] ], [ [[LARGEPHI_EXTRACTSLICE10]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP4:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE12]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE13]], [[ELSE]] ], [ [[LARGEPHI_EXTRACTSLICE13]], [[ELSE]] ]
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = insertelement <5 x double> poison, double [[TMP0]], i64 0
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE0]], double [[TMP1]], i64 1
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE1]], double [[TMP2]], i64 2
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE2]], double [[TMP3]], i64 3
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE4:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE3]], double [[TMP4]], i64 4
+; OPT-NEXT:    store <5 x double> [[LARGEPHI_INSERTSLICE4]], ptr [[OUT:%.*]], align 1
+; OPT-NEXT:    ret void
+;
+; NOOPT-LABEL: @multi_inc_same_bb(
+; NOOPT-NEXT:  entry:
+; NOOPT-NEXT:    br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; NOOPT:       then:
+; NOOPT-NEXT:    [[X:%.*]] = insertelement <5 x double> [[IN:%.*]], double 3.140000e+00, i32 3
+; NOOPT-NEXT:    br label [[FINALLY:%.*]]
+; NOOPT:       else:
+; NOOPT-NEXT:    [[Y:%.*]] = insertelement <5 x double> [[IN]], double 9.140000e+00, i32 2
+; NOOPT-NEXT:    br i1 [[COND]], label [[FINALLY]], label [[FINALLY]]
+; NOOPT:       finally:
+; NOOPT-NEXT:    [[VAL:%.*]] = phi <5 x double> [ [[X]], [[THEN]] ], [ [[Y]], [[ELSE]] ], [ [[Y]], [[ELSE]] ]
+; NOOPT-NEXT:    store <5 x double> [[VAL]], ptr [[OUT:%.*]], align 1
+; NOOPT-NEXT:    ret void
+;
+entry:
+  br i1 %cond, label %then, label %else
+then:
+  %x = insertelement <5 x double> %in, double 3.14, i32 3
+  br label %finally
+else:
+  %y = insertelement <5 x double> %in, double 9.14, i32 2
+  br i1 %cond, label %finally, label %finally
+finally:
+  %val = phi <5 x double> [%x, %then], [%y, %else], [%y, %else]
+  store <5 x double> %val, ptr %out, align 1
+  ret void
+}
+
+; Checks that we can ndeal with blocks that just have a PHI + terminator.
+define amdgpu_kernel void @minimal_block_with_only_phi(<5 x double> %in, ptr %out, i1 %cond) {
+; OPT-LABEL: @minimal_block_with_only_phi(
+; OPT-NEXT:  entry:
+; OPT-NEXT:    br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; OPT:       then:
+; OPT-NEXT:    [[X:%.*]] = insertelement <5 x double> [[IN:%.*]], double 3.140000e+00, i32 3
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE0:%.*]] = extractelement <5 x double> [[X]], i64 0
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE3:%.*]] = extractelement <5 x double> [[X]], i64 1
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE6:%.*]] = extractelement <5 x double> [[X]], i64 2
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE9:%.*]] = extractelement <5 x double> [[X]], i64 3
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE12:%.*]] = extractelement <5 x double> [[X]], i64 4
+; OPT-NEXT:    br label [[FINALLY:%.*]]
+; OPT:       else:
+; OPT-NEXT:    [[Y:%.*]] = insertelement <5 x double> [[IN]], double 9.140000e+00, i32 2
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE1:%.*]] = extractelement <5 x double> [[Y]], i64 0
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE4:%.*]] = extractelement <5 x double> [[Y]], i64 1
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE7:%.*]] = extractelement <5 x double> [[Y]], i64 2
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE10:%.*]] = extractelement <5 x double> [[Y]], i64 3
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE13:%.*]] = extractelement <5 x double> [[Y]], i64 4
+; OPT-NEXT:    br i1 [[COND]], label [[FINALLY]], label [[FINALLY]]
+; OPT:       finally:
+; OPT-NEXT:    [[TMP0:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE0]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP1:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE3]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE4]], [[ELSE]] ], [ [[LARGEPHI_EXTRACTSLICE4]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP2:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE6]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP3:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE9]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE10]], [[ELSE]] ], [ [[LARGEPHI_EXTRACTSLICE10]], [[ELSE]] ]
+; OPT-NEXT:    [[TMP4:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE12]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE13]], [[ELSE]] ], [ [[LARGEPHI_EXTRACTSLICE13]], [[ELSE]] ]
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = insertelement <5 x double> poison, double [[TMP0]], i64 0
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE0]], double [[TMP1]], i64 1
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE1]], double [[TMP2]], i64 2
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE2]], double [[TMP3]], i64 3
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE4:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE3]], double [[TMP4]], i64 4
+; OPT-NEXT:    br label [[REALLYFINALLY:%.*]]
+; OPT:       reallyfinally:
+; OPT-NEXT:    store <5 x double> [[LARGEPHI_INSERTSLICE4]], ptr [[OUT:%.*]], align 1
+; OPT-NEXT:    ret void
+;
+; NOOPT-LABEL: @minimal_block_with_only_phi(
+; NOOPT-NEXT:  entry:
+; NOOPT-NEXT:    br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; NOOPT:       then:
+; NOOPT-NEXT:    [[X:%.*]] = insertelement <5 x double> [[IN:%.*]], double 3.140000e+00, i32 3
+; NOOPT-NEXT:    br label [[FINALLY:%.*]]
+; NOOPT:       else:
+; NOOPT-NEXT:    [[Y:%.*]] = insertelement <5 x double> [[IN]], double 9.140000e+00, i32 2
+; NOOPT-NEXT:    br i1 [[COND]], label [[FINALLY]], label [[FINALLY]]
+; NOOPT:       finally:
+; NOOPT-NEXT:    [[VAL:%.*]] = phi <5 x double> [ [[X]], [[THEN]] ], [ [[Y]], [[ELSE]] ], [ [[Y]], [[ELSE]] ]
+; NOOPT-NEXT:    br label [[REALLYFINALLY:%.*]]
+; NOOPT:       reallyfinally:
+; NOOPT-NEXT:    store <5 x double> [[VAL]], ptr [[OUT:%.*]], align 1
+; NOOPT-NEXT:    ret void
+;
+entry:
+  br i1 %cond, label %then, label %else
+then:
+  %x = insertelement <5 x double> %in, double 3.14, i32 3
+  br label %finally
+else:
+  %y = insertelement <5 x double> %in, double 9.14, i32 2
+  br i1 %cond, label %finally, label %finally
+finally:
+  %val = phi <5 x double> [%x, %then], [%y, %else], [%y, %else]
+  br label %reallyfinally
+reallyfinally:
+  store <5 x double> %val, ptr %out, align 1
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -9,6 +9,12 @@
 declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) #0
 declare <3 x half> @llvm.canonicalize.v3f16(<3 x half>) #0
 declare <4 x half> @llvm.canonicalize.v4f16(<4 x half>) #0
+declare <6 x half> @llvm.canonicalize.v6f16(<6 x half>) #0
+declare <8 x half> @llvm.canonicalize.v8f16(<8 x half>) #0
+declare <12 x half> @llvm.canonicalize.v12f16(<12 x half>) #0
+declare <16 x half> @llvm.canonicalize.v16f16(<16 x half>) #0
+declare <32 x half> @llvm.canonicalize.v32f16(<32 x half>) #0
+declare <64 x half> @llvm.canonicalize.v64f16(<64 x half>) #0
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 
 define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(ptr addrspace(1) %out) #1 {
@@ -2227,6 +2233,807 @@
   ret <4 x half> %canonicalized
 }
 
+define <6 x half> @v_test_canonicalize_var_v6f16(<6 x half> %val) #1 {
+; VI-LABEL: v_test_canonicalize_var_v6f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_max_f16_sdwa v3, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_sdwa v4, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_sdwa v5, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v2, v2, v2
+; VI-NEXT:    v_max_f16_e32 v1, v1, v1
+; VI-NEXT:    v_max_f16_e32 v0, v0, v0
+; VI-NEXT:    v_or_b32_e32 v0, v0, v5
+; VI-NEXT:    v_or_b32_e32 v1, v1, v4
+; VI-NEXT:    v_or_b32_e32 v2, v2, v3
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_canonicalize_var_v6f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; CI-LABEL: v_test_canonicalize_var_v6f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; CI-NEXT:    s_setpc_b64 s[30:31]
+  %canonicalized = call <6 x half> @llvm.canonicalize.v6f16(<6 x half> %val)
+  ret <6 x half> %canonicalized
+}
+
+define <8 x half> @v_test_canonicalize_var_v8f16(<8 x half> %val) #1 {
+; VI-LABEL: v_test_canonicalize_var_v8f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_max_f16_sdwa v4, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_sdwa v5, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_sdwa v6, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_sdwa v7, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v3, v3, v3
+; VI-NEXT:    v_max_f16_e32 v2, v2, v2
+; VI-NEXT:    v_max_f16_e32 v1, v1, v1
+; VI-NEXT:    v_max_f16_e32 v0, v0, v0
+; VI-NEXT:    v_or_b32_e32 v0, v0, v7
+; VI-NEXT:    v_or_b32_e32 v1, v1, v6
+; VI-NEXT:    v_or_b32_e32 v2, v2, v5
+; VI-NEXT:    v_or_b32_e32 v3, v3, v4
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_canonicalize_var_v8f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX9-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; CI-LABEL: v_test_canonicalize_var_v8f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; CI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; CI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; CI-NEXT:    s_setpc_b64 s[30:31]
+  %canonicalized = call <8 x half> @llvm.canonicalize.v8f16(<8 x half> %val)
+  ret <8 x half> %canonicalized
+}
+
+define <12 x half> @v_test_canonicalize_var_v12f16(<12 x half> %val) #1 {
+; VI-LABEL: v_test_canonicalize_var_v12f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_max_f16_sdwa v6, v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_sdwa v7, v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_sdwa v8, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_sdwa v9, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_sdwa v10, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_sdwa v11, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v5, v5, v5
+; VI-NEXT:    v_max_f16_e32 v4, v4, v4
+; VI-NEXT:    v_max_f16_e32 v3, v3, v3
+; VI-NEXT:    v_max_f16_e32 v2, v2, v2
+; VI-NEXT:    v_max_f16_e32 v1, v1, v1
+; VI-NEXT:    v_max_f16_e32 v0, v0, v0
+; VI-NEXT:    v_or_b32_e32 v0, v0, v11
+; VI-NEXT:    v_or_b32_e32 v1, v1, v10
+; VI-NEXT:    v_or_b32_e32 v2, v2, v9
+; VI-NEXT:    v_or_b32_e32 v3, v3, v8
+; VI-NEXT:    v_or_b32_e32 v4, v4, v7
+; VI-NEXT:    v_or_b32_e32 v5, v5, v6
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_canonicalize_var_v12f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX9-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX9-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX9-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; CI-LABEL: v_test_canonicalize_var_v12f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; CI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; CI-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; CI-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; CI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; CI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; CI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; CI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; CI-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; CI-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; CI-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; CI-NEXT:    s_setpc_b64 s[30:31]
+  %canonicalized = call <12 x half> @llvm.canonicalize.v12f16(<12 x half> %val)
+  ret <12 x half> %canonicalized
+}
+
+define <16 x half> @v_test_canonicalize_var_v16f16(<16 x half> %val) #1 {
+; VI-LABEL: v_test_canonicalize_var_v16f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_max_f16_sdwa v8, v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_sdwa v9, v6, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_sdwa v10, v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_sdwa v11, v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_sdwa v12, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_sdwa v13, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_sdwa v14, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_sdwa v15, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v7, v7, v7
+; VI-NEXT:    v_max_f16_e32 v6, v6, v6
+; VI-NEXT:    v_max_f16_e32 v5, v5, v5
+; VI-NEXT:    v_max_f16_e32 v4, v4, v4
+; VI-NEXT:    v_max_f16_e32 v3, v3, v3
+; VI-NEXT:    v_max_f16_e32 v2, v2, v2
+; VI-NEXT:    v_max_f16_e32 v1, v1, v1
+; VI-NEXT:    v_max_f16_e32 v0, v0, v0
+; VI-NEXT:    v_or_b32_e32 v0, v0, v15
+; VI-NEXT:    v_or_b32_e32 v1, v1, v14
+; VI-NEXT:    v_or_b32_e32 v2, v2, v13
+; VI-NEXT:    v_or_b32_e32 v3, v3, v12
+; VI-NEXT:    v_or_b32_e32 v4, v4, v11
+; VI-NEXT:    v_or_b32_e32 v5, v5, v10
+; VI-NEXT:    v_or_b32_e32 v6, v6, v9
+; VI-NEXT:    v_or_b32_e32 v7, v7, v8
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_canonicalize_var_v16f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX9-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX9-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX9-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX9-NEXT:    v_pk_max_f16 v6, v6, v6
+; GFX9-NEXT:    v_pk_max_f16 v7, v7, v7
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; CI-LABEL: v_test_canonicalize_var_v16f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; CI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; CI-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; CI-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; CI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; CI-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; CI-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; CI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; CI-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; CI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; CI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; CI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; CI-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; CI-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; CI-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; CI-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; CI-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; CI-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; CI-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; CI-NEXT:    s_setpc_b64 s[30:31]
+  %canonicalized = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> %val)
+  ret <16 x half> %canonicalized
+}
+
+define <32 x half> @v_test_canonicalize_var_v32f16(<32 x half> %val) #1 {
+; VI-LABEL: v_test_canonicalize_var_v32f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_max_f16_sdwa v20, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v0, v0, v0
+; VI-NEXT:    v_or_b32_e32 v0, v0, v20
+; VI-NEXT:    v_max_f16_sdwa v20, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v1, v1, v1
+; VI-NEXT:    v_or_b32_e32 v1, v1, v20
+; VI-NEXT:    v_max_f16_sdwa v20, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v2, v2, v2
+; VI-NEXT:    v_or_b32_e32 v2, v2, v20
+; VI-NEXT:    v_max_f16_sdwa v20, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v3, v3, v3
+; VI-NEXT:    v_or_b32_e32 v3, v3, v20
+; VI-NEXT:    v_max_f16_sdwa v20, v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v4, v4, v4
+; VI-NEXT:    v_or_b32_e32 v4, v4, v20
+; VI-NEXT:    v_max_f16_sdwa v20, v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v5, v5, v5
+; VI-NEXT:    v_or_b32_e32 v5, v5, v20
+; VI-NEXT:    v_max_f16_sdwa v20, v6, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v6, v6, v6
+; VI-NEXT:    v_or_b32_e32 v6, v6, v20
+; VI-NEXT:    v_max_f16_sdwa v20, v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v7, v7, v7
+; VI-NEXT:    v_or_b32_e32 v7, v7, v20
+; VI-NEXT:    v_max_f16_sdwa v20, v8, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v8, v8, v8
+; VI-NEXT:    v_or_b32_e32 v8, v8, v20
+; VI-NEXT:    v_max_f16_sdwa v20, v9, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v9, v9, v9
+; VI-NEXT:    v_or_b32_e32 v9, v9, v20
+; VI-NEXT:    v_max_f16_sdwa v20, v10, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v10, v10, v10
+; VI-NEXT:    v_max_f16_sdwa v16, v15, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_sdwa v17, v14, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_sdwa v18, v13, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_sdwa v19, v12, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_or_b32_e32 v10, v10, v20
+; VI-NEXT:    v_max_f16_sdwa v20, v11, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v15, v15, v15
+; VI-NEXT:    v_max_f16_e32 v14, v14, v14
+; VI-NEXT:    v_max_f16_e32 v13, v13, v13
+; VI-NEXT:    v_max_f16_e32 v12, v12, v12
+; VI-NEXT:    v_max_f16_e32 v11, v11, v11
+; VI-NEXT:    v_or_b32_e32 v11, v11, v20
+; VI-NEXT:    v_or_b32_e32 v12, v12, v19
+; VI-NEXT:    v_or_b32_e32 v13, v13, v18
+; VI-NEXT:    v_or_b32_e32 v14, v14, v17
+; VI-NEXT:    v_or_b32_e32 v15, v15, v16
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_canonicalize_var_v32f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX9-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX9-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX9-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX9-NEXT:    v_pk_max_f16 v6, v6, v6
+; GFX9-NEXT:    v_pk_max_f16 v7, v7, v7
+; GFX9-NEXT:    v_pk_max_f16 v8, v8, v8
+; GFX9-NEXT:    v_pk_max_f16 v9, v9, v9
+; GFX9-NEXT:    v_pk_max_f16 v10, v10, v10
+; GFX9-NEXT:    v_pk_max_f16 v11, v11, v11
+; GFX9-NEXT:    v_pk_max_f16 v12, v12, v12
+; GFX9-NEXT:    v_pk_max_f16 v13, v13, v13
+; GFX9-NEXT:    v_pk_max_f16 v14, v14, v14
+; GFX9-NEXT:    v_pk_max_f16 v15, v15, v15
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; CI-LABEL: v_test_canonicalize_var_v32f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; CI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; CI-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; CI-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; CI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; CI-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; CI-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; CI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; CI-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; CI-NEXT:    v_cvt_f16_f32_e32 v16, v16
+; CI-NEXT:    v_cvt_f16_f32_e32 v17, v17
+; CI-NEXT:    v_cvt_f16_f32_e32 v18, v18
+; CI-NEXT:    v_cvt_f16_f32_e32 v19, v19
+; CI-NEXT:    v_cvt_f16_f32_e32 v20, v20
+; CI-NEXT:    v_cvt_f16_f32_e32 v21, v21
+; CI-NEXT:    v_cvt_f16_f32_e32 v22, v22
+; CI-NEXT:    v_cvt_f16_f32_e32 v23, v23
+; CI-NEXT:    v_cvt_f16_f32_e32 v24, v24
+; CI-NEXT:    v_cvt_f16_f32_e32 v25, v25
+; CI-NEXT:    v_cvt_f16_f32_e32 v26, v26
+; CI-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT:    v_cvt_f16_f32_e32 v28, v28
+; CI-NEXT:    v_cvt_f16_f32_e32 v29, v29
+; CI-NEXT:    v_cvt_f16_f32_e32 v30, v30
+; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; CI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; CI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; CI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; CI-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; CI-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; CI-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; CI-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; CI-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; CI-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; CI-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; CI-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; CI-NEXT:    v_cvt_f32_f16_e32 v17, v17
+; CI-NEXT:    v_cvt_f32_f16_e32 v18, v18
+; CI-NEXT:    v_cvt_f32_f16_e32 v19, v19
+; CI-NEXT:    v_cvt_f32_f16_e32 v20, v20
+; CI-NEXT:    v_cvt_f32_f16_e32 v21, v21
+; CI-NEXT:    v_cvt_f32_f16_e32 v22, v22
+; CI-NEXT:    v_cvt_f32_f16_e32 v23, v23
+; CI-NEXT:    v_cvt_f32_f16_e32 v24, v24
+; CI-NEXT:    v_cvt_f32_f16_e32 v25, v25
+; CI-NEXT:    v_cvt_f32_f16_e32 v26, v26
+; CI-NEXT:    v_cvt_f32_f16_e32 v27, v27
+; CI-NEXT:    v_cvt_f32_f16_e32 v28, v28
+; CI-NEXT:    v_cvt_f32_f16_e32 v29, v29
+; CI-NEXT:    v_cvt_f32_f16_e32 v30, v30
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; CI-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; CI-NEXT:    s_setpc_b64 s[30:31]
+  %canonicalized = call <32 x half> @llvm.canonicalize.v32f16(<32 x half> %val)
+  ret <32 x half> %canonicalized
+}
+
+define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
+; VI-LABEL: v_test_canonicalize_var_v64f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_max_f16_sdwa v31, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v0, v0, v0
+; VI-NEXT:    v_or_b32_e32 v0, v0, v31
+; VI-NEXT:    v_max_f16_sdwa v31, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v1, v1, v1
+; VI-NEXT:    v_or_b32_e32 v1, v1, v31
+; VI-NEXT:    v_max_f16_sdwa v31, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v2, v2, v2
+; VI-NEXT:    v_or_b32_e32 v2, v2, v31
+; VI-NEXT:    v_max_f16_sdwa v31, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v3, v3, v3
+; VI-NEXT:    v_or_b32_e32 v3, v3, v31
+; VI-NEXT:    v_max_f16_sdwa v31, v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v4, v4, v4
+; VI-NEXT:    v_or_b32_e32 v4, v4, v31
+; VI-NEXT:    v_max_f16_sdwa v31, v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v5, v5, v5
+; VI-NEXT:    v_or_b32_e32 v5, v5, v31
+; VI-NEXT:    v_max_f16_sdwa v31, v6, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v6, v6, v6
+; VI-NEXT:    v_or_b32_e32 v6, v6, v31
+; VI-NEXT:    v_max_f16_sdwa v31, v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v7, v7, v7
+; VI-NEXT:    v_or_b32_e32 v7, v7, v31
+; VI-NEXT:    v_max_f16_sdwa v31, v8, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v8, v8, v8
+; VI-NEXT:    v_or_b32_e32 v8, v8, v31
+; VI-NEXT:    v_max_f16_sdwa v31, v9, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v9, v9, v9
+; VI-NEXT:    v_or_b32_e32 v9, v9, v31
+; VI-NEXT:    v_max_f16_sdwa v31, v10, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v10, v10, v10
+; VI-NEXT:    v_or_b32_e32 v10, v10, v31
+; VI-NEXT:    v_max_f16_sdwa v31, v11, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v11, v11, v11
+; VI-NEXT:    v_or_b32_e32 v11, v11, v31
+; VI-NEXT:    v_max_f16_sdwa v31, v12, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v12, v12, v12
+; VI-NEXT:    v_or_b32_e32 v12, v12, v31
+; VI-NEXT:    v_max_f16_sdwa v31, v13, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v13, v13, v13
+; VI-NEXT:    v_or_b32_e32 v13, v13, v31
+; VI-NEXT:    v_max_f16_sdwa v31, v14, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v14, v14, v14
+; VI-NEXT:    v_or_b32_e32 v14, v14, v31
+; VI-NEXT:    v_max_f16_sdwa v31, v15, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v15, v15, v15
+; VI-NEXT:    v_or_b32_e32 v15, v15, v31
+; VI-NEXT:    v_max_f16_sdwa v31, v16, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v16, v16, v16
+; VI-NEXT:    v_or_b32_e32 v16, v16, v31
+; VI-NEXT:    v_max_f16_sdwa v31, v17, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v17, v17, v17
+; VI-NEXT:    v_or_b32_e32 v17, v17, v31
+; VI-NEXT:    v_max_f16_sdwa v31, v18, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v18, v18, v18
+; VI-NEXT:    v_or_b32_e32 v18, v18, v31
+; VI-NEXT:    v_max_f16_sdwa v31, v19, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v19, v19, v19
+; VI-NEXT:    v_or_b32_e32 v19, v19, v31
+; VI-NEXT:    v_max_f16_sdwa v31, v20, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v20, v20, v20
+; VI-NEXT:    v_or_b32_e32 v20, v20, v31
+; VI-NEXT:    v_max_f16_sdwa v31, v21, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v21, v21, v21
+; VI-NEXT:    v_or_b32_e32 v21, v21, v31
+; VI-NEXT:    v_max_f16_sdwa v31, v22, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v22, v22, v22
+; VI-NEXT:    v_or_b32_e32 v22, v22, v31
+; VI-NEXT:    v_max_f16_sdwa v31, v23, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v23, v23, v23
+; VI-NEXT:    v_or_b32_e32 v23, v23, v31
+; VI-NEXT:    v_max_f16_sdwa v31, v24, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v24, v24, v24
+; VI-NEXT:    v_or_b32_e32 v24, v24, v31
+; VI-NEXT:    v_max_f16_sdwa v31, v25, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v25, v25, v25
+; VI-NEXT:    v_or_b32_e32 v25, v25, v31
+; VI-NEXT:    v_max_f16_sdwa v31, v26, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v26, v26, v26
+; VI-NEXT:    v_or_b32_e32 v26, v26, v31
+; VI-NEXT:    v_max_f16_sdwa v31, v27, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v27, v27, v27
+; VI-NEXT:    v_or_b32_e32 v27, v27, v31
+; VI-NEXT:    v_max_f16_sdwa v31, v28, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v28, v28, v28
+; VI-NEXT:    v_or_b32_e32 v28, v28, v31
+; VI-NEXT:    v_max_f16_sdwa v31, v29, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v29, v29, v29
+; VI-NEXT:    v_or_b32_e32 v29, v29, v31
+; VI-NEXT:    v_max_f16_sdwa v31, v30, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v30, v30, v30
+; VI-NEXT:    v_or_b32_e32 v30, v30, v31
+; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_max_f16_sdwa v32, v31, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v31, v31, v31
+; VI-NEXT:    v_or_b32_e32 v31, v31, v32
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_canonicalize_var_v64f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX9-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX9-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX9-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX9-NEXT:    v_pk_max_f16 v6, v6, v6
+; GFX9-NEXT:    v_pk_max_f16 v7, v7, v7
+; GFX9-NEXT:    v_pk_max_f16 v8, v8, v8
+; GFX9-NEXT:    v_pk_max_f16 v9, v9, v9
+; GFX9-NEXT:    v_pk_max_f16 v10, v10, v10
+; GFX9-NEXT:    v_pk_max_f16 v11, v11, v11
+; GFX9-NEXT:    v_pk_max_f16 v12, v12, v12
+; GFX9-NEXT:    v_pk_max_f16 v13, v13, v13
+; GFX9-NEXT:    v_pk_max_f16 v14, v14, v14
+; GFX9-NEXT:    v_pk_max_f16 v15, v15, v15
+; GFX9-NEXT:    v_pk_max_f16 v16, v16, v16
+; GFX9-NEXT:    v_pk_max_f16 v17, v17, v17
+; GFX9-NEXT:    v_pk_max_f16 v18, v18, v18
+; GFX9-NEXT:    v_pk_max_f16 v19, v19, v19
+; GFX9-NEXT:    v_pk_max_f16 v20, v20, v20
+; GFX9-NEXT:    v_pk_max_f16 v21, v21, v21
+; GFX9-NEXT:    v_pk_max_f16 v22, v22, v22
+; GFX9-NEXT:    v_pk_max_f16 v23, v23, v23
+; GFX9-NEXT:    v_pk_max_f16 v24, v24, v24
+; GFX9-NEXT:    v_pk_max_f16 v25, v25, v25
+; GFX9-NEXT:    v_pk_max_f16 v26, v26, v26
+; GFX9-NEXT:    v_pk_max_f16 v27, v27, v27
+; GFX9-NEXT:    v_pk_max_f16 v28, v28, v28
+; GFX9-NEXT:    v_pk_max_f16 v29, v29, v29
+; GFX9-NEXT:    v_pk_max_f16 v30, v30, v30
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v31, v31, v31
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; CI-LABEL: v_test_canonicalize_var_v64f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; CI-NEXT:    v_or_b32_e32 v1, v1, v2
+; CI-NEXT:    v_cvt_f16_f32_e32 v2, v4
+; CI-NEXT:    v_cvt_f16_f32_e32 v4, v5
+; CI-NEXT:    v_cvt_f16_f32_e32 v5, v7
+; CI-NEXT:    v_cvt_f16_f32_e32 v7, v9
+; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; CI-NEXT:    v_or_b32_e32 v2, v3, v2
+; CI-NEXT:    v_cvt_f16_f32_e32 v3, v6
+; CI-NEXT:    v_cvt_f16_f32_e32 v6, v10
+; CI-NEXT:    v_cvt_f16_f32_e32 v9, v13
+; CI-NEXT:    v_cvt_f16_f32_e32 v10, v18
+; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; CI-NEXT:    v_or_b32_e32 v3, v4, v3
+; CI-NEXT:    v_cvt_f16_f32_e32 v4, v8
+; CI-NEXT:    v_cvt_f16_f32_e32 v8, v14
+; CI-NEXT:    v_cvt_f16_f32_e32 v13, v21
+; CI-NEXT:    v_cvt_f16_f32_e32 v14, v26
+; CI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; CI-NEXT:    v_or_b32_e32 v4, v5, v4
+; CI-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; CI-NEXT:    v_cvt_f16_f32_e32 v6, v12
+; CI-NEXT:    v_or_b32_e32 v5, v7, v5
+; CI-NEXT:    v_cvt_f16_f32_e32 v7, v11
+; CI-NEXT:    v_cvt_f16_f32_e32 v11, v17
+; CI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; CI-NEXT:    v_cvt_f16_f32_e32 v12, v22
+; CI-NEXT:    v_or_b32_e32 v6, v7, v6
+; CI-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
+; CI-NEXT:    v_cvt_f16_f32_e32 v8, v16
+; CI-NEXT:    v_or_b32_e32 v7, v9, v7
+; CI-NEXT:    v_cvt_f16_f32_e32 v9, v15
+; CI-NEXT:    v_cvt_f16_f32_e32 v15, v25
+; CI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; CI-NEXT:    v_cvt_f16_f32_e32 v25, v29
+; CI-NEXT:    v_or_b32_e32 v8, v9, v8
+; CI-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
+; CI-NEXT:    v_cvt_f16_f32_e32 v10, v20
+; CI-NEXT:    v_or_b32_e32 v9, v11, v9
+; CI-NEXT:    v_cvt_f16_f32_e32 v11, v19
+; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:4
+; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32
+; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:12
+; CI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:8
+; CI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; CI-NEXT:    v_or_b32_e32 v10, v11, v10
+; CI-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
+; CI-NEXT:    v_cvt_f16_f32_e32 v12, v24
+; CI-NEXT:    v_or_b32_e32 v11, v13, v11
+; CI-NEXT:    v_cvt_f16_f32_e32 v13, v23
+; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:20
+; CI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:16
+; CI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:28
+; CI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:24
+; CI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; CI-NEXT:    v_cvt_f16_f32_e32 v24, v30
+; CI-NEXT:    v_or_b32_e32 v12, v13, v12
+; CI-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
+; CI-NEXT:    v_or_b32_e32 v13, v15, v13
+; CI-NEXT:    v_cvt_f16_f32_e32 v14, v28
+; CI-NEXT:    v_cvt_f16_f32_e32 v15, v27
+; CI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:36
+; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:32
+; CI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:44
+; CI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:40
+; CI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; CI-NEXT:    v_or_b32_e32 v14, v15, v14
+; CI-NEXT:    v_lshlrev_b32_e32 v15, 16, v24
+; CI-NEXT:    v_or_b32_e32 v15, v25, v15
+; CI-NEXT:    s_waitcnt vmcnt(11)
+; CI-NEXT:    v_cvt_f16_f32_e32 v16, v16
+; CI-NEXT:    s_waitcnt vmcnt(10)
+; CI-NEXT:    v_cvt_f16_f32_e32 v17, v17
+; CI-NEXT:    s_waitcnt vmcnt(9)
+; CI-NEXT:    v_cvt_f16_f32_e32 v18, v18
+; CI-NEXT:    s_waitcnt vmcnt(8)
+; CI-NEXT:    v_cvt_f16_f32_e32 v19, v19
+; CI-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; CI-NEXT:    v_or_b32_e32 v16, v17, v16
+; CI-NEXT:    v_lshlrev_b32_e32 v17, 16, v18
+; CI-NEXT:    v_or_b32_e32 v17, v19, v17
+; CI-NEXT:    s_waitcnt vmcnt(7)
+; CI-NEXT:    v_cvt_f16_f32_e32 v18, v20
+; CI-NEXT:    s_waitcnt vmcnt(6)
+; CI-NEXT:    v_cvt_f16_f32_e32 v19, v21
+; CI-NEXT:    s_waitcnt vmcnt(5)
+; CI-NEXT:    v_cvt_f16_f32_e32 v20, v22
+; CI-NEXT:    s_waitcnt vmcnt(4)
+; CI-NEXT:    v_cvt_f16_f32_e32 v21, v23
+; CI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; CI-NEXT:    v_or_b32_e32 v18, v19, v18
+; CI-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
+; CI-NEXT:    v_or_b32_e32 v19, v21, v19
+; CI-NEXT:    s_waitcnt vmcnt(3)
+; CI-NEXT:    v_cvt_f16_f32_e32 v20, v26
+; CI-NEXT:    s_waitcnt vmcnt(2)
+; CI-NEXT:    v_cvt_f16_f32_e32 v21, v27
+; CI-NEXT:    s_waitcnt vmcnt(1)
+; CI-NEXT:    v_cvt_f16_f32_e32 v26, v28
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v27, v29
+; CI-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; CI-NEXT:    v_or_b32_e32 v20, v21, v20
+; CI-NEXT:    v_lshlrev_b32_e32 v21, 16, v26
+; CI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:52
+; CI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:48
+; CI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:60
+; CI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:56
+; CI-NEXT:    v_or_b32_e32 v21, v27, v21
+; CI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:132
+; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:128
+; CI-NEXT:    s_waitcnt vmcnt(5)
+; CI-NEXT:    v_cvt_f16_f32_e32 v24, v24
+; CI-NEXT:    s_waitcnt vmcnt(4)
+; CI-NEXT:    v_cvt_f16_f32_e32 v25, v25
+; CI-NEXT:    s_waitcnt vmcnt(3)
+; CI-NEXT:    v_cvt_f16_f32_e32 v23, v23
+; CI-NEXT:    s_waitcnt vmcnt(2)
+; CI-NEXT:    v_cvt_f16_f32_e32 v22, v22
+; CI-NEXT:    s_waitcnt vmcnt(1)
+; CI-NEXT:    v_cvt_f16_f32_e32 v26, v26
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; CI-NEXT:    v_or_b32_e32 v24, v25, v24
+; CI-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; CI-NEXT:    v_or_b32_e32 v26, v27, v26
+; CI-NEXT:    v_add_i32_e32 v27, vcc, 0x7c, v0
+; CI-NEXT:    buffer_store_dword v26, v27, s[0:3], 0 offen
+; CI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:124
+; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:120
+; CI-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; CI-NEXT:    v_or_b32_e32 v22, v22, v23
+; CI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88
+; CI-NEXT:    s_waitcnt vmcnt(2)
+; CI-NEXT:    v_cvt_f16_f32_e32 v26, v26
+; CI-NEXT:    s_waitcnt vmcnt(1)
+; CI-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; CI-NEXT:    v_or_b32_e32 v26, v27, v26
+; CI-NEXT:    v_add_i32_e32 v27, vcc, 0x78, v0
+; CI-NEXT:    buffer_store_dword v26, v27, s[0:3], 0 offen
+; CI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:116
+; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:112
+; CI-NEXT:    s_waitcnt vmcnt(3)
+; CI-NEXT:    v_cvt_f16_f32_e32 v23, v23
+; CI-NEXT:    s_waitcnt vmcnt(1)
+; CI-NEXT:    v_cvt_f16_f32_e32 v26, v26
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; CI-NEXT:    v_or_b32_e32 v26, v27, v26
+; CI-NEXT:    v_add_i32_e32 v27, vcc, 0x74, v0
+; CI-NEXT:    buffer_store_dword v26, v27, s[0:3], 0 offen
+; CI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:108
+; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:104
+; CI-NEXT:    s_waitcnt vmcnt(1)
+; CI-NEXT:    v_cvt_f16_f32_e32 v25, v26
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v26, v27
+; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:92
+; CI-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; CI-NEXT:    v_or_b32_e32 v25, v26, v25
+; CI-NEXT:    v_add_i32_e32 v26, vcc, 0x70, v0
+; CI-NEXT:    buffer_store_dword v25, v26, s[0:3], 0 offen
+; CI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:100
+; CI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:96
+; CI-NEXT:    s_waitcnt vmcnt(3)
+; CI-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; CI-NEXT:    v_or_b32_e32 v23, v23, v27
+; CI-NEXT:    s_waitcnt vmcnt(1)
+; CI-NEXT:    v_cvt_f16_f32_e32 v25, v25
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v26, v26
+; CI-NEXT:    v_add_i32_e32 v27, vcc, 0x68, v0
+; CI-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; CI-NEXT:    v_or_b32_e32 v25, v26, v25
+; CI-NEXT:    v_add_i32_e32 v26, vcc, 0x6c, v0
+; CI-NEXT:    buffer_store_dword v25, v26, s[0:3], 0 offen
+; CI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:68
+; CI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:64
+; CI-NEXT:    buffer_store_dword v23, v27, s[0:3], 0 offen
+; CI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:76
+; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:72
+; CI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:84
+; CI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:80
+; CI-NEXT:    s_waitcnt vmcnt(3)
+; CI-NEXT:    v_cvt_f16_f32_e32 v23, v23
+; CI-NEXT:    v_cvt_f16_f32_e32 v25, v25
+; CI-NEXT:    v_cvt_f16_f32_e32 v26, v26
+; CI-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; CI-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; CI-NEXT:    v_or_b32_e32 v25, v26, v25
+; CI-NEXT:    s_waitcnt vmcnt(2)
+; CI-NEXT:    v_cvt_f16_f32_e32 v26, v27
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v27, v29
+; CI-NEXT:    v_or_b32_e32 v23, v26, v23
+; CI-NEXT:    v_cvt_f16_f32_e32 v26, v28
+; CI-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; CI-NEXT:    v_or_b32_e32 v26, v27, v26
+; CI-NEXT:    v_add_i32_e32 v27, vcc, 0x64, v0
+; CI-NEXT:    buffer_store_dword v26, v27, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v26, vcc, 0x60, v0
+; CI-NEXT:    buffer_store_dword v23, v26, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v23, vcc, 0x5c, v0
+; CI-NEXT:    buffer_store_dword v25, v23, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v23, vcc, 0x58, v0
+; CI-NEXT:    buffer_store_dword v22, v23, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v22, vcc, 0x54, v0
+; CI-NEXT:    buffer_store_dword v24, v22, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v22, vcc, 0x50, v0
+; CI-NEXT:    buffer_store_dword v21, v22, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v21, vcc, 0x4c, v0
+; CI-NEXT:    buffer_store_dword v20, v21, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v20, vcc, 0x48, v0
+; CI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v19, vcc, 0x44, v0
+; CI-NEXT:    buffer_store_dword v18, v19, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v18, vcc, 64, v0
+; CI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v17, vcc, 60, v0
+; CI-NEXT:    buffer_store_dword v16, v17, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v16, vcc, 56, v0
+; CI-NEXT:    buffer_store_dword v15, v16, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v15, vcc, 52, v0
+; CI-NEXT:    buffer_store_dword v14, v15, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v14, vcc, 48, v0
+; CI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v13, vcc, 44, v0
+; CI-NEXT:    buffer_store_dword v12, v13, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v12, vcc, 40, v0
+; CI-NEXT:    buffer_store_dword v11, v12, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v11, vcc, 36, v0
+; CI-NEXT:    buffer_store_dword v10, v11, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v10, vcc, 32, v0
+; CI-NEXT:    buffer_store_dword v9, v10, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v9, vcc, 28, v0
+; CI-NEXT:    buffer_store_dword v8, v9, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v8, vcc, 24, v0
+; CI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v7, vcc, 20, v0
+; CI-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v6, vcc, 16, v0
+; CI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v5, vcc, 12, v0
+; CI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v4, vcc, 8, v0
+; CI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v3, vcc, 4, v0
+; CI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; CI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+  %canonicalized = call <64 x half> @llvm.canonicalize.v64f16(<64 x half> %val)
+  ret <64 x half> %canonicalized
+}
+
 attributes #0 = { nounwind readnone }
 attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
 attributes #2 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
@@ -21,6 +21,21 @@
   ret void
 }
 
+define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) {
+; GCN-LABEL: set_inactive_imm_poison:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+  %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 1, i32 poison) #0
+  store i32 %tmp, ptr addrspace(1) %out
+  ret void
+}
+
 define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) {
 ; GCN-LABEL: set_inactive_64:
 ; GCN:       ; %bb.0:
@@ -43,6 +58,22 @@
   ret void
 }
 
+define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) {
+; GCN-LABEL: set_inactive_imm_poison_64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v0, 1
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+  %tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 1, i64 poison) #0
+  store i64 %tmp, ptr addrspace(1) %out
+  ret void
+}
+
 define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x i32> inreg %desc) {
 ; GCN-LABEL: set_inactive_scc:
 ; GCN:       ; %bb.0:
@@ -58,20 +89,20 @@
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_lg_u32 s3, 56
 ; GCN-NEXT:    s_mov_b64 s[2:3], -1
-; GCN-NEXT:    s_cbranch_scc1 .LBB2_3
+; GCN-NEXT:    s_cbranch_scc1 .LBB4_3
 ; GCN-NEXT:  ; %bb.1: ; %Flow
 ; GCN-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN-NEXT:    s_cbranch_vccz .LBB2_4
-; GCN-NEXT:  .LBB2_2: ; %.exit
+; GCN-NEXT:    s_cbranch_vccz .LBB4_4
+; GCN-NEXT:  .LBB4_2: ; %.exit
 ; GCN-NEXT:    s_endpgm
-; GCN-NEXT:  .LBB2_3: ; %.one
+; GCN-NEXT:  .LBB4_3: ; %.one
 ; GCN-NEXT:    v_add_u32_e32 v1, vcc, 1, v0
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GCN-NEXT:    s_mov_b64 s[2:3], 0
-; GCN-NEXT:    s_cbranch_execnz .LBB2_2
-; GCN-NEXT:  .LBB2_4: ; %.zero
+; GCN-NEXT:    s_cbranch_execnz .LBB4_2
+; GCN-NEXT:  .LBB4_4: ; %.zero
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
diff --git a/llvm/test/CodeGen/PowerPC/vec-zext-abdu.ll b/llvm/test/CodeGen/PowerPC/vec-zext-abdu.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/vec-zext-abdu.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le -mcpu=pwr9 < %s | FileCheck %s
+
+define <12 x i8> @zext_abdu(<12 x i8> %a, <12 x i8> %b) {
+; CHECK-LABEL: zext_abdu:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addis 3, 2, .LCPI0_0@toc@ha
+; CHECK-NEXT:    xxlxor 36, 36, 36
+; CHECK-NEXT:    addi 3, 3, .LCPI0_0@toc@l
+; CHECK-NEXT:    lxv 37, 0(3)
+; CHECK-NEXT:    addis 3, 2, .LCPI0_1@toc@ha
+; CHECK-NEXT:    addi 3, 3, .LCPI0_1@toc@l
+; CHECK-NEXT:    lxv 33, 0(3)
+; CHECK-NEXT:    addis 3, 2, .LCPI0_2@toc@ha
+; CHECK-NEXT:    vperm 0, 4, 2, 5
+; CHECK-NEXT:    vperm 5, 4, 3, 5
+; CHECK-NEXT:    addi 3, 3, .LCPI0_2@toc@l
+; CHECK-NEXT:    lxv 39, 0(3)
+; CHECK-NEXT:    vperm 6, 4, 2, 1
+; CHECK-NEXT:    vperm 1, 4, 3, 1
+; CHECK-NEXT:    vperm 2, 4, 2, 7
+; CHECK-NEXT:    vperm 3, 4, 3, 7
+; CHECK-NEXT:    xvnegsp 36, 38
+; CHECK-NEXT:    xvnegsp 35, 35
+; CHECK-NEXT:    xvnegsp 34, 34
+; CHECK-NEXT:    vabsduw 2, 2, 3
+; CHECK-NEXT:    xvnegsp 35, 33
+; CHECK-NEXT:    vabsduw 3, 4, 3
+; CHECK-NEXT:    xvnegsp 36, 37
+; CHECK-NEXT:    xvnegsp 37, 32
+; CHECK-NEXT:    vpkuwum 2, 2, 2
+; CHECK-NEXT:    vabsduw 4, 5, 4
+; CHECK-NEXT:    vpkuwum 3, 4, 3
+; CHECK-NEXT:    vpkuhum 2, 2, 3
+; CHECK-NEXT:    blr
+entry:
+  %aa = zext <12 x i8> %a to <12 x i32>
+  %bb = zext <12 x i8> %b to <12 x i32>
+  %s = sub nsw <12 x i32> %aa, %bb
+  %c = icmp slt <12 x i32> %s, zeroinitializer
+  %ss = sub nsw <12 x i32> zeroinitializer, %s
+  %sel = select <12 x i1> %c, <12 x i32> %ss, <12 x i32> %s
+  %ret = trunc <12 x i32> %sel to <12 x i8>
+  ret <12 x i8> %ret
+}
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -75,8 +75,8 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-smaia %s -o - | FileCheck --check-prefixes=CHECK,RV32SMAIA %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-ssaia %s -o - | FileCheck --check-prefixes=CHECK,RV32SSAIA %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-zfbfmin %s -o - | FileCheck --check-prefixes=CHECK,RV32ZFBFMIN %s
-; RUN: llc -mtriple=riscv32 -mattr=+f,+experimental-zvfbfmin %s -o - | FileCheck --check-prefixes=CHECK,RV32ZVFBFMIN %s
-; RUN: llc -mtriple=riscv32 -mattr=+f,+experimental-zvfbfwma %s -o - | FileCheck --check-prefixes=CHECK,RV32ZVFBFWMA %s
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-zvfbfmin %s -o - | FileCheck --check-prefixes=CHECK,RV32ZVFBFMIN %s
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-zvfbfwma %s -o - | FileCheck --check-prefixes=CHECK,RV32ZVFBFWMA %s
 
 ; RUN: llc -mtriple=riscv64 %s -o - | FileCheck %s
 ; RUN: llc -mtriple=riscv64 -mattr=+m %s -o - | FileCheck --check-prefixes=CHECK,RV64M %s
@@ -159,8 +159,8 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+experimental-smaia %s -o - | FileCheck --check-prefixes=CHECK,RV64SMAIA %s
 ; RUN: llc -mtriple=riscv64 -mattr=+experimental-ssaia %s -o - | FileCheck --check-prefixes=CHECK,RV64SSAIA %s
 ; RUN: llc -mtriple=riscv64 -mattr=+experimental-zfbfmin %s -o - | FileCheck --check-prefixes=CHECK,RV64ZFBFMIN %s
-; RUN: llc -mtriple=riscv64 -mattr=+f,+experimental-zvfbfmin %s -o - | FileCheck --check-prefixes=CHECK,RV64ZVFBFMIN %s
-; RUN: llc -mtriple=riscv64 -mattr=+f,+experimental-zvfbfwma %s -o - | FileCheck --check-prefixes=CHECK,RV64ZVFBFWMA %s
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-zvfbfmin %s -o - | FileCheck --check-prefixes=CHECK,RV64ZVFBFMIN %s
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-zvfbfwma %s -o - | FileCheck --check-prefixes=CHECK,RV64ZVFBFWMA %s
 
 ; CHECK: .attribute 4, 16
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load.ll
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,RV32 %s
-; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,RV64 %s
+; RUN: llc -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,RV32 %s
+; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,RV64 %s
 
 define <5 x i8> @load_v5i8(ptr %p) {
 ; RV32-LABEL: load_v5i8:
@@ -123,29 +123,11 @@
 }
 
 define <6 x half> @load_v6f16(ptr %p) {
-; RV32-LABEL: load_v6f16:
-; RV32:       # %bb.0:
-; RV32-NEXT:    lw a2, 8(a1)
-; RV32-NEXT:    lw a3, 4(a1)
-; RV32-NEXT:    lw a1, 0(a1)
-; RV32-NEXT:    sw a2, 8(a0)
-; RV32-NEXT:    sw a3, 4(a0)
-; RV32-NEXT:    sw a1, 0(a0)
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: load_v6f16:
-; RV64:       # %bb.0:
-; RV64-NEXT:    ld a2, 0(a1)
-; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT:    ld a1, 8(a1)
-; RV64-NEXT:    vslide1down.vx v8, v8, a2
-; RV64-NEXT:    vslide1down.vx v8, v8, a1
-; RV64-NEXT:    sd a2, 0(a0)
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v8, v8, 2
-; RV64-NEXT:    addi a0, a0, 8
-; RV64-NEXT:    vse32.v v8, (a0)
-; RV64-NEXT:    ret
+; CHECK-LABEL: load_v6f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    ret
   %x = load <6 x half>, ptr %p
   ret <6 x half> %x
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,RV32 %s
-; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,RV64 %s
+; RUN: llc -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,RV32 %s
+; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,RV64 %s
 
 define void @store_v5i8(ptr %p, <5 x i8> %v) {
 ; CHECK-LABEL: store_v5i8:
@@ -103,56 +103,16 @@
 define void @store_v6f16(ptr %p, <6 x half> %v) {
 ; RV32-LABEL: store_v6f16:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lh a2, 20(a1)
-; RV32-NEXT:    lhu a3, 16(a1)
-; RV32-NEXT:    slli a2, a2, 16
-; RV32-NEXT:    or a2, a3, a2
-; RV32-NEXT:    lh a3, 12(a1)
-; RV32-NEXT:    lhu a4, 8(a1)
-; RV32-NEXT:    lh a5, 4(a1)
-; RV32-NEXT:    lhu a1, 0(a1)
-; RV32-NEXT:    slli a3, a3, 16
-; RV32-NEXT:    or a3, a4, a3
-; RV32-NEXT:    slli a5, a5, 16
-; RV32-NEXT:    or a1, a1, a5
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vslide1down.vx v8, v8, a1
-; RV32-NEXT:    vslide1down.vx v8, v8, a3
-; RV32-NEXT:    vslide1down.vx v8, v8, a2
-; RV32-NEXT:    vslidedown.vi v8, v8, 1
 ; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vse32.v v8, (a0)
 ; RV32-NEXT:    vslidedown.vi v9, v8, 2
 ; RV32-NEXT:    addi a1, a0, 8
 ; RV32-NEXT:    vse32.v v9, (a1)
-; RV32-NEXT:    vslidedown.vi v8, v8, 1
-; RV32-NEXT:    addi a0, a0, 4
-; RV32-NEXT:    vse32.v v8, (a0)
+; RV32-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; RV32-NEXT:    vse16.v v8, (a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: store_v6f16:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lhu a2, 16(a1)
-; RV64-NEXT:    lh a3, 24(a1)
-; RV64-NEXT:    slli a2, a2, 32
-; RV64-NEXT:    lh a4, 8(a1)
-; RV64-NEXT:    lhu a5, 0(a1)
-; RV64-NEXT:    slli a3, a3, 48
-; RV64-NEXT:    or a2, a3, a2
-; RV64-NEXT:    slli a4, a4, 16
-; RV64-NEXT:    or a4, a5, a4
-; RV64-NEXT:    slli a4, a4, 32
-; RV64-NEXT:    lh a3, 40(a1)
-; RV64-NEXT:    lhu a1, 32(a1)
-; RV64-NEXT:    srli a4, a4, 32
-; RV64-NEXT:    or a2, a4, a2
-; RV64-NEXT:    slli a3, a3, 16
-; RV64-NEXT:    or a1, a1, a3
-; RV64-NEXT:    slli a1, a1, 32
-; RV64-NEXT:    srli a1, a1, 32
-; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT:    vslide1down.vx v8, v8, a2
-; RV64-NEXT:    vslide1down.vx v8, v8, a1
 ; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV64-NEXT:    vse64.v v8, (a0)
 ; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
diff --git a/llvm/test/CodeGen/X86/prefetchi.ll b/llvm/test/CodeGen/X86/prefetchi.ll
--- a/llvm/test/CodeGen/X86/prefetchi.ll
+++ b/llvm/test/CodeGen/X86/prefetchi.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+prefetchi | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefix=NOPREFETCHI
 
 define dso_local void @t(ptr %ptr) nounwind  {
 ; CHECK-LABEL: t:
@@ -9,6 +10,10 @@
 ; CHECK-NEXT:    prefetchit1 t(%rip)
 ; CHECK-NEXT:    prefetchit0 ext(%rip)
 ; CHECK-NEXT:    retq
+;
+; NOPREFETCHI-LABEL: t:
+; NOPREFETCHI:       # %bb.0: # %entry
+; NOPREFETCHI-NEXT:    retq
 entry:
   tail call void @llvm.prefetch(ptr %ptr, i32 0, i32 2, i32 0)
   tail call void @llvm.prefetch(ptr %ptr, i32 0, i32 3, i32 0)
diff --git a/llvm/test/Examples/OrcV2Examples/lljit-with-remote-debugging.test b/llvm/test/Examples/OrcV2Examples/lljit-with-remote-debugging.test
--- a/llvm/test/Examples/OrcV2Examples/lljit-with-remote-debugging.test
+++ b/llvm/test/Examples/OrcV2Examples/lljit-with-remote-debugging.test
@@ -2,6 +2,7 @@
 # Instructions for debugging can be found in LLJITWithRemoteDebugging.cpp
 
 # REQUIRES: default_triple
+# UNSUPPORTED: target=powerpc64{{.*}}
 
 # RUN: LLJITWithRemoteDebugging %p/Inputs/argc_sub1_elf.ll | FileCheck --check-prefix=CHECK0 %s
 # CHECK0: Parsing input IR code from: {{.*}}/Inputs/argc_sub1_elf.ll
diff --git a/llvm/test/Examples/OrcV2Examples/lljit-with-thinlto-summaries.test b/llvm/test/Examples/OrcV2Examples/lljit-with-thinlto-summaries.test
--- a/llvm/test/Examples/OrcV2Examples/lljit-with-thinlto-summaries.test
+++ b/llvm/test/Examples/OrcV2Examples/lljit-with-thinlto-summaries.test
@@ -3,6 +3,7 @@
 # RUN: opt -module-summary %p/Inputs/bar-mod.ll -o %T/bar-mod.bc
 
 # REQUIRES: default_triple
+# UNSUPPORTED: target=powerpc64{{.*}}
 
 # RUN: llvm-lto -thinlto -o %T/main-foo-bar %T/main-mod.bc %T/foo-mod.bc %T/bar-mod.bc
 
diff --git a/llvm/test/Examples/OrcV2Examples/orcv2-cbindings-lazy.test b/llvm/test/Examples/OrcV2Examples/orcv2-cbindings-lazy.test
--- a/llvm/test/Examples/OrcV2Examples/orcv2-cbindings-lazy.test
+++ b/llvm/test/Examples/OrcV2Examples/orcv2-cbindings-lazy.test
@@ -1,5 +1,7 @@
 # RUN: OrcV2CBindingsLazy 2>&1 | FileCheck -check-prefix=THIS %s
 # RUN: OrcV2CBindingsLazy 0 2>&1 | FileCheck -check-prefix=OTHER %s
 
+# UNSUPPORTED: target=powerpc64{{.*}}
+
 # THIS: entry(1) = 1
 # OTHER: entry(2) = 2
diff --git a/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_riscv64_got_plt_reloc.s b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_riscv64_got_plt_reloc.s
--- a/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_riscv64_got_plt_reloc.s
+++ b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_riscv64_got_plt_reloc.s
@@ -6,6 +6,13 @@
 # RUN:     -abs external_func=0x1 -abs external_data=0x2 \
 # RUN:     -check %s %t/elf_riscv64_got_plt_reloc.o
 
+## Run the same tests with relaxation enabled.
+# RUN: llvm-mc -triple=riscv64 -position-independent -filetype=obj \
+# RUN:     -mattr=+relax -o %t/elf_riscv64_got_plt_reloc.o %s
+# RUN: llvm-jitlink -noexec \
+# RUN:     -slab-allocate 100Kb -slab-address 0xfff00000 -slab-page-size 4096 \
+# RUN:     -abs external_func=0x1 -abs external_data=0x2 \
+# RUN:     -check %s %t/elf_riscv64_got_plt_reloc.o
 
         .text
         .file   "testcase.c"
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO-duplicate-local.test b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO-duplicate-local.test
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO-duplicate-local.test
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO-duplicate-local.test
@@ -12,6 +12,7 @@
 # CHECK-NEXT:   target = _foo
 
 --- !mach-o
+IsLittleEndian:    true
 FileHeader:
   magic:           0xFEEDFACF
   cputype:         0x1000007
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_ehframe_bad_fde_cie-ptr_out-of-range.test b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_ehframe_bad_fde_cie-ptr_out-of-range.test
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_ehframe_bad_fde_cie-ptr_out-of-range.test
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_ehframe_bad_fde_cie-ptr_out-of-range.test
@@ -6,6 +6,7 @@
 # CHECK: llvm-jitlink error: No CIE found at address
 
 --- !mach-o
+IsLittleEndian:    true
 FileHeader:
   magic:           0xFEEDFACF
   cputype:         0x1000007
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_ehframe_bad_fde_pc-begin_out-of-range.test b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_ehframe_bad_fde_pc-begin_out-of-range.test
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_ehframe_bad_fde_pc-begin_out-of-range.test
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_ehframe_bad_fde_pc-begin_out-of-range.test
@@ -6,6 +6,7 @@
 # CHECK: llvm-jitlink error: No symbol or block covering address
 
 --- !mach-o
+IsLittleEndian:    true
 FileHeader:
   magic:           0xFEEDFACF
   cputype:         0x1000007
diff --git a/llvm/test/ExecutionEngine/lit.local.cfg b/llvm/test/ExecutionEngine/lit.local.cfg
--- a/llvm/test/ExecutionEngine/lit.local.cfg
+++ b/llvm/test/ExecutionEngine/lit.local.cfg
@@ -1,4 +1,4 @@
-if config.root.native_target in ["Sparc", "PowerPC", "SystemZ", "Hexagon", "RISCV"]:
+if config.root.native_target in ['Sparc', 'SystemZ', 'Hexagon', 'RISCV']:
     config.unsupported = True
 
 # ExecutionEngine tests are not expected to pass in a cross-compilation setup.
diff --git a/llvm/test/MC/RISCV/attribute-arch.s b/llvm/test/MC/RISCV/attribute-arch.s
--- a/llvm/test/MC/RISCV/attribute-arch.s
+++ b/llvm/test/MC/RISCV/attribute-arch.s
@@ -267,8 +267,8 @@
 .attribute arch, "rv32if_zfbfmin0p6"
 # CHECK: .attribute     5, "rv32i2p1_f2p2_zicsr2p0_zfbfmin0p6"
 
-.attribute arch, "rv32if_zvfbfmin0p6"
+.attribute arch, "rv32i_zvfbfmin0p6"
 # CHECK: .attribute     5, "rv32i2p1_f2p2_zicsr2p0_zve32f1p0_zve32x1p0_zvfbfmin0p6_zvl32b1p0"
 
-.attribute arch, "rv32if_zvfbfwma0p6"
+.attribute arch, "rv32i_zvfbfwma0p6"
 # CHECK: .attribute     5, "rv32i2p1_f2p2_zicsr2p0_zve32f1p0_zve32x1p0_zvfbfwma0p6_zvl32b1p0"
diff --git a/llvm/test/MC/RISCV/rv32zfbfmin-valid.s b/llvm/test/MC/RISCV/rv32zfbfmin-valid.s
--- a/llvm/test/MC/RISCV/rv32zfbfmin-valid.s
+++ b/llvm/test/MC/RISCV/rv32zfbfmin-valid.s
@@ -1,12 +1,12 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-zfbfmin,+f -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-zfbfmin -riscv-no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-zfbfmin,+f -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-zfbfmin -riscv-no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+experimental-zfbfmin,+d < %s \
-# RUN:     | llvm-objdump --mattr=+experimental-zfbfmin,+f -M no-aliases -d -r - \
+# RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+experimental-zfbfmin,+f < %s \
+# RUN:     | llvm-objdump --mattr=+experimental-zfbfmin -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+experimental-zfbfmin,+d < %s \
-# RUN:     | llvm-objdump --mattr=+experimental-zfbfmin,+f -M no-aliases -d -r - \
+# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+experimental-zfbfmin,+f < %s \
+# RUN:     | llvm-objdump --mattr=+experimental-zfbfmin -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
 
 # CHECK-ASM-AND-OBJ: flh ft0, 12(a0)
diff --git a/llvm/test/MC/RISCV/rvv/zvfbfmin.s b/llvm/test/MC/RISCV/rvv/zvfbfmin.s
--- a/llvm/test/MC/RISCV/rvv/zvfbfmin.s
+++ b/llvm/test/MC/RISCV/rvv/zvfbfmin.s
@@ -1,20 +1,20 @@
-# RUN: llvm-mc -triple=riscv32 -show-encoding -mattr=+f,+experimental-zvfbfmin %s \
+# RUN: llvm-mc -triple=riscv32 -show-encoding -mattr=+experimental-zvfbfmin %s \
 # RUN:   | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 # RUN: not llvm-mc -triple=riscv32 -show-encoding -mattr=+v,+f %s 2>&1 \
 # RUN:   | FileCheck %s --check-prefix=CHECK-ERROR
-# RUN: llvm-mc -triple=riscv32 -filetype=obj -mattr=+f,+experimental-zvfbfmin %s \
-# RUN:    | llvm-objdump -d --mattr=+f,+experimental-zvfbfmin - \
+# RUN: llvm-mc -triple=riscv32 -filetype=obj -mattr=+experimental-zvfbfmin %s \
+# RUN:    | llvm-objdump -d --mattr=+experimental-zvfbfmin - \
 # RUN:    | FileCheck %s --check-prefix=CHECK-INST
-# RUN: llvm-mc -triple=riscv32 -filetype=obj -mattr=+f,+experimental-zvfbfmin %s \
+# RUN: llvm-mc -triple=riscv32 -filetype=obj -mattr=+experimental-zvfbfmin %s \
 # RUN:    | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-UNKNOWN
-# RUN: llvm-mc -triple=riscv64 -show-encoding -mattr=+f,+experimental-zvfbfmin %s \
+# RUN: llvm-mc -triple=riscv64 -show-encoding -mattr=+experimental-zvfbfmin %s \
 # RUN:   | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 # RUN: not llvm-mc -triple=riscv64 -show-encoding -mattr=+v,+f %s 2>&1 \
 # RUN:   | FileCheck %s --check-prefix=CHECK-ERROR
-# RUN: llvm-mc -triple=riscv64 -filetype=obj -mattr=+f,+experimental-zvfbfmin %s \
-# RUN:    | llvm-objdump -d --mattr=+f,+experimental-zvfbfmin - \
+# RUN: llvm-mc -triple=riscv64 -filetype=obj -mattr=+experimental-zvfbfmin %s \
+# RUN:    | llvm-objdump -d --mattr=+experimental-zvfbfmin - \
 # RUN:    | FileCheck %s --check-prefix=CHECK-INST
-# RUN: llvm-mc -triple=riscv64 -filetype=obj -mattr=+f,+experimental-zvfbfmin %s \
+# RUN: llvm-mc -triple=riscv64 -filetype=obj -mattr=+experimental-zvfbfmin %s \
 # RUN:    | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-UNKNOWN
 
 # CHECK-INST: vfncvtbf16.f.f.w v8, v4, v0.t
diff --git a/llvm/test/MC/RISCV/rvv/zvfbfwma.s b/llvm/test/MC/RISCV/rvv/zvfbfwma.s
--- a/llvm/test/MC/RISCV/rvv/zvfbfwma.s
+++ b/llvm/test/MC/RISCV/rvv/zvfbfwma.s
@@ -1,20 +1,20 @@
-# RUN: llvm-mc -triple=riscv32 -show-encoding -mattr=+f,+experimental-zvfbfwma %s \
+# RUN: llvm-mc -triple=riscv32 -show-encoding -mattr=+experimental-zvfbfwma %s \
 # RUN:   | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 # RUN: not llvm-mc -triple=riscv32 -show-encoding -mattr=+v,+f %s 2>&1 \
 # RUN:   | FileCheck %s --check-prefix=CHECK-ERROR
-# RUN: llvm-mc -triple=riscv32 -filetype=obj -mattr=+f,+experimental-zvfbfwma %s \
-# RUN:    | llvm-objdump -d --mattr=+f,+experimental-zvfbfwma - \
+# RUN: llvm-mc -triple=riscv32 -filetype=obj -mattr=+experimental-zvfbfwma %s \
+# RUN:    | llvm-objdump -d --mattr=+experimental-zvfbfwma - \
 # RUN:    | FileCheck %s --check-prefix=CHECK-INST
-# RUN: llvm-mc -triple=riscv32 -filetype=obj -mattr=+f,+experimental-zvfbfwma %s \
+# RUN: llvm-mc -triple=riscv32 -filetype=obj -mattr=+experimental-zvfbfwma %s \
 # RUN:    | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-UNKNOWN
-# RUN: llvm-mc -triple=riscv64 -show-encoding -mattr=+f,+experimental-zvfbfwma %s \
+# RUN: llvm-mc -triple=riscv64 -show-encoding -mattr=+experimental-zvfbfwma %s \
 # RUN:   | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 # RUN: not llvm-mc -triple=riscv64 -show-encoding -mattr=+v,+f %s 2>&1 \
 # RUN:   | FileCheck %s --check-prefix=CHECK-ERROR
-# RUN: llvm-mc -triple=riscv64 -filetype=obj -mattr=+f,+experimental-zvfbfwma %s \
-# RUN:    | llvm-objdump -d --mattr=+f,+experimental-zvfbfwma - \
+# RUN: llvm-mc -triple=riscv64 -filetype=obj -mattr=+experimental-zvfbfwma %s \
+# RUN:    | llvm-objdump -d --mattr=+experimental-zvfbfwma - \
 # RUN:    | FileCheck %s --check-prefix=CHECK-INST
-# RUN: llvm-mc -triple=riscv64 -filetype=obj -mattr=+f,+experimental-zvfbfwma %s \
+# RUN: llvm-mc -triple=riscv64 -filetype=obj -mattr=+experimental-zvfbfwma %s \
 # RUN:    | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-UNKNOWN
 
 # CHECK-INST: vfwmaccbf16.vv v8, v20, v4, v0.t
diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -9,83 +9,83 @@
 
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -passes='default<O1>' -S %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-FUNC-SPEC,CHECK-O1,%llvmcheckext
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O1,%llvmcheckext
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -passes='default<O2>' -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-FUNC-SPEC,CHECK-O2,CHECK-O23SZ,%llvmcheckext
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O2,CHECK-O23SZ,%llvmcheckext
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -passes='default<O3>' -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-FUNC-SPEC,CHECK-O3,CHECK-O23SZ,%llvmcheckext
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,CHECK-O23SZ,%llvmcheckext
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -passes='default<Os>' -S %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-NO-FUNC-SPEC,CHECK-Os,CHECK-O23SZ,%llvmcheckext
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-Os,CHECK-O23SZ,%llvmcheckext
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -passes='default<Oz>' -S %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-NO-FUNC-SPEC,CHECK-Oz,CHECK-O23SZ,%llvmcheckext
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-Oz,CHECK-O23SZ,%llvmcheckext
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -passes='lto-pre-link<O2>' -S %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-LTO,CHECK-NO-FUNC-SPEC,CHECK-O2,CHECK-O23SZ,%llvmcheckext
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-LTO,CHECK-O2,CHECK-O23SZ,%llvmcheckext
 
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -passes-ep-peephole='no-op-function' \
 ; RUN:     -passes='default<O3>' -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-FUNC-SPEC,CHECK-O3,%llvmcheckext,CHECK-EP-PEEPHOLE,CHECK-O23SZ
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,%llvmcheckext,CHECK-EP-PEEPHOLE,CHECK-O23SZ
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -passes-ep-late-loop-optimizations='no-op-loop' \
 ; RUN:     -passes='default<O3>' -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-FUNC-SPEC,CHECK-O3,%llvmcheckext,CHECK-EP-LOOP-LATE,CHECK-O23SZ
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,%llvmcheckext,CHECK-EP-LOOP-LATE,CHECK-O23SZ
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -passes-ep-loop-optimizer-end='no-op-loop' \
 ; RUN:     -passes='default<O3>' -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-FUNC-SPEC,CHECK-O3,%llvmcheckext,CHECK-EP-LOOP-END,CHECK-O23SZ
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,%llvmcheckext,CHECK-EP-LOOP-END,CHECK-O23SZ
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -passes-ep-scalar-optimizer-late='no-op-function' \
 ; RUN:     -passes='default<O3>' -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-FUNC-SPEC,CHECK-O3,%llvmcheckext,CHECK-EP-SCALAR-LATE,CHECK-O23SZ
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,%llvmcheckext,CHECK-EP-SCALAR-LATE,CHECK-O23SZ
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -passes-ep-cgscc-optimizer-late='no-op-cgscc' \
 ; RUN:     -passes='default<O3>' -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-FUNC-SPEC,CHECK-O3,%llvmcheckext,CHECK-EP-CGSCC-LATE,CHECK-O23SZ
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,%llvmcheckext,CHECK-EP-CGSCC-LATE,CHECK-O23SZ
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -passes-ep-vectorizer-start='no-op-function' \
 ; RUN:     -passes='default<O3>' -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-FUNC-SPEC,CHECK-O3,%llvmcheckext,CHECK-EP-VECTORIZER-START,CHECK-O23SZ
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,%llvmcheckext,CHECK-EP-VECTORIZER-START,CHECK-O23SZ
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -passes-ep-pipeline-start='no-op-module' \
 ; RUN:     -passes='default<O3>' -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-FUNC-SPEC,CHECK-O3,%llvmcheckext,CHECK-EP-PIPELINE-START,CHECK-O23SZ
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,%llvmcheckext,CHECK-EP-PIPELINE-START,CHECK-O23SZ
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -passes-ep-pipeline-early-simplification='no-op-module' \
 ; RUN:     -passes='default<O3>' -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-FUNC-SPEC,CHECK-O3,%llvmcheckext,CHECK-EP-PIPELINE-EARLY-SIMPLIFICATION,CHECK-O23SZ
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,%llvmcheckext,CHECK-EP-PIPELINE-EARLY-SIMPLIFICATION,CHECK-O23SZ
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -passes-ep-pipeline-start='no-op-module' \
 ; RUN:     -passes='lto-pre-link<O3>' -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-LTO,CHECK-NO-FUNC-SPEC,CHECK-O3,%llvmcheckext,CHECK-EP-PIPELINE-START,CHECK-O23SZ
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-LTO,CHECK-O3,%llvmcheckext,CHECK-EP-PIPELINE-START,CHECK-O23SZ
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -passes-ep-optimizer-early='no-op-module' \
 ; RUN:     -passes='default<O3>' -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-FUNC-SPEC,CHECK-O3,%llvmcheckext,CHECK-EP-OPTIMIZER-EARLY,CHECK-O23SZ
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,%llvmcheckext,CHECK-EP-OPTIMIZER-EARLY,CHECK-O23SZ
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -passes-ep-optimizer-last='no-op-module' \
 ; RUN:     -passes='default<O3>' -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-FUNC-SPEC,CHECK-O3,%llvmcheckext,CHECK-EP-OPTIMIZER-LAST,CHECK-O23SZ
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,%llvmcheckext,CHECK-EP-OPTIMIZER-LAST,CHECK-O23SZ
 
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -passes='default<O3>' -enable-matrix -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-FUNC-SPEC,CHECK-O3,CHECK-O23SZ,%llvmcheckext,CHECK-MATRIX
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,CHECK-O23SZ,%llvmcheckext,CHECK-MATRIX
 
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -passes='default<O3>' -enable-merge-functions -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-FUNC-SPEC,CHECK-O3,CHECK-O23SZ,%llvmcheckext,CHECK-MERGE-FUNCS
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,CHECK-O23SZ,%llvmcheckext,CHECK-MERGE-FUNCS
 
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -passes='default<O3>' -ir-outliner -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-FUNC-SPEC,CHECK-O3,CHECK-O23SZ,%llvmcheckext,CHECK-IR-OUTLINER
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,CHECK-O23SZ,%llvmcheckext,CHECK-IR-OUTLINER
 
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -passes='default<O3>' -hot-cold-split -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-FUNC-SPEC,CHECK-O3,CHECK-O23SZ,%llvmcheckext,CHECK-HOT-COLD-SPLIT
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,CHECK-O23SZ,%llvmcheckext,CHECK-HOT-COLD-SPLIT
 
 ; Suppress FileCheck --allow-unused-prefixes=false diagnostics.
 ; CHECK-Oz: {{^}}
@@ -109,7 +109,6 @@
 ; CHECK-O-NEXT: Running pass: OpenMPOptPass
 ; CHECK-EP-PIPELINE-EARLY-SIMPLIFICATION-NEXT: Running pass: NoOpModulePass
 ; CHECK-O-NEXT: Running pass: IPSCCPPass
-; CHECK-FUNC-SPEC-NEXT: Running analysis: LoopAnalysis
 ; CHECK-O-NEXT: Running pass: CalledValuePropagationPass
 ; CHECK-O-NEXT: Running pass: GlobalOptPass
 ; CHECK-O-NEXT: Running pass: PromotePass
@@ -164,7 +163,7 @@
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: ReassociatePass
 ; CHECK-O-NEXT: Running pass: LoopSimplifyPass
-; CHECK-NO-FUNC-SPEC-NEXT: Running analysis: LoopAnalysis
+; CHECK-O-NEXT: Running analysis: LoopAnalysis
 ; CHECK-O-NEXT: Running pass: LCSSAPass
 ; CHECK-O-NEXT: Running analysis: ScalarEvolutionAnalysis
 ; CHECK-O-NEXT: Running analysis: InnerAnalysisManagerProxy
diff --git a/llvm/test/Other/new-pm-lto-defaults.ll b/llvm/test/Other/new-pm-lto-defaults.ll
--- a/llvm/test/Other/new-pm-lto-defaults.ll
+++ b/llvm/test/Other/new-pm-lto-defaults.ll
@@ -9,23 +9,23 @@
 ; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O1,CHECK-EP
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -passes='lto<O2>' -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O2,CHECK-O23,CHECK-O23SZ
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O2,CHECK-O23SZ
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -passes='lto<O2>' -S %s -passes-ep-full-link-time-optimization-early=no-op-module \
 ; RUN:     -passes-ep-full-link-time-optimization-last=no-op-module 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O2,CHECK-O23,CHECK-O23SZ,CHECK-EP
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O2,CHECK-O23SZ,CHECK-EP
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -passes='lto<O3>' -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O3,CHECK-O23,CHECK-O23SZ
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O3,CHECK-O23SZ
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -passes='lto<Os>' -S %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-OS,CHECK-OSZ,CHECK-O23SZ
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-OS,CHECK-O23SZ
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -passes='lto<Oz>' -S %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-OSZ,CHECK-O23SZ
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O23SZ
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -passes='lto<O3>' -S  %s -passes-ep-peephole='no-op-function' 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O3,CHECK-O23,CHECK-O23SZ,CHECK-EP-Peephole
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O3,CHECK-O23SZ,CHECK-EP-Peephole
 
 ; CHECK-EP: Running pass: NoOpModulePass
 ; CHECK-O: Running pass: CrossDSOCFIPass
@@ -43,7 +43,6 @@
 ; CHECK-O23SZ-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: IPSCCPPass
 ; CHECK-O23SZ-NEXT: Running analysis: AssumptionAnalysis on foo
-; CHECK-O23-NEXT: Running analysis: LoopAnalysis on foo
 ; CHECK-O23SZ-NEXT: Running pass: CalledValuePropagationPass
 ; CHECK-O-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}SCC
 ; CHECK-O-NEXT: Running analysis: LazyCallGraphAnalysis
@@ -94,7 +93,7 @@
 ; CHECK-O23SZ-NEXT: Invalidating analysis: AAManager on foo
 ; CHECK-O23SZ-NEXT: Running pass: OpenMPOptCGSCCPass on (foo)
 ; CHECK-O23SZ-NEXT: Running pass: LoopSimplifyPass on foo
-; CHECK-OSZ-NEXT: Running analysis: LoopAnalysis on foo
+; CHECK-O23SZ-NEXT: Running analysis: LoopAnalysis on foo
 ; CHECK-O23SZ-NEXT: Running pass: LCSSAPass on foo
 ; CHECK-O23SZ-NEXT: Running analysis: MemorySSAAnalysis on foo
 ; CHECK-O23SZ-NEXT: Running analysis: AAManager on foo
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
--- a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
@@ -10,28 +10,28 @@
 ; Postlink pipelines:
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -passes='thinlto<O1>' -S %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-FUNC-SPEC,CHECK-O1,CHECK-POSTLINK-O,%llvmcheckext
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O1,CHECK-POSTLINK-O,%llvmcheckext
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -passes='thinlto<O2>' -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-FUNC-SPEC,CHECK-O2,CHECK-O23SZ,CHECK-POSTLINK-O,%llvmcheckext,CHECK-POSTLINK-O2
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O2,CHECK-O23SZ,CHECK-POSTLINK-O,%llvmcheckext,CHECK-POSTLINK-O2
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager -passes-ep-pipeline-start='no-op-module' \
 ; RUN:     -passes='thinlto<O3>' -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-FUNC-SPEC,CHECK-O3,CHECK-O23SZ,CHECK-POSTLINK-O,%llvmcheckext,CHECK-POSTLINK-O3
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O3,CHECK-O23SZ,CHECK-POSTLINK-O,%llvmcheckext,CHECK-POSTLINK-O3
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager -passes-ep-optimizer-early='no-op-module' \
 ; RUN:     -passes='thinlto<O3>' -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-FUNC-SPEC,CHECK-O3,CHECK-O23SZ,CHECK-POSTLINK-O,%llvmcheckext,CHECK-POSTLINK-O3,CHECK-POST-EP-OPT-EARLY
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O3,CHECK-O23SZ,CHECK-POSTLINK-O,%llvmcheckext,CHECK-POSTLINK-O3,CHECK-POST-EP-OPT-EARLY
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager -passes-ep-optimizer-last='no-op-module' \
 ; RUN:     -passes='thinlto<O3>' -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-FUNC-SPEC,CHECK-O3,CHECK-O23SZ,CHECK-POSTLINK-O,%llvmcheckext,CHECK-POSTLINK-O3,CHECK-POST-EP-OPT-LAST
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O3,CHECK-O23SZ,CHECK-POSTLINK-O,%llvmcheckext,CHECK-POSTLINK-O3,CHECK-POST-EP-OPT-LAST
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -passes='thinlto<Os>' -S %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-NO-FUNC-SPEC,CHECK-O23SZ,CHECK-POSTLINK-O,%llvmcheckext,CHECK-POSTLINK-Os
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O23SZ,CHECK-POSTLINK-O,%llvmcheckext,CHECK-POSTLINK-Os
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -passes='thinlto<Oz>' -S %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-NO-FUNC-SPEC,CHECK-O23SZ,CHECK-POSTLINK-O,%llvmcheckext
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O23SZ,CHECK-POSTLINK-O,%llvmcheckext
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager -debug-info-for-profiling \
 ; RUN:     -passes='thinlto<O2>' -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-FUNC-SPEC,CHECK-O2,CHECK-O23SZ,CHECK-POSTLINK-O,%llvmcheckext,CHECK-POSTLINK-O2
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O2,CHECK-O23SZ,CHECK-POSTLINK-O,%llvmcheckext,CHECK-POSTLINK-O2
 
 ; Suppress FileCheck --allow-unused-prefixes=false diagnostics.
 ; CHECK-NOEXT: {{^}}
@@ -58,7 +58,6 @@
 ; CHECK-O-NEXT: Running pass: OpenMPOptPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: LowerTypeTestsPass
 ; CHECK-O-NEXT: Running pass: IPSCCPPass
-; CHECK-FUNC-SPEC-NEXT: Running analysis: LoopAnalysis
 ; CHECK-O-NEXT: Running pass: CalledValuePropagationPass
 ; CHECK-O-NEXT: Running pass: GlobalOptPass
 ; CHECK-O-NEXT: Running pass: PromotePass
@@ -108,7 +107,7 @@
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: ReassociatePass
 ; CHECK-O-NEXT: Running pass: LoopSimplifyPass
-; CHECK-NO-FUNC-SPEC-NEXT: Running analysis: LoopAnalysis
+; CHECK-O-NEXT: Running analysis: LoopAnalysis
 ; CHECK-O-NEXT: Running pass: LCSSAPass
 ; CHECK-O-NEXT: Running analysis: ScalarEvolutionAnalysis
 ; CHECK-O-NEXT: Running analysis: InnerAnalysisManagerProxy
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
--- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
@@ -3,22 +3,22 @@
 ; Postlink pipelines:
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -passes='thinlto<O1>' -S %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O123,CHECK-O1,%llvmcheckext
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O1,%llvmcheckext
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -passes='thinlto<O2>' -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O123,CHECK-O2,CHECK-O23SZ,%llvmcheckext
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O2,CHECK-O23SZ,%llvmcheckext
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager -passes-ep-pipeline-start='no-op-module' \
 ; RUN:     -passes='thinlto<O3>' -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O123,CHECK-O3,CHECK-O23SZ,%llvmcheckext
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O3,CHECK-O23SZ,%llvmcheckext
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -passes='thinlto<Os>' -S %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-OSZ,CHECK-Os,CHECK-O23SZ,%llvmcheckext
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-Os,CHECK-O23SZ,%llvmcheckext
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -passes='thinlto<Oz>' -S %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-OSZ,CHECK-O23SZ,%llvmcheckext
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O23SZ,%llvmcheckext
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager -debug-info-for-profiling \
 ; RUN:     -passes='thinlto<O2>' -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O123,CHECK-O2,CHECK-O23SZ,%llvmcheckext
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O2,CHECK-O23SZ,%llvmcheckext
 
 ; Suppress FileCheck --allow-unused-prefixes=false diagnostics.
 ; CHECK-NOEXT: {{^}}
@@ -43,7 +43,6 @@
 ; CHECK-O-NEXT: Running pass: OpenMPOptPass
 ; CHECK-O-NEXT: Running pass: LowerTypeTestsPass
 ; CHECK-O-NEXT: Running pass: IPSCCPPass
-; CHECK-O123-NEXT: Running analysis: LoopAnalysis on foo
 ; CHECK-O-NEXT: Running pass: CalledValuePropagationPass
 ; CHECK-O-NEXT: Running pass: GlobalOptPass
 ; CHECK-O-NEXT: Running pass: PromotePass
@@ -55,7 +54,7 @@
 ; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
 ; CHECK-O-NEXT: Running analysis: BlockFrequencyAnalysis on foo
 ; CHECK-O-NEXT: Running analysis: BranchProbabilityAnalysis on foo
-; CHECK-OSZ-NEXT: Running analysis: LoopAnalysis on foo
+; CHECK-O-NEXT: Running analysis: LoopAnalysis on foo
 ; CHECK-O-NEXT: Running analysis: PostDominatorTreeAnalysis on foo
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: ModuleInlinerWrapperPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
--- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
@@ -3,27 +3,27 @@
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -pgo-kind=pgo-sample-use-pipeline -profile-file='%S/Inputs/new-pm-thinlto-samplepgo-defaults.prof' \
 ; RUN:     -passes='thinlto<O1>' -S %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O123,CHECK-O1,%llvmcheckext
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O1,%llvmcheckext
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -pgo-kind=pgo-sample-use-pipeline -profile-file='%S/Inputs/new-pm-thinlto-samplepgo-defaults.prof' \
 ; RUN:     -passes='thinlto<O2>' -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O123,CHECK-O2,CHECK-O23SZ,%llvmcheckext
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O2,CHECK-O23SZ,%llvmcheckext
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager -passes-ep-pipeline-start='no-op-module' \
 ; RUN:     -pgo-kind=pgo-sample-use-pipeline -profile-file='%S/Inputs/new-pm-thinlto-samplepgo-defaults.prof' \
 ; RUN:     -passes='thinlto<O3>' -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O123,CHECK-O3,CHECK-O23SZ,%llvmcheckext
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O3,CHECK-O23SZ,%llvmcheckext
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -pgo-kind=pgo-sample-use-pipeline -profile-file='%S/Inputs/new-pm-thinlto-samplepgo-defaults.prof' \
 ; RUN:     -passes='thinlto<Os>' -S %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-OSZ,CHECK-Os,CHECK-O23SZ,%llvmcheckext
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-Os,CHECK-O23SZ,%llvmcheckext
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -pgo-kind=pgo-sample-use-pipeline -profile-file='%S/Inputs/new-pm-thinlto-samplepgo-defaults.prof' \
 ; RUN:     -passes='thinlto<Oz>' -S %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-OSZ,CHECK-O23SZ,%llvmcheckext
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O23SZ,%llvmcheckext
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager -debug-info-for-profiling \
 ; RUN:     -pgo-kind=pgo-sample-use-pipeline -profile-file='%S/Inputs/new-pm-thinlto-samplepgo-defaults.prof' \
 ; RUN:     -passes='thinlto<O2>' -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O123,CHECK-O2,CHECK-O23SZ,%llvmcheckext
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O2,CHECK-O23SZ,%llvmcheckext
 
 ; Suppress FileCheck --allow-unused-prefixes=false diagnostics.
 ; CHECK-NOEXT: {{^}}
@@ -51,7 +51,6 @@
 ; CHECK-O-NEXT: Running pass: OpenMPOptPass
 ; CHECK-O-NEXT: Running pass: LowerTypeTestsPass
 ; CHECK-O-NEXT: Running pass: IPSCCPPass
-; CHECK-O123-NEXT: Running analysis: LoopAnalysis on foo
 ; CHECK-O-NEXT: Running pass: CalledValuePropagationPass
 ; CHECK-O-NEXT: Running pass: GlobalOptPass
 ; CHECK-O-NEXT: Running pass: PromotePass
@@ -63,7 +62,7 @@
 ; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
 ; CHECK-O-NEXT: Running analysis: BlockFrequencyAnalysis on foo
 ; CHECK-O-NEXT: Running analysis: BranchProbabilityAnalysis on foo
-; CHECK-OSZ-NEXT: Running analysis: LoopAnalysis on foo
+; CHECK-O-NEXT: Running analysis: LoopAnalysis on foo
 ; CHECK-O-NEXT: Running analysis: PostDominatorTreeAnalysis on foo
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass on foo
 ; CHECK-O-NEXT: Running pass: ModuleInlinerWrapperPass
diff --git a/llvm/test/Transforms/Attributor/nofpclass.ll b/llvm/test/Transforms/Attributor/nofpclass.ll
--- a/llvm/test/Transforms/Attributor/nofpclass.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass.ll
@@ -739,6 +739,127 @@
   ret float %fabs
 }
 
+define float @returned_fabs_nopos(float nofpclass(psub pnorm pinf) %x) {
+; CHECK: Function Attrs: nofree norecurse nosync nounwind willreturn memory(none)
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @returned_fabs_nopos
+; CHECK-SAME: (float nofpclass(pinf psub pnorm) [[X:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(pinf psub pnorm) [[X]]) #[[ATTR13]]
+; CHECK-NEXT:    ret float [[FABS]]
+;
+  %fabs = call float @llvm.fabs.f32(float %x)
+  ret float %fabs
+}
+
+define float @returned_fabs_nopos_nopzero(float nofpclass(psub pnorm pinf pzero) %x) {
+; CHECK: Function Attrs: nofree norecurse nosync nounwind willreturn memory(none)
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @returned_fabs_nopos_nopzero
+; CHECK-SAME: (float nofpclass(pinf pzero psub pnorm) [[X:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(pinf pzero psub pnorm) [[X]]) #[[ATTR13]]
+; CHECK-NEXT:    ret float [[FABS]]
+;
+  %fabs = call float @llvm.fabs.f32(float %x)
+  ret float %fabs
+}
+
+define float @returned_fabs_nopos_nozero(float nofpclass(psub pnorm pinf zero) %x) {
+; CHECK: Function Attrs: nofree norecurse nosync nounwind willreturn memory(none)
+; CHECK-LABEL: define nofpclass(ninf zero nsub nnorm) float @returned_fabs_nopos_nozero
+; CHECK-SAME: (float nofpclass(pinf zero psub pnorm) [[X:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[FABS:%.*]] = call nofpclass(ninf zero nsub nnorm) float @llvm.fabs.f32(float nofpclass(pinf zero psub pnorm) [[X]]) #[[ATTR13]]
+; CHECK-NEXT:    ret float [[FABS]]
+;
+  %fabs = call float @llvm.fabs.f32(float %x)
+  ret float %fabs
+}
+
+define float @returned_fabs_nopos_nonan(float nofpclass(psub pnorm pinf nan) %x) {
+; CHECK: Function Attrs: nofree norecurse nosync nounwind willreturn memory(none)
+; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @returned_fabs_nopos_nonan
+; CHECK-SAME: (float nofpclass(nan pinf psub pnorm) [[X:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[FABS:%.*]] = call nofpclass(nan ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(nan pinf psub pnorm) [[X]]) #[[ATTR13]]
+; CHECK-NEXT:    ret float [[FABS]]
+;
+  %fabs = call float @llvm.fabs.f32(float %x)
+  ret float %fabs
+}
+
+define float @returned_fabs_noneg(float nofpclass(nsub nnorm ninf) %x) {
+; CHECK: Function Attrs: nofree norecurse nosync nounwind willreturn memory(none)
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @returned_fabs_noneg
+; CHECK-SAME: (float nofpclass(ninf nsub nnorm) [[X:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(ninf nsub nnorm) [[X]]) #[[ATTR13]]
+; CHECK-NEXT:    ret float [[FABS]]
+;
+  %fabs = call float @llvm.fabs.f32(float %x)
+  ret float %fabs
+}
+
+define float @returned_fabs_noneg_nonzero(float nofpclass(nsub nnorm ninf nzero) %x) {
+; CHECK: Function Attrs: nofree norecurse nosync nounwind willreturn memory(none)
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @returned_fabs_noneg_nonzero
+; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[X:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(ninf nzero nsub nnorm) [[X]]) #[[ATTR13]]
+; CHECK-NEXT:    ret float [[FABS]]
+;
+  %fabs = call float @llvm.fabs.f32(float %x)
+  ret float %fabs
+}
+
+define float @returned_fabs_noneg_nozero(float nofpclass(nsub nnorm ninf zero) %x) {
+; CHECK: Function Attrs: nofree norecurse nosync nounwind willreturn memory(none)
+; CHECK-LABEL: define nofpclass(ninf zero nsub nnorm) float @returned_fabs_noneg_nozero
+; CHECK-SAME: (float nofpclass(ninf zero nsub nnorm) [[X:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[FABS:%.*]] = call nofpclass(ninf zero nsub nnorm) float @llvm.fabs.f32(float nofpclass(ninf zero nsub nnorm) [[X]]) #[[ATTR13]]
+; CHECK-NEXT:    ret float [[FABS]]
+;
+  %fabs = call float @llvm.fabs.f32(float %x)
+  ret float %fabs
+}
+
+define float @returned_fabs_noneg_nonan(float nofpclass(nsub nnorm ninf nan) %x) {
+; CHECK: Function Attrs: nofree norecurse nosync nounwind willreturn memory(none)
+; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @returned_fabs_noneg_nonan
+; CHECK-SAME: (float nofpclass(nan ninf nsub nnorm) [[X:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[FABS:%.*]] = call nofpclass(nan ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(nan ninf nsub nnorm) [[X]]) #[[ATTR13]]
+; CHECK-NEXT:    ret float [[FABS]]
+;
+  %fabs = call float @llvm.fabs.f32(float %x)
+  ret float %fabs
+}
+
+define float @returned_fabs_nonsub_nopnorm_nonzero(float nofpclass(nsub pnorm nzero) %x) {
+; CHECK: Function Attrs: nofree norecurse nosync nounwind willreturn memory(none)
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @returned_fabs_nonsub_nopnorm_nonzero
+; CHECK-SAME: (float nofpclass(nzero nsub pnorm) [[X:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(nzero nsub pnorm) [[X]]) #[[ATTR13]]
+; CHECK-NEXT:    ret float [[FABS]]
+;
+  %fabs = call float @llvm.fabs.f32(float %x)
+  ret float %fabs
+}
+
+define float @returned_fabs_nopsub_nonnorm_nopzero(float nofpclass(psub nnorm pzero) %x) {
+; CHECK: Function Attrs: nofree norecurse nosync nounwind willreturn memory(none)
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @returned_fabs_nopsub_nonnorm_nopzero
+; CHECK-SAME: (float nofpclass(pzero psub nnorm) [[X:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(pzero psub nnorm) [[X]]) #[[ATTR13]]
+; CHECK-NEXT:    ret float [[FABS]]
+;
+  %fabs = call float @llvm.fabs.f32(float %x)
+  ret float %fabs
+}
+
+define float @returned_fabs_nonnorm_nozero(float nofpclass(nnorm nzero) %x) {
+; CHECK: Function Attrs: nofree norecurse nosync nounwind willreturn memory(none)
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @returned_fabs_nonnorm_nozero
+; CHECK-SAME: (float nofpclass(nzero nnorm) [[X:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(nzero nnorm) [[X]]) #[[ATTR13]]
+; CHECK-NEXT:    ret float [[FABS]]
+;
+  %fabs = call float @llvm.fabs.f32(float %x)
+  ret float %fabs
+}
+
 define float @returned_fneg(float %x) {
 ; CHECK: Function Attrs: nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define float @returned_fneg
@@ -938,9 +1059,9 @@
 
 define float @returned_fneg_fabs_nopos(float nofpclass(pinf psub pnorm pzero) %x) {
 ; CHECK: Function Attrs: nofree norecurse nosync nounwind willreturn memory(none)
-; CHECK-LABEL: define nofpclass(inf zero sub norm) float @returned_fneg_fabs_nopos
+; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @returned_fneg_fabs_nopos
 ; CHECK-SAME: (float nofpclass(pinf pzero psub pnorm) [[X:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[FABS:%.*]] = call nofpclass(inf zero sub norm) float @llvm.fabs.f32(float nofpclass(pinf pzero psub pnorm) [[X]]) #[[ATTR13]]
+; CHECK-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(pinf pzero psub pnorm) [[X]]) #[[ATTR13]]
 ; CHECK-NEXT:    [[FNEG_FABS:%.*]] = fneg float [[FABS]]
 ; CHECK-NEXT:    ret float [[FNEG_FABS]]
 ;
@@ -951,9 +1072,9 @@
 
 define float @returned_fneg_fabs_mixed(float nofpclass(psub nnorm nzero qnan ninf) %x) {
 ; CHECK: Function Attrs: nofree norecurse nosync nounwind willreturn memory(none)
-; CHECK-LABEL: define nofpclass(qnan pinf pzero sub pnorm) float @returned_fneg_fabs_mixed
+; CHECK-LABEL: define nofpclass(qnan pinf pzero psub pnorm) float @returned_fneg_fabs_mixed
 ; CHECK-SAME: (float nofpclass(qnan ninf nzero psub nnorm) [[X:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[FABS:%.*]] = call nofpclass(qnan ninf nzero sub nnorm) float @llvm.fabs.f32(float nofpclass(qnan ninf nzero psub nnorm) [[X]]) #[[ATTR13]]
+; CHECK-NEXT:    [[FABS:%.*]] = call nofpclass(qnan ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(qnan ninf nzero psub nnorm) [[X]]) #[[ATTR13]]
 ; CHECK-NEXT:    [[FNEG_FABS:%.*]] = fneg float [[FABS]]
 ; CHECK-NEXT:    ret float [[FNEG_FABS]]
 ;
diff --git a/llvm/test/Transforms/FunctionSpecialization/function-specialization-always-inline.ll b/llvm/test/Transforms/FunctionSpecialization/function-specialization-always-inline.ll
--- a/llvm/test/Transforms/FunctionSpecialization/function-specialization-always-inline.ll
+++ b/llvm/test/Transforms/FunctionSpecialization/function-specialization-always-inline.ll
@@ -1,4 +1,4 @@
-; RUN: opt -passes="ipsccp<func-spec>" -funcspec-avg-loop-iters=3 -funcspec-min-function-size=10 -S < %s | FileCheck %s
+; RUN: opt -passes="ipsccp<func-spec>" -force-specialization -S < %s | FileCheck %s
 
 ; CHECK-NOT: foo.{{[0-9]+}}
 
diff --git a/llvm/test/Transforms/FunctionSpecialization/function-specialization-constant-integers.ll b/llvm/test/Transforms/FunctionSpecialization/function-specialization-constant-integers.ll
--- a/llvm/test/Transforms/FunctionSpecialization/function-specialization-constant-integers.ll
+++ b/llvm/test/Transforms/FunctionSpecialization/function-specialization-constant-integers.ll
@@ -1,4 +1,4 @@
-; RUN: opt -passes="ipsccp<func-spec>" -funcspec-for-literal-constant=true -funcspec-min-function-size=10 -S < %s | FileCheck %s
+; RUN: opt -passes="ipsccp<func-spec>" -funcspec-for-literal-constant=true -force-specialization -S < %s | FileCheck %s
 
 ; Check that the literal constant parameter could be specialized.
 ; CHECK: @foo.1(
diff --git a/llvm/test/Transforms/FunctionSpecialization/function-specialization-loop.ll b/llvm/test/Transforms/FunctionSpecialization/function-specialization-loop.ll
deleted file mode 100644
--- a/llvm/test/Transforms/FunctionSpecialization/function-specialization-loop.ll
+++ /dev/null
@@ -1,63 +0,0 @@
-; RUN: opt -passes="ipsccp<func-spec>" -funcspec-avg-loop-iters=5 -funcspec-min-function-size=10 -S < %s | FileCheck %s
-
-; Check that the loop depth results in a larger specialization bonus.
-; CHECK: @foo.1(
-; CHECK: @foo.2(
-
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-
-@A = external dso_local constant i32, align 4
-@B = external dso_local constant i32, align 4
-@C = external dso_local constant i32, align 4
-@D = external dso_local constant i32, align 4
-
-declare i1 @cond_begin()
-declare i1 @cond_end()
-declare i1 @getCond()
-
-define internal i32 @foo(i32 %x, ptr %b, ptr %c) {
-entry:
-  br label %loop.entry
-
-loop.entry:
-  br label %loop2.entry
-
-loop2.entry:
-  br label %loop2.body
-
-loop2.body:
-  %0 = load i32, ptr %b, align 4
-  %1 = load i32, ptr %c, align 4
-  %add.0 = add nsw i32 %0, %1
-  %add = add nsw i32 %add.0, %x
-  br label %loop2.end
-
-loop2.end:
-  %cond.end = call i1 @cond_end()
-  br i1 %cond.end, label %loop2.entry, label %loop.end
-
-loop.end:
-  %cond2.end = call i1 @getCond()
-  br i1 %cond2.end, label %loop.entry, label %return
-
-return:
-  ret i32 %add
-}
-
-define dso_local i32 @bar(i32 %x, i32 %y) {
-entry:
-  %tobool = icmp ne i32 %x, 0
-  br i1 %tobool, label %if.then, label %if.else
-
-if.then:
-  %call = call i32 @foo(i32 %x, ptr @A, ptr @C)
-  br label %return
-
-if.else:
-  %call1 = call i32 @foo(i32 %y, ptr @B, ptr @D)
-  br label %return
-
-return:
-  %retval.0 = phi i32 [ %call, %if.then ], [ %call1, %if.else ]
-  ret i32 %retval.0
-}
diff --git a/llvm/test/Transforms/FunctionSpecialization/function-specialization2.ll b/llvm/test/Transforms/FunctionSpecialization/function-specialization2.ll
--- a/llvm/test/Transforms/FunctionSpecialization/function-specialization2.ll
+++ b/llvm/test/Transforms/FunctionSpecialization/function-specialization2.ll
@@ -2,7 +2,6 @@
 ; RUN: opt -passes="ipsccp<func-spec>,deadargelim" -force-specialization -S < %s | FileCheck %s
 ; RUN: opt -passes="ipsccp<func-spec>,deadargelim" -funcspec-max-iters=1 -force-specialization -S < %s | FileCheck %s
 ; RUN: opt -passes="ipsccp<func-spec>,deadargelim" -funcspec-max-iters=0 -force-specialization -S < %s | FileCheck %s --check-prefix=DISABLED
-; RUN: opt -passes="ipsccp<func-spec>,deadargelim" -funcspec-avg-loop-iters=1 -force-specialization -S < %s | FileCheck %s
 
 ; DISABLED-NOT: @func.1(
 ; DISABLED-NOT: @func.2(
diff --git a/llvm/test/Transforms/FunctionSpecialization/function-specialization3.ll b/llvm/test/Transforms/FunctionSpecialization/function-specialization3.ll
--- a/llvm/test/Transforms/FunctionSpecialization/function-specialization3.ll
+++ b/llvm/test/Transforms/FunctionSpecialization/function-specialization3.ll
@@ -1,9 +1,7 @@
-; RUN: opt -passes="ipsccp<func-spec>" -funcspec-avg-loop-iters=3 -S < %s | \
+; RUN: opt -passes="ipsccp<func-spec>" -S < %s | \
 ; RUN:   FileCheck %s --check-prefixes=COMMON,DISABLED
 ; RUN: opt -passes="ipsccp<func-spec>" -force-specialization -S < %s | \
 ; RUN:   FileCheck %s --check-prefixes=COMMON,FORCE
-; RUN: opt -passes="ipsccp<func-spec>" -funcspec-avg-loop-iters=3 -force-specialization -S < %s | \
-; RUN:   FileCheck %s --check-prefixes=COMMON,FORCE
 
 ; Test for specializing a constant global.
 
diff --git a/llvm/test/Transforms/InstCombine/merging-multiple-stores-into-successor.ll b/llvm/test/Transforms/InstCombine/merging-multiple-stores-into-successor.ll
--- a/llvm/test/Transforms/InstCombine/merging-multiple-stores-into-successor.ll
+++ b/llvm/test/Transforms/InstCombine/merging-multiple-stores-into-successor.ll
@@ -71,3 +71,233 @@
 bb12:                                             ; preds = %bb10, %bb9
   ret void
 }
+
+define half @diff_types_same_width_merge(i1 %cond, half %a, i16 %b) {
+; CHECK-LABEL: @diff_types_same_width_merge(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[BB0:%.*]], label [[BB1:%.*]]
+; CHECK:       BB0:
+; CHECK-NEXT:    br label [[SINK:%.*]]
+; CHECK:       BB1:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16 [[B:%.*]] to half
+; CHECK-NEXT:    br label [[SINK]]
+; CHECK:       sink:
+; CHECK-NEXT:    [[STOREMERGE:%.*]] = phi half [ [[TMP0]], [[BB1]] ], [ [[A:%.*]], [[BB0]] ]
+; CHECK-NEXT:    ret half [[STOREMERGE]]
+;
+entry:
+  %alloca = alloca half
+  br i1 %cond, label %BB0, label %BB1
+BB0:
+  store half %a, ptr %alloca
+  br label %sink
+BB1:
+  store i16 %b, ptr %alloca
+  br label %sink
+sink:
+  %val = load half, ptr %alloca
+  ret half %val
+}
+
+define i32 @diff_types_diff_width_no_merge(i1 %cond, i32 %a, i64 %b) {
+; CHECK-LABEL: @diff_types_diff_width_no_merge(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca i64, align 8
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[A:%.*]], label [[B:%.*]]
+; CHECK:       A:
+; CHECK-NEXT:    store i32 [[A:%.*]], ptr [[ALLOCA]], align 8
+; CHECK-NEXT:    br label [[SINK:%.*]]
+; CHECK:       B:
+; CHECK-NEXT:    store i64 [[B:%.*]], ptr [[ALLOCA]], align 8
+; CHECK-NEXT:    br label [[SINK]]
+; CHECK:       sink:
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ALLOCA]], align 8
+; CHECK-NEXT:    ret i32 [[VAL]]
+;
+entry:
+  %alloca = alloca i64
+  br i1 %cond, label %A, label %B
+A:
+  store i32 %a, ptr %alloca
+  br label %sink
+B:
+  store i64 %b, ptr %alloca
+  br label %sink
+sink:
+  %val = load i32, ptr %alloca
+  ret i32 %val
+}
+
+define <4 x i32> @vec_no_merge(i1 %cond, <2 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @vec_no_merge(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca i64, align 16
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[A:%.*]], label [[B:%.*]]
+; CHECK:       A:
+; CHECK-NEXT:    store <2 x i32> [[A:%.*]], ptr [[ALLOCA]], align 16
+; CHECK-NEXT:    br label [[SINK:%.*]]
+; CHECK:       B:
+; CHECK-NEXT:    store <4 x i32> [[B:%.*]], ptr [[ALLOCA]], align 16
+; CHECK-NEXT:    br label [[SINK]]
+; CHECK:       sink:
+; CHECK-NEXT:    [[VAL:%.*]] = load <4 x i32>, ptr [[ALLOCA]], align 16
+; CHECK-NEXT:    ret <4 x i32> [[VAL]]
+;
+entry:
+  %alloca = alloca i64
+  br i1 %cond, label %A, label %B
+A:
+  store <2 x i32> %a, ptr %alloca
+  br label %sink
+B:
+  store <4 x i32> %b, ptr %alloca
+  br label %sink
+sink:
+  %val = load <4 x i32>, ptr %alloca
+  ret <4 x i32> %val
+}
+
+%struct.half = type { half };
+
+define %struct.half @one_elem_struct_merge(i1 %cond, %struct.half %a, half %b) {
+; CHECK-LABEL: @one_elem_struct_merge(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[BB0:%.*]], label [[BB1:%.*]]
+; CHECK:       BB0:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractvalue [[STRUCT_HALF:%.*]] [[A:%.*]], 0
+; CHECK-NEXT:    br label [[SINK:%.*]]
+; CHECK:       BB1:
+; CHECK-NEXT:    br label [[SINK]]
+; CHECK:       sink:
+; CHECK-NEXT:    [[STOREMERGE:%.*]] = phi half [ [[TMP0]], [[BB0]] ], [ [[B:%.*]], [[BB1]] ]
+; CHECK-NEXT:    [[VAL1:%.*]] = insertvalue [[STRUCT_HALF]] poison, half [[STOREMERGE]], 0
+; CHECK-NEXT:    ret [[STRUCT_HALF]] [[VAL1]]
+;
+entry:
+  %alloca = alloca i64
+  br i1 %cond, label %BB0, label %BB1
+BB0:
+  store %struct.half %a, ptr %alloca
+  br label %sink
+BB1:
+  store half %b, ptr %alloca
+  br label %sink
+sink:
+  %val = load %struct.half, ptr %alloca
+  ret %struct.half %val
+}
+
+%struct.tup = type { half, i32 };
+
+define %struct.tup @multi_elem_struct_no_merge(i1 %cond, %struct.tup %a, half %b) {
+; CHECK-LABEL: @multi_elem_struct_no_merge(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca i64, align 8
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[A:%.*]], label [[B:%.*]]
+; CHECK:       A:
+; CHECK-NEXT:    store [[STRUCT_TUP:%.*]] [[A:%.*]], ptr [[ALLOCA]], align 8
+; CHECK-NEXT:    br label [[SINK:%.*]]
+; CHECK:       B:
+; CHECK-NEXT:    store half [[B:%.*]], ptr [[ALLOCA]], align 8
+; CHECK-NEXT:    br label [[SINK]]
+; CHECK:       sink:
+; CHECK-NEXT:    [[VAL:%.*]] = load [[STRUCT_TUP]], ptr [[ALLOCA]], align 8
+; CHECK-NEXT:    ret [[STRUCT_TUP]] [[VAL]]
+;
+entry:
+  %alloca = alloca i64
+  br i1 %cond, label %A, label %B
+A:
+  store %struct.tup %a, ptr %alloca
+  br label %sink
+B:
+  store half %b, ptr %alloca
+  br label %sink
+sink:
+  %val = load %struct.tup, ptr %alloca
+  ret %struct.tup %val
+}
+
+define i16 @same_types_diff_align_no_merge(i1 %cond, i16 %a, i16 %b) {
+; CHECK-LABEL: @same_types_diff_align_no_merge(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca i16, align 4
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[BB0:%.*]], label [[BB1:%.*]]
+; CHECK:       BB0:
+; CHECK-NEXT:    store i16 [[A:%.*]], ptr [[ALLOCA]], align 8
+; CHECK-NEXT:    br label [[SINK:%.*]]
+; CHECK:       BB1:
+; CHECK-NEXT:    store i16 [[B:%.*]], ptr [[ALLOCA]], align 4
+; CHECK-NEXT:    br label [[SINK]]
+; CHECK:       sink:
+; CHECK-NEXT:    [[VAL:%.*]] = load i16, ptr [[ALLOCA]], align 4
+; CHECK-NEXT:    ret i16 [[VAL]]
+;
+entry:
+  %alloca = alloca i16, align 4
+  br i1 %cond, label %BB0, label %BB1
+BB0:
+  store i16 %a, ptr %alloca, align 8
+  br label %sink
+BB1:
+  store i16 %b, ptr %alloca, align 4
+  br label %sink
+sink:
+  %val = load i16, ptr %alloca
+  ret i16 %val
+}
+
+define i64 @ptrtoint_merge(i1 %cond, i64 %a, ptr %b) {
+; CHECK-LABEL: @ptrtoint_merge(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[BB0:%.*]], label [[BB1:%.*]]
+; CHECK:       BB0:
+; CHECK-NEXT:    br label [[SINK:%.*]]
+; CHECK:       BB1:
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[B:%.*]] to i64
+; CHECK-NEXT:    br label [[SINK]]
+; CHECK:       sink:
+; CHECK-NEXT:    [[STOREMERGE:%.*]] = phi i64 [ [[A:%.*]], [[BB0]] ], [ [[TMP0]], [[BB1]] ]
+; CHECK-NEXT:    ret i64 [[STOREMERGE]]
+;
+entry:
+  %alloca = alloca ptr
+  br i1 %cond, label %BB0, label %BB1
+BB0:
+  store i64 %a, ptr %alloca
+  br label %sink
+BB1:
+  store ptr %b, ptr %alloca
+  br label %sink
+sink:
+  %val = load i64, ptr %alloca
+  ret i64 %val
+}
+
+define ptr @inttoptr_merge(i1 %cond, i64 %a, ptr %b) {
+; CHECK-LABEL: define ptr @inttoptr_merge
+; CHECK-SAME: (i1 [[COND:%.*]], i64 [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND]], label [[BB0:%.*]], label [[BB1:%.*]]
+; CHECK:       BB0:
+; CHECK-NEXT:    [[TMP0:%.*]] = inttoptr i64 [[A]] to ptr
+; CHECK-NEXT:    br label [[SINK:%.*]]
+; CHECK:       BB1:
+; CHECK-NEXT:    br label [[SINK]]
+; CHECK:       sink:
+; CHECK-NEXT:    [[STOREMERGE:%.*]] = phi ptr [ [[B]], [[BB1]] ], [ [[TMP0]], [[BB0]] ]
+; CHECK-NEXT:    ret ptr [[STOREMERGE]]
+;
+entry:
+  %alloca = alloca ptr
+  br i1 %cond, label %BB0, label %BB1
+BB0:
+  store i64 %a, ptr %alloca, align 8
+  br label %sink
+BB1:
+  store ptr %b, ptr %alloca, align 8
+  br label %sink
+sink:
+  %val = load ptr, ptr %alloca
+  ret ptr %val
+}
diff --git a/llvm/test/Transforms/InstSimplify/floating-point-compare.ll b/llvm/test/Transforms/InstSimplify/floating-point-compare.ll
--- a/llvm/test/Transforms/InstSimplify/floating-point-compare.ll
+++ b/llvm/test/Transforms/InstSimplify/floating-point-compare.ll
@@ -1475,6 +1475,36 @@
   ret i1 %r
 }
 
+define i1 @ogt_zero_fabs_select_negone_or_pinf(i1 %cond) {
+; CHECK-LABEL: @ogt_zero_fabs_select_negone_or_pinf(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND:%.*]], float -1.000000e+00, float 0x7FF0000000000000
+; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[SELECT]])
+; CHECK-NEXT:    [[ONE:%.*]] = fcmp ogt float [[FABS]], 0.000000e+00
+; CHECK-NEXT:    ret i1 [[ONE]]
+;
+entry:
+  %select = select i1 %cond, float -1.0, float 0x7FF0000000000000
+  %fabs = call float @llvm.fabs.f32(float %select)
+  %one = fcmp ogt float %fabs, 0.0
+  ret i1 %one
+}
+
+define i1 @ogt_zero_fabs_select_one_or_ninf(i1 %cond) {
+; CHECK-LABEL: @ogt_zero_fabs_select_one_or_ninf(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND:%.*]], float 1.000000e+00, float 0xFFF0000000000000
+; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[SELECT]])
+; CHECK-NEXT:    [[ONE:%.*]] = fcmp ogt float [[FABS]], 0.000000e+00
+; CHECK-NEXT:    ret i1 [[ONE]]
+;
+entry:
+  %select = select i1 %cond, float 1.0, float 0xFFF0000000000000
+  %fabs = call float @llvm.fabs.f32(float %select)
+  %one = fcmp ogt float %fabs, 0.0
+  ret i1 %one
+}
+
 declare <2 x double> @llvm.fabs.v2f64(<2 x double>)
 declare <2 x float> @llvm.fabs.v2f32(<2 x float>)
 declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>)
diff --git a/llvm/test/Transforms/InstSimplify/known-never-infinity.ll b/llvm/test/Transforms/InstSimplify/known-never-infinity.ll
--- a/llvm/test/Transforms/InstSimplify/known-never-infinity.ll
+++ b/llvm/test/Transforms/InstSimplify/known-never-infinity.ll
@@ -1021,6 +1021,73 @@
   ret i1 %cmp
 }
 
+define i1 @not_inf_fabs_select_pzero_or_ninf(i1 %cond) {
+; CHECK-LABEL: define i1 @not_inf_fabs_select_pzero_or_ninf
+; CHECK-SAME: (i1 [[COND:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float 0.000000e+00, float 0xFFF0000000000000
+; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[SELECT]])
+; CHECK-NEXT:    [[ONE:%.*]] = fcmp one float [[FABS]], 0x7FF0000000000000
+; CHECK-NEXT:    ret i1 [[ONE]]
+;
+entry:
+  %select = select i1 %cond, float 0.000000e+00, float 0xFFF0000000000000
+  %fabs = call float @llvm.fabs.f32(float %select)
+  %one = fcmp one float %fabs, 0x7FF0000000000000
+  ret i1 %one
+}
+
+define i1 @not_inf_fabs_select_nzero_or_pinf(i1 %cond) {
+; CHECK-LABEL: define i1 @not_inf_fabs_select_nzero_or_pinf
+; CHECK-SAME: (i1 [[COND:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float -0.000000e+00, float 0x7FF0000000000000
+; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[SELECT]])
+; CHECK-NEXT:    [[ONE:%.*]] = fcmp one float [[FABS]], 0x7FF0000000000000
+; CHECK-NEXT:    ret i1 [[ONE]]
+;
+entry:
+  %select = select i1 %cond, float -0.000000e+00, float 0x7FF0000000000000
+  %fabs = call float @llvm.fabs.f32(float %select)
+  %one = fcmp one float %fabs, 0x7FF0000000000000
+  ret i1 %one
+}
+
+define i1 @not_ninf_fabs_select_nzero_or_pinf(i1 %cond) {
+; CHECK-LABEL: define i1 @not_ninf_fabs_select_nzero_or_pinf
+; CHECK-SAME: (i1 [[COND:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float -0.000000e+00, float 0x7FF0000000000000
+; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[SELECT]])
+; CHECK-NEXT:    [[ONE:%.*]] = fcmp one float [[FABS]], 0xFFF0000000000000
+; CHECK-NEXT:    ret i1 [[ONE]]
+;
+entry:
+  %select = select i1 %cond, float -0.000000e+00, float 0x7FF0000000000000
+  %fabs = call float @llvm.fabs.f32(float %select)
+  %one = fcmp one float %fabs, 0xFFF0000000000000
+  ret i1 %one
+}
+
+define i1 @not_ninf_fneg_fabs_select_nzero_or_pinf(i1 %cond) {
+; CHECK-LABEL: define i1 @not_ninf_fneg_fabs_select_nzero_or_pinf
+; CHECK-SAME: (i1 [[COND:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float -0.000000e+00, float 0x7FF0000000000000
+; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[SELECT]])
+; CHECK-NEXT:    [[FNEG_FABS:%.*]] = fneg float [[FABS]]
+; CHECK-NEXT:    [[ONE:%.*]] = fcmp one float [[FNEG_FABS]], 0xFFF0000000000000
+; CHECK-NEXT:    ret i1 [[ONE]]
+;
+entry:
+  %select = select i1 %cond, float -0.000000e+00, float 0x7FF0000000000000
+  %fabs = call float @llvm.fabs.f32(float %select)
+  %fneg.fabs = fneg float %fabs
+  %one = fcmp one float %fneg.fabs, 0xFFF0000000000000
+  ret i1 %one
+}
+
+
 declare double @llvm.arithmetic.fence.f64(double)
 declare double @llvm.canonicalize.f64(double)
 declare double @llvm.ceil.f64(double)
@@ -1029,6 +1096,7 @@
 declare double @llvm.exp2.f64(double)
 declare double @llvm.exp.f64(double)
 declare double @llvm.fabs.f64(double)
+declare float @llvm.fabs.f32(float)
 declare double @llvm.floor.f64(double)
 declare double @llvm.fma.f64(double, double, double)
 declare double @llvm.fmuladd.f64(double, double, double)
diff --git a/llvm/test/Transforms/InstSimplify/strictfp-sqrt-nonneg.ll b/llvm/test/Transforms/InstSimplify/strictfp-sqrt-nonneg.ll
--- a/llvm/test/Transforms/InstSimplify/strictfp-sqrt-nonneg.ll
+++ b/llvm/test/Transforms/InstSimplify/strictfp-sqrt-nonneg.ll
@@ -81,15 +81,14 @@
   ret float %sub
 }
 
-; Test all the rounding modes. Exception handling shouldn't matter.
+; Test all the rounding modes. Rounding mode and exception handling
+; shouldn't matter.
 
-; Negative test: should not fire due to rounding mode metadata.
 define float @nonneg_u_downward(i32 %a) #0 {
 ; CHECK-LABEL: @nonneg_u_downward(
 ; CHECK-NEXT:    [[FPA:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[A:%.*]], metadata !"round.downward", metadata !"fpexcept.ignore") #[[ATTR0]]
 ; CHECK-NEXT:    [[SQRA:%.*]] = call float @llvm.experimental.constrained.sqrt.f32(float [[FPA]], metadata !"round.downward", metadata !"fpexcept.ignore") #[[ATTR0]]
-; CHECK-NEXT:    [[SUB:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[SQRA]], float -0.000000e+00, metadata !"round.downward", metadata !"fpexcept.ignore") #[[ATTR0]]
-; CHECK-NEXT:    ret float [[SUB]]
+; CHECK-NEXT:    ret float [[SQRA]]
 ;
   %fpa = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 %a, metadata !"round.downward", metadata !"fpexcept.ignore") #0
   %sqra = call float @llvm.experimental.constrained.sqrt.f32(float %fpa, metadata !"round.downward", metadata !"fpexcept.ignore") #0
@@ -97,13 +96,11 @@
   ret float %sub
 }
 
-; Negative test: should not fire due to rounding mode metadata.
 define float @nonneg_s_downward(i32 %a) #0 {
 ; CHECK-LABEL: @nonneg_s_downward(
 ; CHECK-NEXT:    [[FPA:%.*]] = call float @llvm.experimental.constrained.sitofp.f32.i32(i32 [[A:%.*]], metadata !"round.downward", metadata !"fpexcept.ignore") #[[ATTR0]]
 ; CHECK-NEXT:    [[SQRA:%.*]] = call float @llvm.experimental.constrained.sqrt.f32(float [[FPA]], metadata !"round.downward", metadata !"fpexcept.ignore") #[[ATTR0]]
-; CHECK-NEXT:    [[SUB:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[SQRA]], float -0.000000e+00, metadata !"round.downward", metadata !"fpexcept.ignore") #[[ATTR0]]
-; CHECK-NEXT:    ret float [[SUB]]
+; CHECK-NEXT:    ret float [[SQRA]]
 ;
   %fpa = call float @llvm.experimental.constrained.sitofp.f32.i32(i32 %a, metadata !"round.downward", metadata !"fpexcept.ignore") #0
   %sqra = call float @llvm.experimental.constrained.sqrt.f32(float %fpa, metadata !"round.downward", metadata !"fpexcept.ignore") #0
@@ -111,13 +108,11 @@
   ret float %sub
 }
 
-; Negative test: should not fire due to rounding mode metadata.
 define float @nonneg_u_upward(i32 %a) #0 {
 ; CHECK-LABEL: @nonneg_u_upward(
 ; CHECK-NEXT:    [[FPA:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[A:%.*]], metadata !"round.upward", metadata !"fpexcept.ignore") #[[ATTR0]]
 ; CHECK-NEXT:    [[SQRA:%.*]] = call float @llvm.experimental.constrained.sqrt.f32(float [[FPA]], metadata !"round.upward", metadata !"fpexcept.ignore") #[[ATTR0]]
-; CHECK-NEXT:    [[SUB:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[SQRA]], float -0.000000e+00, metadata !"round.upward", metadata !"fpexcept.ignore") #[[ATTR0]]
-; CHECK-NEXT:    ret float [[SUB]]
+; CHECK-NEXT:    ret float [[SQRA]]
 ;
   %fpa = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 %a, metadata !"round.upward", metadata !"fpexcept.ignore") #0
   %sqra = call float @llvm.experimental.constrained.sqrt.f32(float %fpa, metadata !"round.upward", metadata !"fpexcept.ignore") #0
@@ -125,13 +120,11 @@
   ret float %sub
 }
 
-; Negative test: should not fire due to rounding mode metadata.
 define float @nonneg_s_upward(i32 %a) #0 {
 ; CHECK-LABEL: @nonneg_s_upward(
 ; CHECK-NEXT:    [[FPA:%.*]] = call float @llvm.experimental.constrained.sitofp.f32.i32(i32 [[A:%.*]], metadata !"round.upward", metadata !"fpexcept.ignore") #[[ATTR0]]
 ; CHECK-NEXT:    [[SQRA:%.*]] = call float @llvm.experimental.constrained.sqrt.f32(float [[FPA]], metadata !"round.upward", metadata !"fpexcept.ignore") #[[ATTR0]]
-; CHECK-NEXT:    [[SUB:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[SQRA]], float -0.000000e+00, metadata !"round.upward", metadata !"fpexcept.ignore") #[[ATTR0]]
-; CHECK-NEXT:    ret float [[SUB]]
+; CHECK-NEXT:    ret float [[SQRA]]
 ;
   %fpa = call float @llvm.experimental.constrained.sitofp.f32.i32(i32 %a, metadata !"round.upward", metadata !"fpexcept.ignore") #0
   %sqra = call float @llvm.experimental.constrained.sqrt.f32(float %fpa, metadata !"round.upward", metadata !"fpexcept.ignore") #0
@@ -139,13 +132,11 @@
   ret float %sub
 }
 
-; Negative test: should not fire due to rounding mode metadata.
 define float @nonneg_u_towardzero(i32 %a) #0 {
 ; CHECK-LABEL: @nonneg_u_towardzero(
 ; CHECK-NEXT:    [[FPA:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[A:%.*]], metadata !"round.towardzero", metadata !"fpexcept.ignore") #[[ATTR0]]
 ; CHECK-NEXT:    [[SQRA:%.*]] = call float @llvm.experimental.constrained.sqrt.f32(float [[FPA]], metadata !"round.towardzero", metadata !"fpexcept.ignore") #[[ATTR0]]
-; CHECK-NEXT:    [[SUB:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[SQRA]], float -0.000000e+00, metadata !"round.towardzero", metadata !"fpexcept.ignore") #[[ATTR0]]
-; CHECK-NEXT:    ret float [[SUB]]
+; CHECK-NEXT:    ret float [[SQRA]]
 ;
   %fpa = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 %a, metadata !"round.towardzero", metadata !"fpexcept.ignore") #0
   %sqra = call float @llvm.experimental.constrained.sqrt.f32(float %fpa, metadata !"round.towardzero", metadata !"fpexcept.ignore") #0
@@ -153,13 +144,11 @@
   ret float %sub
 }
 
-; Negative test: should not fire due to rounding mode metadata.
 define float @nonneg_s_towardzero(i32 %a) #0 {
 ; CHECK-LABEL: @nonneg_s_towardzero(
 ; CHECK-NEXT:    [[FPA:%.*]] = call float @llvm.experimental.constrained.sitofp.f32.i32(i32 [[A:%.*]], metadata !"round.towardzero", metadata !"fpexcept.ignore") #[[ATTR0]]
 ; CHECK-NEXT:    [[SQRA:%.*]] = call float @llvm.experimental.constrained.sqrt.f32(float [[FPA]], metadata !"round.towardzero", metadata !"fpexcept.ignore") #[[ATTR0]]
-; CHECK-NEXT:    [[SUB:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[SQRA]], float -0.000000e+00, metadata !"round.towardzero", metadata !"fpexcept.ignore") #[[ATTR0]]
-; CHECK-NEXT:    ret float [[SUB]]
+; CHECK-NEXT:    ret float [[SQRA]]
 ;
   %fpa = call float @llvm.experimental.constrained.sitofp.f32.i32(i32 %a, metadata !"round.towardzero", metadata !"fpexcept.ignore") #0
   %sqra = call float @llvm.experimental.constrained.sqrt.f32(float %fpa, metadata !"round.towardzero", metadata !"fpexcept.ignore") #0
@@ -167,13 +156,11 @@
   ret float %sub
 }
 
-; Negative test: should not fire due to rounding mode metadata.
 define float @nonneg_u_tonearestaway(i32 %a) #0 {
 ; CHECK-LABEL: @nonneg_u_tonearestaway(
 ; CHECK-NEXT:    [[FPA:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[A:%.*]], metadata !"round.tonearestaway", metadata !"fpexcept.ignore") #[[ATTR0]]
 ; CHECK-NEXT:    [[SQRA:%.*]] = call float @llvm.experimental.constrained.sqrt.f32(float [[FPA]], metadata !"round.tonearestaway", metadata !"fpexcept.ignore") #[[ATTR0]]
-; CHECK-NEXT:    [[SUB:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[SQRA]], float -0.000000e+00, metadata !"round.tonearestaway", metadata !"fpexcept.ignore") #[[ATTR0]]
-; CHECK-NEXT:    ret float [[SUB]]
+; CHECK-NEXT:    ret float [[SQRA]]
 ;
   %fpa = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 %a, metadata !"round.tonearestaway", metadata !"fpexcept.ignore") #0
   %sqra = call float @llvm.experimental.constrained.sqrt.f32(float %fpa, metadata !"round.tonearestaway", metadata !"fpexcept.ignore") #0
@@ -181,13 +168,11 @@
   ret float %sub
 }
 
-; Negative test: should not fire due to rounding mode metadata.
 define float @nonneg_s_tonearestaway(i32 %a) #0 {
 ; CHECK-LABEL: @nonneg_s_tonearestaway(
 ; CHECK-NEXT:    [[FPA:%.*]] = call float @llvm.experimental.constrained.sitofp.f32.i32(i32 [[A:%.*]], metadata !"round.tonearestaway", metadata !"fpexcept.ignore") #[[ATTR0]]
 ; CHECK-NEXT:    [[SQRA:%.*]] = call float @llvm.experimental.constrained.sqrt.f32(float [[FPA]], metadata !"round.tonearestaway", metadata !"fpexcept.ignore") #[[ATTR0]]
-; CHECK-NEXT:    [[SUB:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[SQRA]], float -0.000000e+00, metadata !"round.tonearestaway", metadata !"fpexcept.ignore") #[[ATTR0]]
-; CHECK-NEXT:    ret float [[SUB]]
+; CHECK-NEXT:    ret float [[SQRA]]
 ;
   %fpa = call float @llvm.experimental.constrained.sitofp.f32.i32(i32 %a, metadata !"round.tonearestaway", metadata !"fpexcept.ignore") #0
   %sqra = call float @llvm.experimental.constrained.sqrt.f32(float %fpa, metadata !"round.tonearestaway", metadata !"fpexcept.ignore") #0
@@ -195,13 +180,11 @@
   ret float %sub
 }
 
-; Negative test: should not fire due to rounding mode metadata.
 define float @nonneg_u_dynamic(i32 %a) #0 {
 ; CHECK-LABEL: @nonneg_u_dynamic(
 ; CHECK-NEXT:    [[FPA:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[A:%.*]], metadata !"round.dynamic", metadata !"fpexcept.ignore") #[[ATTR0]]
 ; CHECK-NEXT:    [[SQRA:%.*]] = call float @llvm.experimental.constrained.sqrt.f32(float [[FPA]], metadata !"round.dynamic", metadata !"fpexcept.ignore") #[[ATTR0]]
-; CHECK-NEXT:    [[SUB:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[SQRA]], float -0.000000e+00, metadata !"round.dynamic", metadata !"fpexcept.ignore") #[[ATTR0]]
-; CHECK-NEXT:    ret float [[SUB]]
+; CHECK-NEXT:    ret float [[SQRA]]
 ;
   %fpa = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 %a, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
   %sqra = call float @llvm.experimental.constrained.sqrt.f32(float %fpa, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
@@ -209,13 +192,11 @@
   ret float %sub
 }
 
-; Negative test: should not fire due to rounding mode metadata.
 define float @nonneg_s_dynamic(i32 %a) #0 {
 ; CHECK-LABEL: @nonneg_s_dynamic(
 ; CHECK-NEXT:    [[FPA:%.*]] = call float @llvm.experimental.constrained.sitofp.f32.i32(i32 [[A:%.*]], metadata !"round.dynamic", metadata !"fpexcept.ignore") #[[ATTR0]]
 ; CHECK-NEXT:    [[SQRA:%.*]] = call float @llvm.experimental.constrained.sqrt.f32(float [[FPA]], metadata !"round.dynamic", metadata !"fpexcept.ignore") #[[ATTR0]]
-; CHECK-NEXT:    [[SUB:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[SQRA]], float -0.000000e+00, metadata !"round.dynamic", metadata !"fpexcept.ignore") #[[ATTR0]]
-; CHECK-NEXT:    ret float [[SUB]]
+; CHECK-NEXT:    ret float [[SQRA]]
 ;
   %fpa = call float @llvm.experimental.constrained.sitofp.f32.i32(i32 %a, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
   %sqra = call float @llvm.experimental.constrained.sqrt.f32(float %fpa, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
diff --git a/llvm/test/Transforms/LICM/hoist-add-sub.ll b/llvm/test/Transforms/LICM/hoist-add-sub.ll
--- a/llvm/test/Transforms/LICM/hoist-add-sub.ll
+++ b/llvm/test/Transforms/LICM/hoist-add-sub.ll
@@ -165,18 +165,18 @@
 }
 
 
-; TODO: x + iv < 4 ==> iv < 4 - x
+; x + iv < 4 ==> iv < 4 - x
 define i32 @test_02(ptr %p, ptr %x_p, ptr %length_p) {
 ; CHECK-LABEL: define i32 @test_02
 ; CHECK-SAME: (ptr [[P:%.*]], ptr [[X_P:%.*]], ptr [[LENGTH_P:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P]], align 4, !range [[RNG0]]
 ; CHECK-NEXT:    [[LENGTH:%.*]] = load i32, ptr [[LENGTH_P]], align 4, !range [[RNG0]]
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = sub nsw i32 4, [[X]]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
-; CHECK-NEXT:    [[ARITH:%.*]] = add nsw i32 [[X]], [[IV]]
-; CHECK-NEXT:    [[X_CHECK:%.*]] = icmp slt i32 [[ARITH]], 4
+; CHECK-NEXT:    [[X_CHECK:%.*]] = icmp slt i32 [[IV]], [[INVARIANT_OP]]
 ; CHECK-NEXT:    br i1 [[X_CHECK]], label [[OUT_OF_BOUNDS:%.*]], label [[BACKEDGE]]
 ; CHECK:       backedge:
 ; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
@@ -391,18 +391,18 @@
   ret i32 -2
 }
 
-; TODO: iv + x < 4 ==> iv < 4 - x
+; iv + x < 4 ==> iv < 4 - x
 define i32 @test_04(ptr %p, ptr %x_p, ptr %length_p) {
 ; CHECK-LABEL: define i32 @test_04
 ; CHECK-SAME: (ptr [[P:%.*]], ptr [[X_P:%.*]], ptr [[LENGTH_P:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P]], align 4, !range [[RNG0]]
 ; CHECK-NEXT:    [[LENGTH:%.*]] = load i32, ptr [[LENGTH_P]], align 4, !range [[RNG0]]
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = sub nsw i32 4, [[X]]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
-; CHECK-NEXT:    [[ARITH:%.*]] = add nsw i32 [[IV]], [[X]]
-; CHECK-NEXT:    [[X_CHECK:%.*]] = icmp slt i32 [[ARITH]], 4
+; CHECK-NEXT:    [[X_CHECK:%.*]] = icmp slt i32 [[IV]], [[INVARIANT_OP]]
 ; CHECK-NEXT:    br i1 [[X_CHECK]], label [[OUT_OF_BOUNDS:%.*]], label [[BACKEDGE]]
 ; CHECK:       backedge:
 ; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/getpointerschaincost.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/getpointerschaincost.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/getpointerschaincost.ll
@@ -0,0 +1,101 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt -S -mtriple=riscv64 -mattr=+v -riscv-v-slp-max-vf=0 -passes=slp-vectorizer -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: FileCheck --input-file=%t --check-prefix=YAML %s
+
+; Because all of these addresses are foldable, the scalar cost should be 0 when
+; computing the pointers chain cost.
+;
+; TODO: These are currently costed as free the indices are all constants, but we
+; should check if the constants are actually foldable
+define void @f(ptr %dest, i64 %i) {
+; CHECK-LABEL: define void @f
+; CHECK-SAME: (ptr [[DEST:%.*]], i64 [[I:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr i32, ptr [[DEST]], i32 0
+; CHECK-NEXT:    store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, ptr [[P1]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+; YAML:      Pass:            slp-vectorizer
+; YAML-NEXT: Name:            StoresVectorized
+; YAML-NEXT: Function:        f
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'Stores SLP vectorized with cost '
+; YAML-NEXT:   - Cost:            '-2'
+; YAML-NEXT:   - String:          ' and with tree size '
+; YAML-NEXT:   - TreeSize:        '2'
+  %p1 = getelementptr i32, ptr %dest, i32 0
+  store i32 1, ptr %p1
+  %p2 = getelementptr i32, ptr %dest, i32 1
+  store i32 1, ptr %p2
+  %p3 = getelementptr i32, ptr %dest, i32 2
+  store i32 1, ptr %p3
+  %p4 = getelementptr i32, ptr %dest, i32 3
+  store i32 1, ptr %p4
+  ret void
+}
+
+; When computing the scalar pointers chain cost here, there is a cost of 1 for
+; the base pointer, and the rest can be folded in, so the scalar cost should be
+; 1.
+;
+; TODO: These are currently costed as free the indices are all constants, but we
+; should check if the constants are actually foldable
+define void @g(ptr %dest, i64 %i) {
+; CHECK-LABEL: define void @g
+; CHECK-SAME: (ptr [[DEST:%.*]], i64 [[I:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr i32, ptr [[DEST]], i32 2048
+; CHECK-NEXT:    store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, ptr [[P1]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+; YAML:      Pass:            slp-vectorizer
+; YAML-NEXT: Name:            StoresVectorized
+; YAML-NEXT: Function:        g
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'Stores SLP vectorized with cost '
+; YAML-NEXT:   - Cost:            '-2'
+; YAML-NEXT:   - String:          ' and with tree size '
+; YAML-NEXT:   - TreeSize:        '2'
+  %p1 = getelementptr i32, ptr %dest, i32 2048
+  store i32 1, ptr %p1
+  %p2 = getelementptr i32, ptr %dest, i32 2049
+  store i32 1, ptr %p2
+  %p3 = getelementptr i32, ptr %dest, i32 2050
+  store i32 1, ptr %p3
+  %p4 = getelementptr i32, ptr %dest, i32 2051
+  store i32 1, ptr %p4
+  ret void
+}
+
+; When computing the scalar pointers chain cost here, there is a cost of
+; 1 for the base pointer, and the rest can be folded in, so the scalar cost
+; should be 1.
+define void @h(ptr %dest, i32 %i) {
+; CHECK-LABEL: define void @h
+; CHECK-SAME: (ptr [[DEST:%.*]], i32 [[I:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr [4 x i32], ptr [[DEST]], i32 [[I]], i32 0
+; CHECK-NEXT:    store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, ptr [[P1]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+; YAML:      Pass:            slp-vectorizer
+; YAML-NEXT: Name:            StoresVectorized
+; YAML-NEXT: Function:        h
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'Stores SLP vectorized with cost '
+; YAML-NEXT:   - Cost:            '-2'
+; YAML-NEXT:   - String:          ' and with tree size '
+; YAML-NEXT:   - TreeSize:        '2'
+  %p1 = getelementptr [4 x i32], ptr %dest, i32 %i, i32 0
+  store i32 1, ptr %p1
+  %p2 = getelementptr [4 x i32], ptr %dest, i32 %i, i32 1
+  store i32 1, ptr %p2
+  %p3 = getelementptr [4 x i32], ptr %dest, i32 %i, i32 2
+  store i32 1, ptr %p3
+  %p4 = getelementptr [4 x i32], ptr %dest, i32 %i, i32 3
+  store i32 1, ptr %p4
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/struct-gep.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/struct-gep.ll
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/struct-gep.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/struct-gep.ll
@@ -2,7 +2,9 @@
 ; RUN: opt < %s -passes=slp-vectorizer -mtriple=riscv64 -mattr=+v \
 ; RUN: -riscv-v-slp-max-vf=0 -S | FileCheck %s
 
-; FIXME: This should not be vectorized
+; This shouldn't be vectorized as the extra address computation required for the
+; vector store make it unprofitable (vle/vse don't have an offset in their
+; addressing modes)
 
 %struct.2i32 = type { i32, i32 }
 
@@ -10,7 +12,9 @@
 ; CHECK-LABEL: @splat_store_v2i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[P1:%.*]] = getelementptr [[STRUCT_2I32:%.*]], ptr [[DEST:%.*]], i64 [[I:%.*]], i32 0
-; CHECK-NEXT:    store <2 x i32> <i32 1, i32 1>, ptr [[P1]], align 4
+; CHECK-NEXT:    store i32 1, ptr [[P1]], align 4
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr [[STRUCT_2I32]], ptr [[DEST]], i64 [[I]], i32 1
+; CHECK-NEXT:    store i32 1, ptr [[P2]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/tools/llvm-mca/RISCV/different-instruments.s b/llvm/test/tools/llvm-mca/RISCV/different-instruments.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/different-instruments.s
@@ -0,0 +1,76 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=sifive-x280 -timeline -iterations=1 < %s | FileCheck %s
+
+vsetvli zero, a0, e8, m1, tu, mu
+# LLVM-MCA-RISCV-LMUL M1
+vadd.vv v12, v12, v12
+vsetvli zero, a0, e8, m8, tu, mu
+# LLVM-MCA-RISCV-LMUL M8
+vadd.vv v12, v12, v12
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      4
+# CHECK-NEXT: Total Cycles:      12
+# CHECK-NEXT: Total uOps:        4
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.33
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 18.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, a0, e8, m1, tu, mu
+# CHECK-NEXT:  1      4     16.00                       vadd.vv	v12, v12, v12
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, a0, e8, m8, tu, mu
+# CHECK-NEXT:  1      4     16.00                       vadd.vv	v12, v12, v12
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SiFive7FDiv
+# CHECK-NEXT: [1]   - SiFive7IDiv
+# CHECK-NEXT: [2]   - SiFive7PipeA
+# CHECK-NEXT: [3]   - SiFive7PipeB
+# CHECK-NEXT: [4]   - SiFive7PipeV
+# CHECK-NEXT: [5]   - SiFive7VA
+# CHECK-NEXT: [6]   - SiFive7VL
+# CHECK-NEXT: [7]   - SiFive7VS
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
+# CHECK-NEXT:  -      -     2.00    -     18.00  18.00   -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    Instructions:
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, a0, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -     vadd.vv	v12, v12, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, a0, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     16.00  16.00   -      -     vadd.vv	v12, v12, v12
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeE .    ..   vsetvli	zero, a0, e8, m1, tu, mu
+# CHECK-NEXT: [0,1]     .  DeeeE  ..   vadd.vv	v12, v12, v12
+# CHECK-NEXT: [0,2]     .   DeeE  ..   vsetvli	zero, a0, e8, m8, tu, mu
+# CHECK-NEXT: [0,3]     .    . DeeeE   vadd.vv	v12, v12, v12
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       vsetvli	zero, a0, e8, m1, tu, mu
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       vadd.vv	v12, v12, v12
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       vsetvli	zero, a0, e8, m8, tu, mu
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       vadd.vv	v12, v12, v12
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
diff --git a/llvm/test/tools/llvm-mca/RISCV/disable-im.s b/llvm/test/tools/llvm-mca/RISCV/disable-im.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/disable-im.s
@@ -0,0 +1,87 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=sifive-x280 -timeline -iterations=1 -disable-im < %s | FileCheck %s
+
+vsetvli zero, a0, e8, m2, tu, mu
+# LLVM-MCA-RISCV-LMUL M2
+vadd.vv v12, v12, v12
+vsetvli zero, a0, e8, m1, tu, mu
+# LLVM-MCA-RISCV-LMUL M1
+vadd.vv v12, v12, v12
+vsetvli zero, a0, e8, m8, tu, mu
+# LLVM-MCA-RISCV-LMUL M8
+vadd.vv v12, v12, v12
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      6
+# CHECK-NEXT: Total Cycles:      40
+# CHECK-NEXT: Total uOps:        6
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.15
+# CHECK-NEXT: IPC:               0.15
+# CHECK-NEXT: Block RThroughput: 48.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, a0, e8, m2, tu, mu
+# CHECK-NEXT:  1      4     16.00                       vadd.vv	v12, v12, v12
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, a0, e8, m1, tu, mu
+# CHECK-NEXT:  1      4     16.00                       vadd.vv	v12, v12, v12
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, a0, e8, m8, tu, mu
+# CHECK-NEXT:  1      4     16.00                       vadd.vv	v12, v12, v12
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SiFive7FDiv
+# CHECK-NEXT: [1]   - SiFive7IDiv
+# CHECK-NEXT: [2]   - SiFive7PipeA
+# CHECK-NEXT: [3]   - SiFive7PipeB
+# CHECK-NEXT: [4]   - SiFive7PipeV
+# CHECK-NEXT: [5]   - SiFive7VA
+# CHECK-NEXT: [6]   - SiFive7VL
+# CHECK-NEXT: [7]   - SiFive7VS
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
+# CHECK-NEXT:  -      -     3.00    -     48.00  48.00   -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    Instructions:
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, a0, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     16.00  16.00   -      -     vadd.vv	v12, v12, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, a0, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     16.00  16.00   -      -     vadd.vv	v12, v12, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, a0, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     16.00  16.00   -      -     vadd.vv	v12, v12, v12
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeE .    .    .    .    .    .    .   .   vsetvli	zero, a0, e8, m2, tu, mu
+# CHECK-NEXT: [0,1]     .  DeeeE  .    .    .    .    .    .   .   vadd.vv	v12, v12, v12
+# CHECK-NEXT: [0,2]     .   DeeE  .    .    .    .    .    .   .   vsetvli	zero, a0, e8, m1, tu, mu
+# CHECK-NEXT: [0,3]     .    .    .    .   DeeeE .    .    .   .   vadd.vv	v12, v12, v12
+# CHECK-NEXT: [0,4]     .    .    .    .    DeeE .    .    .   .   vsetvli	zero, a0, e8, m8, tu, mu
+# CHECK-NEXT: [0,5]     .    .    .    .    .    .    .    DeeeE   vadd.vv	v12, v12, v12
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       vsetvli	zero, a0, e8, m2, tu, mu
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       vadd.vv	v12, v12, v12
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       vsetvli	zero, a0, e8, m1, tu, mu
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       vadd.vv	v12, v12, v12
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       vsetvli	zero, a0, e8, m8, tu, mu
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       vadd.vv	v12, v12, v12
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
diff --git a/llvm/test/tools/llvm-mca/RISCV/instrument-at-start.s b/llvm/test/tools/llvm-mca/RISCV/instrument-at-start.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/instrument-at-start.s
@@ -0,0 +1,64 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=sifive-x280 -timeline -iterations=1 < %s | FileCheck %s
+
+vsetvli zero, a0, e8, m1, tu, mu
+# LLVM-MCA-RISCV-LMUL M1
+vadd.vv v12, v12, v12
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      2
+# CHECK-NEXT: Total Cycles:      8
+# CHECK-NEXT: Total uOps:        2
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.25
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, a0, e8, m1, tu, mu
+# CHECK-NEXT:  1      4     16.00                       vadd.vv	v12, v12, v12
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SiFive7FDiv
+# CHECK-NEXT: [1]   - SiFive7IDiv
+# CHECK-NEXT: [2]   - SiFive7PipeA
+# CHECK-NEXT: [3]   - SiFive7PipeB
+# CHECK-NEXT: [4]   - SiFive7PipeV
+# CHECK-NEXT: [5]   - SiFive7VA
+# CHECK-NEXT: [6]   - SiFive7VL
+# CHECK-NEXT: [7]   - SiFive7VS
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
+# CHECK-NEXT:  -      -     1.00    -     2.00   2.00    -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    Instructions:
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, a0, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -     vadd.vv	v12, v12, v12
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     01234567
+
+# CHECK:      [0,0]     DeeE . .   vsetvli	zero, a0, e8, m1, tu, mu
+# CHECK-NEXT: [0,1]     .  DeeeE   vadd.vv	v12, v12, v12
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       vsetvli	zero, a0, e8, m1, tu, mu
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       vadd.vv	v12, v12, v12
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
diff --git a/llvm/test/tools/llvm-mca/RISCV/instrument-in-middle.s b/llvm/test/tools/llvm-mca/RISCV/instrument-in-middle.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/instrument-in-middle.s
@@ -0,0 +1,70 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=sifive-x280 -timeline -iterations=1 < %s | FileCheck %s
+
+vadd.vv v12, v12, v12
+vsetvli zero, a0, e8, m8, tu, mu
+# LLVM-MCA-RISCV-LMUL MF8
+vadd.vv v12, v12, v12
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      3
+# CHECK-NEXT: Total Cycles:      21
+# CHECK-NEXT: Total uOps:        3
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.14
+# CHECK-NEXT: IPC:               0.14
+# CHECK-NEXT: Block RThroughput: 17.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      4     16.00                       vadd.vv	v12, v12, v12
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, a0, e8, m8, tu, mu
+# CHECK-NEXT:  1      4     16.00                       vadd.vv	v12, v12, v12
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SiFive7FDiv
+# CHECK-NEXT: [1]   - SiFive7IDiv
+# CHECK-NEXT: [2]   - SiFive7PipeA
+# CHECK-NEXT: [3]   - SiFive7PipeB
+# CHECK-NEXT: [4]   - SiFive7PipeV
+# CHECK-NEXT: [5]   - SiFive7VA
+# CHECK-NEXT: [6]   - SiFive7VL
+# CHECK-NEXT: [7]   - SiFive7VS
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
+# CHECK-NEXT:  -      -     1.00    -     17.00  17.00   -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    Instructions:
+# CHECK-NEXT:  -      -      -      -     16.00  16.00   -      -     vadd.vv	v12, v12, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, a0, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -     vadd.vv	v12, v12, v12
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
+
+# CHECK:      [0,0]     DeeeE.    .    .    .   vadd.vv	v12, v12, v12
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   vsetvli	zero, a0, e8, m8, tu, mu
+# CHECK-NEXT: [0,2]     .    .    .    .DeeeE   vadd.vv	v12, v12, v12
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       vadd.vv	v12, v12, v12
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       vsetvli	zero, a0, e8, m8, tu, mu
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       vadd.vv	v12, v12, v12
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
diff --git a/llvm/test/tools/llvm-mca/RISCV/instrument-in-region.s b/llvm/test/tools/llvm-mca/RISCV/instrument-in-region.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/instrument-in-region.s
@@ -0,0 +1,68 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=sifive-x280 -timeline -iterations=1 < %s | FileCheck %s
+
+# LLVM-MCA-BEGIN foo
+vsetvli zero, a0, e8, m1, tu, mu
+# LLVM-MCA-RISCV-LMUL M1
+vadd.vv v12, v12, v12
+# LLVM-MCA-END foo
+
+# CHECK:      [0] Code Region - foo
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      2
+# CHECK-NEXT: Total Cycles:      8
+# CHECK-NEXT: Total uOps:        2
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.25
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, a0, e8, m1, tu, mu
+# CHECK-NEXT:  1      4     16.00                       vadd.vv	v12, v12, v12
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SiFive7FDiv
+# CHECK-NEXT: [1]   - SiFive7IDiv
+# CHECK-NEXT: [2]   - SiFive7PipeA
+# CHECK-NEXT: [3]   - SiFive7PipeB
+# CHECK-NEXT: [4]   - SiFive7PipeV
+# CHECK-NEXT: [5]   - SiFive7VA
+# CHECK-NEXT: [6]   - SiFive7VL
+# CHECK-NEXT: [7]   - SiFive7VS
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
+# CHECK-NEXT:  -      -     1.00    -     2.00   2.00    -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    Instructions:
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, a0, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -     vadd.vv	v12, v12, v12
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     01234567
+
+# CHECK:      [0,0]     DeeE . .   vsetvli	zero, a0, e8, m1, tu, mu
+# CHECK-NEXT: [0,1]     .  DeeeE   vadd.vv	v12, v12, v12
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       vsetvli	zero, a0, e8, m1, tu, mu
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       vadd.vv	v12, v12, v12
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
diff --git a/llvm/test/tools/llvm-mca/RISCV/instrument-straddles-region.s b/llvm/test/tools/llvm-mca/RISCV/instrument-straddles-region.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/instrument-straddles-region.s
@@ -0,0 +1,69 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=sifive-x280 -timeline -iterations=1 < %s | FileCheck %s
+
+# LLVM-MCA-BEGIN foo
+vsetvli zero, a0, e8, m1, tu, mu
+# LLVM-MCA-RISCV-LMUL M1
+vadd.vv v12, v12, v12
+# LLVM-MCA-END foo
+vadd.vv v12, v12, v12
+
+# CHECK:      [0] Code Region - foo
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      2
+# CHECK-NEXT: Total Cycles:      8
+# CHECK-NEXT: Total uOps:        2
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.25
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, a0, e8, m1, tu, mu
+# CHECK-NEXT:  1      4     16.00                       vadd.vv	v12, v12, v12
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SiFive7FDiv
+# CHECK-NEXT: [1]   - SiFive7IDiv
+# CHECK-NEXT: [2]   - SiFive7PipeA
+# CHECK-NEXT: [3]   - SiFive7PipeB
+# CHECK-NEXT: [4]   - SiFive7PipeV
+# CHECK-NEXT: [5]   - SiFive7VA
+# CHECK-NEXT: [6]   - SiFive7VL
+# CHECK-NEXT: [7]   - SiFive7VS
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
+# CHECK-NEXT:  -      -     1.00    -     2.00   2.00    -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    Instructions:
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, a0, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -     vadd.vv	v12, v12, v12
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     01234567
+
+# CHECK:      [0,0]     DeeE . .   vsetvli	zero, a0, e8, m1, tu, mu
+# CHECK-NEXT: [0,1]     .  DeeeE   vadd.vv	v12, v12, v12
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       vsetvli	zero, a0, e8, m1, tu, mu
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       vadd.vv	v12, v12, v12
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
diff --git a/llvm/test/tools/llvm-mca/RISCV/lit.local.cfg b/llvm/test/tools/llvm-mca/RISCV/lit.local.cfg
new file mode 100644
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'RISCV' in config.root.targets:
+    config.unsupported = True
diff --git a/llvm/test/tools/llvm-mca/RISCV/multiple-same-instruments.s b/llvm/test/tools/llvm-mca/RISCV/multiple-same-instruments.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/multiple-same-instruments.s
@@ -0,0 +1,97 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=sifive-x280 -timeline -iterations=1 < %s | FileCheck %s
+
+vsetvli zero, a0, e8, m1, tu, mu
+# LLVM-MCA-RISCV-LMUL M1
+vadd.vv v12, v12, v12
+vsetvli zero, a0, e8, m1, tu, mu
+# LLVM-MCA-RISCV-LMUL M1
+vadd.vv v12, v12, v12
+vsub.vv v12, v12, v12
+vsetvli zero, a0, e8, m2, tu, mu
+# LLVM-MCA-RISCV-LMUL M4
+vadd.vv v12, v12, v12
+vsub.vv v12, v12, v12
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      8
+# CHECK-NEXT: Total Cycles:      28
+# CHECK-NEXT: Total uOps:        8
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.29
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 22.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, a0, e8, m1, tu, mu
+# CHECK-NEXT:  1      4     16.00                       vadd.vv	v12, v12, v12
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, a0, e8, m1, tu, mu
+# CHECK-NEXT:  1      4     16.00                       vadd.vv	v12, v12, v12
+# CHECK-NEXT:  1      4     16.00                       vsub.vv	v12, v12, v12
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, a0, e8, m2, tu, mu
+# CHECK-NEXT:  1      4     16.00                       vadd.vv	v12, v12, v12
+# CHECK-NEXT:  1      4     16.00                       vsub.vv	v12, v12, v12
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SiFive7FDiv
+# CHECK-NEXT: [1]   - SiFive7IDiv
+# CHECK-NEXT: [2]   - SiFive7PipeA
+# CHECK-NEXT: [3]   - SiFive7PipeB
+# CHECK-NEXT: [4]   - SiFive7PipeV
+# CHECK-NEXT: [5]   - SiFive7VA
+# CHECK-NEXT: [6]   - SiFive7VL
+# CHECK-NEXT: [7]   - SiFive7VS
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
+# CHECK-NEXT:  -      -     3.00    -     22.00  22.00   -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    Instructions:
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, a0, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -     vadd.vv	v12, v12, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, a0, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -     vadd.vv	v12, v12, v12
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -     vsub.vv	v12, v12, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, a0, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     8.00   8.00    -      -     vadd.vv	v12, v12, v12
+# CHECK-NEXT:  -      -      -      -     8.00   8.00    -      -     vsub.vv	v12, v12, v12
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234567
+
+# CHECK:      [0,0]     DeeE .    .    .    .    . .   vsetvli	zero, a0, e8, m1, tu, mu
+# CHECK-NEXT: [0,1]     .  DeeeE  .    .    .    . .   vadd.vv	v12, v12, v12
+# CHECK-NEXT: [0,2]     .   DeeE  .    .    .    . .   vsetvli	zero, a0, e8, m1, tu, mu
+# CHECK-NEXT: [0,3]     .    . DeeeE   .    .    . .   vadd.vv	v12, v12, v12
+# CHECK-NEXT: [0,4]     .    .    .DeeeE    .    . .   vsub.vv	v12, v12, v12
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    . .   vsetvli	zero, a0, e8, m2, tu, mu
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    . .   vadd.vv	v12, v12, v12
+# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeeE   vsub.vv	v12, v12, v12
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       vsetvli	zero, a0, e8, m1, tu, mu
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       vadd.vv	v12, v12, v12
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       vsetvli	zero, a0, e8, m1, tu, mu
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       vadd.vv	v12, v12, v12
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       vsub.vv	v12, v12, v12
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       vsetvli	zero, a0, e8, m2, tu, mu
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       vadd.vv	v12, v12, v12
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       vsub.vv	v12, v12, v12
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
diff --git a/llvm/test/tools/llvm-mca/RISCV/riscv-instrument-no-data-is-err.s b/llvm/test/tools/llvm-mca/RISCV/riscv-instrument-no-data-is-err.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/riscv-instrument-no-data-is-err.s
@@ -0,0 +1,10 @@
+# RUN: not llvm-mca -mtriple=riscv64 -mcpu=sifive-x280 -iterations=1 < %s 2>&1 | FileCheck %s
+
+vsetvli zero, a0, e8, m1, tu, mu
+# LLVM-MCA-RISCV-LMUL
+vadd.vv v12, v12, v12
+
+# CHECK: error: Failed to create RISCV-LMUL instrument with no data
+# CHECK: # LLVM-MCA-RISCV-LMUL
+# CHECK:  ^
+# CHECK: error: There was an error parsing comments.
diff --git a/llvm/test/tools/llvm-mca/RISCV/unknown-instrument-is-err.s b/llvm/test/tools/llvm-mca/RISCV/unknown-instrument-is-err.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/unknown-instrument-is-err.s
@@ -0,0 +1,10 @@
+# RUN: not llvm-mca -mtriple=riscv64 -mcpu=sifive-x280 -timeline -iterations=1 < %s 2>&1 | FileCheck %s
+
+# LLVM-MCA-UNKNOWN M1
+vsetvli zero, a0, e8, m1, tu, mu
+vadd.vv v12, v12, v12
+
+# CHECK: error: Unknown instrumentation type in LLVM-MCA comment: UNKNOWN
+# CHECK: # LLVM-MCA-UNKNOWN M1
+# CHECK:  ^
+# CHECK:  error: There was an error parsing comments.
diff --git a/llvm/test/tools/llvm-mca/RISCV/unknown-lmul-is-err.s b/llvm/test/tools/llvm-mca/RISCV/unknown-lmul-is-err.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/unknown-lmul-is-err.s
@@ -0,0 +1,10 @@
+# RUN: not llvm-mca -mtriple=riscv64 -mcpu=sifive-x280 -iterations=1 < %s 2>&1 | FileCheck %s
+
+vsetvli zero, a0, e8, m1, tu, mu
+# LLVM-MCA-RISCV-V MF9
+vadd.vv v12, v12, v12
+
+# CHECK: error: Unknown instrumentation type in LLVM-MCA comment: RISCV-V
+# CHECK: # LLVM-MCA-RISCV-V MF9
+# CHECK:  ^
+# CHECK:  error: There was an error parsing comments.
diff --git a/llvm/test/tools/llvm-profdata/version.test b/llvm/test/tools/llvm-profdata/version.test
new file mode 100644
--- /dev/null
+++ b/llvm/test/tools/llvm-profdata/version.test
@@ -0,0 +1,4 @@
+# RUN: llvm-profdata --version | FileCheck %s
+
+# CHECK: llvm-profdata
+# CHECK: LLVM version {{.*}}
diff --git a/llvm/tools/llvm-mca/CodeRegion.h b/llvm/tools/llvm-mca/CodeRegion.h
--- a/llvm/tools/llvm-mca/CodeRegion.h
+++ b/llvm/tools/llvm-mca/CodeRegion.h
@@ -91,6 +91,8 @@
   CodeRegion(llvm::StringRef Desc, llvm::SMLoc Start)
       : Description(Desc), RangeStart(Start) {}
 
+  virtual ~CodeRegion() = default;
+
   void addInstruction(const llvm::MCInst &Instruction) {
     Instructions.emplace_back(Instruction);
   }
@@ -115,14 +117,14 @@
 /// in analysis of the region.
 class InstrumentRegion : public CodeRegion {
   /// Instrument for this region.
-  SharedInstrument Instrument;
+  UniqueInstrument I;
 
 public:
-  InstrumentRegion(llvm::StringRef Desc, llvm::SMLoc Start, SharedInstrument I)
-      : CodeRegion(Desc, Start), Instrument(I) {}
+  InstrumentRegion(llvm::StringRef Desc, llvm::SMLoc Start, UniqueInstrument I)
+      : CodeRegion(Desc, Start), I(std::move(I)) {}
 
 public:
-  SharedInstrument getInstrument() const { return Instrument; }
+  Instrument *getInstrument() const { return I.get(); }
 };
 
 class CodeRegionParseError final : public Error {};
@@ -142,6 +144,7 @@
 
 public:
   CodeRegions(llvm::SourceMgr &S) : SM(S), FoundErrors(false) {}
+  virtual ~CodeRegions() = default;
 
   typedef std::vector<UniqueCodeRegion>::iterator iterator;
   typedef std::vector<UniqueCodeRegion>::const_iterator const_iterator;
@@ -179,14 +182,14 @@
 };
 
 struct InstrumentRegions : public CodeRegions {
+
   InstrumentRegions(llvm::SourceMgr &S);
 
   void beginRegion(llvm::StringRef Description, llvm::SMLoc Loc,
-                   SharedInstrument Instrument);
+                   UniqueInstrument Instrument);
   void endRegion(llvm::StringRef Description, llvm::SMLoc Loc);
 
-  const SmallVector<SharedInstrument>
-  getActiveInstruments(llvm::SMLoc Loc) const;
+  const SmallVector<Instrument *> getActiveInstruments(llvm::SMLoc Loc) const;
 };
 
 } // namespace mca
diff --git a/llvm/tools/llvm-mca/CodeRegion.cpp b/llvm/tools/llvm-mca/CodeRegion.cpp
--- a/llvm/tools/llvm-mca/CodeRegion.cpp
+++ b/llvm/tools/llvm-mca/CodeRegion.cpp
@@ -115,7 +115,7 @@
 InstrumentRegions::InstrumentRegions(llvm::SourceMgr &S) : CodeRegions(S) {}
 
 void InstrumentRegions::beginRegion(StringRef Description, SMLoc Loc,
-                                    SharedInstrument I) {
+                                    UniqueInstrument I) {
   if (Description.empty()) {
     SM.PrintMessage(Loc, llvm::SourceMgr::DK_Error,
                     "anonymous instrumentation regions are not permitted");
@@ -137,7 +137,8 @@
   }
 
   ActiveRegions[Description] = Regions.size();
-  Regions.emplace_back(std::make_unique<InstrumentRegion>(Description, Loc, I));
+  Regions.emplace_back(
+      std::make_unique<InstrumentRegion>(Description, Loc, std::move(I)));
 }
 
 void InstrumentRegions::endRegion(StringRef Description, SMLoc Loc) {
@@ -158,13 +159,13 @@
   }
 }
 
-const SmallVector<SharedInstrument>
+const SmallVector<Instrument *>
 InstrumentRegions::getActiveInstruments(SMLoc Loc) const {
-  SmallVector<SharedInstrument> AI;
+  SmallVector<Instrument *> AI;
   for (auto &R : Regions) {
     if (R->isLocInRange(Loc)) {
       InstrumentRegion *IR = static_cast<InstrumentRegion *>(R.get());
-      AI.emplace_back(IR->getInstrument());
+      AI.push_back(IR->getInstrument());
     }
   }
   return AI;
diff --git a/llvm/tools/llvm-mca/CodeRegionGenerator.cpp b/llvm/tools/llvm-mca/CodeRegionGenerator.cpp
--- a/llvm/tools/llvm-mca/CodeRegionGenerator.cpp
+++ b/llvm/tools/llvm-mca/CodeRegionGenerator.cpp
@@ -184,7 +184,7 @@
     return;
   }
 
-  SharedInstrument I = IM.createInstrument(InstrumentKind, Data);
+  UniqueInstrument I = IM.createInstrument(InstrumentKind, Data);
   if (!I) {
     if (Data.empty())
       SM.PrintMessage(Loc, llvm::SourceMgr::DK_Error,
@@ -202,7 +202,7 @@
   if (Regions.isRegionActive(InstrumentKind))
     Regions.endRegion(InstrumentKind, Loc);
   // Start new instrumentation region
-  Regions.beginRegion(InstrumentKind, Loc, I);
+  Regions.beginRegion(InstrumentKind, Loc, std::move(I));
 }
 
 } // namespace mca
diff --git a/llvm/tools/llvm-mca/llvm-mca.cpp b/llvm/tools/llvm-mca/llvm-mca.cpp
--- a/llvm/tools/llvm-mca/llvm-mca.cpp
+++ b/llvm/tools/llvm-mca/llvm-mca.cpp
@@ -574,7 +574,7 @@
     SmallVector<std::unique_ptr<mca::Instruction>> LoweredSequence;
     for (const MCInst &MCI : Insts) {
       SMLoc Loc = MCI.getLoc();
-      const SmallVector<mca::SharedInstrument> Instruments =
+      const SmallVector<mca::Instrument *> Instruments =
           InstrumentRegions.getActiveInstruments(Loc);
 
       Expected<std::unique_ptr<mca::Instruction>> Inst =
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -3070,6 +3070,12 @@
              << "Available commands: merge, show, overlap\n";
       return 0;
     }
+
+    if (strcmp(argv[1], "--version") == 0) {
+      outs() << ProgName << '\n';
+      cl::PrintVersionMessage();
+      return 0;
+    }
   }
 
   if (argc < 2)
diff --git a/llvm/unittests/tools/llvm-mca/MCATestBase.cpp b/llvm/unittests/tools/llvm-mca/MCATestBase.cpp
--- a/llvm/unittests/tools/llvm-mca/MCATestBase.cpp
+++ b/llvm/unittests/tools/llvm-mca/MCATestBase.cpp
@@ -68,7 +68,7 @@
   auto IM = std::make_unique<mca::InstrumentManager>(*STI, *MCII);
   mca::InstrBuilder IB(*STI, *MCII, *MRI, MCIA.get(), *IM);
 
-  const SmallVector<mca::SharedInstrument> Instruments;
+  const SmallVector<mca::Instrument *> Instruments;
   SmallVector<std::unique_ptr<mca::Instruction>> LoweredInsts;
   for (const auto &MCI : Insts) {
     Expected<std::unique_ptr<mca::Instruction>> Inst =
diff --git a/llvm/unittests/tools/llvm-mca/X86/TestIncrementalMCA.cpp b/llvm/unittests/tools/llvm-mca/X86/TestIncrementalMCA.cpp
--- a/llvm/unittests/tools/llvm-mca/X86/TestIncrementalMCA.cpp
+++ b/llvm/unittests/tools/llvm-mca/X86/TestIncrementalMCA.cpp
@@ -35,7 +35,7 @@
   auto IM = std::make_unique<mca::InstrumentManager>(*STI, *MCII);
   mca::InstrBuilder IB(*STI, *MCII, *MRI, MCIA.get(), *IM);
 
-  const SmallVector<mca::SharedInstrument> Instruments;
+  const SmallVector<mca::Instrument *> Instruments;
   // Tile size = 7
   for (unsigned i = 0U, E = MCIs.size(); i < E;) {
     for (unsigned TE = i + 7; i < TE && i < E; ++i) {
@@ -127,7 +127,7 @@
   mca::InstrBuilder IB(*STI, *MCII, *MRI, MCIA.get(), *IM);
   IB.setInstRecycleCallback(GetRecycledInst);
 
-  const SmallVector<mca::SharedInstrument> Instruments;
+  const SmallVector<mca::Instrument *> Instruments;
   // Tile size = 7
   for (unsigned i = 0U, E = MCIs.size(); i < E;) {
     for (unsigned TE = i + 7; i < TE && i < E; ++i) {
diff --git a/mlir/docs/BytecodeFormat.md b/mlir/docs/BytecodeFormat.md
--- a/mlir/docs/BytecodeFormat.md
+++ b/mlir/docs/BytecodeFormat.md
@@ -339,11 +339,20 @@
   numSuccessors: varint?,
   successors: varint[],
 
+  numUseListOrders: varint?,
+  useListOrders: uselist[],
+
   regionEncoding: varint?, // (numRegions << 1) | (isIsolatedFromAbove)
 
   // regions are stored in a section if isIsolatedFromAbove
   regions: (region | region_section)[]
 }
+
+uselist {
+  indexInRange: varint?,
+  useListEncoding: varint, // (numIndices << 1) | (isIndexPairEncoding)
+  indices: varint[]
+}
 ```
 
 The encoding of an operation is important because this is generally the most
@@ -377,6 +386,26 @@
 If the operation has successors, the number of successors and the indexes of the
 successor blocks within the parent region are encoded.
 
+##### Use-list orders
+
+The reference use-list order is assumed to be the reverse of the global
+enumeration of all the op operands that one would obtain with a pre-order walk
+of the IR. This order is naturally obtained by building blocks of operations
+op-by-op. However, some transformations may shuffle the use-lists with respect
+to this reference ordering. If any of the results of the operation have a
+use-list order that is not sorted with respect to the reference use-list order,
+an encoding is emitted such that it is possible to reconstruct such order after
+parsing the bytecode. The encoding represents an index map from the reference
+operand order to the current use-list order. A bit flag is used to detect if
+this encoding is of type index-pair or not. When the bit flag is set to zero,
+the element at `i` represent the position of the use `i` of the reference list
+into the current use-list. When the bit flag is set to `1`, the encoding
+represent index pairs `(i, j)`, which indicate that the use at position `i` of
+the reference list is mapped to position `j` in the current use-list. When only
+less than half of the elements in the current use-list are shuffled with respect
+to the reference use-list, the index-pair encoding is used to reduce the
+bytecode memory requirements.
+
 ##### Regions
 
 If the operation has regions, the number of regions and if the regions are
@@ -410,6 +439,8 @@
 block_arguments {
   numArgs: varint?,
   args: block_argument[]
+  numUseListOrders: varint?,
+  useListOrders: uselist[],
 }
 
 block_argument {
@@ -421,3 +452,6 @@
 A block is encoded with an array of operations and block arguments. The first
 field is an encoding that combines the number of operations in the block, with a
 flag indicating if the block has arguments.
+
+Use-list orders are attached to block arguments similarly to how they are
+attached to operation results.
diff --git a/mlir/lib/Bytecode/Encoding.h b/mlir/include/mlir/Bytecode/Encoding.h
rename from mlir/lib/Bytecode/Encoding.h
rename to mlir/include/mlir/Bytecode/Encoding.h
--- a/mlir/lib/Bytecode/Encoding.h
+++ b/mlir/include/mlir/Bytecode/Encoding.h
@@ -11,10 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LIB_MLIR_BYTECODE_ENCODING_H
-#define LIB_MLIR_BYTECODE_ENCODING_H
+#ifndef MLIR_BYTECODE_ENCODING_H
+#define MLIR_BYTECODE_ENCODING_H
 
+#include "mlir/IR/Value.h"
 #include <cstdint>
+#include <type_traits>
 
 namespace mlir {
 namespace bytecode {
@@ -27,7 +29,7 @@
   kMinSupportedVersion = 0,
 
   /// The current bytecode version.
-  kVersion = 2,
+  kVersion = 3,
 
   /// An arbitrary value used to fill alignment padding.
   kAlignmentByte = 0xCB,
@@ -87,10 +89,27 @@
   kHasOperands      = 0b00000100,
   kHasSuccessors    = 0b00001000,
   kHasInlineRegions = 0b00010000,
+  kHasUseListOrders = 0b00100000,
   // clang-format on
 };
 } // namespace OpEncodingMask
 
+/// Get the unique ID of a value use. We encode the unique ID combining an owner
+/// number and the argument number such as if ownerID(op1) < ownerID(op2), then
+/// useID(op1) < useID(op2). If uses have the same owner, then argNumber(op1) <
+/// argNumber(op2) implies useID(op1) < useID(op2).
+template <typename OperandT>
+static inline uint64_t getUseID(OperandT &val, unsigned ownerID) {
+  uint32_t operandNumberID;
+  if constexpr (std::is_same_v<OpOperand, OperandT>)
+    operandNumberID = val.getOperandNumber();
+  else if constexpr (std::is_same_v<BlockArgument, OperandT>)
+    operandNumberID = val.getArgNumber();
+  else
+    llvm_unreachable("unexpected operand type");
+  return (static_cast<uint64_t>(ownerID) << 32) | operandNumberID;
+}
+
 } // namespace bytecode
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h b/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h
--- a/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h
+++ b/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h
@@ -82,12 +82,12 @@
   /// Returns the type of a pointer to an element of the memref.
   Type getElementPtrType(MemRefType type) const;
 
-  /// Computes sizes, strides and buffer size in bytes of `memRefType` with
-  /// identity layout. Emits constant ops for the static sizes of `memRefType`,
-  /// and uses `dynamicSizes` for the others. Emits instructions to compute
-  /// strides and buffer size from these sizes.
+  /// Computes sizes, strides and buffer size of `memRefType` with identity
+  /// layout. Emits constant ops for the static sizes of `memRefType`, and uses
+  /// `dynamicSizes` for the others. Emits instructions to compute strides and
+  /// buffer size from these sizes.
   ///
-  /// For example, memref<4x?xf32> emits:
+  /// For example, memref<4x?xf32> with `sizeInBytes = true` emits:
   /// `sizes[0]`   = llvm.mlir.constant(4 : index) : i64
   /// `sizes[1]`   = `dynamicSizes[0]`
   /// `strides[1]` = llvm.mlir.constant(1 : index) : i64
@@ -97,19 +97,27 @@
   /// %gep         = llvm.getelementptr %nullptr[%size]
   ///                  : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
   /// `sizeBytes`  = llvm.ptrtoint %gep : !llvm.ptr<f32> to i64
+  ///
+  /// If `sizeInBytes = false`, memref<4x?xf32> emits:
+  /// `sizes[0]`   = llvm.mlir.constant(4 : index) : i64
+  /// `sizes[1]`   = `dynamicSizes[0]`
+  /// `strides[1]` = llvm.mlir.constant(1 : index) : i64
+  /// `strides[0]` = `sizes[0]`
+  /// %size        = llvm.mul `sizes[0]`, `sizes[1]` : i64
   void getMemRefDescriptorSizes(Location loc, MemRefType memRefType,
                                 ValueRange dynamicSizes,
                                 ConversionPatternRewriter &rewriter,
                                 SmallVectorImpl<Value> &sizes,
-                                SmallVectorImpl<Value> &strides,
-                                Value &sizeBytes) const;
+                                SmallVectorImpl<Value> &strides, Value &size,
+                                bool sizeInBytes = true) const;
 
   /// Computes the size of type in bytes.
   Value getSizeInBytes(Location loc, Type type,
                        ConversionPatternRewriter &rewriter) const;
 
-  /// Computes total number of elements for the given shape.
-  Value getNumElements(Location loc, ArrayRef<Value> shape,
+  /// Computes total number of elements for the given MemRef and dynamicSizes.
+  Value getNumElements(Location loc, MemRefType memRefType,
+                       ValueRange dynamicSizes,
                        ConversionPatternRewriter &rewriter) const;
 
   /// Creates and populates a canonical memref descriptor struct.
diff --git a/mlir/include/mlir/Conversion/MemRefToLLVM/AllocLikeConversion.h b/mlir/include/mlir/Conversion/MemRefToLLVM/AllocLikeConversion.h
--- a/mlir/include/mlir/Conversion/MemRefToLLVM/AllocLikeConversion.h
+++ b/mlir/include/mlir/Conversion/MemRefToLLVM/AllocLikeConversion.h
@@ -20,8 +20,10 @@
   using ConvertToLLVMPattern::getVoidPtrType;
 
   explicit AllocationOpLLVMLowering(StringRef opName,
-                                    LLVMTypeConverter &converter)
-      : ConvertToLLVMPattern(opName, &converter.getContext(), converter) {}
+                                    LLVMTypeConverter &converter,
+                                    PatternBenefit benefit = 1)
+      : ConvertToLLVMPattern(opName, &converter.getContext(), converter,
+                             benefit) {}
 
 protected:
   /// Computes the aligned value for 'input' as follows:
@@ -103,15 +105,20 @@
 /// Lowering for AllocOp and AllocaOp.
 struct AllocLikeOpLLVMLowering : public AllocationOpLLVMLowering {
   explicit AllocLikeOpLLVMLowering(StringRef opName,
-                                   LLVMTypeConverter &converter)
-      : AllocationOpLLVMLowering(opName, converter) {}
+                                   LLVMTypeConverter &converter,
+                                   PatternBenefit benefit = 1)
+      : AllocationOpLLVMLowering(opName, converter, benefit) {}
 
 protected:
   /// Allocates the underlying buffer. Returns the allocated pointer and the
   /// aligned pointer.
   virtual std::tuple<Value, Value>
-  allocateBuffer(ConversionPatternRewriter &rewriter, Location loc,
-                 Value sizeBytes, Operation *op) const = 0;
+  allocateBuffer(ConversionPatternRewriter &rewriter, Location loc, Value size,
+                 Operation *op) const = 0;
+
+  /// Sets the flag 'requiresNumElements', specifying the Op requires the number
+  /// of elements instead of the size in bytes.
+  void setRequiresNumElements();
 
 private:
   // An `alloc` is converted into a definition of a memref descriptor value and
@@ -133,6 +140,10 @@
   LogicalResult
   matchAndRewrite(Operation *op, ArrayRef<Value> operands,
                   ConversionPatternRewriter &rewriter) const override;
+
+  // Flag for specifying the Op requires the number of elements instead of the
+  // size in bytes.
+  bool requiresNumElements = false;
 };
 
 } // namespace mlir
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -172,8 +172,9 @@
   LLVM_ScalarOrVectorOf<LLVM_AnyFloat>, "fneg", "FNeg">;
 
 // Memory-related operations.
-def LLVM_AllocaOp : LLVM_Op<"alloca",
-    [DeclareOpInterfaceMethods<PromotableAllocationOpInterface>]>,
+def LLVM_AllocaOp : LLVM_Op<"alloca", 
+    [DeclareOpInterfaceMethods<PromotableAllocationOpInterface>,
+     DeclareOpInterfaceMethods<DestructurableAllocationOpInterface>]>,
   LLVM_MemOpPatterns {
   let arguments = (ins AnyInteger:$arraySize,
                    OptionalAttr<I64Attr>:$alignment,
@@ -232,7 +233,9 @@
 }
 
 def LLVM_GEPOp : LLVM_Op<"getelementptr", [Pure,
-    DeclareOpInterfaceMethods<PromotableOpInterface>]> {
+    DeclareOpInterfaceMethods<PromotableOpInterface>,
+    DeclareOpInterfaceMethods<SafeMemorySlotAccessOpInterface>,
+    DeclareOpInterfaceMethods<DestructurableAccessorOpInterface>]> {
   let arguments = (ins LLVM_ScalarOrVectorOf<LLVM_AnyPointer>:$base,
                    Variadic<LLVM_ScalarOrVectorOf<AnyInteger>>:$dynamicIndices,
                    DenseI32ArrayAttr:$rawConstantIndices,
@@ -316,7 +319,8 @@
 }
 
 def LLVM_LoadOp : LLVM_MemAccessOpBase<"load",
-    [DeclareOpInterfaceMethods<PromotableMemOpInterface>]> {
+    [DeclareOpInterfaceMethods<PromotableMemOpInterface>,
+     DeclareOpInterfaceMethods<SafeMemorySlotAccessOpInterface>]> {
   dag args = (ins Arg<LLVM_PointerTo<LLVM_LoadableType>, "", [MemRead]>:$addr,
               OptionalAttr<I64Attr>:$alignment,
               UnitAttr:$volatile_,
@@ -388,7 +392,8 @@
 }
 
 def LLVM_StoreOp : LLVM_MemAccessOpBase<"store",
-    [DeclareOpInterfaceMethods<PromotableMemOpInterface>]> {
+    [DeclareOpInterfaceMethods<PromotableMemOpInterface>,
+     DeclareOpInterfaceMethods<SafeMemorySlotAccessOpInterface>]> {
   dag args = (ins LLVM_LoadableType:$value,
               Arg<LLVM_PointerTo<LLVM_LoadableType>,"",[MemWrite]>:$addr,
               OptionalAttr<I64Attr>:$alignment,
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h b/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h
@@ -16,6 +16,7 @@
 
 #include "mlir/IR/Types.h"
 #include "mlir/Interfaces/DataLayoutInterfaces.h"
+#include "mlir/Interfaces/MemorySlotInterfaces.h"
 #include <optional>
 
 namespace llvm {
@@ -103,6 +104,7 @@
 class LLVMStructType
     : public Type::TypeBase<LLVMStructType, Type, detail::LLVMStructTypeStorage,
                             DataLayoutTypeInterface::Trait,
+                            DestructurableTypeInterface::Trait,
                             TypeTrait::IsMutable> {
 public:
   /// Inherit base constructors.
@@ -198,6 +200,12 @@
 
   LogicalResult verifyEntries(DataLayoutEntryListRef entries,
                               Location loc) const;
+
+  /// Destructs the struct into its indexed field types.
+  std::optional<DenseMap<Attribute, Type>> getSubelementIndexMap();
+
+  /// Returns which type is stored at a given integer index within the struct.
+  Type getTypeAtIndex(Attribute index);
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.td
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.td
@@ -12,6 +12,7 @@
 include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
 include "mlir/IR/AttrTypeBase.td"
 include "mlir/Interfaces/DataLayoutInterfaces.td"
+include "mlir/Interfaces/MemorySlotInterfaces.td"
 
 /// Base class for all LLVM dialect types.
 class LLVMType<string typeName, string typeMnemonic, list<Trait> traits = []>
@@ -24,7 +25,8 @@
 //===----------------------------------------------------------------------===//
 
 def LLVMArrayType : LLVMType<"LLVMArray", "array", [
-    DeclareTypeInterfaceMethods<DataLayoutTypeInterface, ["getTypeSize"]>]> {
+    DeclareTypeInterfaceMethods<DataLayoutTypeInterface, ["getTypeSize"]>,
+    DeclareTypeInterfaceMethods<DestructurableTypeInterface>]> {
   let summary = "LLVM array type";
   let description = [{
     The `!llvm.array` type represents a fixed-size array of element types.
diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
@@ -819,7 +819,7 @@
     loop nests, which can be empty.
   }];
 
-  // Also allow any !pdl.operation for simpler composition. Non-tensor.pad ops
+  // Also allow any payload operation for simpler composition. Non-tensor.pad ops
   // will be dropped from the results.
   let arguments =
     (ins TransformHandleTypeInterface:$target,
@@ -862,7 +862,7 @@
     tensor.pad operations, which can be empty.
   }];
 
-  // Also allow any !pdl.operation for simpler composition. Non-tensor.pad ops
+  // Also allow any operation for simpler composition. Non-tensor.pad ops
   // will be dropped from the results.
   let arguments =
     (ins TransformHandleTypeInterface:$target,
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -636,7 +636,8 @@
                        UnitAttr:$selfAttr,
                        OptionalAttr<OpenACC_ReductionOperatorAttr>:$reductionOp,
                        Variadic<AnyType>:$reductionOperands,
-                       Variadic<AnyType>:$gangPrivateOperands,
+                       Variadic<OpenACC_PointerLikeTypeInterface>:$gangPrivateOperands,
+                       OptionalAttr<SymbolRefArrayAttr>:$privatizations,
                        Variadic<AnyType>:$gangFirstPrivateOperands,
                        Variadic<OpenACC_PointerLikeTypeInterface>:$dataClauseOperands,
                        OptionalAttr<DefaultValueAttr>:$defaultAttr);
@@ -659,7 +660,9 @@
             type($gangFirstPrivateOperands) `)`
       | `num_gangs` `(` $numGangs `:` type($numGangs) `)`
       | `num_workers` `(` $numWorkers `:` type($numWorkers) `)`
-      | `private` `(` $gangPrivateOperands `:` type($gangPrivateOperands) `)`
+      | `private` `(` custom<PrivatizationList>(
+            $gangPrivateOperands, type($gangPrivateOperands), $privatizations)
+        `)`
       | `vector_length` `(` $vectorLength `:` type($vectorLength) `)`
       | `wait` `(` $waitOperands `:` type($waitOperands) `)`
       | `self` `(` $selfCond `)`
@@ -701,7 +704,8 @@
                        UnitAttr:$selfAttr,
                        OptionalAttr<OpenACC_ReductionOperatorAttr>:$reductionOp,
                        Variadic<AnyType>:$reductionOperands,
-                       Variadic<AnyType>:$gangPrivateOperands,
+                       Variadic<OpenACC_PointerLikeTypeInterface>:$gangPrivateOperands,
+                       OptionalAttr<SymbolRefArrayAttr>:$privatizations,
                        Variadic<AnyType>:$gangFirstPrivateOperands,
                        Variadic<OpenACC_PointerLikeTypeInterface>:$dataClauseOperands,
                        OptionalAttr<DefaultValueAttr>:$defaultAttr);
@@ -722,7 +726,9 @@
       | `async` `(` $async `:` type($async) `)`
       | `firstprivate` `(` $gangFirstPrivateOperands `:`
             type($gangFirstPrivateOperands) `)`
-      | `private` `(` $gangPrivateOperands `:` type($gangPrivateOperands) `)`
+      | `private` `(` custom<PrivatizationList>(
+            $gangPrivateOperands, type($gangPrivateOperands), $privatizations)
+        `)`
       | `wait` `(` $waitOperands `:` type($waitOperands) `)`
       | `self` `(` $selfCond `)`
       | `if` `(` $ifCond `)`
@@ -1033,7 +1039,8 @@
                        UnitAttr:$hasWorker,
                        UnitAttr:$hasVector,
                        Variadic<IntOrIndex>:$tileOperands,
-                       Variadic<AnyType>:$privateOperands,
+                       Variadic<OpenACC_PointerLikeTypeInterface>:$privateOperands,
+                       OptionalAttr<SymbolRefArrayAttr>:$privatizations,
                        OptionalAttr<OpenACC_ReductionOperatorAttr>:$reductionOp,
                        Variadic<AnyType>:$reductionOperands);
 
@@ -1053,7 +1060,9 @@
         `gang` `` custom<GangClause>($gangNum, type($gangNum), $gangStatic, type($gangStatic), $hasGang)
       | `worker` `` custom<WorkerClause>($workerNum, type($workerNum), $hasWorker)
       | `vector` `` custom<VectorClause>($vectorLength, type($vectorLength), $hasVector)
-      | `private` `(` $privateOperands `:` type($privateOperands) `)`
+      | `private` `(` custom<PrivatizationList>(
+            $privateOperands, type($privateOperands), $privatizations)
+        `)`
       | `tile` `(` $tileOperands `:` type($tileOperands) `)`
       | `reduction` `(` $reductionOperands `:` type($reductionOperands) `)`
     )
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPInterfaces.h b/mlir/include/mlir/Dialect/OpenMP/OpenMPInterfaces.h
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPInterfaces.h
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPInterfaces.h
@@ -30,6 +30,12 @@
 struct OffloadModuleDefaultModel
     : public OffloadModuleInterface::ExternalModel<OffloadModuleDefaultModel,
                                                    mlir::ModuleOp> {};
+
+template <typename T>
+struct DeclareTargetDefaultModel
+    : public DeclareTargetInterface::ExternalModel<DeclareTargetDefaultModel<T>,
+                                                   T> {};
+
 } // namespace mlir::omp
 
 #endif // MLIR_DIALECT_OPENMP_OPENMPINTERFACES_H_
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -26,7 +26,7 @@
 def OpenMP_Dialect : Dialect {
   let name = "omp";
   let cppNamespace = "::mlir::omp";
-  let dependentDialects = ["::mlir::LLVM::LLVMDialect"];
+  let dependentDialects = ["::mlir::LLVM::LLVMDialect, ::mlir::func::FuncDialect"];
   let useDefaultAttributePrinterParser = 1;
   let usePropertiesForAttributes = 1;
 }
@@ -97,6 +97,52 @@
 def OpenMP_PointerLikeType : TypeAlias<OpenMP_PointerLikeTypeInterface,
 	"OpenMP-compatible variable type">;
 
+//===----------------------------------------------------------------------===//
+//  2.12.7 Declare Target Directive
+//===----------------------------------------------------------------------===//
+
+def DeviceTypeAny : I32EnumAttrCase<"any", 0>;
+def DeviceTypeHost   : I32EnumAttrCase<"host", 1>;
+def DeviceTypeNoHost   : I32EnumAttrCase<"nohost", 2>;
+
+def DeclareTargetDeviceType : I32EnumAttr<
+    "DeclareTargetDeviceType",
+    "device_type clause",
+    [DeviceTypeAny, DeviceTypeHost, DeviceTypeNoHost]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::omp";
+}
+
+def DeclareTargetDeviceTypeAttr : EnumAttr<OpenMP_Dialect, DeclareTargetDeviceType,
+                                    "device_type"> {
+  let assemblyFormat = "`(` $value `)`";
+}
+
+def CaptureClauseLink : I32EnumAttrCase<"to", 0>;
+def CaptureClauseTo   : I32EnumAttrCase<"link", 1>;
+
+def DeclareTargetCaptureClause : I32EnumAttr<
+    "DeclareTargetCaptureClause",
+    "capture clause",
+    [CaptureClauseLink, CaptureClauseTo]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::omp";
+}
+
+def DeclareTargetCaptureClauseAttr : EnumAttr<OpenMP_Dialect, DeclareTargetCaptureClause,
+                                    "capture_clause"> {
+  let assemblyFormat = "`(` $value `)`";
+}
+
+def DeclareTargetAttr : OpenMP_Attr<"DeclareTarget", "declaretarget"> {
+  let parameters = (ins
+    OptionalParameter<"DeclareTargetDeviceTypeAttr">:$device_type,
+    OptionalParameter<"DeclareTargetCaptureClauseAttr">:$capture_clause
+  );
+
+  let assemblyFormat = "`<` struct(params) `>`";
+}
+
 //===----------------------------------------------------------------------===//
 // 2.6 parallel Construct
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td
@@ -47,6 +47,74 @@
   ];
 }
 
+def DeclareTargetInterface : OpInterface<"DeclareTargetInterface"> {
+  let description = [{
+    OpenMP operations that support declare target have this interface.
+    For example, FuncOp's and llvm.GlobalOp/fir.GlobalOp's. This
+    interface allows simple manipulation and introspection of the
+    declare target attribute that can be applied to these operations.
+  }];
+
+  let cppNamespace = "::mlir::omp";
+
+  let methods = [
+    InterfaceMethod<
+      /*description=*/[{
+        Set the declare target attribute on the current operation with the
+        specified attribute arguments.
+      }],
+      /*retTy=*/"void",
+      /*methodName=*/"setDeclareTarget",
+      (ins "mlir::omp::DeclareTargetDeviceType":$deviceType,
+            "mlir::omp::DeclareTargetCaptureClause":$captureClause), [{}], [{
+        $_op->setAttr("omp.declare_target",
+                  mlir::omp::DeclareTargetAttr::get(
+                      $_op->getContext(),
+                      mlir::omp::DeclareTargetDeviceTypeAttr::get(
+                          $_op->getContext(), deviceType),
+                      mlir::omp::DeclareTargetCaptureClauseAttr::get(
+                          $_op->getContext(), captureClause)));
+      }]>,
+      InterfaceMethod<
+      /*description=*/[{
+        Checks if the declare target attribute has been applied and exists on the
+        current operation. Returns true if it exists on it, otherwise returns
+        false.
+      }],
+      /*retTy=*/"bool",
+      /*methodName=*/"isDeclareTarget",
+      (ins), [{}], [{
+        return $_op->hasAttr("omp.declare_target");
+      }]>,
+      InterfaceMethod<
+      /*description=*/[{
+        Returns the DeclareTargetDeviceType segment of the DeclareTarget attribute if it
+        exists on the current operation. Otherwise it returns null.
+      }],
+      /*retTy=*/"mlir::omp::DeclareTargetDeviceType",
+      /*methodName=*/"getDeclareTargetDeviceType",
+      (ins), [{}], [{
+        if (mlir::Attribute dTar = $_op->getAttr("omp.declare_target"))
+          if (auto dAttr = dTar.dyn_cast_or_null<mlir::omp::DeclareTargetAttr>())
+            return dAttr.getDeviceType().getValue();
+        return {};
+      }]>,
+      InterfaceMethod<
+      /*description=*/[{
+        Returns the DeclareTargetCaptureClause segment of the DeclareTarget attribute if it
+        exists on the current operation. Otherwise it returns null.
+      }],
+      /*retTy=*/"mlir::omp::DeclareTargetCaptureClause",
+      /*methodName=*/"getDeclareTargetCaptureClause",
+      (ins), [{}], [{
+        if (mlir::Attribute dTar = $_op->getAttr("omp.declare_target"))
+          if (auto dAttr = dTar.dyn_cast_or_null<mlir::omp::DeclareTargetAttr>())
+            return dAttr.getCaptureClause().getValue();
+        return {};
+      }]>
+  ];
+}
+
 def OffloadModuleInterface : OpInterface<"OffloadModuleInterface"> {
   let description = [{
     Operations that represent a module for offloading (host or device)
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
@@ -93,7 +93,6 @@
     //     of 3x4 matrix |0.0, 0.0, 2.2, 3.3|
     //                   |0.0, 0.0, 0.0, 0.0|
     ```
-    ```
   }];
 
   let assemblyFormat =
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
@@ -944,6 +944,8 @@
   let results = (outs
     Tosa_Tensor:$output
   );
+
+  let hasFolder = 1;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
--- a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
+++ b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
@@ -77,16 +77,16 @@
 
     ```mlir
     %result = transform.alternatives %scope {
-    ^bb0(%arg0: !pdl.operation):
+    ^bb0(%arg0: !transform.any_op):
       // Try a fallible transformation.
       %0 = transform.fallible %arg0 // ...
       // If succeeded, yield the the result of the transformation.
-      transform.yield %0 : !pdl.operation
+      transform.yield %0 : !transform.any_op
     }, {
-    ^bb0(%arg0: !pdl.operation):
+    ^bb0(%arg0: !transform.any_op):
       // Otherwise, the second alternative is tried and it always succeeds by
       // returning the original handle.
-      transform.yield %arg0 : !pdl.operation
+      transform.yield %arg0 : !transform.any_op
     }
     ```
   }];
@@ -767,7 +767,7 @@
 
     ```mlir
     transform.with_pdl_patterns {
-    ^bb0(%arg0: !pdl.operation):
+    ^bb0(%arg0: !transform.any_op):
       pdl.pattern @my_pattern : benefit(1) {
         %0 = pdl.operation //...
         // Regular PDL goes here.
@@ -775,7 +775,7 @@
       }
 
       sequence %arg0 failures(propagate) {
-      ^bb0(%arg1: !pdl.operation):
+      ^bb0(%arg1: !transform.any_op):
         %1 = pdl_match @my_pattern in %arg1
         // Use %1 as handle
       }
diff --git a/mlir/include/mlir/IR/UseDefLists.h b/mlir/include/mlir/IR/UseDefLists.h
--- a/mlir/include/mlir/IR/UseDefLists.h
+++ b/mlir/include/mlir/IR/UseDefLists.h
@@ -44,6 +44,21 @@
   /// of the SSA machinery.
   IROperandBase *getNextOperandUsingThisValue() { return nextUse; }
 
+  /// Initialize the use-def chain by setting the back address to self and
+  /// nextUse to nullptr.
+  void initChainWithUse(IROperandBase **self) {
+    assert(this == *self);
+    back = self;
+    nextUse = nullptr;
+  }
+
+  /// Link the current node to next.
+  void linkTo(IROperandBase *next) {
+    nextUse = next;
+    if (nextUse)
+      nextUse->back = &nextUse;
+  }
+
 protected:
   IROperandBase(Operation *owner) : owner(owner) {}
   IROperandBase(IROperandBase &&other) : owner(other.owner) {
@@ -192,6 +207,30 @@
       use_begin()->set(newValue);
   }
 
+  /// Shuffle the use-list chain according to the provided indices vector, which
+  /// need to represent a valid shuffle. That is, a vector of unique integers in
+  /// range [0, numUses - 1]. Users of this function need to guarantee the
+  /// validity of the indices vector.
+  void shuffleUseList(ArrayRef<unsigned> indices) {
+    assert((size_t)std::distance(getUses().begin(), getUses().end()) ==
+               indices.size() &&
+           "indices vector expected to have a number of elements equal to the "
+           "number of uses");
+    SmallVector<detail::IROperandBase *> shuffled(indices.size());
+    detail::IROperandBase *ptr = firstUse;
+    for (size_t idx = 0; idx < indices.size();
+         idx++, ptr = ptr->getNextOperandUsingThisValue())
+      shuffled[indices[idx]] = ptr;
+
+    initFirstUse(shuffled.front());
+    auto *current = firstUse;
+    for (auto &next : llvm::drop_begin(shuffled)) {
+      current->linkTo(next);
+      current = next;
+    }
+    current->linkTo(nullptr);
+  }
+
   //===--------------------------------------------------------------------===//
   // Uses
   //===--------------------------------------------------------------------===//
@@ -234,6 +273,12 @@
   OperandType *getFirstUse() const { return (OperandType *)firstUse; }
 
 private:
+  /// Set use as the first use of the chain.
+  void initFirstUse(detail::IROperandBase *use) {
+    firstUse = use;
+    firstUse->initChainWithUse(&firstUse);
+  }
+
   detail::IROperandBase *firstUse = nullptr;
 
   /// Allow access to `firstUse`.
diff --git a/mlir/include/mlir/IR/Value.h b/mlir/include/mlir/IR/Value.h
--- a/mlir/include/mlir/IR/Value.h
+++ b/mlir/include/mlir/IR/Value.h
@@ -187,6 +187,11 @@
   /// Returns true if the value is used outside of the given block.
   bool isUsedOutsideOfBlock(Block *block);
 
+  /// Shuffle the use list order according to the provided indices. It is
+  /// responsibility of the caller to make sure that the indices map the current
+  /// use-list chain to another valid use-list chain.
+  void shuffleUseList(ArrayRef<unsigned> indices);
+
   //===--------------------------------------------------------------------===//
   // Uses
 
diff --git a/mlir/include/mlir/Interfaces/CMakeLists.txt b/mlir/include/mlir/Interfaces/CMakeLists.txt
--- a/mlir/include/mlir/Interfaces/CMakeLists.txt
+++ b/mlir/include/mlir/Interfaces/CMakeLists.txt
@@ -19,6 +19,8 @@
 set(LLVM_TARGET_DEFINITIONS MemorySlotInterfaces.td)
 mlir_tablegen(MemorySlotOpInterfaces.h.inc -gen-op-interface-decls)
 mlir_tablegen(MemorySlotOpInterfaces.cpp.inc -gen-op-interface-defs)
+mlir_tablegen(MemorySlotTypeInterfaces.h.inc -gen-type-interface-decls)
+mlir_tablegen(MemorySlotTypeInterfaces.cpp.inc -gen-type-interface-defs)
 add_public_tablegen_target(MLIRMemorySlotInterfacesIncGen)
 add_dependencies(mlir-generic-headers MLIRMemorySlotInterfacesIncGen)
 
diff --git a/mlir/include/mlir/Interfaces/MemorySlotInterfaces.h b/mlir/include/mlir/Interfaces/MemorySlotInterfaces.h
--- a/mlir/include/mlir/Interfaces/MemorySlotInterfaces.h
+++ b/mlir/include/mlir/Interfaces/MemorySlotInterfaces.h
@@ -24,6 +24,13 @@
   Type elemType;
 };
 
+/// Memory slot attached with information about its destructuring procedure.
+struct DestructurableMemorySlot : public MemorySlot {
+  /// Maps an index within the memory slot to the type of the pointer that
+  /// will be generated to access the element directly.
+  DenseMap<Attribute, Type> elementPtrs;
+};
+
 /// Returned by operation promotion logic requesting the deletion of an
 /// operation.
 enum class DeletionKind {
@@ -36,5 +43,6 @@
 } // namespace mlir
 
 #include "mlir/Interfaces/MemorySlotOpInterfaces.h.inc"
+#include "mlir/Interfaces/MemorySlotTypeInterfaces.h.inc"
 
 #endif // MLIR_INTERFACES_MEMORYSLOTINTERFACES_H
diff --git a/mlir/include/mlir/Interfaces/MemorySlotInterfaces.td b/mlir/include/mlir/Interfaces/MemorySlotInterfaces.td
--- a/mlir/include/mlir/Interfaces/MemorySlotInterfaces.td
+++ b/mlir/include/mlir/Interfaces/MemorySlotInterfaces.td
@@ -215,4 +215,158 @@
   ];
 }
 
+def DestructurableAllocationOpInterface
+  : OpInterface<"DestructurableAllocationOpInterface"> {
+  let description = [{
+    Describes operations allocating memory slots of aggregates that can be
+    destructured into multiple smaller allocations.
+  }];
+  let cppNamespace = "::mlir";
+
+  let methods = [
+    InterfaceMethod<[{
+        Returns the list of slots for which destructuring should be attempted,
+        specifying in which way the slot should be destructured into subslots.
+        The subslots are indexed by attributes. This computes the type of the
+        pointer for each subslot to be generated. The type of the memory slot
+        must implement `DestructurableTypeInterface`.
+
+        No IR mutation is allowed in this method.
+      }],
+      "::llvm::SmallVector<::mlir::DestructurableMemorySlot>",
+      "getDestructurableSlots",
+      (ins)
+    >,
+    InterfaceMethod<[{
+        Destructures this slot into multiple subslots. The newly generated slots
+        may belong to a different allocator. The original slot must still exist
+        at the end of this call. Only generates subslots for the indices found in
+        `usedIndices` since all other subslots are unused.
+
+        The rewriter is located at the beginning of the block where the slot
+        pointer is defined. All IR mutations must happen through the rewriter.
+      }],
+      "::llvm::DenseMap<::mlir::Attribute, ::mlir::MemorySlot>",
+      "destructure",
+      (ins "const ::mlir::DestructurableMemorySlot &":$slot,
+           "const ::llvm::SmallPtrSetImpl<::mlir::Attribute> &":$usedIndices,
+           "::mlir::RewriterBase &":$rewriter)
+    >,
+    InterfaceMethod<[{
+        Hook triggered once the destructuring of a slot is complete, meaning the
+        original slot is no longer being refered to and could be deleted.
+        This will only be called for slots declared by this operation.
+
+        All IR mutations must happen through the rewriter.
+      }],
+      "void", "handleDestructuringComplete",
+      (ins "const ::mlir::DestructurableMemorySlot &":$slot,
+           "::mlir::RewriterBase &":$rewriter)
+    >,
+  ];
+}
+
+def SafeMemorySlotAccessOpInterface
+  : OpInterface<"SafeMemorySlotAccessOpInterface"> {
+  let description = [{
+    Describes operations using memory slots in a type-safe manner.
+  }];
+  let cppNamespace = "::mlir";
+
+  let methods = [
+    InterfaceMethod<[{
+        Returns whether all accesses in this operation to the provided slot are
+        done in a type-safe manner. To be type-safe, the access must only load
+        the value in this type as the type of the slot, and without assuming any
+        context around the slot. For example, a type-safe load must not load
+        outside the bounds of the slot.
+
+        If the type-safety of the accesses depends on the type-safety of the
+        accesses to further memory slots, the result of this method will be
+        conditioned to the type-safety of the accesses to the slots added by
+        this method to `mustBeSafelyUsed`.
+
+        No IR mutation is allowed in this method.
+      }],
+      "::mlir::LogicalResult",
+      "ensureOnlySafeAccesses",
+      (ins "const ::mlir::MemorySlot &":$slot,
+           "::mlir::SmallVectorImpl<::mlir::MemorySlot> &":$mustBeSafelyUsed)
+    >
+  ];
+}
+
+def DestructurableAccessorOpInterface
+  : OpInterface<"DestructurableAccessorOpInterface"> {
+  let description = [{
+    Describes operations that can access a sub-element of a destructurable slot.
+  }];
+  let cppNamespace = "::mlir";
+
+  let methods = [
+    InterfaceMethod<[{
+        For a given destructurable memory slot, returns whether this operation can
+        rewire its uses of the slot to use the slots generated after
+        destructuring. This may involve creating new operations, and usually
+        amounts to checking if the pointer types match.
+
+        This method must also register the indices it will access within the
+        `usedIndices` set. If the accessor generates new slots mapping to
+        subelements, they must be registered in `mustBeSafelyUsed` to ensure
+        they are used in a locally type-safe manner.
+
+        No IR mutation is allowed in this method.
+      }],
+      "bool",
+      "canRewire",
+      (ins "const ::mlir::DestructurableMemorySlot &":$slot,
+           "::llvm::SmallPtrSetImpl<::mlir::Attribute> &":$usedIndices,
+           "::mlir::SmallVectorImpl<::mlir::MemorySlot> &":$mustBeSafelyUsed)
+    >,
+    InterfaceMethod<[{
+        Rewires the use of a slot to the generated subslots, without deleting
+        any operation. Returns whether the accessor should be deleted.
+
+        All IR mutations must happen through the rewriter. Deletion of
+        operations is not allowed, only the accessor can be scheduled for
+        deletion by returning the appropriate value.
+      }],
+      "::mlir::DeletionKind",
+      "rewire",
+      (ins "const ::mlir::DestructurableMemorySlot &":$slot,
+           "::llvm::DenseMap<::mlir::Attribute, ::mlir::MemorySlot> &":$subslots,
+           "::mlir::RewriterBase &":$rewriter)
+    >
+  ];
+}
+
+def DestructurableTypeInterface
+  : TypeInterface<"DestructurableTypeInterface"> {
+  let description = [{
+    Describes a type that can be broken down into indexable sub-element types.
+  }];
+  let cppNamespace = "::mlir";
+
+  let methods = [
+    InterfaceMethod<[{
+        Destructures the type into subelements into a map of index attributes to
+        types of subelements. Returns nothing if the type cannot be destructured.
+      }],
+      "::std::optional<::llvm::DenseMap<::mlir::Attribute, ::mlir::Type>>",
+      "getSubelementIndexMap",
+      (ins)
+    >,
+    InterfaceMethod<[{
+        Indicates which type is held at the provided index, returning a null
+        Type if no type could be computed. While this can return information
+        even when the type cannot be completely destructured, it must be coherent
+        with the types returned by `getSubelementIndexMap` when they exist.
+      }],
+      "::mlir::Type",
+      "getTypeAtIndex",
+      (ins "::mlir::Attribute":$index)
+    >
+  ];
+}
+
 #endif // MLIR_INTERFACES_MEMORYSLOTINTERFACES
diff --git a/mlir/include/mlir/Transforms/Mem2Reg.h b/mlir/include/mlir/Transforms/Mem2Reg.h
--- a/mlir/include/mlir/Transforms/Mem2Reg.h
+++ b/mlir/include/mlir/Transforms/Mem2Reg.h
@@ -17,8 +17,11 @@
 
 namespace mlir {
 
+/// Statistics collected while applying mem2reg.
 struct Mem2RegStatistics {
+  /// Total amount of memory slots promoted.
   llvm::Statistic *promotedAmount = nullptr;
+  /// Total amount of new block arguments inserted in blocks.
   llvm::Statistic *newBlockArgumentAmount = nullptr;
 };
 
diff --git a/mlir/include/mlir/Transforms/Passes.h b/mlir/include/mlir/Transforms/Passes.h
--- a/mlir/include/mlir/Transforms/Passes.h
+++ b/mlir/include/mlir/Transforms/Passes.h
@@ -36,6 +36,7 @@
 #define GEN_PASS_DECL_MEM2REG
 #define GEN_PASS_DECL_PRINTIRPASS
 #define GEN_PASS_DECL_PRINTOPSTATS
+#define GEN_PASS_DECL_SROA
 #define GEN_PASS_DECL_STRIPDEBUGINFO
 #define GEN_PASS_DECL_SCCP
 #define GEN_PASS_DECL_SYMBOLDCE
diff --git a/mlir/include/mlir/Transforms/Passes.td b/mlir/include/mlir/Transforms/Passes.td
--- a/mlir/include/mlir/Transforms/Passes.td
+++ b/mlir/include/mlir/Transforms/Passes.td
@@ -199,10 +199,10 @@
   let statistics = [
     Statistic<"promotedAmount",
               "promoted slots",
-              "Number of promoted memory slot">,
+              "Total amount of memory slot promoted">,
     Statistic<"newBlockArgumentAmount",
               "new block args",
-              "Total number of block arguments added">,
+              "Total amount of new block argument inserted in blocks">,
   ];
 }
 
@@ -229,6 +229,42 @@
   let constructor = "mlir::createSCCPPass()";
 }
 
+def SROA : Pass<"sroa"> {
+  let summary = "Scalar Replacement of Aggregates";
+  let description = [{
+    Scalar Replacement of Aggregates. Replaces allocations of aggregates into
+    independant allocations of its elements.
+
+    Allocators must implement `DestructurableAllocationOpInterface` to provide
+    the list of memory slots for which destructuring should be attempted.
+
+    This pass will only be applied if all accessors of the aggregate implement
+    the `DestructurableAccessorOpInterface`. If the accessors provide a view
+    into the struct, users of the view must ensure it is used in a type-safe
+    manner and within bounds by implementing `TypeSafeOpInterface`.
+  }];
+
+  let statistics = [
+    Statistic<
+      "destructuredAmount",
+      "destructured slots",
+      "Total amount of memory slots destructured"
+    >,
+    Statistic<
+      "slotsWithMemoryBenefit",
+      "slots with memory benefit",
+      "Total amount of memory slots in which the destructured size was smaller "
+      "than the total size after eliminating unused fields"
+    >,
+    Statistic<
+      "maxSubelementAmount",
+      "max subelement number",
+      "Maximal number of sub-elements a successfully destructured slot "
+      "initially had"
+    >,
+  ];
+}
+
 def StripDebugInfo : Pass<"strip-debuginfo"> {
   let summary = "Strip debug info from all operations";
   let description = [{
diff --git a/mlir/include/mlir/Transforms/SROA.h b/mlir/include/mlir/Transforms/SROA.h
new file mode 100644
--- /dev/null
+++ b/mlir/include/mlir/Transforms/SROA.h
@@ -0,0 +1,57 @@
+//===-- SROA.h - Scalar Replacement Of Aggregates ---------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TRANSFORMS_SROA_H
+#define MLIR_TRANSFORMS_SROA_H
+
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Interfaces/MemorySlotInterfaces.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/Statistic.h"
+
+namespace mlir {
+
+/// Statistics collected while applying SROA.
+struct SROAStatistics {
+  /// Total amount of memory slots destructured.
+  llvm::Statistic *destructuredAmount = nullptr;
+  /// Total amount of memory slots in which the destructured size was smaller
+  /// than the total size after eliminating unused fields.
+  llvm::Statistic *slotsWithMemoryBenefit = nullptr;
+  /// Maximal number of sub-elements a successfully destructured slot initially
+  /// had.
+  llvm::Statistic *maxSubelementAmount = nullptr;
+};
+
+/// Pattern applying SROA to the regions of the operations on which it
+/// matches.
+class SROAPattern
+    : public OpInterfaceRewritePattern<DestructurableAllocationOpInterface> {
+public:
+  using OpInterfaceRewritePattern::OpInterfaceRewritePattern;
+
+  SROAPattern(MLIRContext *context, SROAStatistics statistics = {},
+              PatternBenefit benefit = 1)
+      : OpInterfaceRewritePattern(context, benefit), statistics(statistics) {}
+
+  LogicalResult matchAndRewrite(DestructurableAllocationOpInterface allocator,
+                                PatternRewriter &rewriter) const override;
+
+private:
+  SROAStatistics statistics;
+};
+
+/// Attempts to destructure the slots of destructurable allocators. Returns
+/// failure if no slot was destructured.
+LogicalResult tryToDestructureMemorySlots(
+    ArrayRef<DestructurableAllocationOpInterface> allocators,
+    RewriterBase &rewriter, SROAStatistics statistics = {});
+
+} // namespace mlir
+
+#endif // MLIR_TRANSFORMS_SROA_H
diff --git a/mlir/lib/Bytecode/Reader/BytecodeReader.cpp b/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
--- a/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
+++ b/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
@@ -7,12 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 // TODO: Support for big-endian architectures.
-// TODO: Properly preserve use lists of values.
 
 #include "mlir/Bytecode/BytecodeReader.h"
-#include "../Encoding.h"
 #include "mlir/AsmParser/AsmParser.h"
 #include "mlir/Bytecode/BytecodeImplementation.h"
+#include "mlir/Bytecode/Encoding.h"
 #include "mlir/IR/BuiltinDialect.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/OpImplementation.h"
@@ -29,6 +28,7 @@
 #include "llvm/Support/SourceMgr.h"
 #include <list>
 #include <memory>
+#include <numeric>
 #include <optional>
 
 #define DEBUG_TYPE "mlir-bytecode-reader"
@@ -1281,6 +1281,42 @@
   /// Create a value to use for a forward reference.
   Value createForwardRef();
 
+  //===--------------------------------------------------------------------===//
+  // Use-list order helpers
+
+  /// This struct is a simple storage that contains information required to
+  /// reorder the use-list of a value with respect to the pre-order traversal
+  /// ordering.
+  struct UseListOrderStorage {
+    UseListOrderStorage(bool isIndexPairEncoding,
+                        SmallVector<unsigned, 4> &&indices)
+        : indices(std::move(indices)),
+          isIndexPairEncoding(isIndexPairEncoding){};
+    /// The vector containing the information required to reorder the
+    /// use-list of a value.
+    SmallVector<unsigned, 4> indices;
+
+    /// Whether indices represent a pair of type `(src, dst)` or it is a direct
+    /// indexing, such as `dst = order[src]`.
+    bool isIndexPairEncoding;
+  };
+
+  /// Parse use-list order from bytecode for a range of values if available. The
+  /// range is expected to be either a block argument or an op result range. On
+  /// success, return a map of the position in the range and the use-list order
+  /// encoding. The function assumes to know the size of the range it is
+  /// processing.
+  using UseListMapT = DenseMap<unsigned, UseListOrderStorage>;
+  FailureOr<UseListMapT> parseUseListOrderForRange(EncodingReader &reader,
+                                                   uint64_t rangeSize);
+
+  /// Shuffle the use-chain according to the order parsed.
+  LogicalResult sortUseListOrder(Value value);
+
+  /// Recursively visit all the values defined within topLevelOp and sort the
+  /// use-list orders according to the indices parsed.
+  LogicalResult processUseLists(Operation *topLevelOp);
+
   //===--------------------------------------------------------------------===//
   // Fields
 
@@ -1341,17 +1377,27 @@
   /// The reader used to process resources within the bytecode.
   ResourceSectionReader resourceReader;
 
+  /// Worklist of values with custom use-list orders to process before the end
+  /// of the parsing.
+  DenseMap<void *, UseListOrderStorage> valueToUseListMap;
+
   /// The table of strings referenced within the bytecode file.
   StringSectionReader stringReader;
 
   /// The current set of available IR value scopes.
   std::vector<ValueScope> valueScopes;
+
+  /// The global pre-order operation ordering.
+  DenseMap<Operation *, unsigned> operationIDs;
+
   /// A block containing the set of operations defined to create forward
   /// references.
   Block forwardRefOps;
+
   /// A block containing previously created, and no longer used, forward
   /// reference operations.
   Block openForwardRefOps;
+
   /// An operation state used when instantiating forward references.
   OperationState forwardRefOpState;
 
@@ -1597,6 +1643,165 @@
                                    dialectReader, bufferOwnerRef);
 }
 
+//===----------------------------------------------------------------------===//
+// UseListOrder Helpers
+
+FailureOr<BytecodeReader::Impl::UseListMapT>
+BytecodeReader::Impl::parseUseListOrderForRange(EncodingReader &reader,
+                                                uint64_t numResults) {
+  BytecodeReader::Impl::UseListMapT map;
+  uint64_t numValuesToRead = 1;
+  if (numResults > 1 && failed(reader.parseVarInt(numValuesToRead)))
+    return failure();
+
+  for (size_t valueIdx = 0; valueIdx < numValuesToRead; valueIdx++) {
+    uint64_t resultIdx = 0;
+    if (numResults > 1 && failed(reader.parseVarInt(resultIdx)))
+      return failure();
+
+    uint64_t numValues;
+    bool indexPairEncoding;
+    if (failed(reader.parseVarIntWithFlag(numValues, indexPairEncoding)))
+      return failure();
+
+    SmallVector<unsigned, 4> useListOrders;
+    for (size_t idx = 0; idx < numValues; idx++) {
+      uint64_t index;
+      if (failed(reader.parseVarInt(index)))
+        return failure();
+      useListOrders.push_back(index);
+    }
+
+    // Store in a map the result index
+    map.try_emplace(resultIdx, UseListOrderStorage(indexPairEncoding,
+                                                   std::move(useListOrders)));
+  }
+
+  return map;
+}
+
+/// Sorts each use according to the order specified in the use-list parsed. If
+/// the custom use-list is not found, this means that the order needs to be
+/// consistent with the reverse pre-order walk of the IR. If multiple uses lie
+/// on the same operation, the order will follow the reverse operand number
+/// ordering.
+LogicalResult BytecodeReader::Impl::sortUseListOrder(Value value) {
+  // Early return for trivial use-lists.
+  if (value.use_empty() || value.hasOneUse())
+    return success();
+
+  bool hasIncomingOrder =
+      valueToUseListMap.contains(value.getAsOpaquePointer());
+
+  // Compute the current order of the use-list with respect to the global
+  // ordering. Detect if the order is already sorted while doing so.
+  bool alreadySorted = true;
+  auto &firstUse = *value.use_begin();
+  uint64_t prevID =
+      bytecode::getUseID(firstUse, operationIDs.at(firstUse.getOwner()));
+  llvm::SmallVector<std::pair<unsigned, uint64_t>> currentOrder = {{0, prevID}};
+  for (auto item : llvm::drop_begin(llvm::enumerate(value.getUses()))) {
+    uint64_t currentID = bytecode::getUseID(
+        item.value(), operationIDs.at(item.value().getOwner()));
+    alreadySorted &= prevID > currentID;
+    currentOrder.push_back({item.index(), currentID});
+    prevID = currentID;
+  }
+
+  // If the order is already sorted, and there wasn't a custom order to apply
+  // from the bytecode file, we are done.
+  if (alreadySorted && !hasIncomingOrder)
+    return success();
+
+  // If not already sorted, sort the indices of the current order by descending
+  // useIDs.
+  if (!alreadySorted)
+    std::sort(
+        currentOrder.begin(), currentOrder.end(),
+        [](auto elem1, auto elem2) { return elem1.second > elem2.second; });
+
+  if (!hasIncomingOrder) {
+    // If the bytecode file did not contain any custom use-list order, it means
+    // that the order was descending useID. Hence, shuffle by the first index
+    // of the `currentOrder` pair.
+    SmallVector<unsigned> shuffle = SmallVector<unsigned>(
+        llvm::map_range(currentOrder, [&](auto item) { return item.first; }));
+    value.shuffleUseList(shuffle);
+    return success();
+  }
+
+  // Pull the custom order info from the map.
+  UseListOrderStorage customOrder =
+      valueToUseListMap.at(value.getAsOpaquePointer());
+  SmallVector<unsigned, 4> shuffle = std::move(customOrder.indices);
+  uint64_t numUses =
+      std::distance(value.getUses().begin(), value.getUses().end());
+
+  // If the encoding was a pair of indices `(src, dst)` for every permutation,
+  // reconstruct the shuffle vector for every use. Initialize the shuffle vector
+  // as identity, and then apply the mapping encoded in the indices.
+  if (customOrder.isIndexPairEncoding) {
+    // Return failure if the number of indices was not representing pairs.
+    if (shuffle.size() & 1)
+      return failure();
+
+    SmallVector<unsigned, 4> newShuffle(numUses);
+    size_t idx = 0;
+    std::iota(newShuffle.begin(), newShuffle.end(), idx);
+    for (idx = 0; idx < shuffle.size(); idx += 2)
+      newShuffle[shuffle[idx]] = shuffle[idx + 1];
+
+    shuffle = std::move(newShuffle);
+  }
+
+  // Make sure that the indices represent a valid mapping. That is, the sum of
+  // all the values needs to be equal to (numUses - 1) * numUses / 2, and no
+  // duplicates are allowed in the list.
+  DenseSet<unsigned> set;
+  uint64_t accumulator = 0;
+  for (const auto &elem : shuffle) {
+    if (set.contains(elem))
+      return failure();
+    accumulator += elem;
+    set.insert(elem);
+  }
+  if (numUses != shuffle.size() ||
+      accumulator != (((numUses - 1) * numUses) >> 1))
+    return failure();
+
+  // Apply the current ordering map onto the shuffle vector to get the final
+  // use-list sorting indices before shuffling.
+  shuffle = SmallVector<unsigned, 4>(llvm::map_range(
+      currentOrder, [&](auto item) { return shuffle[item.first]; }));
+  value.shuffleUseList(shuffle);
+  return success();
+}
+
+LogicalResult BytecodeReader::Impl::processUseLists(Operation *topLevelOp) {
+  // Precompute operation IDs according to the pre-order walk of the IR. We
+  // can't do this while parsing since parseRegions ordering is not strictly
+  // equal to the pre-order walk.
+  unsigned operationID = 0;
+  topLevelOp->walk<mlir::WalkOrder::PreOrder>(
+      [&](Operation *op) { operationIDs.try_emplace(op, operationID++); });
+
+  auto blockWalk = topLevelOp->walk([this](Block *block) {
+    for (auto arg : block->getArguments())
+      if (failed(sortUseListOrder(arg)))
+        return WalkResult::interrupt();
+    return WalkResult::advance();
+  });
+
+  auto resultWalk = topLevelOp->walk([this](Operation *op) {
+    for (auto result : op->getResults())
+      if (failed(sortUseListOrder(result)))
+        return WalkResult::interrupt();
+    return WalkResult::advance();
+  });
+
+  return failure(blockWalk.wasInterrupted() || resultWalk.wasInterrupted());
+}
+
 //===----------------------------------------------------------------------===//
 // IR Section
 
@@ -1627,6 +1832,11 @@
         "not all forward unresolved forward operand references");
   }
 
+  // Sort use-lists according to what specified in bytecode.
+  if (failed(processUseLists(*moduleOp)))
+    return reader.emitError(
+        "parsed use-list orders were invalid and could not be applied");
+
   // Resolve dialect version.
   for (const BytecodeDialect &byteCodeDialect : dialects) {
     // Parsing is complete, give an opportunity to each dialect to visit the
@@ -1812,6 +2022,17 @@
     }
   }
 
+  /// Parse the use-list orders for the results of the operation. Use-list
+  /// orders are available since version 3 of the bytecode.
+  std::optional<UseListMapT> resultIdxToUseListMap = std::nullopt;
+  if (version > 2 && (opMask & bytecode::OpEncodingMask::kHasUseListOrders)) {
+    size_t numResults = opState.types.size();
+    auto parseResult = parseUseListOrderForRange(reader, numResults);
+    if (failed(parseResult))
+      return failure();
+    resultIdxToUseListMap = std::move(*parseResult);
+  }
+
   /// Parse the regions of the operation.
   if (opMask & bytecode::OpEncodingMask::kHasInlineRegions) {
     uint64_t numRegions;
@@ -1831,6 +2052,16 @@
   if (op->getNumResults() && failed(defineValues(reader, op->getResults())))
     return failure();
 
+  /// Store a map for every value that received a custom use-list order from the
+  /// bytecode file.
+  if (resultIdxToUseListMap.has_value()) {
+    for (size_t idx = 0; idx < op->getNumResults(); idx++) {
+      if (resultIdxToUseListMap->contains(idx)) {
+        valueToUseListMap.try_emplace(op->getResult(idx).getAsOpaquePointer(),
+                                      resultIdxToUseListMap->at(idx));
+      }
+    }
+  }
   return op;
 }
 
@@ -1880,6 +2111,28 @@
   if (hasArgs && failed(parseBlockArguments(reader, &*readState.curBlock)))
     return failure();
 
+  // Uselist orders are available since version 3 of the bytecode.
+  if (version < 3)
+    return success();
+
+  uint8_t hasUseListOrders = 0;
+  if (hasArgs && failed(reader.parseByte(hasUseListOrders)))
+    return failure();
+
+  if (!hasUseListOrders)
+    return success();
+
+  Block &blk = *readState.curBlock;
+  auto argIdxToUseListMap =
+      parseUseListOrderForRange(reader, blk.getNumArguments());
+  if (failed(argIdxToUseListMap) || argIdxToUseListMap->empty())
+    return failure();
+
+  for (size_t idx = 0; idx < blk.getNumArguments(); idx++)
+    if (argIdxToUseListMap->contains(idx))
+      valueToUseListMap.try_emplace(blk.getArgument(idx).getAsOpaquePointer(),
+                                    argIdxToUseListMap->at(idx));
+
   // We don't parse the operations of the block here, that's done elsewhere.
   return success();
 }
diff --git a/mlir/lib/Bytecode/Writer/BytecodeWriter.cpp b/mlir/lib/Bytecode/Writer/BytecodeWriter.cpp
--- a/mlir/lib/Bytecode/Writer/BytecodeWriter.cpp
+++ b/mlir/lib/Bytecode/Writer/BytecodeWriter.cpp
@@ -7,9 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Bytecode/BytecodeWriter.h"
-#include "../Encoding.h"
 #include "IRNumbering.h"
 #include "mlir/Bytecode/BytecodeImplementation.h"
+#include "mlir/Bytecode/Encoding.h"
 #include "mlir/IR/OpImplementation.h"
 #include "llvm/ADT/CachedHashString.h"
 #include "llvm/ADT/MapVector.h"
@@ -470,6 +470,12 @@
 
   void writeStringSection(EncodingEmitter &emitter);
 
+  //===--------------------------------------------------------------------===//
+  // Helpers
+
+  void writeUseListOrders(EncodingEmitter &emitter, uint8_t &opEncodingMask,
+                          ValueRange range);
+
   //===--------------------------------------------------------------------===//
   // Fields
 
@@ -667,6 +673,14 @@
       emitter.emitVarInt(numberingState.getNumber(arg.getType()));
       emitter.emitVarInt(numberingState.getNumber(arg.getLoc()));
     }
+    if (config.bytecodeVersion > 2) {
+      uint64_t maskOffset = emitter.size();
+      uint8_t encodingMask = 0;
+      emitter.emitByte(0);
+      writeUseListOrders(emitter, encodingMask, args);
+      if (encodingMask)
+        emitter.patchByte(maskOffset, encodingMask);
+    }
   }
 
   // Emit the operations within the block.
@@ -718,6 +732,11 @@
       emitter.emitVarInt(numberingState.getNumber(successor));
   }
 
+  // Emit the use-list orders to bytecode, so we can reconstruct the same order
+  // at parsing.
+  if (config.bytecodeVersion > 2)
+    writeUseListOrders(emitter, opEncodingMask, ValueRange(op->getResults()));
+
   // Check for regions.
   unsigned numRegions = op->getNumRegions();
   if (numRegions)
@@ -749,6 +768,94 @@
   }
 }
 
+void BytecodeWriter::writeUseListOrders(EncodingEmitter &emitter,
+                                        uint8_t &opEncodingMask,
+                                        ValueRange range) {
+  // Loop over the results and store the use-list order per result index.
+  DenseMap<unsigned, llvm::SmallVector<unsigned>> map;
+  for (auto item : llvm::enumerate(range)) {
+    auto value = item.value();
+    // No need to store a custom use-list order if the result does not have
+    // multiple uses.
+    if (value.use_empty() || value.hasOneUse())
+      continue;
+
+    // For each result, assemble the list of pairs (use-list-index,
+    // global-value-index). While doing so, detect if the global-value-index is
+    // already ordered with respect to the use-list-index.
+    bool alreadyOrdered = true;
+    auto &firstUse = *value.use_begin();
+    uint64_t prevID = bytecode::getUseID(
+        firstUse, numberingState.getNumber(firstUse.getOwner()));
+    llvm::SmallVector<std::pair<unsigned, uint64_t>> useListPairs(
+        {{0, prevID}});
+
+    for (auto use : llvm::drop_begin(llvm::enumerate(value.getUses()))) {
+      uint64_t currentID = bytecode::getUseID(
+          use.value(), numberingState.getNumber(use.value().getOwner()));
+      // The use-list order achieved when building the IR at parsing always
+      // pushes new uses on front. Hence, if the order by unique ID is
+      // monotonically decreasing, a roundtrip to bytecode preserves such order.
+      alreadyOrdered &= (prevID > currentID);
+      useListPairs.push_back({use.index(), currentID});
+      prevID = currentID;
+    }
+
+    // Do not emit if the order is already sorted.
+    if (alreadyOrdered)
+      continue;
+
+    // Sort the use indices by the unique ID indices in descending order.
+    std::sort(
+        useListPairs.begin(), useListPairs.end(),
+        [](auto elem1, auto elem2) { return elem1.second > elem2.second; });
+
+    map.try_emplace(item.index(), llvm::map_range(useListPairs, [](auto elem) {
+                      return elem.first;
+                    }));
+  }
+
+  if (map.empty())
+    return;
+
+  opEncodingMask |= bytecode::OpEncodingMask::kHasUseListOrders;
+  // Emit the number of results that have a custom use-list order if the number
+  // of results is greater than one.
+  if (range.size() != 1)
+    emitter.emitVarInt(map.size());
+
+  for (const auto &item : map) {
+    auto resultIdx = item.getFirst();
+    auto useListOrder = item.getSecond();
+
+    // Compute the number of uses that are actually shuffled. If those are less
+    // than half of the total uses, encoding the index pair `(src, dst)` is more
+    // space efficient.
+    size_t shuffledElements =
+        llvm::count_if(llvm::enumerate(useListOrder),
+                       [](auto item) { return item.index() != item.value(); });
+    bool indexPairEncoding = shuffledElements < (useListOrder.size() / 2);
+
+    // For single result, we don't need to store the result index.
+    if (range.size() != 1)
+      emitter.emitVarInt(resultIdx);
+
+    if (indexPairEncoding) {
+      emitter.emitVarIntWithFlag(shuffledElements * 2, indexPairEncoding);
+      for (auto pair : llvm::enumerate(useListOrder)) {
+        if (pair.index() != pair.value()) {
+          emitter.emitVarInt(pair.value());
+          emitter.emitVarInt(pair.index());
+        }
+      }
+    } else {
+      emitter.emitVarIntWithFlag(useListOrder.size(), indexPairEncoding);
+      for (const auto &index : useListOrder)
+        emitter.emitVarInt(index);
+    }
+  }
+}
+
 void BytecodeWriter::writeRegion(EncodingEmitter &emitter, Region *region) {
   // If the region is empty, we only need to emit the number of blocks (which is
   // zero).
diff --git a/mlir/lib/Bytecode/Writer/IRNumbering.h b/mlir/lib/Bytecode/Writer/IRNumbering.h
--- a/mlir/lib/Bytecode/Writer/IRNumbering.h
+++ b/mlir/lib/Bytecode/Writer/IRNumbering.h
@@ -152,6 +152,10 @@
     assert(blockIDs.count(block) && "block not numbered");
     return blockIDs[block];
   }
+  unsigned getNumber(Operation *op) {
+    assert(operationIDs.count(op) && "operation not numbered");
+    return operationIDs[op];
+  }
   unsigned getNumber(OperationName opName) {
     assert(opNames.count(opName) && "opName not numbered");
     return opNames[opName]->number;
@@ -224,7 +228,8 @@
   llvm::SpecificBumpPtrAllocator<DialectResourceNumbering> resourceAllocator;
   llvm::SpecificBumpPtrAllocator<TypeNumbering> typeAllocator;
 
-  /// The value ID for each Block and Value.
+  /// The value ID for each Operation, Block and Value.
+  DenseMap<Operation *, unsigned> operationIDs;
   DenseMap<Block *, unsigned> blockIDs;
   DenseMap<Value, unsigned> valueIDs;
 
diff --git a/mlir/lib/Bytecode/Writer/IRNumbering.cpp b/mlir/lib/Bytecode/Writer/IRNumbering.cpp
--- a/mlir/lib/Bytecode/Writer/IRNumbering.cpp
+++ b/mlir/lib/Bytecode/Writer/IRNumbering.cpp
@@ -7,9 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "IRNumbering.h"
-#include "../Encoding.h"
 #include "mlir/Bytecode/BytecodeImplementation.h"
-#include "mlir/Bytecode/BytecodeWriter.h"
 #include "mlir/IR/AsmState.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/OpDefinition.h"
@@ -109,6 +107,12 @@
 }
 
 IRNumberingState::IRNumberingState(Operation *op) {
+  // Compute a global operation ID numbering according to the pre-order walk of
+  // the IR. This is used as reference to construct use-list orders.
+  unsigned operationID = 0;
+  op->walk<WalkOrder::PreOrder>(
+      [&](Operation *op) { operationIDs.try_emplace(op, operationID++); });
+
   // Number the root operation.
   number(*op);
 
diff --git a/mlir/lib/Conversion/LLVMCommon/Pattern.cpp b/mlir/lib/Conversion/LLVMCommon/Pattern.cpp
--- a/mlir/lib/Conversion/LLVMCommon/Pattern.cpp
+++ b/mlir/lib/Conversion/LLVMCommon/Pattern.cpp
@@ -121,7 +121,7 @@
 void ConvertToLLVMPattern::getMemRefDescriptorSizes(
     Location loc, MemRefType memRefType, ValueRange dynamicSizes,
     ConversionPatternRewriter &rewriter, SmallVectorImpl<Value> &sizes,
-    SmallVectorImpl<Value> &strides, Value &sizeBytes) const {
+    SmallVectorImpl<Value> &strides, Value &size, bool sizeInBytes) const {
   assert(isConvertibleAndHasIdentityMaps(memRefType) &&
          "layout maps must have been normalized away");
   assert(count(memRefType.getShape(), ShapedType::kDynamic) ==
@@ -143,14 +143,14 @@
   for (auto i = memRefType.getRank(); i-- > 0;) {
     strides[i] = runningStride;
 
-    int64_t size = memRefType.getShape()[i];
-    if (size == 0)
+    int64_t staticSize = memRefType.getShape()[i];
+    if (staticSize == 0)
       continue;
     bool useSizeAsStride = stride == 1;
-    if (size == ShapedType::kDynamic)
+    if (staticSize == ShapedType::kDynamic)
       stride = ShapedType::kDynamic;
     if (stride != ShapedType::kDynamic)
-      stride *= size;
+      stride *= staticSize;
 
     if (useSizeAsStride)
       runningStride = sizes[i];
@@ -160,14 +160,17 @@
     else
       runningStride = createIndexConstant(rewriter, loc, stride);
   }
-
-  // Buffer size in bytes.
-  Type elementType = typeConverter->convertType(memRefType.getElementType());
-  Type elementPtrType = getTypeConverter()->getPointerType(elementType);
-  Value nullPtr = rewriter.create<LLVM::NullOp>(loc, elementPtrType);
-  Value gepPtr = rewriter.create<LLVM::GEPOp>(loc, elementPtrType, elementType,
-                                              nullPtr, runningStride);
-  sizeBytes = rewriter.create<LLVM::PtrToIntOp>(loc, getIndexType(), gepPtr);
+  if (sizeInBytes) {
+    // Buffer size in bytes.
+    Type elementType = typeConverter->convertType(memRefType.getElementType());
+    Type elementPtrType = getTypeConverter()->getPointerType(elementType);
+    Value nullPtr = rewriter.create<LLVM::NullOp>(loc, elementPtrType);
+    Value gepPtr = rewriter.create<LLVM::GEPOp>(
+        loc, elementPtrType, elementType, nullPtr, runningStride);
+    size = rewriter.create<LLVM::PtrToIntOp>(loc, getIndexType(), gepPtr);
+  } else {
+    size = runningStride;
+  }
 }
 
 Value ConvertToLLVMPattern::getSizeInBytes(
@@ -186,13 +189,30 @@
 }
 
 Value ConvertToLLVMPattern::getNumElements(
-    Location loc, ArrayRef<Value> shape,
+    Location loc, MemRefType memRefType, ValueRange dynamicSizes,
     ConversionPatternRewriter &rewriter) const {
+  assert(count(memRefType.getShape(), ShapedType::kDynamic) ==
+             static_cast<ssize_t>(dynamicSizes.size()) &&
+         "dynamicSizes size doesn't match dynamic sizes count in memref shape");
+
+  Value numElements = memRefType.getRank() == 0
+                          ? createIndexConstant(rewriter, loc, 1)
+                          : nullptr;
+  unsigned dynamicIndex = 0;
+
   // Compute the total number of memref elements.
-  Value numElements =
-      shape.empty() ? createIndexConstant(rewriter, loc, 1) : shape.front();
-  for (unsigned i = 1, e = shape.size(); i < e; ++i)
-    numElements = rewriter.create<LLVM::MulOp>(loc, numElements, shape[i]);
+  for (int64_t staticSize : memRefType.getShape()) {
+    if (numElements) {
+      Value size = staticSize == ShapedType::kDynamic
+                       ? dynamicSizes[dynamicIndex++]
+                       : createIndexConstant(rewriter, loc, staticSize);
+      numElements = rewriter.create<LLVM::MulOp>(loc, numElements, size);
+    } else {
+      numElements = staticSize == ShapedType::kDynamic
+                        ? dynamicSizes[dynamicIndex++]
+                        : createIndexConstant(rewriter, loc, staticSize);
+    }
+  }
   return numElements;
 }
 
diff --git a/mlir/lib/Conversion/MemRefToLLVM/AllocLikeConversion.cpp b/mlir/lib/Conversion/MemRefToLLVM/AllocLikeConversion.cpp
--- a/mlir/lib/Conversion/MemRefToLLVM/AllocLikeConversion.cpp
+++ b/mlir/lib/Conversion/MemRefToLLVM/AllocLikeConversion.cpp
@@ -156,6 +156,10 @@
                              elementPtrType, *getTypeConverter());
 }
 
+void AllocLikeOpLLVMLowering::setRequiresNumElements() {
+  requiresNumElements = true;
+}
+
 LogicalResult AllocLikeOpLLVMLowering::matchAndRewrite(
     Operation *op, ArrayRef<Value> operands,
     ConversionPatternRewriter &rewriter) const {
@@ -169,13 +173,14 @@
   // zero-dimensional memref, assume a scalar (size 1).
   SmallVector<Value, 4> sizes;
   SmallVector<Value, 4> strides;
-  Value sizeBytes;
+  Value size;
+
   this->getMemRefDescriptorSizes(loc, memRefType, operands, rewriter, sizes,
-                                 strides, sizeBytes);
+                                 strides, size, !requiresNumElements);
 
   // Allocate the underlying buffer.
   auto [allocatedPtr, alignedPtr] =
-      this->allocateBuffer(rewriter, loc, sizeBytes, op);
+      this->allocateBuffer(rewriter, loc, size, op);
 
   // Create the MemRef descriptor.
   auto memRefDescriptor = this->createMemRefDescriptor(
diff --git a/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp b/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
--- a/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
+++ b/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
@@ -85,13 +85,15 @@
 struct AllocaOpLowering : public AllocLikeOpLLVMLowering {
   AllocaOpLowering(LLVMTypeConverter &converter)
       : AllocLikeOpLLVMLowering(memref::AllocaOp::getOperationName(),
-                                converter) {}
+                                converter) {
+    setRequiresNumElements();
+  }
 
   /// Allocates the underlying buffer using the right call. `allocatedBytePtr`
   /// is set to null for stack allocations. `accessAlignment` is set if
   /// alignment is needed post allocation (for eg. in conjunction with malloc).
   std::tuple<Value, Value> allocateBuffer(ConversionPatternRewriter &rewriter,
-                                          Location loc, Value sizeBytes,
+                                          Location loc, Value size,
                                           Operation *op) const override {
 
     // With alloca, one gets a pointer to the element type right away.
@@ -104,9 +106,9 @@
     auto elementPtrType =
         getTypeConverter()->getPointerType(elementType, addrSpace);
 
-    auto allocatedElementPtr = rewriter.create<LLVM::AllocaOp>(
-        loc, elementPtrType, elementType, sizeBytes,
-        allocaOp.getAlignment().value_or(0));
+    auto allocatedElementPtr =
+        rewriter.create<LLVM::AllocaOp>(loc, elementPtrType, elementType, size,
+                                        allocaOp.getAlignment().value_or(0));
 
     return std::make_tuple(allocatedElementPtr, allocatedElementPtr);
   }
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.cpp
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.cpp
@@ -245,16 +245,6 @@
                                  targetAlignment);
 }
 
-/// Returns true if the given argument or result attribute is supported by the
-/// inliner, false otherwise.
-static bool isArgOrResAttrSupported(NamedAttribute attr) {
-  if (attr.getName() == LLVM::LLVMDialect::getInAllocaAttrName())
-    return false;
-  if (attr.getName() == LLVM::LLVMDialect::getNoAliasAttrName())
-    return false;
-  return true;
-}
-
 namespace {
 struct LLVMInlinerInterface : public DialectInlinerInterface {
   using DialectInlinerInterface::DialectInlinerInterface;
@@ -287,27 +277,13 @@
                  << "Cannot inline: callable is not an LLVM::LLVMFuncOp\n");
       return false;
     }
+    // TODO: Generate aliasing metadata from noalias argument/result attributes.
     if (auto attrs = funcOp.getArgAttrs()) {
       for (DictionaryAttr attrDict : attrs->getAsRange<DictionaryAttr>()) {
-        for (NamedAttribute attr : attrDict) {
-          if (!isArgOrResAttrSupported(attr)) {
-            LLVM_DEBUG(llvm::dbgs() << "Cannot inline " << funcOp.getSymName()
-                                    << ": unhandled argument attribute "
-                                    << attr.getName() << "\n");
-            return false;
-          }
-        }
-      }
-    }
-    if (auto attrs = funcOp.getResAttrs()) {
-      for (DictionaryAttr attrDict : attrs->getAsRange<DictionaryAttr>()) {
-        for (NamedAttribute attr : attrDict) {
-          if (!isArgOrResAttrSupported(attr)) {
-            LLVM_DEBUG(llvm::dbgs() << "Cannot inline " << funcOp.getSymName()
-                                    << ": unhandled return attribute "
-                                    << attr.getName() << "\n");
-            return false;
-          }
+        if (attrDict.contains(LLVM::LLVMDialect::getInAllocaAttrName())) {
+          LLVM_DEBUG(llvm::dbgs() << "Cannot inline " << funcOp.getSymName()
+                                  << ": inalloca arguments not supported\n");
+          return false;
         }
       }
     }
@@ -364,6 +340,7 @@
     }
     // clang-format off
     if (isa<LLVM::AllocaOp,
+            LLVM::AssumeOp,
             LLVM::AtomicRMWOp,
             LLVM::AtomicCmpXchgOp,
             LLVM::CallOp,
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp
@@ -12,6 +12,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/ValueRange.h"
 #include "mlir/Interfaces/MemorySlotInterfaces.h"
 
 using namespace mlir;
@@ -56,6 +60,57 @@
   rewriter.eraseOp(*this);
 }
 
+SmallVector<DestructurableMemorySlot> LLVM::AllocaOp::getDestructurableSlots() {
+  if (!mlir::matchPattern(getArraySize(), m_One()))
+    return {};
+
+  Type elemType = getAllocaElementType(*this);
+  auto destructurable = dyn_cast<DestructurableTypeInterface>(elemType);
+  if (!destructurable)
+    return {};
+
+  std::optional<DenseMap<Attribute, Type>> destructuredType =
+      destructurable.getSubelementIndexMap();
+  if (!destructuredType)
+    return {};
+
+  DenseMap<Attribute, Type> allocaTypeMap;
+  for (Attribute index : llvm::make_first_range(destructuredType.value()))
+    allocaTypeMap.insert({index, LLVM::LLVMPointerType::get(getContext())});
+
+  return {DestructurableMemorySlot{{getResult(), elemType}, {allocaTypeMap}}};
+}
+
+DenseMap<Attribute, MemorySlot>
+LLVM::AllocaOp::destructure(const DestructurableMemorySlot &slot,
+                            const SmallPtrSetImpl<Attribute> &usedIndices,
+                            RewriterBase &rewriter) {
+  assert(slot.ptr == getResult());
+  Type elemType =
+      getElemType() ? *getElemType() : getResult().getType().getElementType();
+
+  rewriter.setInsertionPointAfter(*this);
+
+  auto destructurableType = cast<DestructurableTypeInterface>(elemType);
+  DenseMap<Attribute, MemorySlot> slotMap;
+  for (Attribute index : usedIndices) {
+    Type elemType = destructurableType.getTypeAtIndex(index);
+    assert(elemType && "used index must exist");
+    auto subAlloca = rewriter.create<LLVM::AllocaOp>(
+        getLoc(), LLVM::LLVMPointerType::get(getContext()), elemType,
+        getArraySize());
+    slotMap.try_emplace<MemorySlot>(index, {subAlloca.getResult(), elemType});
+  }
+
+  return slotMap;
+}
+
+void LLVM::AllocaOp::handleDestructuringComplete(
+    const DestructurableMemorySlot &slot, RewriterBase &rewriter) {
+  assert(slot.ptr == getResult());
+  rewriter.eraseOp(*this);
+}
+
 //===----------------------------------------------------------------------===//
 // Interfaces for LoadOp/StoreOp
 //===----------------------------------------------------------------------===//
@@ -115,12 +170,23 @@
   // `canUsesBeRemoved` checked this blocking use must be the stored slot
   // pointer.
   for (Operation *user : slot.ptr.getUsers())
-    if (auto declareOp = llvm::dyn_cast<LLVM::DbgDeclareOp>(user))
+    if (auto declareOp = dyn_cast<LLVM::DbgDeclareOp>(user))
       rewriter.create<LLVM::DbgValueOp>(declareOp->getLoc(), getValue(),
                                         declareOp.getVarInfo());
   return DeletionKind::Delete;
 }
 
+LogicalResult LLVM::LoadOp::ensureOnlySafeAccesses(
+    const MemorySlot &slot, SmallVectorImpl<MemorySlot> &mustBeSafelyUsed) {
+  return success(getAddr() != slot.ptr || getType() == slot.elemType);
+}
+
+LogicalResult LLVM::StoreOp::ensureOnlySafeAccesses(
+    const MemorySlot &slot, SmallVectorImpl<MemorySlot> &mustBeSafelyUsed) {
+  return success(getAddr() != slot.ptr ||
+                 getValue().getType() == slot.elemType);
+}
+
 //===----------------------------------------------------------------------===//
 // Interfaces for discardable OPs
 //===----------------------------------------------------------------------===//
@@ -189,6 +255,10 @@
   return DeletionKind::Delete;
 }
 
+//===----------------------------------------------------------------------===//
+// Interfaces for GEPOp
+//===----------------------------------------------------------------------===//
+
 static bool hasAllZeroIndices(LLVM::GEPOp gepOp) {
   return llvm::all_of(gepOp.getIndices(), [](auto index) {
     auto indexAttr = index.template dyn_cast<IntegerAttr>();
@@ -196,10 +266,6 @@
   });
 }
 
-//===----------------------------------------------------------------------===//
-// Interfaces for GEPOp
-//===----------------------------------------------------------------------===//
-
 bool LLVM::GEPOp::canUsesBeRemoved(
     const SmallPtrSetImpl<OpOperand *> &blockingUses,
     SmallVectorImpl<OpOperand *> &newBlockingUses) {
@@ -213,3 +279,171 @@
     const SmallPtrSetImpl<OpOperand *> &blockingUses, RewriterBase &rewriter) {
   return DeletionKind::Delete;
 }
+
+/// Returns the type the resulting pointer of the GEP points to. If such a type
+/// is not clear, returns null type.
+static Type computeReachedGEPType(LLVM::GEPOp gep) {
+  if (gep.getIndices().empty())
+    return {};
+
+  // Ensures all indices are static and fetches them.
+  SmallVector<IntegerAttr> indices;
+  for (auto index : gep.getIndices()) {
+    IntegerAttr indexInt = index.dyn_cast<IntegerAttr>();
+    if (!indexInt)
+      return {};
+    indices.push_back(indexInt);
+  }
+
+  // Check the pointer indexing only targets the first element.
+  if (indices[0].getInt() != 0)
+    return {};
+
+  // Set the initial type currently being used for indexing. This will be
+  // updated as the indices get walked over.
+  std::optional<Type> maybeSelectedType = gep.getElemType();
+  if (!maybeSelectedType)
+    return {};
+  Type selectedType = *maybeSelectedType;
+
+  // Follow the indexed elements in the gep.
+  for (IntegerAttr index : llvm::drop_begin(indices)) {
+    // Ensure the structure of the type being indexed can be reasoned about.
+    // This includes rejecting any potential typed pointer.
+    auto destructurable = selectedType.dyn_cast<DestructurableTypeInterface>();
+    if (!destructurable)
+      return {};
+
+    // Follow the type at the index the gep is accessing, making it the new type
+    // used for indexing.
+    Type field = destructurable.getTypeAtIndex(index);
+    if (!field)
+      return {};
+    selectedType = field;
+  }
+
+  // When there are no more indices, the type currently being used for indexing
+  // is the type of the value pointed at by the returned indexed pointer.
+  return selectedType;
+}
+
+LogicalResult LLVM::GEPOp::ensureOnlySafeAccesses(
+    const MemorySlot &slot, SmallVectorImpl<MemorySlot> &mustBeSafelyUsed) {
+  if (getBase() != slot.ptr)
+    return success();
+  if (slot.elemType != getElemType())
+    return failure();
+  Type reachedType = computeReachedGEPType(*this);
+  if (!reachedType)
+    return failure();
+  mustBeSafelyUsed.emplace_back<MemorySlot>({getResult(), reachedType});
+  return success();
+}
+
+bool LLVM::GEPOp::canRewire(const DestructurableMemorySlot &slot,
+                            SmallPtrSetImpl<Attribute> &usedIndices,
+                            SmallVectorImpl<MemorySlot> &mustBeSafelyUsed) {
+  auto basePtrType = getBase().getType().dyn_cast<LLVM::LLVMPointerType>();
+  if (!basePtrType)
+    return false;
+
+  // Typed pointers are not supported. This should be removed once typed
+  // pointers are removed from the LLVM dialect.
+  if (!basePtrType.isOpaque())
+    return false;
+
+  if (getBase() != slot.ptr || slot.elemType != getElemType())
+    return false;
+  Type reachedType = computeReachedGEPType(*this);
+  if (!reachedType || getIndices().size() < 2)
+    return false;
+  auto firstLevelIndex = cast<IntegerAttr>(getIndices()[1]);
+  assert(slot.elementPtrs.contains(firstLevelIndex));
+  if (!slot.elementPtrs.at(firstLevelIndex).isa<LLVM::LLVMPointerType>())
+    return false;
+  mustBeSafelyUsed.emplace_back<MemorySlot>({getResult(), reachedType});
+  usedIndices.insert(firstLevelIndex);
+  return true;
+}
+
+DeletionKind LLVM::GEPOp::rewire(const DestructurableMemorySlot &slot,
+                                 DenseMap<Attribute, MemorySlot> &subslots,
+                                 RewriterBase &rewriter) {
+  IntegerAttr firstLevelIndex = getIndices()[1].dyn_cast<IntegerAttr>();
+  const MemorySlot &newSlot = subslots.at(firstLevelIndex);
+
+  ArrayRef<int32_t> remainingIndices = getRawConstantIndices().slice(2);
+
+  // If the GEP would become trivial after this transformation, eliminate it.
+  // A GEP should only be eliminated if it has no indices (except the first
+  // pointer index), as simplifying GEPs with all-zero indices would eliminate
+  // structure information useful for further destruction.
+  if (remainingIndices.empty()) {
+    rewriter.replaceAllUsesWith(getResult(), newSlot.ptr);
+    return DeletionKind::Delete;
+  }
+
+  rewriter.updateRootInPlace(*this, [&]() {
+    // Rewire the indices by popping off the second index.
+    // Start with a single zero, then add the indices beyond the second.
+    SmallVector<int32_t> newIndices(1);
+    newIndices.append(remainingIndices.begin(), remainingIndices.end());
+    setRawConstantIndices(newIndices);
+
+    // Rewire the pointed type.
+    setElemType(newSlot.elemType);
+
+    // Rewire the pointer.
+    getBaseMutable().assign(newSlot.ptr);
+  });
+
+  return DeletionKind::Keep;
+}
+
+//===----------------------------------------------------------------------===//
+// Interfaces for destructurable types
+//===----------------------------------------------------------------------===//
+
+std::optional<DenseMap<Attribute, Type>>
+LLVM::LLVMStructType::getSubelementIndexMap() {
+  Type i32 = IntegerType::get(getContext(), 32);
+  DenseMap<Attribute, Type> destructured;
+  for (const auto &[index, elemType] : llvm::enumerate(getBody()))
+    destructured.insert({IntegerAttr::get(i32, index), elemType});
+  return destructured;
+}
+
+Type LLVM::LLVMStructType::getTypeAtIndex(Attribute index) {
+  auto indexAttr = index.dyn_cast<IntegerAttr>();
+  if (!indexAttr || !indexAttr.getType().isInteger(32))
+    return {};
+  int32_t indexInt = indexAttr.getInt();
+  ArrayRef<Type> body = getBody();
+  if (indexInt < 0 || body.size() <= static_cast<uint32_t>(indexInt))
+    return {};
+  return body[indexInt];
+}
+
+std::optional<DenseMap<Attribute, Type>>
+LLVM::LLVMArrayType::getSubelementIndexMap() const {
+  constexpr size_t maxArraySizeForDestructuring = 16;
+  if (getNumElements() > maxArraySizeForDestructuring)
+    return {};
+  int32_t numElements = getNumElements();
+
+  Type i32 = IntegerType::get(getContext(), 32);
+  DenseMap<Attribute, Type> destructured;
+  for (int32_t index = 0; index < numElements; ++index)
+    destructured.insert({IntegerAttr::get(i32, index), getElementType()});
+  return destructured;
+}
+
+Type LLVM::LLVMArrayType::getTypeAtIndex(Attribute index) const {
+  auto indexAttr = index.dyn_cast<IntegerAttr>();
+  if (!indexAttr || !indexAttr.getType().isInteger(32))
+    return {};
+  int32_t indexInt = indexAttr.getInt();
+  if (indexInt < 0 || getNumElements() <= static_cast<uint32_t>(indexInt))
+    return {};
+  return getElementType();
+}
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -436,6 +436,43 @@
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// Custom parser and printer verifier for private clause
+//===----------------------------------------------------------------------===//
+
+static ParseResult parsePrivatizationList(
+    mlir::OpAsmParser &parser,
+    llvm::SmallVectorImpl<mlir::OpAsmParser::UnresolvedOperand> &operands,
+    llvm::SmallVectorImpl<Type> &types, mlir::ArrayAttr &privatizationSymbols) {
+  llvm::SmallVector<SymbolRefAttr> privatizationVec;
+  if (failed(parser.parseCommaSeparatedList([&]() {
+        if (parser.parseAttribute(privatizationVec.emplace_back()) ||
+            parser.parseArrow() ||
+            parser.parseOperand(operands.emplace_back()) ||
+            parser.parseColonType(types.emplace_back()))
+          return failure();
+        return success();
+      })))
+    return failure();
+  llvm::SmallVector<mlir::Attribute> privatizations(privatizationVec.begin(),
+                                                    privatizationVec.end());
+  privatizationSymbols = ArrayAttr::get(parser.getContext(), privatizations);
+  return success();
+}
+
+static void
+printPrivatizationList(mlir::OpAsmPrinter &p, mlir::Operation *op,
+                       mlir::OperandRange privateOperands,
+                       mlir::TypeRange privateTypes,
+                       std::optional<mlir::ArrayAttr> privatizations) {
+  for (unsigned i = 0, e = privatizations->size(); i < e; ++i) {
+    if (i != 0)
+      p << ", ";
+    p << (*privatizations)[i] << " -> " << privateOperands[i] << " : "
+      << privateOperands[i].getType();
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // ParallelOp
 //===----------------------------------------------------------------------===//
@@ -455,6 +492,45 @@
   return success();
 }
 
+static LogicalResult
+checkPrivatizationList(Operation *op,
+                       std::optional<mlir::ArrayAttr> privatizations,
+                       mlir::OperandRange privateOperands) {
+  if (!privateOperands.empty()) {
+    if (!privatizations || privatizations->size() != privateOperands.size())
+      return op->emitOpError() << "expected as many privatizations symbol "
+                                  "reference as private operands";
+  } else {
+    if (privatizations)
+      return op->emitOpError() << "unexpected privatizations symbol reference";
+    return success();
+  }
+
+  llvm::DenseSet<Value> privates;
+  for (auto args : llvm::zip(privateOperands, *privatizations)) {
+    mlir::Value privateOperand = std::get<0>(args);
+
+    if (!privates.insert(privateOperand).second)
+      return op->emitOpError() << "private operand appears more than once";
+
+    mlir::Type varType = privateOperand.getType();
+    auto symbolRef = std::get<1>(args).cast<SymbolRefAttr>();
+    auto decl =
+        SymbolTable::lookupNearestSymbolFrom<PrivateRecipeOp>(op, symbolRef);
+    if (!decl)
+      return op->emitOpError() << "expected symbol reference " << symbolRef
+                               << " to point to a private declaration";
+
+    if (decl.getType() && decl.getType() != varType)
+      return op->emitOpError()
+             << "expected private (" << varType
+             << ") to be the same type as private declaration ("
+             << decl.getType() << ")";
+  }
+
+  return success();
+}
+
 unsigned ParallelOp::getNumDataOperands() {
   return getReductionOperands().size() + getGangPrivateOperands().size() +
          getGangFirstPrivateOperands().size() + getDataClauseOperands().size();
@@ -471,6 +547,9 @@
 }
 
 LogicalResult acc::ParallelOp::verify() {
+  if (failed(checkPrivatizationList(*this, getPrivatizations(),
+                                    getGangPrivateOperands())))
+    return failure();
   return checkDataOperands<acc::ParallelOp>(*this, getDataClauseOperands());
 }
 
@@ -647,6 +726,10 @@
   if (getSeq() && (getHasGang() || getHasWorker() || getHasVector()))
     return emitError("gang, worker or vector cannot appear with the seq attr");
 
+  if (failed(checkPrivatizationList(*this, getPrivatizations(),
+                                    getPrivateOperands())))
+    return failure();
+
   // Check non-empty body().
   if (getRegion().empty())
     return emitError("expected non-empty body.");
diff --git a/mlir/lib/Dialect/OpenMP/CMakeLists.txt b/mlir/lib/Dialect/OpenMP/CMakeLists.txt
--- a/mlir/lib/Dialect/OpenMP/CMakeLists.txt
+++ b/mlir/lib/Dialect/OpenMP/CMakeLists.txt
@@ -12,4 +12,5 @@
   LINK_LIBS PUBLIC
   MLIRIR
   MLIRLLVMDialect
+  MLIRFuncDialect
   )
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/DialectImplementation.h"
@@ -71,8 +72,23 @@
   MemRefType::attachInterface<PointerLikeModel<MemRefType>>(*getContext());
   LLVM::LLVMPointerType::attachInterface<
       PointerLikeModel<LLVM::LLVMPointerType>>(*getContext());
+
+  // Attach default offload module interface to module op to access
+  // offload functionality through
   mlir::ModuleOp::attachInterface<mlir::omp::OffloadModuleDefaultModel>(
       *getContext());
+
+  // Attach default declare target interfaces to operations which can be marked
+  // as declare target (Global Operations and Functions/Subroutines in dialects
+  // that Fortran (or other languages that lower to MLIR) translates too
+  mlir::LLVM::GlobalOp::attachInterface<
+      mlir::omp::DeclareTargetDefaultModel<mlir::LLVM::GlobalOp>>(
+      *getContext());
+  mlir::LLVM::LLVMFuncOp::attachInterface<
+      mlir::omp::DeclareTargetDefaultModel<mlir::LLVM::LLVMFuncOp>>(
+      *getContext());
+  mlir::func::FuncOp::attachInterface<
+      mlir::omp::DeclareTargetDefaultModel<mlir::func::FuncOp>>(*getContext());
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
--- a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -1087,6 +1087,54 @@
   }
 };
 
+/// Bufferization of tensor.splat. Bufferizes to a new allocation that is filled
+/// with a linalg.map. Similar to tensor.generate.
+struct SplatOpInterface
+    : public BufferizableOpInterface::ExternalModel<SplatOpInterface,
+                                                    tensor::SplatOp> {
+
+  bool bufferizesToAllocation(Operation *op, OpResult opResult) const {
+    return true;
+  }
+
+  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                          const BufferizationOptions &options) const {
+    OpBuilder::InsertionGuard g(rewriter);
+    auto splatOp = cast<tensor::SplatOp>(op);
+
+    // Should the buffer be deallocated?
+    bool dealloc =
+        shouldDeallocateOpResult(cast<OpResult>(splatOp.getResult()), options);
+
+    // TODO: Implement memory space for this op.
+    if (options.defaultMemorySpace != Attribute())
+      return op->emitError("memory space not implemented yet");
+
+    // Allocate memory.
+    Location loc = op->getLoc();
+    FailureOr<Value> tensorAlloc =
+        allocateTensorForShapedValue(rewriter, loc, splatOp.getResult(),
+                                     /*escape=*/!dealloc, options,
+                                     /*copy=*/false);
+    if (failed(tensorAlloc))
+      return failure();
+
+    // Create linalg::MapOp.
+    auto tensorType = cast<RankedTensorType>(tensorAlloc->getType());
+    auto linalgOp =
+        rewriter.create<linalg::MapOp>(loc, tensorType, /*inputs=*/ValueRange(),
+                                       /*init=*/*tensorAlloc);
+    Block &linalgBody = linalgOp.getMapper().emplaceBlock();
+
+    // Create linalg::IndexOps.
+    rewriter.setInsertionPointToStart(&linalgBody);
+    rewriter.create<linalg::YieldOp>(loc, splatOp.getInput());
+    rewriter.replaceOp(splatOp, linalgOp.getResult()[0]);
+
+    return success();
+  }
+};
+
 } // namespace
 } // namespace tensor
 } // namespace mlir
@@ -1110,6 +1158,7 @@
         *ctx);
     RankOp::attachInterface<RankOpInterface>(*ctx);
     ReshapeOp::attachInterface<ReshapeOpInterface>(*ctx);
+    SplatOp::attachInterface<SplatOpInterface>(*ctx);
 
     // Load additional dialects of which ops may get created.
     ctx->loadDialect<arith::ArithDialect, linalg::LinalgDialect>();
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
--- a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
@@ -1027,3 +1027,13 @@
 
   return {};
 }
+
+OpFoldResult tosa::AbsOp::fold(FoldAdaptor adaptor) {
+  auto input = getInput1();
+  // Element-wise abs(abs(x)) = abs(x)
+  if (auto op = input.getDefiningOp<tosa::AbsOp>()) {
+    return input;
+  }
+
+  return {};
+}
diff --git a/mlir/lib/IR/Value.cpp b/mlir/lib/IR/Value.cpp
--- a/mlir/lib/IR/Value.cpp
+++ b/mlir/lib/IR/Value.cpp
@@ -93,6 +93,11 @@
   });
 }
 
+/// Shuffles the use-list order according to the provided indices.
+void Value::shuffleUseList(ArrayRef<unsigned> indices) {
+  getImpl()->shuffleUseList(indices);
+}
+
 //===----------------------------------------------------------------------===//
 // OpResult
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Interfaces/MemorySlotInterfaces.cpp b/mlir/lib/Interfaces/MemorySlotInterfaces.cpp
--- a/mlir/lib/Interfaces/MemorySlotInterfaces.cpp
+++ b/mlir/lib/Interfaces/MemorySlotInterfaces.cpp
@@ -9,3 +9,4 @@
 #include "mlir/Interfaces/MemorySlotInterfaces.h"
 
 #include "mlir/Interfaces/MemorySlotOpInterfaces.cpp.inc"
+#include "mlir/Interfaces/MemorySlotTypeInterfaces.cpp.inc"
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -694,6 +694,11 @@
     return taskOp.emitError("unhandled clauses for translation to LLVM IR");
   }
   auto bodyCB = [&](InsertPointTy allocaIP, InsertPointTy codegenIP) {
+    // Save the alloca insertion point on ModuleTranslation stack for use in
+    // nested regions.
+    LLVM::ModuleTranslation::SaveStack<OpenMPAllocaStackFrame> frame(
+        moduleTranslation, allocaIP);
+
     builder.restoreIP(codegenIP);
     convertOmpOpRegions(taskOp.getRegion(), "omp.task.region", builder,
                         moduleTranslation, bodyGenStatus);
diff --git a/mlir/lib/Transforms/CMakeLists.txt b/mlir/lib/Transforms/CMakeLists.txt
--- a/mlir/lib/Transforms/CMakeLists.txt
+++ b/mlir/lib/Transforms/CMakeLists.txt
@@ -12,6 +12,7 @@
   OpStats.cpp
   PrintIR.cpp
   SCCP.cpp
+  SROA.cpp
   StripDebugInfo.cpp
   SymbolDCE.cpp
   SymbolPrivatize.cpp
diff --git a/mlir/lib/Transforms/Mem2Reg.cpp b/mlir/lib/Transforms/Mem2Reg.cpp
--- a/mlir/lib/Transforms/Mem2Reg.cpp
+++ b/mlir/lib/Transforms/Mem2Reg.cpp
@@ -578,8 +578,6 @@
 LogicalResult mlir::tryToPromoteMemorySlots(
     ArrayRef<PromotableAllocationOpInterface> allocators,
     RewriterBase &rewriter, Mem2RegStatistics statistics) {
-  DominanceInfo dominance;
-
   bool promotedAny = false;
 
   for (PromotableAllocationOpInterface allocator : allocators) {
diff --git a/mlir/lib/Transforms/SROA.cpp b/mlir/lib/Transforms/SROA.cpp
new file mode 100644
--- /dev/null
+++ b/mlir/lib/Transforms/SROA.cpp
@@ -0,0 +1,235 @@
+//===-- SROA.cpp - Scalar Replacement Of Aggregates -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Transforms/SROA.h"
+#include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Interfaces/MemorySlotInterfaces.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "mlir/Transforms/Passes.h"
+
+namespace mlir {
+#define GEN_PASS_DEF_SROA
+#include "mlir/Transforms/Passes.h.inc"
+} // namespace mlir
+
+#define DEBUG_TYPE "sroa"
+
+using namespace mlir;
+
+namespace {
+
+/// Information computed by destructurable memory slot analysis used to perform
+/// actual destructuring of the slot. This struct is only constructed if
+/// destructuring is possible, and contains the necessary data to perform it.
+struct MemorySlotDestructuringInfo {
+  /// Set of the indices that are actually used when accessing the subelements.
+  SmallPtrSet<Attribute, 8> usedIndices;
+  /// Blocking uses of a given user of the memory slot that must be eliminated.
+  DenseMap<Operation *, SmallPtrSet<OpOperand *, 4>> userToBlockingUses;
+  /// List of potentially indirect accessors of the memory slot that need
+  /// rewiring.
+  SmallVector<DestructurableAccessorOpInterface> accessors;
+};
+
+} // namespace
+
+/// Computes information for slot destructuring. This will compute whether this
+/// slot can be destructured and data to perform the destructuring. Returns
+/// nothing if the slot cannot be destructured or if there is no useful work to
+/// be done.
+static std::optional<MemorySlotDestructuringInfo>
+computeDestructuringInfo(DestructurableMemorySlot &slot) {
+  assert(isa<DestructurableTypeInterface>(slot.elemType));
+
+  if (slot.ptr.use_empty())
+    return {};
+
+  MemorySlotDestructuringInfo info;
+
+  SmallVector<MemorySlot> usedSafelyWorklist;
+
+  auto scheduleAsBlockingUse = [&](OpOperand &use) {
+    SmallPtrSetImpl<OpOperand *> &blockingUses =
+        info.userToBlockingUses.getOrInsertDefault(use.getOwner());
+    blockingUses.insert(&use);
+  };
+
+  // Initialize the analysis with the immediate users of the slot.
+  for (OpOperand &use : slot.ptr.getUses()) {
+    if (auto accessor =
+            dyn_cast<DestructurableAccessorOpInterface>(use.getOwner())) {
+      if (accessor.canRewire(slot, info.usedIndices, usedSafelyWorklist)) {
+        info.accessors.push_back(accessor);
+        continue;
+      }
+    }
+
+    // If it cannot be shown that the operation uses the slot safely, maybe it
+    // can be promoted out of using the slot?
+    scheduleAsBlockingUse(use);
+  }
+
+  SmallPtrSet<OpOperand *, 16> visited;
+  while (!usedSafelyWorklist.empty()) {
+    MemorySlot mustBeUsedSafely = usedSafelyWorklist.pop_back_val();
+    for (OpOperand &subslotUse : mustBeUsedSafely.ptr.getUses()) {
+      if (!visited.insert(&subslotUse).second)
+        continue;
+      Operation *subslotUser = subslotUse.getOwner();
+
+      if (auto memOp = dyn_cast<SafeMemorySlotAccessOpInterface>(subslotUser))
+        if (succeeded(memOp.ensureOnlySafeAccesses(mustBeUsedSafely,
+                                                   usedSafelyWorklist)))
+          continue;
+
+      // If it cannot be shown that the operation uses the slot safely, maybe it
+      // can be promoted out of using the slot?
+      scheduleAsBlockingUse(subslotUse);
+    }
+  }
+
+  SetVector<Operation *> forwardSlice;
+  mlir::getForwardSlice(slot.ptr, &forwardSlice);
+  for (Operation *user : forwardSlice) {
+    // If the next operation has no blocking uses, everything is fine.
+    if (!info.userToBlockingUses.contains(user))
+      continue;
+
+    SmallPtrSet<OpOperand *, 4> &blockingUses = info.userToBlockingUses[user];
+    auto promotable = dyn_cast<PromotableOpInterface>(user);
+
+    // An operation that has blocking uses must be promoted. If it is not
+    // promotable, destructuring must fail.
+    if (!promotable)
+      return {};
+
+    SmallVector<OpOperand *> newBlockingUses;
+    // If the operation decides it cannot deal with removing the blocking uses,
+    // destructuring must fail.
+    if (!promotable.canUsesBeRemoved(blockingUses, newBlockingUses))
+      return {};
+
+    // Then, register any new blocking uses for coming operations.
+    for (OpOperand *blockingUse : newBlockingUses) {
+      assert(llvm::is_contained(user->getResults(), blockingUse->get()));
+
+      SmallPtrSetImpl<OpOperand *> &newUserBlockingUseSet =
+          info.userToBlockingUses.getOrInsertDefault(blockingUse->getOwner());
+      newUserBlockingUseSet.insert(blockingUse);
+    }
+  }
+
+  return info;
+}
+
+/// Performs the destructuring of a destructible slot given associated
+/// destructuring information. The provided slot will be destructured in
+/// subslots as specified by its allocator.
+static void destructureSlot(DestructurableMemorySlot &slot,
+                            DestructurableAllocationOpInterface allocator,
+                            RewriterBase &rewriter,
+                            MemorySlotDestructuringInfo &info,
+                            const SROAStatistics &statistics) {
+  RewriterBase::InsertionGuard guard(rewriter);
+
+  rewriter.setInsertionPointToStart(slot.ptr.getParentBlock());
+  DenseMap<Attribute, MemorySlot> subslots =
+      allocator.destructure(slot, info.usedIndices, rewriter);
+
+  if (statistics.slotsWithMemoryBenefit &&
+      slot.elementPtrs.size() != info.usedIndices.size())
+    (*statistics.slotsWithMemoryBenefit)++;
+
+  if (statistics.maxSubelementAmount)
+    statistics.maxSubelementAmount->updateMax(slot.elementPtrs.size());
+
+  SetVector<Operation *> usersToRewire;
+  for (Operation *user : llvm::make_first_range(info.userToBlockingUses))
+    usersToRewire.insert(user);
+  for (DestructurableAccessorOpInterface accessor : info.accessors)
+    usersToRewire.insert(accessor);
+  usersToRewire = mlir::topologicalSort(usersToRewire);
+
+  llvm::SmallVector<Operation *> toErase;
+  for (Operation *toRewire : llvm::reverse(usersToRewire)) {
+    rewriter.setInsertionPointAfter(toRewire);
+    if (auto accessor = dyn_cast<DestructurableAccessorOpInterface>(toRewire)) {
+      if (accessor.rewire(slot, subslots, rewriter) == DeletionKind::Delete)
+        toErase.push_back(accessor);
+      continue;
+    }
+
+    auto promotable = cast<PromotableOpInterface>(toRewire);
+    if (promotable.removeBlockingUses(info.userToBlockingUses[promotable],
+                                      rewriter) == DeletionKind::Delete)
+      toErase.push_back(promotable);
+  }
+
+  for (Operation *toEraseOp : toErase)
+    rewriter.eraseOp(toEraseOp);
+
+  assert(slot.ptr.use_empty() && "after destructuring, the original slot "
+                                 "pointer should no longer be used");
+
+  LLVM_DEBUG(llvm::dbgs() << "[sroa] Destructured memory slot: " << slot.ptr
+                          << "\n");
+
+  if (statistics.destructuredAmount)
+    (*statistics.destructuredAmount)++;
+
+  allocator.handleDestructuringComplete(slot, rewriter);
+}
+
+LogicalResult mlir::tryToDestructureMemorySlots(
+    ArrayRef<DestructurableAllocationOpInterface> allocators,
+    RewriterBase &rewriter, SROAStatistics statistics) {
+  bool destructuredAny = false;
+
+  for (DestructurableAllocationOpInterface allocator : allocators) {
+    for (DestructurableMemorySlot slot : allocator.getDestructurableSlots()) {
+      std::optional<MemorySlotDestructuringInfo> info =
+          computeDestructuringInfo(slot);
+      if (!info)
+        continue;
+
+      destructureSlot(slot, allocator, rewriter, *info, statistics);
+      destructuredAny = true;
+    }
+  }
+
+  return success(destructuredAny);
+}
+
+LogicalResult
+SROAPattern::matchAndRewrite(DestructurableAllocationOpInterface allocator,
+                             PatternRewriter &rewriter) const {
+  hasBoundedRewriteRecursion();
+  return tryToDestructureMemorySlots({allocator}, rewriter, statistics);
+}
+
+namespace {
+
+struct SROA : public impl::SROABase<SROA> {
+  using impl::SROABase<SROA>::SROABase;
+
+  void runOnOperation() override {
+    Operation *scopeOp = getOperation();
+
+    SROAStatistics statistics{&destructuredAmount, &slotsWithMemoryBenefit,
+                              &maxSubelementAmount};
+
+    RewritePatternSet rewritePatterns(&getContext());
+    rewritePatterns.add<SROAPattern>(&getContext(), statistics);
+    FrozenRewritePatternSet frozen(std::move(rewritePatterns));
+
+    if (failed(applyPatternsAndFoldGreedily(scopeOp, frozen)))
+      signalPassFailure();
+  }
+};
+
+} // namespace
diff --git a/mlir/python/mlir/ir.py b/mlir/python/mlir/ir.py
--- a/mlir/python/mlir/ir.py
+++ b/mlir/python/mlir/ir.py
@@ -27,7 +27,7 @@
 
 
 @register_attribute_builder("I16Attr")
-def _i32Attr(x, context):
+def _i16Attr(x, context):
   return IntegerAttr.get(IntegerType.get_signless(16, context=context), x)
 
 
@@ -41,6 +41,26 @@
   return IntegerAttr.get(IntegerType.get_signless(64, context=context), x)
 
 
+@register_attribute_builder("SI16Attr")
+def _si16Attr(x, context):
+  return IntegerAttr.get(IntegerType.get_signed(16, context=context), x)
+
+
+@register_attribute_builder("SI32Attr")
+def _si32Attr(x, context):
+  return IntegerAttr.get(IntegerType.get_signed(32, context=context), x)
+
+
+@register_attribute_builder("F32Attr")
+def _f32Attr(x, context):
+  return FloatAttr.get_f32(x, context=context)
+
+
+@register_attribute_builder("F64Attr")
+def _f64Attr(x, context):
+  return FloatAttr.get_f64(x, context=context)
+
+
 @register_attribute_builder("StrAttr")
 def _stringAttr(x, context):
   return StringAttr.get(x, context=context)
@@ -61,11 +81,26 @@
   return ArrayAttr.get(x, context=context)
 
 
+@register_attribute_builder("I32ArrayAttr")
+def _i32ArrayAttr(x, context):
+  return ArrayAttr.get([_i32Attr(v, context) for v in x])
+
+
 @register_attribute_builder("I64ArrayAttr")
 def _i64ArrayAttr(x, context):
   return ArrayAttr.get([_i64Attr(v, context) for v in x])
 
 
+@register_attribute_builder("F32ArrayAttr")
+def _f32ArrayAttr(x, context):
+  return ArrayAttr.get([_f32Attr(v, context) for v in x])
+
+
+@register_attribute_builder("F64ArrayAttr")
+def _f64ArrayAttr(x, context):
+  return ArrayAttr.get([_f64Attr(v, context) for v in x])
+
+
 @register_attribute_builder("DenseI64ArrayAttr")
 def _denseI64ArrayAttr(x, context):
   return DenseI64ArrayAttr.get(x, context=context)
diff --git a/mlir/test/Bytecode/invalid/invalid-structure.mlir b/mlir/test/Bytecode/invalid/invalid-structure.mlir
--- a/mlir/test/Bytecode/invalid/invalid-structure.mlir
+++ b/mlir/test/Bytecode/invalid/invalid-structure.mlir
@@ -9,7 +9,7 @@
 //===--------------------------------------------------------------------===//
 
 // RUN: not mlir-opt %S/invalid-structure-version.mlirbc 2>&1 | FileCheck %s --check-prefix=VERSION
-// VERSION: bytecode version 127 is newer than the current version 2
+// VERSION: bytecode version 127 is newer than the current version 3
 
 //===--------------------------------------------------------------------===//
 // Producer
diff --git a/mlir/test/Bytecode/uselist_orders.mlir b/mlir/test/Bytecode/uselist_orders.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Bytecode/uselist_orders.mlir
@@ -0,0 +1,63 @@
+// RUN: mlir-opt %s -split-input-file --test-verify-uselistorder -verify-diagnostics
+
+// COM: --test-verify-uselistorder will randomly shuffle the uselist of every
+//      value and do a roundtrip to bytecode. An error is returned if the
+//      uselist order are not preserved when doing a roundtrip to bytecode. The
+//      test needs to verify diagnostics to be functional.
+
+func.func @base_test(%arg0 : i32) -> i32 {
+  %0 = arith.constant 45 : i32
+  %1 = arith.constant 46 : i32
+  %2 = "test.addi"(%arg0, %arg0) : (i32, i32) -> i32
+  %3 = "test.addi"(%2, %0) : (i32, i32) -> i32
+  %4 = "test.addi"(%2, %1) : (i32, i32) -> i32
+  %5 = "test.addi"(%3, %4) : (i32, i32) -> i32
+  %6 = "test.addi"(%5, %4) : (i32, i32) -> i32
+  %7 = "test.addi"(%6, %4) : (i32, i32) -> i32
+  return %7 : i32
+}
+
+// -----
+
+func.func @test_with_multiple_uses_in_same_op(%arg0 : i32) -> i32 {
+  %0 = arith.constant 45 : i32
+  %1 = arith.constant 46 : i32
+  %2 = "test.addi"(%arg0, %arg0) : (i32, i32) -> i32
+  %3 = "test.addi"(%2, %0) : (i32, i32) -> i32
+  %4 = "test.addi"(%2, %1) : (i32, i32) -> i32
+  %5 = "test.addi"(%2, %2) : (i32, i32) -> i32
+  %6 = "test.addi"(%3, %4) : (i32, i32) -> i32
+  %7 = "test.addi"(%6, %5) : (i32, i32) -> i32
+  %8 = "test.addi"(%7, %4) : (i32, i32) -> i32
+  %9 = "test.addi"(%8, %4) : (i32, i32) -> i32
+  return %9 : i32
+}
+
+// -----
+
+func.func @test_with_multiple_block_arg_uses(%arg0 : i32) -> i32 {
+  %0 = arith.constant 45 : i32
+  %1 = arith.constant 46 : i32
+  %2 = "test.addi"(%arg0, %arg0) : (i32, i32) -> i32
+  %3 = "test.addi"(%2, %arg0) : (i32, i32) -> i32
+  %4 = "test.addi"(%2, %1) : (i32, i32) -> i32
+  %5 = "test.addi"(%2, %2) : (i32, i32) -> i32
+  %6 = "test.addi"(%3, %4) : (i32, i32) -> i32
+  %7 = "test.addi"(%6, %5) : (i32, i32) -> i32
+  %8 = "test.addi"(%7, %4) : (i32, i32) -> i32
+  %9 = "test.addi"(%8, %4) : (i32, i32) -> i32
+  return %9 : i32
+}
+
+// -----
+
+// Test that use-lists in region with no dominance are preserved
+test.graph_region {
+  %0 = "test.foo"(%1) : (i32) -> i32
+  test.graph_region attributes {a} {
+    %a = "test.a"(%b) : (i32) -> i32
+    %b = "test.b"(%2) : (i32) -> i32
+  }
+  %1 = "test.bar"(%2) : (i32) -> i32
+  %2 = "test.baz"() : () -> i32
+}
diff --git a/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir
--- a/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir
@@ -91,12 +91,15 @@
     %j = arith.constant 16 : index
     gpu.subgroup_mma_store_matrix %arg0, %sg[%i,%j] {leadDimension= 32 : index, transpose} : !gpu.mma_matrix<16x16xf16, "COp">, memref<32x32xf16, 3>
     // CHECK:  %[[INX:.*]] = llvm.mlir.constant(16 : index) : i64
-    // CHECK: %{{.*}} = llvm.insertvalue %{{.*}}, %{{.*}}[{{.*}}, {{.*}}]
+    // CHECK:  %{{.*}} = llvm.insertvalue %{{.*}}, %{{.*}}[{{.*}}, {{.*}}]
+    // CHECK:  %{{.*}} = llvm.insertvalue %{{.*}}, %{{.*}}[{{.*}}, {{.*}}]
+    // CHECK:  %{{.*}} = llvm.insertvalue %{{.*}}, %{{.*}}[{{.*}}, {{.*}}]
+    // CHECK:  %[[MEMREF:.*]] = llvm.insertvalue %{{.*}}, %{{.*}}[{{.*}}, {{.*}}]
     // CHECK:  %[[EL1:.*]] = llvm.extractvalue %[[D]][0] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
     // CHECK:  %[[EL2:.*]] = llvm.extractvalue %[[D]][1] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
     // CHECK:  %[[EL3:.*]] = llvm.extractvalue %[[D]][2] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
     // CHECK:  %[[EL4:.*]] = llvm.extractvalue %[[D]][3] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
-    // CHECK:  %[[BASE:.*]] = llvm.extractvalue %17[1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
+    // CHECK:  %[[BASE:.*]] = llvm.extractvalue %[[MEMREF]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
     // CHECK:  %[[LDM:.*]] = llvm.mlir.constant(32 : index) : i64
     // CHECK:  %[[LI:.*]] = llvm.mul %[[INX]], %[[LDM]]   : i64
     // CHECK:  %[[LIJ:.*]] = llvm.add %[[LI]], %[[INX]]  : i64
@@ -107,12 +110,15 @@
     // CHECK:  llvm.return
 
     // CHECK32:  %[[INX:.*]] = llvm.mlir.constant(16 : index) : i32
-    // CHECK32: %{{.*}} = llvm.insertvalue %{{.*}}, %{{.*}}[{{.*}}, {{.*}}]
+    // CHECK32:  %{{.*}} = llvm.insertvalue %{{.*}}, %{{.*}}[{{.*}}, {{.*}}]
+    // CHECK32:  %{{.*}} = llvm.insertvalue %{{.*}}, %{{.*}}[{{.*}}, {{.*}}]
+    // CHECK32:  %{{.*}} = llvm.insertvalue %{{.*}}, %{{.*}}[{{.*}}, {{.*}}]
+    // CHECK32:  %[[MEMREF:.*]] = llvm.insertvalue %{{.*}}, %{{.*}}[{{.*}}, {{.*}}]
     // CHECK32:  %[[EL1:.*]] = llvm.extractvalue %[[D]][0] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
     // CHECK32:  %[[EL2:.*]] = llvm.extractvalue %[[D]][1] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
     // CHECK32:  %[[EL3:.*]] = llvm.extractvalue %[[D]][2] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
     // CHECK32:  %[[EL4:.*]] = llvm.extractvalue %[[D]][3] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
-    // CHECK32:  %[[BASE:.*]] = llvm.extractvalue %17[1] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<2 x i32>, array<2 x i32>)>
+    // CHECK32:  %[[BASE:.*]] = llvm.extractvalue %[[MEMREF]][1] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<2 x i32>, array<2 x i32>)>
     // CHECK32:  %[[LDM:.*]] = llvm.mlir.constant(32 : index) : i32
     // CHECK32:  %[[LI:.*]] = llvm.mul %[[INX]], %[[LDM]]   : i32
     // CHECK32:  %[[LIJ:.*]] = llvm.add %[[LI]], %[[INX]]  : i32
diff --git a/mlir/test/Conversion/MemRefToLLVM/convert-dynamic-memref-ops.mlir b/mlir/test/Conversion/MemRefToLLVM/convert-dynamic-memref-ops.mlir
--- a/mlir/test/Conversion/MemRefToLLVM/convert-dynamic-memref-ops.mlir
+++ b/mlir/test/Conversion/MemRefToLLVM/convert-dynamic-memref-ops.mlir
@@ -86,10 +86,7 @@
 //   CHECK-DAG:  %[[N:.*]] = builtin.unrealized_conversion_cast %[[Narg]]
 //  CHECK-NEXT:  %[[st1:.*]] = llvm.mlir.constant(1 : index) : i64
 //  CHECK-NEXT:  %[[num_elems:.*]] = llvm.mul %[[N]], %[[M]] : i64
-//  CHECK-NEXT:  %[[null:.*]] = llvm.mlir.null : !llvm.ptr
-//  CHECK-NEXT:  %[[gep:.*]] = llvm.getelementptr %[[null]][%[[num_elems]]] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-//  CHECK-NEXT:  %[[sz_bytes:.*]] = llvm.ptrtoint %[[gep]] : !llvm.ptr to i64
-//  CHECK-NEXT:  %[[allocated:.*]] = llvm.alloca %[[sz_bytes]] x f32 : (i64) -> !llvm.ptr
+//  CHECK-NEXT:  %[[allocated:.*]] = llvm.alloca %[[num_elems]] x f32 : (i64) -> !llvm.ptr
 //  CHECK-NEXT:  llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
 //  CHECK-NEXT:  llvm.insertvalue %[[allocated]], %{{.*}}[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
 //  CHECK-NEXT:  llvm.insertvalue %[[allocated]], %{{.*}}[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
diff --git a/mlir/test/Conversion/MemRefToLLVM/convert-static-memref-ops.mlir b/mlir/test/Conversion/MemRefToLLVM/convert-static-memref-ops.mlir
--- a/mlir/test/Conversion/MemRefToLLVM/convert-static-memref-ops.mlir
+++ b/mlir/test/Conversion/MemRefToLLVM/convert-static-memref-ops.mlir
@@ -79,10 +79,7 @@
 // CHECK: %[[sz2:.*]] = llvm.mlir.constant(18 : index) : i64
 // CHECK: %[[st2:.*]] = llvm.mlir.constant(1 : index) : i64
 // CHECK: %[[num_elems:.*]] = llvm.mlir.constant(576 : index) : i64
-// CHECK: %[[null:.*]] = llvm.mlir.null : !llvm.ptr
-// CHECK: %[[gep:.*]] = llvm.getelementptr %[[null]][%[[num_elems]]] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-// CHECK: %[[size_bytes:.*]] = llvm.ptrtoint %[[gep]] : !llvm.ptr to i64
-// CHECK: %[[allocated:.*]] = llvm.alloca %[[size_bytes]] x f32 : (i64) -> !llvm.ptr
+// CHECK: %[[allocated:.*]] = llvm.alloca %[[num_elems]] x f32 : (i64) -> !llvm.ptr
  %0 = memref.alloca() : memref<32x18xf32>
 
  // Test with explicitly specified alignment. llvm.alloca takes care of the
diff --git a/mlir/test/Dialect/LLVMIR/inlining.mlir b/mlir/test/Dialect/LLVMIR/inlining.mlir
--- a/mlir/test/Dialect/LLVMIR/inlining.mlir
+++ b/mlir/test/Dialect/LLVMIR/inlining.mlir
@@ -12,15 +12,16 @@
   llvm.intr.dbg.value #variable = %0 : i32
   llvm.intr.dbg.declare #variableAddr = %ptr : !llvm.ptr
   %byte = llvm.mlir.constant(43 : i8) : i8
-  %volatile = llvm.mlir.constant(1 : i1) : i1
-  "llvm.intr.memset"(%ptr, %byte, %0, %volatile) : (!llvm.ptr, i8, i32, i1) -> ()
-  "llvm.intr.memmove"(%ptr, %ptr, %0, %volatile) : (!llvm.ptr, !llvm.ptr, i32, i1) -> ()
-  "llvm.intr.memcpy"(%ptr, %ptr, %0, %volatile) : (!llvm.ptr, !llvm.ptr, i32, i1) -> ()
+  %true = llvm.mlir.constant(1 : i1) : i1
+  "llvm.intr.memset"(%ptr, %byte, %0, %true) : (!llvm.ptr, i8, i32, i1) -> ()
+  "llvm.intr.memmove"(%ptr, %ptr, %0, %true) : (!llvm.ptr, !llvm.ptr, i32, i1) -> ()
+  "llvm.intr.memcpy"(%ptr, %ptr, %0, %true) : (!llvm.ptr, !llvm.ptr, i32, i1) -> ()
+  "llvm.intr.assume"(%true) : (i1) -> ()
   llvm.fence release
   %2 = llvm.atomicrmw add %ptr, %0 monotonic : !llvm.ptr, i32
   %3 = llvm.cmpxchg %ptr, %0, %1 acq_rel monotonic : !llvm.ptr, i32
   llvm.inline_asm has_side_effects "foo", "bar" : () -> ()
-  llvm.cond_br %volatile, ^bb1, ^bb2
+  llvm.cond_br %true, ^bb1, ^bb2
 ^bb1:
   llvm.unreachable
 ^bb2:
@@ -39,6 +40,7 @@
 // CHECK: "llvm.intr.memset"(%[[PTR]]
 // CHECK: "llvm.intr.memmove"(%[[PTR]], %[[PTR]]
 // CHECK: "llvm.intr.memcpy"(%[[PTR]], %[[PTR]]
+// CHECK: "llvm.intr.assume"
 // CHECK: llvm.fence release
 // CHECK: llvm.atomicrmw add %[[PTR]], %[[CST]] monotonic
 // CHECK: llvm.cmpxchg %[[PTR]], %[[CST]], %[[RES]] acq_rel monotonic
@@ -564,7 +566,7 @@
 
 // -----
 
-llvm.func @ignored_attrs(%ptr : !llvm.ptr { llvm.inreg, llvm.nocapture, llvm.nofree, llvm.preallocated = i32, llvm.returned, llvm.alignstack = 32 : i64, llvm.writeonly, llvm.noundef, llvm.nonnull }, %x : i32 { llvm.zeroext }) -> (!llvm.ptr { llvm.noundef, llvm.inreg, llvm.nonnull }) {
+llvm.func @ignored_attrs(%ptr : !llvm.ptr { llvm.inreg, llvm.noalias, llvm.nocapture, llvm.nofree, llvm.preallocated = i32, llvm.returned, llvm.alignstack = 32 : i64, llvm.writeonly, llvm.noundef, llvm.nonnull }, %x : i32 { llvm.zeroext }) -> (!llvm.ptr { llvm.noundef, llvm.inreg, llvm.nonnull }) {
   llvm.return %ptr : !llvm.ptr
 }
 
@@ -578,7 +580,7 @@
 
 // -----
 
-llvm.func @disallowed_arg_attr(%ptr : !llvm.ptr { llvm.noalias }) {
+llvm.func @disallowed_arg_attr(%ptr : !llvm.ptr { llvm.inalloca = i64 }) {
   llvm.return
 }
 
@@ -588,16 +590,3 @@
   llvm.call @disallowed_arg_attr(%ptr) : (!llvm.ptr) -> ()
   llvm.return
 }
-
-// -----
-
-llvm.func @disallowed_res_attr(%ptr : !llvm.ptr) -> (!llvm.ptr { llvm.noalias }) {
-  llvm.return %ptr : !llvm.ptr
-}
-
-// CHECK-LABEL: @test_disallow_res_attr
-// CHECK-NEXT: llvm.call
-llvm.func @test_disallow_res_attr(%ptr : !llvm.ptr) {
-  llvm.call @disallowed_res_attr(%ptr) : (!llvm.ptr) -> (!llvm.ptr)
-  llvm.return
-}
diff --git a/mlir/test/Dialect/LLVMIR/sroa-statistics.mlir b/mlir/test/Dialect/LLVMIR/sroa-statistics.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Dialect/LLVMIR/sroa-statistics.mlir
@@ -0,0 +1,61 @@
+// RUN: mlir-opt %s --pass-pipeline="builtin.module(llvm.func(sroa))" --split-input-file --mlir-pass-statistics 2>&1 >/dev/null | FileCheck %s
+
+// CHECK: SROA
+// CHECK-NEXT: (S) 1 destructured slots
+// CHECK-NEXT: (S) 2 max subelement number
+// CHECK-NEXT: (S) 1 slots with memory benefit
+llvm.func @basic() -> i32 {
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i32, i32)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+  %2 = llvm.getelementptr inbounds %1[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i32, i32)>
+  %3 = llvm.load %2 : !llvm.ptr -> i32
+  llvm.return %3 : i32
+}
+
+// -----
+
+// CHECK: SROA
+// CHECK-NEXT: (S) 1 destructured slots
+// CHECK-NEXT: (S) 2 max subelement number
+// CHECK-NEXT: (S) 0 slots with memory benefit
+llvm.func @basic_no_memory_benefit() -> i32 {
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i32, i32)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+  %2 = llvm.getelementptr inbounds %1[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i32, i32)>
+  %3 = llvm.getelementptr inbounds %1[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i32, i32)>
+  %4 = llvm.load %2 : !llvm.ptr -> i32
+  %5 = llvm.load %3 : !llvm.ptr -> i32
+  %6 = llvm.add %4, %5 : i32
+  llvm.return %6 : i32
+}
+
+// -----
+
+// CHECK: SROA
+// CHECK-NEXT: (S)  1 destructured slots
+// CHECK-NEXT: (S) 10 max subelement number
+// CHECK-NEXT: (S)  1 slots with memory benefit
+llvm.func @basic_array() -> i32 {
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  %1 = llvm.alloca %0 x !llvm.array<10 x i32> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+  %2 = llvm.getelementptr inbounds %1[0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<10 x i32>
+  %3 = llvm.load %2 : !llvm.ptr -> i32
+  llvm.return %3 : i32
+}
+
+// -----
+
+// SROA is applied repeatedly here, peeling off layers of aggregates one after
+// the other, four times.
+
+// CHECK: SROA
+// CHECK-NEXT: (S)  4 destructured slots
+// CHECK-NEXT: (S) 10 max subelement number
+// CHECK-NEXT: (S)  4 slots with memory benefit
+llvm.func @multi_level_direct() -> i32 {
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i32, f64, struct<"bar", (i8, array<10 x array<10 x i32>>, i8)>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+  %2 = llvm.getelementptr inbounds %1[0, 2, 1, 5, 8] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i32, f64, struct<"bar", (i8, array<10 x array<10 x i32>>, i8)>)>
+  %3 = llvm.load %2 : !llvm.ptr -> i32
+  llvm.return %3 : i32
+}
diff --git a/mlir/test/Dialect/LLVMIR/sroa.mlir b/mlir/test/Dialect/LLVMIR/sroa.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Dialect/LLVMIR/sroa.mlir
@@ -0,0 +1,211 @@
+// RUN: mlir-opt %s --pass-pipeline="builtin.module(llvm.func(sroa))" --split-input-file | FileCheck %s
+
+// CHECK-LABEL: llvm.func @basic_struct
+llvm.func @basic_struct() -> i32 {
+  // CHECK: %[[SIZE:.*]] = llvm.mlir.constant(1 : i32)
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %[[SIZE]] x i32
+  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i32, f64, i32)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+  %2 = llvm.getelementptr inbounds %1[0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i32, f64, i32)>
+  // CHECK: %[[RES:.*]] = llvm.load %[[ALLOCA]]
+  %3 = llvm.load %2 : !llvm.ptr -> i32
+  // CHECK: llvm.return %[[RES]] : i32
+  llvm.return %3 : i32
+}
+
+// -----
+
+// CHECK-LABEL: llvm.func @basic_array
+llvm.func @basic_array() -> i32 {
+  // CHECK: %[[SIZE:.*]] = llvm.mlir.constant(1 : i32)
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %[[SIZE]] x i32
+  %1 = llvm.alloca %0 x !llvm.array<10 x i32> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+  %2 = llvm.getelementptr inbounds %1[0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<10 x i32>
+  // CHECK: %[[RES:.*]] = llvm.load %[[ALLOCA]]
+  %3 = llvm.load %2 : !llvm.ptr -> i32
+  // CHECK: llvm.return %[[RES]] : i32
+  llvm.return %3 : i32
+}
+
+// -----
+
+// CHECK-LABEL: llvm.func @multi_level_direct
+llvm.func @multi_level_direct() -> i32 {
+  // CHECK: %[[SIZE:.*]] = llvm.mlir.constant(1 : i32)
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %[[SIZE]] x i32
+  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i32, f64, struct<"bar", (i8, array<10 x array<10 x i32>>, i8)>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+  %2 = llvm.getelementptr inbounds %1[0, 2, 1, 5, 8] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i32, f64, struct<"bar", (i8, array<10 x array<10 x i32>>, i8)>)>
+  // CHECK: %[[RES:.*]] = llvm.load %[[ALLOCA]]
+  %3 = llvm.load %2 : !llvm.ptr -> i32
+  // CHECK: llvm.return %[[RES]] : i32
+  llvm.return %3 : i32
+}
+
+// -----
+
+// The first application of SROA would generate a GEP with indices [0, 0]. This
+// test ensures this GEP is not eliminated during the first application. Even
+// though doing it would be correct, it would prevent the second application
+// of SROA to eliminate the array. GEPs should be eliminated only when they are
+// truly trivial (with indices [0]).
+
+// CHECK-LABEL: llvm.func @multi_level_direct_two_applications
+llvm.func @multi_level_direct_two_applications() -> i32 {
+  // CHECK: %[[SIZE:.*]] = llvm.mlir.constant(1 : i32)
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %[[SIZE]] x i32
+  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i32, f64, array<10 x i32>, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+  %2 = llvm.getelementptr inbounds %1[0, 2, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i32, f64, array<10 x i32>, i8)>
+  // CHECK: %[[RES:.*]] = llvm.load %[[ALLOCA]]
+  %3 = llvm.load %2 : !llvm.ptr -> i32
+  // CHECK: llvm.return %[[RES]] : i32
+  llvm.return %3 : i32
+}
+
+// -----
+
+// CHECK-LABEL: llvm.func @multi_level_indirect
+llvm.func @multi_level_indirect() -> i32 {
+  // CHECK: %[[SIZE:.*]] = llvm.mlir.constant(1 : i32)
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %[[SIZE]] x i32
+  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i32, f64, struct<"bar", (i8, array<10 x array<10 x i32>>, i8)>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+  %2 = llvm.getelementptr inbounds %1[0, 2, 1, 5] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i32, f64, struct<"bar", (i8, array<10 x array<10 x i32>>, i8)>)>
+  %3 = llvm.getelementptr inbounds %2[0, 8] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<10 x i32>
+  // CHECK: %[[RES:.*]] = llvm.load %[[ALLOCA]]
+  %4 = llvm.load %3 : !llvm.ptr -> i32
+  // CHECK: llvm.return %[[RES]] : i32
+  llvm.return %4 : i32
+}
+
+// -----
+
+// CHECK-LABEL: llvm.func @resolve_alias
+// CHECK-SAME: (%[[ARG:.*]]: i32)
+llvm.func @resolve_alias(%arg: i32) -> i32 {
+  // CHECK: %[[SIZE:.*]] = llvm.mlir.constant(1 : i32)
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %[[SIZE]] x i32
+  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i32, f64, i32)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+  %2 = llvm.getelementptr %1[0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i32, f64, i32)>
+  %3 = llvm.getelementptr inbounds %1[0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i32, f64, i32)>
+  // CHECK: llvm.store %[[ARG]], %[[ALLOCA]]
+  llvm.store %arg, %2 : i32, !llvm.ptr
+  // CHECK: %[[RES:.*]] = llvm.load %[[ALLOCA]]
+  %4 = llvm.load %3 : !llvm.ptr -> i32
+  // CHECK: llvm.return %[[RES]] : i32
+  llvm.return %4 : i32
+}
+
+// -----
+
+// CHECK-LABEL: llvm.func @no_non_single_support
+llvm.func @no_non_single_support() -> i32 {
+  // CHECK: %[[SIZE:.*]] = llvm.mlir.constant
+  %0 = llvm.mlir.constant(2 : i32) : i32
+  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %[[SIZE]] x !llvm.struct<"foo", (i32, f64, i32)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i32, f64, i32)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+  // CHECK-NOT: = llvm.alloca
+  %2 = llvm.getelementptr inbounds %1[0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i32, f64, i32)>
+  %3 = llvm.load %2 : !llvm.ptr -> i32
+  llvm.return %3 : i32
+}
+
+// -----
+
+// CHECK-LABEL: llvm.func @no_pointer_indexing
+llvm.func @no_pointer_indexing() -> i32 {
+  // CHECK: %[[SIZE:.*]] = llvm.mlir.constant(1 : i32)
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %[[SIZE]] x !llvm.struct<"foo", (i32, f64, i32)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i32, f64, i32)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+  // CHECK-NOT: = llvm.alloca
+  %2 = llvm.getelementptr %1[1, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i32, f64, i32)>
+  %3 = llvm.load %2 : !llvm.ptr -> i32
+  llvm.return %3 : i32
+}
+
+// -----
+
+// CHECK-LABEL: llvm.func @no_direct_use
+llvm.func @no_direct_use() -> i32 {
+  // CHECK: %[[SIZE:.*]] = llvm.mlir.constant(1 : i32)
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %[[SIZE]] x !llvm.struct<"foo", (i32, f64, i32)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i32, f64, i32)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+  // CHECK-NOT: = llvm.alloca
+  %2 = llvm.getelementptr %1[0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i32, f64, i32)>
+  %3 = llvm.load %2 : !llvm.ptr -> i32
+  llvm.call @use(%1) : (!llvm.ptr) -> ()
+  llvm.return %3 : i32
+}
+
+llvm.func @use(!llvm.ptr)
+
+// -----
+
+// CHECK-LABEL: llvm.func @direct_promotable_use_is_fine
+llvm.func @direct_promotable_use_is_fine() -> i32 {
+  // CHECK: %[[SIZE:.*]] = llvm.mlir.constant(1 : i32)
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %[[SIZE]] x i32
+  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i32, f64, i32)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+  %2 = llvm.getelementptr %1[0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i32, f64, i32)>
+  // CHECK: %[[RES:.*]] = llvm.load %[[ALLOCA]]
+  %3 = llvm.load %2 : !llvm.ptr -> i32
+  // This is a direct use of the slot but it can be removed because it implements PromotableOpInterface.
+  llvm.intr.lifetime.start 2, %1 : !llvm.ptr
+  // CHECK: llvm.return %[[RES]] : i32
+  llvm.return %3 : i32
+}
+
+// -----
+
+// CHECK-LABEL: llvm.func @direct_promotable_use_is_fine_on_accessor
+llvm.func @direct_promotable_use_is_fine_on_accessor() -> i32 {
+  // CHECK: %[[SIZE:.*]] = llvm.mlir.constant(1 : i32)
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %[[SIZE]] x i32
+  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i32, f64, i32)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+  %2 = llvm.getelementptr %1[0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i32, f64, i32)>
+  // CHECK: %[[RES:.*]] = llvm.load %[[ALLOCA]]
+  %3 = llvm.load %2 : !llvm.ptr -> i32
+  // This does not provide side-effect info but it can be removed because it implements PromotableOpInterface.
+  llvm.intr.lifetime.start 2, %2 : !llvm.ptr
+  // CHECK: llvm.return %[[RES]] : i32
+  llvm.return %3 : i32
+}
+
+// -----
+
+// CHECK-LABEL: llvm.func @no_dynamic_indexing
+// CHECK-SAME: (%[[ARG:.*]]: i32)
+llvm.func @no_dynamic_indexing(%arg: i32) -> i32 {
+  // CHECK: %[[SIZE:.*]] = llvm.mlir.constant(1 : i32)
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %[[SIZE]] x !llvm.array<10 x i32> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+  %1 = llvm.alloca %0 x !llvm.array<10 x i32> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+  // CHECK-NOT: = llvm.alloca
+  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[ALLOCA]][0, %[[ARG]]]
+  %2 = llvm.getelementptr %1[0, %arg] : (!llvm.ptr, i32) -> !llvm.ptr, !llvm.array<10 x i32>
+  // CHECK: %[[RES:.*]] = llvm.load %[[GEP]]
+  %3 = llvm.load %2 : !llvm.ptr -> i32
+  // CHECK: llvm.return %[[RES]] : i32
+  llvm.return %3 : i32
+}
+
+// -----
+
+// CHECK-LABEL: llvm.func @no_typed_pointers
+llvm.func @no_typed_pointers() -> i32 {
+  // CHECK: %[[SIZE:.*]] = llvm.mlir.constant(1 : i32)
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %[[SIZE]] x !llvm.array<10 x i32> {alignment = 8 : i64} : (i32) -> !llvm.ptr<array<10 x i32>>
+  %1 = llvm.alloca %0 x !llvm.array<10 x i32> {alignment = 8 : i64} : (i32) -> !llvm.ptr<array<10 x i32>>
+  // CHECK-NOT: = llvm.alloca
+  %2 = llvm.getelementptr %1[0, 1] : (!llvm.ptr<array<10 x i32>>) -> !llvm.ptr<i32>
+  %3 = llvm.load %2 : !llvm.ptr<i32>
+  llvm.return %3 : i32
+}
diff --git a/mlir/test/Dialect/OpenACC/ops.mlir b/mlir/test/Dialect/OpenACC/ops.mlir
--- a/mlir/test/Dialect/OpenACC/ops.mlir
+++ b/mlir/test/Dialect/OpenACC/ops.mlir
@@ -114,6 +114,16 @@
 
 // -----
 
+acc.private.recipe @privatization_memref_10_f32 : memref<10xf32> init {
+^bb0(%arg0: memref<10xf32>):
+  %0 = memref.alloc() : memref<10xf32>
+  acc.yield %0 : memref<10xf32>
+} destroy {
+^bb0(%arg0: memref<10xf32>):
+  memref.dealloc %arg0 : memref<10xf32> 
+  acc.terminator
+}
+
 func.func @compute3(%a: memref<10x10xf32>, %b: memref<10x10xf32>, %c: memref<10xf32>, %d: memref<10xf32>) -> memref<10xf32> {
   %lb = arith.constant 0 : index
   %st = arith.constant 1 : index
@@ -126,7 +136,7 @@
   %pc = acc.present varPtr(%c : memref<10xf32>) -> memref<10xf32>
   %pd = acc.present varPtr(%d : memref<10xf32>) -> memref<10xf32>
   acc.data dataOperands(%pa, %pb, %pc, %pd: memref<10x10xf32>, memref<10x10xf32>, memref<10xf32>, memref<10xf32>) {
-    acc.parallel num_gangs(%numGangs: i64) num_workers(%numWorkers: i64) private(%c : memref<10xf32>) {
+    acc.parallel num_gangs(%numGangs: i64) num_workers(%numWorkers: i64) private(@privatization_memref_10_f32 -> %c : memref<10xf32>) {
       acc.loop gang {
         scf.for %x = %lb to %c10 step %st {
           acc.loop worker {
@@ -168,7 +178,7 @@
 // CHECK-NEXT:   [[NUMGANG:%.*]] = arith.constant 10 : i64
 // CHECK-NEXT:   [[NUMWORKERS:%.*]] = arith.constant 10 : i64
 // CHECK:        acc.data dataOperands(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<10x10xf32>, memref<10x10xf32>, memref<10xf32>, memref<10xf32>) {
-// CHECK-NEXT:     acc.parallel num_gangs([[NUMGANG]] : i64) num_workers([[NUMWORKERS]] : i64) private([[ARG2]] : memref<10xf32>) {
+// CHECK-NEXT:     acc.parallel num_gangs([[NUMGANG]] : i64) num_workers([[NUMWORKERS]] : i64) private(@privatization_memref_10_f32 -> [[ARG2]] : memref<10xf32>) {
 // CHECK-NEXT:       acc.loop gang {
 // CHECK-NEXT:         scf.for %{{.*}} = [[C0]] to [[C10]] step [[C1]] {
 // CHECK-NEXT:           acc.loop worker {
@@ -358,6 +368,26 @@
 
 // -----
 
+acc.private.recipe @privatization_memref_10_f32 : memref<10xf32> init {
+^bb0(%arg0: memref<10xf32>):
+  %0 = memref.alloc() : memref<10xf32>
+  acc.yield %0 : memref<10xf32>
+} destroy {
+^bb0(%arg0: memref<10xf32>):
+  memref.dealloc %arg0 : memref<10xf32> 
+  acc.terminator
+}
+
+acc.private.recipe @privatization_memref_10_10_f32 : memref<10x10xf32> init {
+^bb0(%arg0: memref<10x10xf32>):
+  %0 = memref.alloc() : memref<10x10xf32>
+  acc.yield %0 : memref<10x10xf32>
+} destroy {
+^bb0(%arg0: memref<10x10xf32>):
+  memref.dealloc %arg0 : memref<10x10xf32> 
+  acc.terminator
+}
+
 func.func @testparallelop(%a: memref<10xf32>, %b: memref<10xf32>, %c: memref<10x10xf32>) -> () {
   %i64value = arith.constant 1 : i64
   %i32value = arith.constant 1 : i32
@@ -394,7 +424,7 @@
   }
   acc.parallel vector_length(%idxValue: index) {
   }
-  acc.parallel private(%a, %c : memref<10xf32>, memref<10x10xf32>) firstprivate(%b: memref<10xf32>) {
+  acc.parallel private(@privatization_memref_10_f32 -> %a : memref<10xf32>, @privatization_memref_10_10_f32 -> %c : memref<10x10xf32>) firstprivate(%b: memref<10xf32>) {
   }
   acc.parallel {
   } attributes {defaultAttr = #acc<defaultvalue none>}
@@ -445,7 +475,7 @@
 // CHECK-NEXT: }
 // CHECK:      acc.parallel vector_length([[IDXVALUE]] : index) {
 // CHECK-NEXT: }
-// CHECK:      acc.parallel firstprivate([[ARGB]] : memref<10xf32>) private([[ARGA]], [[ARGC]] : memref<10xf32>, memref<10x10xf32>) {
+// CHECK:      acc.parallel firstprivate([[ARGB]] : memref<10xf32>) private(@privatization_memref_10_f32 -> [[ARGA]] : memref<10xf32>, @privatization_memref_10_10_f32 -> [[ARGC]] : memref<10x10xf32>) {
 // CHECK-NEXT: }
 // CHECK:      acc.parallel {
 // CHECK-NEXT: } attributes {defaultAttr = #acc<defaultvalue none>}
@@ -460,7 +490,25 @@
 
 // -----
 
-// -----
+acc.private.recipe @privatization_memref_10_f32 : memref<10xf32> init {
+^bb0(%arg0: memref<10xf32>):
+  %0 = memref.alloc() : memref<10xf32>
+  acc.yield %0 : memref<10xf32>
+} destroy {
+^bb0(%arg0: memref<10xf32>):
+  memref.dealloc %arg0 : memref<10xf32> 
+  acc.terminator
+}
+
+acc.private.recipe @privatization_memref_10_10_f32 : memref<10x10xf32> init {
+^bb0(%arg0: memref<10x10xf32>):
+  %0 = memref.alloc() : memref<10x10xf32>
+  acc.yield %0 : memref<10x10xf32>
+} destroy {
+^bb0(%arg0: memref<10x10xf32>):
+  memref.dealloc %arg0 : memref<10x10xf32> 
+  acc.terminator
+}
 
 func.func @testserialop(%a: memref<10xf32>, %b: memref<10xf32>, %c: memref<10x10xf32>) -> () {
   %i64value = arith.constant 1 : i64
@@ -480,7 +528,7 @@
   }
   acc.serial wait(%i64value, %i32value, %idxValue : i64, i32, index) {
   }
-  acc.serial private(%a, %c : memref<10xf32>, memref<10x10xf32>) firstprivate(%b: memref<10xf32>) {
+  acc.serial private(@privatization_memref_10_f32 -> %a : memref<10xf32>, @privatization_memref_10_10_f32 -> %c : memref<10x10xf32>) firstprivate(%b: memref<10xf32>) {
   }
   acc.serial {
   } attributes {defaultAttr = #acc<defaultvalue none>}
@@ -516,7 +564,7 @@
 // CHECK-NEXT: }
 // CHECK:      acc.serial wait([[I64VALUE]], [[I32VALUE]], [[IDXVALUE]] : i64, i32, index) {
 // CHECK-NEXT: }
-// CHECK:      acc.serial firstprivate([[ARGB]] : memref<10xf32>) private([[ARGA]], [[ARGC]] : memref<10xf32>, memref<10x10xf32>) {
+// CHECK:      acc.serial firstprivate([[ARGB]] : memref<10xf32>) private(@privatization_memref_10_f32 -> [[ARGA]] : memref<10xf32>, @privatization_memref_10_10_f32 -> [[ARGC]] : memref<10x10xf32>) {
 // CHECK-NEXT: }
 // CHECK:      acc.serial {
 // CHECK-NEXT: } attributes {defaultAttr = #acc<defaultvalue none>}
diff --git a/mlir/test/Dialect/OpenMP/attr.mlir b/mlir/test/Dialect/OpenMP/attr.mlir
--- a/mlir/test/Dialect/OpenMP/attr.mlir
+++ b/mlir/test/Dialect/OpenMP/attr.mlir
@@ -56,3 +56,83 @@
 
 // CHECK: module attributes {omp.version = #omp.version<version = 51>} {
 module attributes {omp.version = #omp.version<version = 51>} {}
+
+// ----
+
+// CHECK-LABEL: func @omp_decl_tar_host_to
+// CHECK-SAME: {{.*}} attributes {omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>} {
+func.func @omp_decl_tar_host_to() -> () attributes {omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>} {
+  return
+}
+
+// CHECK-LABEL: func @omp_decl_tar_host_link
+// CHECK-SAME: {{.*}} attributes {omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (link)>} {
+func.func @omp_decl_tar_host_link() -> () attributes {omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (link)>} {
+  return
+}
+
+// CHECK-LABEL: func @omp_decl_tar_nohost_to
+// CHECK-SAME: {{.*}} attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>} {
+func.func @omp_decl_tar_nohost_to() -> () attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>} {
+  return
+}
+
+// CHECK-LABEL: func @omp_decl_tar_nohost_link
+// CHECK-SAME: {{.*}} attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (link)>} {
+func.func @omp_decl_tar_nohost_link() -> () attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (link)>} {
+  return
+}
+
+// CHECK-LABEL: func @omp_decl_tar_any_to
+// CHECK-SAME: {{.*}} attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>} {
+func.func @omp_decl_tar_any_to() -> () attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>} {
+  return
+}
+
+// CHECK-LABEL: func @omp_decl_tar_any_link
+// CHECK-SAME: {{.*}} attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} {
+func.func @omp_decl_tar_any_link() -> () attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} {
+  return
+}
+
+// CHECK-LABEL: global external @omp_decl_tar_data_host_to
+// CHECK-SAME: {{.*}} {{{.*}}omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>}
+llvm.mlir.global external @omp_decl_tar_data_host_to() {omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>} : i32 {
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  llvm.return %0 : i32
+}
+
+// CHECK-LABEL: global external @omp_decl_tar_data_host_link
+// CHECK-SAME: {{.*}} {{{.*}}omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (link)>}
+llvm.mlir.global external @omp_decl_tar_data_host_link() {omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (link)>} : i32 {
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  llvm.return %0 : i32
+}
+
+// CHECK-LABEL: global external @omp_decl_tar_data_nohost_to
+// CHECK-SAME: {{.*}} {{{.*}}omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>}
+llvm.mlir.global external @omp_decl_tar_data_nohost_to() {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>} : i32 {
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  llvm.return %0 : i32
+}
+
+// CHECK-LABEL: global external @omp_decl_tar_data_nohost_link
+// CHECK-SAME: {{.*}} {{{.*}}omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (link)>}
+llvm.mlir.global external @omp_decl_tar_data_nohost_link() {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (link)>} : i32 {
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  llvm.return %0 : i32
+}
+
+// CHECK-LABEL: global external @omp_decl_tar_data_any_to
+// CHECK-SAME: {{.*}} {{{.*}}omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>}
+llvm.mlir.global external @omp_decl_tar_data_any_to() {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>} : i32 {
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  llvm.return %0 : i32
+}
+
+// CHECK-LABEL: global external @omp_decl_tar_data_any_link
+// CHECK-SAME: {{.*}} {{{.*}}omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>}
+llvm.mlir.global external @omp_decl_tar_data_any_link() {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} : i32 {
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  llvm.return %0 : i32
+}
diff --git a/mlir/test/Dialect/SCF/transform-op-coalesce.mlir b/mlir/test/Dialect/SCF/transform-op-coalesce.mlir
--- a/mlir/test/Dialect/SCF/transform-op-coalesce.mlir
+++ b/mlir/test/Dialect/SCF/transform-op-coalesce.mlir
@@ -23,9 +23,9 @@
 }
 
 transform.sequence failures(propagate) {
-^bb1(%arg1: !pdl.operation):
-  %0 = transform.structured.match ops{["scf.for"]} attributes {coalesce} in %arg1 : (!pdl.operation) -> !pdl.operation
-  %1 = transform.cast %0 : !pdl.operation to !transform.op<"scf.for">
+^bb1(%arg1: !transform.any_op):
+  %0 = transform.structured.match ops{["scf.for"]} attributes {coalesce} in %arg1 : (!transform.any_op) -> !transform.any_op
+  %1 = transform.cast %0 : !transform.any_op to !transform.op<"scf.for">
   %2 = transform.loop.coalesce %1: (!transform.op<"scf.for">) -> (!transform.op<"scf.for">)
 }
 
@@ -49,9 +49,9 @@
 }
 
 transform.sequence failures(propagate) {
-^bb1(%arg1: !pdl.operation):
-  %0 = transform.structured.match ops{["affine.for"]} attributes {coalesce} in %arg1 : (!pdl.operation) -> !pdl.operation
-  %1 = transform.cast %0 : !pdl.operation to !transform.op<"affine.for">
+^bb1(%arg1: !transform.any_op):
+  %0 = transform.structured.match ops{["affine.for"]} attributes {coalesce} in %arg1 : (!transform.any_op) -> !transform.any_op
+  %1 = transform.cast %0 : !transform.any_op to !transform.op<"affine.for">
   %2 = transform.loop.coalesce %1 : (!transform.op<"affine.for">) -> (!transform.op<"affine.for">)
 }
 
@@ -84,9 +84,9 @@
 }
 
 transform.sequence failures(propagate) {
-^bb1(%arg1: !pdl.operation):
-  %0 = transform.structured.match ops{["scf.for"]} attributes {coalesce} in %arg1 : (!pdl.operation) -> !pdl.operation
-  %1 = transform.cast %0 : !pdl.operation to !transform.op<"scf.for">
+^bb1(%arg1: !transform.any_op):
+  %0 = transform.structured.match ops{["scf.for"]} attributes {coalesce} in %arg1 : (!transform.any_op) -> !transform.any_op
+  %1 = transform.cast %0 : !transform.any_op to !transform.op<"scf.for">
   %2 = transform.loop.coalesce %1 : (!transform.op<"scf.for">) -> (!transform.op<"scf.for">)
   transform.loop.unroll %2 {factor = 3} : !transform.op<"scf.for">
 }
diff --git a/mlir/test/Dialect/SCF/transform-ops-invalid.mlir b/mlir/test/Dialect/SCF/transform-ops-invalid.mlir
--- a/mlir/test/Dialect/SCF/transform-ops-invalid.mlir
+++ b/mlir/test/Dialect/SCF/transform-ops-invalid.mlir
@@ -11,9 +11,9 @@
 }
 
 transform.sequence failures(propagate) {
-^bb1(%arg1: !pdl.operation):
-  %0 = transform.structured.match ops{["affine.for"]} attributes {coalesce} in %arg1 : (!pdl.operation) -> !pdl.operation
-  %1 = transform.cast %0 : !pdl.operation to !transform.op<"affine.for">
+^bb1(%arg1: !transform.any_op):
+  %0 = transform.structured.match ops{["affine.for"]} attributes {coalesce} in %arg1 : (!transform.any_op) -> !transform.any_op
+  %1 = transform.cast %0 : !transform.any_op to !transform.op<"affine.for">
   // expected-error @below {{failed to coalesce}}
   %2 = transform.loop.coalesce %1: (!transform.op<"affine.for">) -> (!transform.op<"affine.for">)
 }
@@ -28,9 +28,9 @@
 }
 
 transform.sequence failures(propagate) {
-^bb1(%arg1: !pdl.operation):
-  %0 = transform.structured.match ops{["arith.addi"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-  %1 = transform.loop.get_parent_for %0 { affine = true } : (!pdl.operation) -> !transform.op<"affine.for">
+^bb1(%arg1: !transform.any_op):
+  %0 = transform.structured.match ops{["arith.addi"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+  %1 = transform.loop.get_parent_for %0 { affine = true } : (!transform.any_op) -> !transform.op<"affine.for">
   // expected-error @below {{failed to unroll}}
   transform.loop.unroll %1 { factor = 8 } : !transform.op<"affine.for">
 }
diff --git a/mlir/test/Dialect/SCF/transform-ops.mlir b/mlir/test/Dialect/SCF/transform-ops.mlir
--- a/mlir/test/Dialect/SCF/transform-ops.mlir
+++ b/mlir/test/Dialect/SCF/transform-ops.mlir
@@ -16,12 +16,12 @@
 }
 
 transform.sequence failures(propagate) {
-^bb1(%arg1: !pdl.operation):
-  %0 = transform.structured.match ops{["arith.addi"]} in %arg1 : (!pdl.operation) -> !pdl.operation
+^bb1(%arg1: !transform.any_op):
+  %0 = transform.structured.match ops{["arith.addi"]} in %arg1 : (!transform.any_op) -> !transform.any_op
   // CHECK: = transform.loop.get_parent_for
-  %1 = transform.loop.get_parent_for %0 : (!pdl.operation) -> !transform.op<"scf.for">
-  %2 = transform.loop.get_parent_for %0 { num_loops = 2 } : (!pdl.operation) -> !transform.op<"scf.for">
-  %3 = transform.loop.get_parent_for %0 { num_loops = 3 } : (!pdl.operation) -> !transform.op<"scf.for">
+  %1 = transform.loop.get_parent_for %0 : (!transform.any_op) -> !transform.op<"scf.for">
+  %2 = transform.loop.get_parent_for %0 { num_loops = 2 } : (!transform.any_op) -> !transform.op<"scf.for">
+  %3 = transform.loop.get_parent_for %0 { num_loops = 3 } : (!transform.any_op) -> !transform.op<"scf.for">
   transform.test_print_remark_at_operand %1, "third loop" : !transform.op<"scf.for">
   transform.test_print_remark_at_operand %2, "second loop" : !transform.op<"scf.for">
   transform.test_print_remark_at_operand %3, "first loop" : !transform.op<"scf.for">
@@ -36,10 +36,10 @@
 }
 
 transform.sequence failures(propagate) {
-^bb1(%arg1: !pdl.operation):
-  %0 = transform.structured.match ops{["arith.addi"]} in %arg1 : (!pdl.operation) -> !pdl.operation
+^bb1(%arg1: !transform.any_op):
+  %0 = transform.structured.match ops{["arith.addi"]} in %arg1 : (!transform.any_op) -> !transform.any_op
   // expected-error @below {{could not find an 'scf.for' parent}}
-  %1 = transform.loop.get_parent_for %0 : (!pdl.operation) -> !transform.op<"scf.for">
+  %1 = transform.loop.get_parent_for %0 : (!transform.any_op) -> !transform.op<"scf.for">
 }
 
 // -----
@@ -104,10 +104,10 @@
 }
 
 transform.sequence failures(propagate) {
-^bb1(%arg1: !pdl.operation):
-  %0 = transform.structured.match ops{["arith.addi"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-  %1 = transform.loop.get_parent_for %0 : (!pdl.operation) -> !transform.op<"scf.for">
-  transform.loop.peel %1 : (!transform.op<"scf.for">) -> !pdl.operation
+^bb1(%arg1: !transform.any_op):
+  %0 = transform.structured.match ops{["arith.addi"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+  %1 = transform.loop.get_parent_for %0 : (!transform.any_op) -> !transform.op<"scf.for">
+  transform.loop.peel %1 : (!transform.op<"scf.for">) -> !transform.any_op
 }
 
 // -----
@@ -137,12 +137,12 @@
 }
 
 transform.sequence failures(propagate) {
-^bb1(%arg1: !pdl.operation):
-  %0 = transform.structured.match ops{["arith.addf"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-  %1 = transform.loop.get_parent_for %0 : (!pdl.operation) -> !transform.op<"scf.for">
-  %2 = transform.loop.pipeline %1 : (!transform.op<"scf.for">) -> !pdl.operation
+^bb1(%arg1: !transform.any_op):
+  %0 = transform.structured.match ops{["arith.addf"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+  %1 = transform.loop.get_parent_for %0 : (!transform.any_op) -> !transform.op<"scf.for">
+  %2 = transform.loop.pipeline %1 : (!transform.op<"scf.for">) -> !transform.any_op
   // Verify that the returned handle is usable.
-  transform.test_print_remark_at_operand %2, "transformed" : !pdl.operation
+  transform.test_print_remark_at_operand %2, "transformed" : !transform.any_op
 }
 
 // -----
@@ -161,9 +161,9 @@
 }
 
 transform.sequence failures(propagate) {
-^bb1(%arg1: !pdl.operation):
-  %0 = transform.structured.match ops{["arith.addi"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-  %1 = transform.loop.get_parent_for %0 : (!pdl.operation) -> !transform.op<"scf.for">
+^bb1(%arg1: !transform.any_op):
+  %0 = transform.structured.match ops{["arith.addi"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+  %1 = transform.loop.get_parent_for %0 : (!transform.any_op) -> !transform.op<"scf.for">
   transform.loop.unroll %1 { factor = 4 } : !transform.op<"scf.for">
 }
 
@@ -185,12 +185,12 @@
 }
 
 transform.sequence failures(propagate) {
-^bb1(%arg1: !pdl.operation):
-  %0 = transform.structured.match ops{["arith.addi"]} in %arg1 : (!pdl.operation) -> !pdl.operation
+^bb1(%arg1: !transform.any_op):
+  %0 = transform.structured.match ops{["arith.addi"]} in %arg1 : (!transform.any_op) -> !transform.any_op
   // CHECK: = transform.loop.get_parent_for
-  %1 = transform.loop.get_parent_for %0 { affine = true } : (!pdl.operation) -> !transform.op<"affine.for">
-  %2 = transform.loop.get_parent_for %0 { num_loops = 2, affine = true } : (!pdl.operation) -> !transform.op<"affine.for">
-  %3 = transform.loop.get_parent_for %0 { num_loops = 3, affine = true } : (!pdl.operation) -> !transform.op<"affine.for">
+  %1 = transform.loop.get_parent_for %0 { affine = true } : (!transform.any_op) -> !transform.op<"affine.for">
+  %2 = transform.loop.get_parent_for %0 { num_loops = 2, affine = true } : (!transform.any_op) -> !transform.op<"affine.for">
+  %3 = transform.loop.get_parent_for %0 { num_loops = 3, affine = true } : (!transform.any_op) -> !transform.op<"affine.for">
   transform.test_print_remark_at_operand %1, "third loop" : !transform.op<"affine.for">
   transform.test_print_remark_at_operand %2, "second loop" : !transform.op<"affine.for">
   transform.test_print_remark_at_operand %3, "first loop" : !transform.op<"affine.for">
@@ -205,10 +205,10 @@
 }
 
 transform.sequence failures(propagate) {
-^bb1(%arg1: !pdl.operation):
-  %0 = transform.structured.match ops{["arith.addi"]} in %arg1 : (!pdl.operation) -> !pdl.operation
+^bb1(%arg1: !transform.any_op):
+  %0 = transform.structured.match ops{["arith.addi"]} in %arg1 : (!transform.any_op) -> !transform.any_op
   // expected-error @below {{could not find an 'affine.for' parent}}
-  %1 = transform.loop.get_parent_for %0 { affine = true } : (!pdl.operation) -> !transform.op<"affine.for">
+  %1 = transform.loop.get_parent_for %0 { affine = true } : (!transform.any_op) -> !transform.op<"affine.for">
 }
 
 // -----
@@ -227,9 +227,9 @@
 }
 
 transform.sequence failures(propagate) {
-^bb1(%arg1: !pdl.operation):
-  %0 = transform.structured.match ops{["arith.addi"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-  %1 = transform.loop.get_parent_for %0 { affine = true } : (!pdl.operation) -> !transform.op<"affine.for">
+^bb1(%arg1: !transform.any_op):
+  %0 = transform.structured.match ops{["arith.addi"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+  %1 = transform.loop.get_parent_for %0 { affine = true } : (!transform.any_op) -> !transform.op<"affine.for">
   transform.test_print_remark_at_operand %1, "affine for loop" : !transform.op<"affine.for">
   transform.loop.unroll %1 { factor = 4, affine = true } : !transform.op<"affine.for">
 }
@@ -252,9 +252,9 @@
 }
 
 transform.sequence failures(propagate) {
-^bb1(%arg1: !pdl.operation):
-  %0 = transform.structured.match ops{["arith.addi"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-  %1 = transform.loop.get_parent_for %0 { num_loops = 1, affine = true } : (!pdl.operation) -> !transform.op<"affine.for">
+^bb1(%arg1: !transform.any_op):
+  %0 = transform.structured.match ops{["arith.addi"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+  %1 = transform.loop.get_parent_for %0 { num_loops = 1, affine = true } : (!transform.any_op) -> !transform.op<"affine.for">
   transform.test_print_remark_at_operand %1, "affine for loop" : !transform.op<"affine.for">
   transform.loop.unroll %1 { factor = 4 } : !transform.op<"affine.for">
 }
diff --git a/mlir/test/Dialect/Tensor/bufferize.mlir b/mlir/test/Dialect/Tensor/bufferize.mlir
--- a/mlir/test/Dialect/Tensor/bufferize.mlir
+++ b/mlir/test/Dialect/Tensor/bufferize.mlir
@@ -582,3 +582,20 @@
   // CHECK:     return %[[r]] : tensor<?x?xindex>
   return %0 : tensor<?x?xindex>
 }
+
+// -----
+
+// CHECK-LABEL:   func @tensor.splat(
+// CHECK-SAME:        %[[F:.*]]: f32)
+// CHECK-DAG:       %[[ALLOC:.*]] = memref.alloc() {{.*}} : memref<10x2x4xf32>
+// CHECK:           %[[ALLOC_T:.*]] = bufferization.to_tensor %[[ALLOC]]
+// CHECK:           %[[MAPPED:.*]] = linalg.map
+// CHECK:                 outs(%[[ALLOC_T]] : tensor<10x2x4xf32>)
+// CHECK:             linalg.yield %[[F]]
+// CHECK:           }
+// CHECK:           return %[[MAPPED]] : tensor<10x2x4xf32>
+// CHECK:         }
+func.func @tensor.splat(%f: f32) -> tensor<10x2x4xf32> {
+  %t = tensor.splat %f : tensor<10x2x4xf32>
+  return %t : tensor<10x2x4xf32>
+}
diff --git a/mlir/test/Dialect/Tosa/canonicalize.mlir b/mlir/test/Dialect/Tosa/canonicalize.mlir
--- a/mlir/test/Dialect/Tosa/canonicalize.mlir
+++ b/mlir/test/Dialect/Tosa/canonicalize.mlir
@@ -549,3 +549,13 @@
   return %1 : tensor<?x1xf32>
 }
 
+// -----
+
+// CHECK-LABEL: @fold_abs_abs
+func.func @fold_abs_abs(%arg0: tensor<?x1xf32>) -> tensor<?x1xf32> {
+  // CHECK: %[[ABS:.*]] = "tosa.abs"(%arg{{.*}}) : (tensor<?x1xf32>) -> tensor<?x1xf32>
+  // CHECK: return %[[ABS]] : tensor<?x1xf32>
+  %0 = "tosa.abs"(%arg0) : (tensor<?x1xf32>) -> tensor<?x1xf32>
+  %1 = "tosa.abs"(%0) : (tensor<?x1xf32>) -> tensor<?x1xf32>
+  return %1 : tensor<?x1xf32>
+}
diff --git a/mlir/test/Dialect/Transform/check-use-after-free.mlir b/mlir/test/Dialect/Transform/check-use-after-free.mlir
--- a/mlir/test/Dialect/Transform/check-use-after-free.mlir
+++ b/mlir/test/Dialect/Transform/check-use-after-free.mlir
@@ -2,7 +2,7 @@
 
 func.func @use_after_free_branching_control_flow() {
   // expected-note @below {{allocated here}}
-  %0 = transform.test_produce_self_handle_or_forward_operand
+  %0 = transform.test_produce_self_handle_or_forward_operand : () -> !transform.any_op
   transform.test_transform_op_with_regions {
     "transform.test_branching_transform_op_terminator"() : () -> ()
   },
@@ -11,14 +11,14 @@
     "transform.test_branching_transform_op_terminator"()[^bb1, ^bb2] : () -> ()
   ^bb1:
     // expected-note @below {{freed here}}
-    transform.test_consume_operand_of_op_kind_or_fail %0, "transform.test_produce_self_handle_or_forward_operand"
+    transform.test_consume_operand_of_op_kind_or_fail %0, "transform.test_produce_self_handle_or_forward_operand" : !transform.any_op
     "transform.test_branching_transform_op_terminator"()[^bb3] : () -> ()
   ^bb2:
     "transform.test_branching_transform_op_terminator"()[^bb3] : () -> ()
   ^bb3:
     // expected-warning @below {{operand #0 may be used after free}}
-    transform.sequence %0 : !pdl.operation failures(propagate) {
-    ^bb0(%arg0: !pdl.operation):
+    transform.sequence %0 : !transform.any_op failures(propagate) {
+    ^bb0(%arg0: !transform.any_op):
     }
     "transform.test_branching_transform_op_terminator"() : () -> ()
   }
@@ -29,7 +29,7 @@
 
 func.func @use_after_free_in_nested_op() {
   // expected-note @below {{allocated here}}
-  %0 = transform.test_produce_self_handle_or_forward_operand
+  %0 = transform.test_produce_self_handle_or_forward_operand : () -> !transform.any_op
   // expected-note @below {{freed here}}
   transform.test_transform_op_with_regions {
     "transform.test_branching_transform_op_terminator"() : () -> ()
@@ -38,7 +38,7 @@
   ^bb0:
     "transform.test_branching_transform_op_terminator"()[^bb1, ^bb2] : () -> ()
   ^bb1:
-    transform.test_consume_operand_of_op_kind_or_fail %0, "transform.test_produce_self_handle_or_forward_operand"
+    transform.test_consume_operand_of_op_kind_or_fail %0, "transform.test_produce_self_handle_or_forward_operand" : !transform.any_op
     "transform.test_branching_transform_op_terminator"()[^bb3] : () -> ()
   ^bb2:
     "transform.test_branching_transform_op_terminator"()[^bb3] : () -> ()
@@ -46,8 +46,8 @@
     "transform.test_branching_transform_op_terminator"() : () -> ()
   }
   // expected-warning @below {{operand #0 may be used after free}}
-  transform.sequence %0 : !pdl.operation failures(propagate) {
-    ^bb0(%arg0: !pdl.operation):
+  transform.sequence %0 : !transform.any_op failures(propagate) {
+    ^bb0(%arg0: !transform.any_op):
   }
   return
 }
@@ -56,29 +56,29 @@
 
 func.func @use_after_free_recursive_side_effects() {
   transform.sequence failures(propagate) {
-  ^bb0(%arg0: !pdl.operation):
+  ^bb0(%arg0: !transform.any_op):
     // expected-note @below {{allocated here}}
-    %0 = transform.sequence %arg0 : !pdl.operation -> !pdl.operation failures(propagate) attributes { ord = 1 } {
-    ^bb1(%arg1: !pdl.operation):
-      yield %arg1 : !pdl.operation
+    %0 = transform.sequence %arg0 : !transform.any_op -> !transform.any_op failures(propagate) attributes { ord = 1 } {
+    ^bb1(%arg1: !transform.any_op):
+      yield %arg1 : !transform.any_op
     }
-    transform.sequence %0 : !pdl.operation failures(propagate) attributes { ord = 2 } {
-    ^bb2(%arg2: !pdl.operation):
+    transform.sequence %0 : !transform.any_op failures(propagate) attributes { ord = 2 } {
+    ^bb2(%arg2: !transform.any_op):
     }
-    transform.sequence %0 : !pdl.operation failures(propagate) attributes { ord = 3 } {
-    ^bb3(%arg3: !pdl.operation):
+    transform.sequence %0 : !transform.any_op failures(propagate) attributes { ord = 3 } {
+    ^bb3(%arg3: !transform.any_op):
     }
 
     // `transform.sequence` has recursive side effects so it has the same "free"
     // as the child op it contains.
     // expected-note @below {{freed here}}
-    transform.sequence %0 : !pdl.operation failures(propagate) attributes { ord = 4 } {
-    ^bb4(%arg4: !pdl.operation):
-      test_consume_operand_of_op_kind_or_fail %0, "transform.sequence"
+    transform.sequence %0 : !transform.any_op failures(propagate) attributes { ord = 4 } {
+    ^bb4(%arg4: !transform.any_op):
+      test_consume_operand_of_op_kind_or_fail %0, "transform.sequence" : !transform.any_op
     }
     // expected-warning @below {{operand #0 may be used after free}}
-    transform.sequence %0 : !pdl.operation failures(propagate) attributes { ord = 5 } {
-    ^bb3(%arg3: !pdl.operation):
+    transform.sequence %0 : !transform.any_op failures(propagate) attributes { ord = 5 } {
+    ^bb3(%arg3: !transform.any_op):
     }
   }
   return
@@ -88,24 +88,24 @@
 
 func.func @use_after_free() {
   transform.sequence failures(propagate) {
-  ^bb0(%arg0: !pdl.operation):
+  ^bb0(%arg0: !transform.any_op):
     // expected-note @below {{allocated here}}
-    %0 = transform.sequence %arg0 : !pdl.operation -> !pdl.operation failures(propagate) attributes { ord = 1 } {
-    ^bb1(%arg1: !pdl.operation):
-      yield %arg1 : !pdl.operation
+    %0 = transform.sequence %arg0 : !transform.any_op -> !transform.any_op failures(propagate) attributes { ord = 1 } {
+    ^bb1(%arg1: !transform.any_op):
+      yield %arg1 : !transform.any_op
     }
-    transform.sequence %0 : !pdl.operation failures(propagate) attributes { ord = 2 } {
-    ^bb2(%arg2: !pdl.operation):
+    transform.sequence %0 : !transform.any_op failures(propagate) attributes { ord = 2 } {
+    ^bb2(%arg2: !transform.any_op):
     }
-    transform.sequence %0 : !pdl.operation failures(propagate) attributes { ord = 3 } {
-    ^bb3(%arg3: !pdl.operation):
+    transform.sequence %0 : !transform.any_op failures(propagate) attributes { ord = 3 } {
+    ^bb3(%arg3: !transform.any_op):
     }
 
     // expected-note @below {{freed here}}
-    test_consume_operand_of_op_kind_or_fail %0, "transform.sequence"
+    test_consume_operand_of_op_kind_or_fail %0, "transform.sequence" : !transform.any_op
     // expected-warning @below {{operand #0 may be used after free}}
-    transform.sequence %0 : !pdl.operation failures(propagate) attributes { ord = 5 } {
-    ^bb3(%arg3: !pdl.operation):
+    transform.sequence %0 : !transform.any_op failures(propagate) attributes { ord = 5 } {
+    ^bb3(%arg3: !transform.any_op):
     }
   }
   return
@@ -118,7 +118,7 @@
 // be reported as use-after-free.
 func.func @use_after_free_self_cycle() {
   // expected-note @below {{allocated here}}
-  %0 = transform.test_produce_self_handle_or_forward_operand
+  %0 = transform.test_produce_self_handle_or_forward_operand : () -> !transform.any_op
   transform.test_transform_op_with_regions {
     "transform.test_branching_transform_op_terminator"() : () -> ()
   },
@@ -127,12 +127,12 @@
     "transform.test_branching_transform_op_terminator"()[^bb1] : () -> ()
   ^bb1:
     // expected-warning @below {{operand #0 may be used after free}}
-    transform.sequence %0 : !pdl.operation failures(propagate) {
-    ^bb0(%arg0: !pdl.operation):
+    transform.sequence %0 : !transform.any_op failures(propagate) {
+    ^bb0(%arg0: !transform.any_op):
     }
     // expected-warning @below {{operand #0 may be used after free}}
     // expected-note @below {{freed here}}
-    transform.test_consume_operand_of_op_kind_or_fail %0, "transform.test_produce_self_handle_or_forward_operand"
+    transform.test_consume_operand_of_op_kind_or_fail %0, "transform.test_produce_self_handle_or_forward_operand" : !transform.any_op
     "transform.test_branching_transform_op_terminator"()[^bb1, ^bb2] : () -> ()
   ^bb2:
     "transform.test_branching_transform_op_terminator"() : () -> ()
@@ -147,7 +147,7 @@
 // use-after-free.
 func.func @use_after_free_cycle() {
   // expected-note @below {{allocated here}}
-  %0 = transform.test_produce_self_handle_or_forward_operand
+  %0 = transform.test_produce_self_handle_or_forward_operand : () -> !transform.any_op
   transform.test_transform_op_with_regions {
     "transform.test_branching_transform_op_terminator"() : () -> ()
   },
@@ -157,7 +157,7 @@
   ^bb1:
     // expected-warning @below {{operand #0 may be used after free}}
     // expected-note @below {{freed here}}
-    transform.test_consume_operand_of_op_kind_or_fail %0, "transform.test_produce_self_handle_or_forward_operand"
+    transform.test_consume_operand_of_op_kind_or_fail %0, "transform.test_produce_self_handle_or_forward_operand" : !transform.any_op
     "transform.test_branching_transform_op_terminator"()[^bb2, ^bb3] : () -> ()
   ^bb2:
     "transform.test_branching_transform_op_terminator"()[^bb1] : () -> ()
@@ -172,8 +172,8 @@
 // This should not crash.
 
 transform.sequence failures(propagate) {
-^bb0(%arg0: !pdl.operation):
-  alternatives %arg0 : !pdl.operation {
-  ^bb0(%arg1: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
+  alternatives %arg0 : !transform.any_op {
+  ^bb0(%arg1: !transform.any_op):
   }
 }
diff --git a/mlir/test/Dialect/Transform/expensive-checks.mlir b/mlir/test/Dialect/Transform/expensive-checks.mlir
--- a/mlir/test/Dialect/Transform/expensive-checks.mlir
+++ b/mlir/test/Dialect/Transform/expensive-checks.mlir
@@ -7,7 +7,7 @@
 }
 
 transform.with_pdl_patterns {
-^bb0(%arg0: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
   pdl.pattern @return : benefit(1) {
     %0 = operands
     %1 = types
@@ -15,15 +15,15 @@
     rewrite %2 with "transform.dialect"
   }
 
-  sequence %arg0 : !pdl.operation failures(propagate) {
-  ^bb1(%arg1: !pdl.operation):
+  sequence %arg0 : !transform.any_op failures(propagate) {
+  ^bb1(%arg1: !transform.any_op):
     // expected-note @below {{handle to invalidated ops}}
-    %0 = pdl_match @return in %arg1 : (!pdl.operation) -> !pdl.operation
-    %1 = get_closest_isolated_parent %0 : (!pdl.operation) -> !pdl.operation
+    %0 = pdl_match @return in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = get_closest_isolated_parent %0 : (!transform.any_op) -> !transform.any_op
     // expected-note @below {{invalidated by this transform op that consumes its operand #0}}
-    test_consume_operand %1 : !pdl.operation
+    test_consume_operand %1 : !transform.any_op
     // expected-error @below {{op uses a handle invalidated by a previously executed transform op}}
-    test_print_remark_at_operand %0, "remark" : !pdl.operation
+    test_print_remark_at_operand %0, "remark" : !transform.any_op
   }
 }
 
@@ -36,7 +36,7 @@
 func.func private @func2()
 
 transform.with_pdl_patterns {
-^bb0(%arg0: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
   pdl.pattern @func : benefit(1) {
     %0 = operands
     %1 = types
@@ -50,14 +50,14 @@
     rewrite %2 with "transform.dialect"
   }
 
-  sequence %arg0 : !pdl.operation failures(propagate) {
-  ^bb1(%arg1: !pdl.operation):
-    %0 = pdl_match @func in %arg1 : (!pdl.operation) -> !pdl.operation
-    %1 = pdl_match @return in %arg1 : (!pdl.operation) -> !pdl.operation
-    %2 = replicate num(%0) %1 : !pdl.operation, !pdl.operation
+  sequence %arg0 : !transform.any_op failures(propagate) {
+  ^bb1(%arg1: !transform.any_op):
+    %0 = pdl_match @func in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = pdl_match @return in %arg1 : (!transform.any_op) -> !transform.any_op
+    %2 = replicate num(%0) %1 : !transform.any_op, !transform.any_op
     // expected-error @below {{a handle passed as operand #0 and consumed by this operation points to a payload entity more than once}}
-    test_consume_operand %2 : !pdl.operation
-    test_print_remark_at_operand %0, "remark" : !pdl.operation
+    test_consume_operand %2 : !transform.any_op
+    test_print_remark_at_operand %0, "remark" : !transform.any_op
   }
 }
 
@@ -69,14 +69,14 @@
 module {
 
   transform.sequence failures(propagate) {
-  ^bb0(%0: !pdl.operation):
-    %1 = transform.test_copy_payload %0
+  ^bb0(%0: !transform.any_op):
+    %1 = transform.test_copy_payload %0 : (!transform.any_op) -> !transform.any_op
     // expected-note @below {{handle to invalidated ops}}
-    %2 = transform.test_copy_payload %0
+    %2 = transform.test_copy_payload %0 : (!transform.any_op) ->!transform.any_op
     // expected-note @below {{invalidated by this transform op that consumes its operand #0}}
-    transform.test_consume_operand %1 : !pdl.operation
+    transform.test_consume_operand %1 : !transform.any_op
     // expected-error @below {{op uses a handle invalidated by a previously executed transform op}}
-    transform.test_consume_operand %2 : !pdl.operation
+    transform.test_consume_operand %2 : !transform.any_op
   }
 }
 
@@ -87,16 +87,16 @@
 module {
 
   transform.sequence failures(propagate) {
-  ^bb0(%0: !pdl.operation):
-    %1 = transform.test_copy_payload %0
+  ^bb0(%0: !transform.any_op):
+    %1 = transform.test_copy_payload %0 : (!transform.any_op) -> !transform.any_op
     // expected-note @below {{handle to invalidated ops}}
-    %2 = transform.test_copy_payload %0
+    %2 = transform.test_copy_payload %0 : (!transform.any_op) -> !transform.any_op
     // Consuming two handles in the same operation is invalid if they point
     // to overlapping sets of payload IR ops.
     //
     // expected-error @below {{op uses a handle invalidated by a previously executed transform op}}
     // expected-note @below {{invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities}}
-    transform.test_consume_operand %1, %2 : !pdl.operation
+    transform.test_consume_operand %1, %2 : !transform.any_op, !transform.any_op
   }
 }
 
@@ -107,10 +107,10 @@
 module {
 
   transform.sequence failures(propagate) {
-  ^bb0(%0: !pdl.operation):
-    %1 = transform.test_copy_payload %0
-    %2 = transform.test_copy_payload %0
-    transform.merge_handles %1, %2 { deduplicate } : !pdl.operation
+  ^bb0(%0: !transform.any_op):
+    %1 = transform.test_copy_payload %0 : (!transform.any_op) -> !transform.any_op
+    %2 = transform.test_copy_payload %0 : (!transform.any_op) -> !transform.any_op
+    transform.merge_handles %1, %2 { deduplicate } : !transform.any_op
   }
 }
 // -----
diff --git a/mlir/test/Dialect/Transform/multi-arg-top-level-params.mlir b/mlir/test/Dialect/Transform/multi-arg-top-level-params.mlir
--- a/mlir/test/Dialect/Transform/multi-arg-top-level-params.mlir
+++ b/mlir/test/Dialect/Transform/multi-arg-top-level-params.mlir
@@ -2,7 +2,7 @@
 // RUN:          --split-input-file --verify-diagnostics
 
 transform.sequence failures(propagate) {
-^bb0(%arg0: !pdl.operation, %arg1: !transform.param<i64>, %arg2: !transform.param<i64>):
+^bb0(%arg0: !transform.any_op, %arg1: !transform.param<i64>, %arg2: !transform.param<i64>):
   // expected-remark @below {{1 : i64, 2 : i64, 3 : i64}}
   transform.test_print_param %arg1 : !transform.param<i64>
   // expected-remark @below {{42 : i64, 45 : i64}}
@@ -12,7 +12,7 @@
 // -----
 
 transform.sequence failures(propagate) {
-^bb0(%arg0: !pdl.operation, %arg1: !transform.any_op, %arg2: !transform.param<i64>):
+^bb0(%arg0: !transform.any_op, %arg1: !transform.any_op, %arg2: !transform.param<i64>):
   // expected-error @above {{wrong kind of value provided for top-level operation handle}}
 }
 
@@ -20,5 +20,5 @@
 
 // expected-error @below {{operation expects 3 extra value bindings, but 2 were provided to the interpreter}}
 transform.sequence failures(propagate) {
-^bb0(%arg0: !pdl.operation, %arg1: !transform.param<i64>, %arg2: !transform.param<i64>, %arg3: !transform.param<i64>):
+^bb0(%arg0: !transform.any_op, %arg1: !transform.param<i64>, %arg2: !transform.param<i64>, %arg3: !transform.param<i64>):
 }
diff --git a/mlir/test/Dialect/Transform/ops-invalid.mlir b/mlir/test/Dialect/Transform/ops-invalid.mlir
--- a/mlir/test/Dialect/Transform/ops-invalid.mlir
+++ b/mlir/test/Dialect/Transform/ops-invalid.mlir
@@ -15,10 +15,10 @@
 
 // expected-note @below {{nested in another possible top-level op}}
 transform.sequence failures(propagate) {
-^bb0(%arg0: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
   // expected-error @below {{expects operands to be provided for a nested op}}
   transform.sequence failures(propagate) {
-  ^bb1(%arg1: !pdl.operation):
+  ^bb1(%arg1: !transform.any_op):
   }
 }
 
@@ -34,7 +34,7 @@
 
 // expected-error @below {{expected children ops to implement TransformOpInterface}}
 transform.sequence failures(propagate) {
-^bb0(%arg0: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
   // expected-note @below {{op without interface}}
   arith.constant 42.0 : f32
 }
@@ -42,8 +42,8 @@
 // -----
 
 // expected-error @below {{expects the types of the terminator operands to match the types of the result}}
-%0 = transform.sequence -> !pdl.operation failures(propagate) {
-^bb0(%arg0: !pdl.operation):
+%0 = transform.sequence -> !transform.any_op failures(propagate) {
+^bb0(%arg0: !transform.any_op):
   // expected-note @below {{terminator}}
   transform.yield
 }
@@ -54,7 +54,7 @@
 ^bb0(%arg0: !transform.any_op):
   // expected-error @below {{expects the type of the block argument to match the type of the operand}}
   transform.sequence %arg0: !transform.any_op failures(propagate) {
-  ^bb1(%arg1: !pdl.operation):
+  ^bb1(%arg1: !transform.op<"builtin.module">):
     transform.yield
   }
 }
@@ -82,10 +82,10 @@
 
 // expected-note @below {{nested in another possible top-level op}}
 transform.with_pdl_patterns {
-^bb0(%arg0: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
   // expected-error @below {{expects operands to be provided for a nested op}}
   transform.sequence failures(propagate) {
-  ^bb1(%arg1: !pdl.operation):
+  ^bb1(%arg1: !transform.any_op):
   }
 }
 
@@ -93,14 +93,14 @@
 
 // expected-error @below {{expects only one non-pattern op in its body}}
 transform.with_pdl_patterns {
-^bb0(%arg0: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
   // expected-note @below {{first non-pattern op}}
   transform.sequence failures(propagate) {
-  ^bb1(%arg1: !pdl.operation):
+  ^bb1(%arg1: !transform.any_op):
   }
   // expected-note @below {{second non-pattern op}}
   transform.sequence failures(propagate) {
-  ^bb1(%arg1: !pdl.operation):
+  ^bb1(%arg1: !transform.any_op):
   }
 }
 
@@ -108,7 +108,7 @@
 
 // expected-error @below {{expects only pattern and top-level transform ops in its body}}
 transform.with_pdl_patterns {
-^bb0(%arg0: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
   // expected-note @below {{offending op}}
   "test.something"() : () -> ()
 }
@@ -117,10 +117,10 @@
 
 // expected-note @below {{parent operation}}
 transform.with_pdl_patterns {
-^bb0(%arg0: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
    // expected-error @below {{op cannot be nested}}
-  transform.with_pdl_patterns %arg0 : !pdl.operation {
-  ^bb1(%arg1: !pdl.operation):
+  transform.with_pdl_patterns %arg0 : !transform.any_op {
+  ^bb1(%arg1: !transform.any_op):
   }
 }
 
@@ -128,7 +128,7 @@
 
 // expected-error @below {{op expects at least one non-pattern op}}
 transform.with_pdl_patterns {
-^bb0(%arg0: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
   pdl.pattern @some : benefit(1) {
     %0 = pdl.operation "test.foo"
     pdl.rewrite %0 with "transform.dialect"
@@ -138,10 +138,10 @@
 // -----
 
 transform.sequence failures(propagate) {
-^bb0(%arg0: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
   // expected-error @below {{op expects at least one non-pattern op}}
-  with_pdl_patterns %arg0 : !pdl.operation {
-  ^bb1(%arg1: !pdl.operation):
+  with_pdl_patterns %arg0 : !transform.any_op {
+  ^bb1(%arg1: !transform.any_op):
   }
 }
 
@@ -155,7 +155,7 @@
 
 // expected-error @below {{expects a single-block region}}
 "transform.test_transform_unrestricted_op_no_interface"() ({
-^bb0(%arg0: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
   "test.potential_terminator"() : () -> ()
 ^bb1:
   "test.potential_terminator"() : () -> ()
@@ -164,59 +164,59 @@
 // -----
 
 transform.sequence failures(propagate) {
-^bb0(%arg0: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
   // expected-error @below {{result #0 has more than one potential consumer}}
-  %0 = test_produce_self_handle_or_forward_operand
+  %0 = test_produce_self_handle_or_forward_operand : () -> !transform.any_op
   // expected-note @below {{used here as operand #0}}
-  test_consume_operand_of_op_kind_or_fail %0, "transform.test_produce_self_handle_or_forward_operand"
+  test_consume_operand_of_op_kind_or_fail %0, "transform.test_produce_self_handle_or_forward_operand" : !transform.any_op
   // expected-note @below {{used here as operand #0}}
-  test_consume_operand_of_op_kind_or_fail %0, "transform.test_produce_self_handle_or_forward_operand"
+  test_consume_operand_of_op_kind_or_fail %0, "transform.test_produce_self_handle_or_forward_operand" : !transform.any_op
 }
 
 // -----
 
 transform.sequence failures(propagate) {
-^bb0(%arg0: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
   // expected-error @below {{result #0 has more than one potential consumer}}
-  %0 = test_produce_self_handle_or_forward_operand
+  %0 = test_produce_self_handle_or_forward_operand : () -> !transform.any_op
   // expected-note @below {{used here as operand #0}}
-  test_consume_operand_of_op_kind_or_fail %0, "transform.test_produce_self_handle_or_forward_operand"
+  test_consume_operand_of_op_kind_or_fail %0, "transform.test_produce_self_handle_or_forward_operand" : !transform.any_op
   // expected-note @below {{used here as operand #0}}
-  transform.sequence %0 : !pdl.operation failures(propagate) {
-  ^bb1(%arg1: !pdl.operation):
-    test_consume_operand_of_op_kind_or_fail %arg1, "transform.test_produce_self_handle_or_forward_operand"
+  transform.sequence %0 : !transform.any_op failures(propagate) {
+  ^bb1(%arg1: !transform.any_op):
+    test_consume_operand_of_op_kind_or_fail %arg1, "transform.test_produce_self_handle_or_forward_operand" : !transform.any_op
   }
 }
 
 // -----
 
 transform.sequence failures(propagate) {
-^bb0(%arg0: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
   // expected-error @below {{result #0 has more than one potential consumer}}
-  %0 = test_produce_self_handle_or_forward_operand
+  %0 = test_produce_self_handle_or_forward_operand : () -> !transform.any_op
   // expected-note @below {{used here as operand #0}}
-  test_consume_operand_of_op_kind_or_fail %0, "transform.test_produce_self_handle_or_forward_operand"
-  transform.sequence %0 : !pdl.operation failures(propagate) {
-  ^bb1(%arg1: !pdl.operation):
+  test_consume_operand_of_op_kind_or_fail %0, "transform.test_produce_self_handle_or_forward_operand" : !transform.any_op
+  transform.sequence %0 : !transform.any_op failures(propagate) {
+  ^bb1(%arg1: !transform.any_op):
     // expected-note @below {{used here as operand #0}}
-    test_consume_operand_of_op_kind_or_fail %0, "transform.test_produce_self_handle_or_forward_operand"
+    test_consume_operand_of_op_kind_or_fail %0, "transform.test_produce_self_handle_or_forward_operand" : !transform.any_op
   }
 }
 
 // -----
 
 transform.sequence failures(propagate) {
-^bb0(%arg0: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
   // expected-error @below {{result #0 has more than one potential consumer}}
-  %0 = test_produce_self_handle_or_forward_operand
+  %0 = test_produce_self_handle_or_forward_operand : () -> !transform.any_op
   // expected-note @below {{used here as operand #0}}
-  test_consume_operand_of_op_kind_or_fail %0, "transform.test_produce_self_handle_or_forward_operand"
+  test_consume_operand_of_op_kind_or_fail %0, "transform.test_produce_self_handle_or_forward_operand" : !transform.any_op
   // expected-note @below {{used here as operand #0}}
-  transform.sequence %0 : !pdl.operation failures(propagate) {
-  ^bb1(%arg1: !pdl.operation):
-    transform.sequence %arg1 : !pdl.operation failures(propagate) {
-    ^bb2(%arg2: !pdl.operation):
-      test_consume_operand_of_op_kind_or_fail %arg2, "transform.test_produce_self_handle_or_forward_operand"
+  transform.sequence %0 : !transform.any_op failures(propagate) {
+  ^bb1(%arg1: !transform.any_op):
+    transform.sequence %arg1 : !transform.any_op failures(propagate) {
+    ^bb2(%arg2: !transform.any_op):
+      test_consume_operand_of_op_kind_or_fail %arg2, "transform.test_produce_self_handle_or_forward_operand" : !transform.any_op
     }
   }
 }
@@ -224,7 +224,7 @@
 // -----
 
 transform.sequence failures(propagate) {
-^bb1(%arg1: !pdl.operation):
+^bb1(%arg1: !transform.any_op):
   // expected-error @below {{expects at least one region}}
   transform.alternatives
 }
@@ -232,13 +232,13 @@
 // -----
 
 transform.sequence failures(propagate) {
-^bb1(%arg1: !pdl.operation):
+^bb1(%arg1: !transform.any_op):
   // expected-error @below {{expects terminator operands to have the same type as results of the operation}}
-  %2 = transform.alternatives %arg1 : !pdl.operation -> !pdl.operation {
-  ^bb2(%arg2: !pdl.operation):
-    transform.yield %arg2 : !pdl.operation
+  %2 = transform.alternatives %arg1 : !transform.any_op -> !transform.any_op {
+  ^bb2(%arg2: !transform.any_op):
+    transform.yield %arg2 : !transform.any_op
   }, {
-  ^bb2(%arg2: !pdl.operation):
+  ^bb2(%arg2: !transform.any_op):
     // expected-note @below {{terminator}}
     transform.yield
   }
@@ -255,16 +255,16 @@
 // -----
 
 transform.sequence failures(propagate) {
-^bb0(%arg0: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
   // expected-error @below {{result #0 has more than one potential consumer}}
-  %0 = test_produce_self_handle_or_forward_operand
+  %0 = test_produce_self_handle_or_forward_operand : () -> !transform.any_op
   // expected-note @below {{used here as operand #0}}
-  transform.foreach %0 : !pdl.operation {
-  ^bb1(%arg1: !pdl.operation):
-    transform.test_consume_operand %arg1 : !pdl.operation
+  transform.foreach %0 : !transform.any_op {
+  ^bb1(%arg1: !transform.any_op):
+    transform.test_consume_operand %arg1 : !transform.any_op
   }
   // expected-note @below {{used here as operand #0}}
-  transform.test_consume_operand %0 : !pdl.operation
+  transform.test_consume_operand %0 : !transform.any_op
 }
 
 // -----
diff --git a/mlir/test/Dialect/Transform/ops.mlir b/mlir/test/Dialect/Transform/ops.mlir
--- a/mlir/test/Dialect/Transform/ops.mlir
+++ b/mlir/test/Dialect/Transform/ops.mlir
@@ -1,42 +1,42 @@
 // RUN: mlir-opt %s | mlir-opt | FileCheck %s
 
 // CHECK: transform.sequence
-// CHECK: ^{{.+}}(%{{.+}}: !pdl.operation):
+// CHECK: ^{{.+}}(%{{.+}}: !transform.any_op):
 transform.sequence failures(propagate) {
-^bb0(%arg0: !pdl.operation):
-  // CHECK: sequence %{{.+}} : !pdl.operation
-  // CHECK: ^{{.+}}(%{{.+}}: !pdl.operation):
-  sequence %arg0 : !pdl.operation failures(propagate) {
-  ^bb1(%arg1: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
+  // CHECK: sequence %{{.+}} : !transform.any_op
+  // CHECK: ^{{.+}}(%{{.+}}: !transform.any_op):
+  sequence %arg0 : !transform.any_op failures(propagate) {
+  ^bb1(%arg1: !transform.any_op):
   }
 }
 
 // CHECK: transform.with_pdl_patterns
-// CHECK: ^{{.+}}(%[[ARG:.+]]: !pdl.operation):
+// CHECK: ^{{.+}}(%[[ARG:.+]]: !transform.any_op):
 transform.with_pdl_patterns {
-^bb0(%arg0: !pdl.operation):
-  // CHECK: sequence %[[ARG]] : !pdl.operation
-  sequence %arg0 : !pdl.operation failures(propagate) {
-  ^bb1(%arg1: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
+  // CHECK: sequence %[[ARG]] : !transform.any_op
+  sequence %arg0 : !transform.any_op failures(propagate) {
+  ^bb1(%arg1: !transform.any_op):
   }
 }
 
 // Using the same value multiple times without consuming it is fine.
 // CHECK: transform.sequence
-// CHECK: %[[V:.+]] = sequence %{{.*}} : !pdl.operation -> !pdl.operation
+// CHECK: %[[V:.+]] = sequence %{{.*}} : !transform.any_op -> !transform.any_op
 // CHECK: sequence %[[V]]
 // CHECK: sequence %[[V]]
 transform.sequence failures(propagate) {
-^bb0(%arg0: !pdl.operation):
-  %0 = transform.sequence %arg0 : !pdl.operation -> !pdl.operation failures(propagate) {
-  ^bb1(%arg1: !pdl.operation):
-    yield %arg1 : !pdl.operation
+^bb0(%arg0: !transform.any_op):
+  %0 = transform.sequence %arg0 : !transform.any_op -> !transform.any_op failures(propagate) {
+  ^bb1(%arg1: !transform.any_op):
+    yield %arg1 : !transform.any_op
   }
-  transform.sequence %0 : !pdl.operation failures(propagate) {
-  ^bb2(%arg2: !pdl.operation):
+  transform.sequence %0 : !transform.any_op failures(propagate) {
+  ^bb2(%arg2: !transform.any_op):
   }
-  transform.sequence %0 : !pdl.operation failures(propagate) {
-  ^bb3(%arg3: !pdl.operation):
+  transform.sequence %0 : !transform.any_op failures(propagate) {
+  ^bb3(%arg3: !transform.any_op):
   }
 }
 
@@ -70,17 +70,17 @@
 // CHECK: transform.sequence
 // CHECK: foreach
 transform.sequence failures(propagate) {
-^bb0(%arg0: !pdl.operation):
-  transform.foreach %arg0 : !pdl.operation {
-  ^bb1(%arg1: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
+  transform.foreach %arg0 : !transform.any_op {
+  ^bb1(%arg1: !transform.any_op):
   }
 }
 
 // CHECK: transform.sequence
 transform.sequence failures(propagate) {
-^bb0(%arg0: !pdl.operation):
-  // CHECK: cast %{{.*}} : !pdl.operation to !transform.any_op
-  %0 = cast %arg0: !pdl.operation to !transform.any_op
+^bb0(%arg0: !transform.any_op):
+  // CHECK: cast %{{.*}} : !transform.any_op to !transform.any_op
+  %0 = cast %arg0: !transform.any_op to !transform.any_op
   // CHECK: cast %{{.*}} : !transform.any_op to !transform.op<"builtin.module">
   %1 = cast %0: !transform.any_op to !transform.op<"builtin.module">
 }
@@ -91,9 +91,9 @@
 // CHECK: print
 // CHECK: print
 transform.sequence failures(propagate) {
-^bb0(%arg0: !pdl.operation):
-  transform.print %arg0 : !pdl.operation
+^bb0(%arg0: !transform.any_op):
+  transform.print %arg0 : !transform.any_op
   transform.print
-  transform.print %arg0 {name = "test"} : !pdl.operation
+  transform.print %arg0 {name = "test"} : !transform.any_op
   transform.print {name = "test"}
 }
diff --git a/mlir/test/Dialect/Transform/test-dialect-injection.mlir b/mlir/test/Dialect/Transform/test-dialect-injection.mlir
--- a/mlir/test/Dialect/Transform/test-dialect-injection.mlir
+++ b/mlir/test/Dialect/Transform/test-dialect-injection.mlir
@@ -7,11 +7,11 @@
 transform.test_transform_op
 
 // CHECK: = transform.test_produce_self_handle_or_forward_operand {foo = "bar"}
-%0 = transform.test_produce_self_handle_or_forward_operand { foo = "bar" }
+%0 = transform.test_produce_self_handle_or_forward_operand { foo = "bar" } : () -> !transform.any_op
 
 // CHECK: transform.test_consume_operand_of_op_kind_or_fail %{{.*}},
-transform.test_consume_operand_of_op_kind_or_fail %0, "transform.test_produce_self_handle_or_forward_operand"
+transform.test_consume_operand_of_op_kind_or_fail %0, "transform.test_produce_self_handle_or_forward_operand" : !transform.any_op
 
 // Ensure that the extension type is roundtripped correctly.
-// CHECK: transform.cast %{{.*}} : !pdl.operation to !transform.test_dialect_op
-%1 = transform.cast %0: !pdl.operation to !transform.test_dialect_op
+// CHECK: transform.cast %{{.*}} : !transform.any_op to !transform.test_dialect_op
+%1 = transform.cast %0: !transform.any_op to !transform.test_dialect_op
diff --git a/mlir/test/Dialect/Transform/test-interpreter.mlir b/mlir/test/Dialect/Transform/test-interpreter.mlir
--- a/mlir/test/Dialect/Transform/test-interpreter.mlir
+++ b/mlir/test/Dialect/Transform/test-interpreter.mlir
@@ -10,18 +10,18 @@
 
 transform.sequence failures(propagate) {
 ^bb0(%arg0: !transform.any_op):
-  %0 = transform.test_produce_self_handle_or_forward_operand { foo = "bar" }
+  %0 = transform.test_produce_self_handle_or_forward_operand { foo = "bar" } : () -> !transform.any_op
   // expected-remark @below {{succeeded}}
-  transform.test_consume_operand_of_op_kind_or_fail %0, "transform.test_produce_self_handle_or_forward_operand"
+  transform.test_consume_operand_of_op_kind_or_fail %0, "transform.test_produce_self_handle_or_forward_operand" : !transform.any_op
 }
 
 // -----
 
 transform.sequence failures(propagate) {
 ^bb0(%arg0: !transform.any_op):
-  %0 = transform.test_produce_self_handle_or_forward_operand { foo = "bar" }
+  %0 = transform.test_produce_self_handle_or_forward_operand { foo = "bar" } : () -> !transform.any_op
   // expected-error @below {{expected the operand to be associated a payload op of kind transform.sequence got transform.test_produce_self_handle_or_forward_operand}}
-  transform.test_consume_operand_of_op_kind_or_fail %0, "transform.sequence"
+  transform.test_consume_operand_of_op_kind_or_fail %0, "transform.sequence" : !transform.any_op
 }
 
 // -----
@@ -31,18 +31,18 @@
 // to detect double-consumption.
 transform.sequence failures(propagate) {
 ^bb0(%arg0: !transform.any_op):
-  %0 = transform.test_produce_self_handle_or_forward_operand { foo = "bar" }
-  %1 = transform.test_copy_payload %0
+  %0 = transform.test_produce_self_handle_or_forward_operand { foo = "bar" } : () -> !transform.any_op
+  %1 = transform.test_copy_payload %0 : (!transform.any_op) -> !transform.any_op
   // expected-remark @below {{succeeded}}
-  transform.test_consume_operand_of_op_kind_or_fail %0, "transform.test_produce_self_handle_or_forward_operand"
+  transform.test_consume_operand_of_op_kind_or_fail %0, "transform.test_produce_self_handle_or_forward_operand" : !transform.any_op
 }
 
 // -----
 
 transform.sequence failures(propagate) {
-^bb0(%arg0: !pdl.operation):
-  sequence %arg0 : !pdl.operation failures(propagate) {
-  ^bb0(%arg1: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
+  sequence %arg0 : !transform.any_op failures(propagate) {
+  ^bb0(%arg1: !transform.any_op):
     // expected-remark @below {{applying transformation "a"}}
     test_transform_op "a"
     // expected-remark @below {{applying transformation "b"}}
@@ -59,36 +59,36 @@
 // -----
 
 transform.sequence failures(propagate) {
-^bb0(%arg0: !pdl.operation):
-  %0 = test_produce_self_handle_or_forward_operand
-  sequence %0 : !pdl.operation failures(propagate) {
-  ^bb0(%arg1: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
+  %0 = test_produce_self_handle_or_forward_operand : () -> !transform.any_op
+  sequence %0 : !transform.any_op failures(propagate) {
+  ^bb0(%arg1: !transform.any_op):
     // expected-remark @below {{succeeded}}
-    test_consume_operand_of_op_kind_or_fail %arg1, "transform.test_produce_self_handle_or_forward_operand"
+    test_consume_operand_of_op_kind_or_fail %arg1, "transform.test_produce_self_handle_or_forward_operand" : !transform.any_op
   }
 }
 
 // -----
 
 transform.sequence failures(propagate) {
-^bb0(%arg0: !pdl.operation):
-  %0 = sequence %arg0 : !pdl.operation -> !pdl.operation failures(propagate) {
-  ^bb0(%arg1: !pdl.operation):
-    %1 = test_produce_self_handle_or_forward_operand
-    yield %1 : !pdl.operation
+^bb0(%arg0: !transform.any_op):
+  %0 = sequence %arg0 : !transform.any_op -> !transform.any_op failures(propagate) {
+  ^bb0(%arg1: !transform.any_op):
+    %1 = test_produce_self_handle_or_forward_operand : () -> !transform.any_op
+    yield %1 : !transform.any_op
   }
   // expected-remark @below {{succeeded}}
-  test_consume_operand_of_op_kind_or_fail %0, "transform.test_produce_self_handle_or_forward_operand"
+  test_consume_operand_of_op_kind_or_fail %0, "transform.test_produce_self_handle_or_forward_operand" : !transform.any_op
 }
 
 // -----
 
 transform.with_pdl_patterns {
-^bb0(%arg0: !pdl.operation):
-  sequence %arg0 : !pdl.operation failures(propagate) {
-  ^bb0(%arg1: !pdl.operation):
-    %0 = pdl_match @some in %arg1 : (!pdl.operation) -> !pdl.operation
-    test_print_remark_at_operand %0, "matched" : !pdl.operation
+^bb0(%arg0: !transform.any_op):
+  sequence %arg0 : !transform.any_op failures(propagate) {
+  ^bb0(%arg1: !transform.any_op):
+    %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
+    test_print_remark_at_operand %0, "matched" : !transform.any_op
   }
 
   pdl.pattern @some : benefit(1) {
@@ -124,18 +124,18 @@
 }
 
 transform.with_pdl_patterns {
-^bb0(%arg0: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
   pdl.pattern @const : benefit(1) {
     %r = pdl.types
     %0 = pdl.operation "arith.constant" -> (%r : !pdl.range<type>)
     pdl.rewrite %0 with "transform.dialect"
   }
 
-  transform.sequence %arg0 : !pdl.operation failures(propagate) {
-  ^bb1(%arg1: !pdl.operation):
-    %f = pdl_match @const in %arg1 : (!pdl.operation) -> !pdl.operation
-    %m = get_closest_isolated_parent %f : (!pdl.operation) -> !pdl.operation
-    test_print_remark_at_operand %m, "parent function" : !pdl.operation
+  transform.sequence %arg0 : !transform.any_op failures(propagate) {
+  ^bb1(%arg1: !transform.any_op):
+    %f = pdl_match @const in %arg1 : (!transform.any_op) -> !transform.any_op
+    %m = get_closest_isolated_parent %f : (!transform.any_op) -> !transform.any_op
+    test_print_remark_at_operand %m, "parent function" : !transform.any_op
   }
 }
 
@@ -147,7 +147,7 @@
 }
 
 transform.with_pdl_patterns {
-^bb0(%arg0: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
   pdl.pattern @match_func : benefit(1) {
     %0 = pdl.operands
     %1 = pdl.types
@@ -155,22 +155,22 @@
     pdl.rewrite %2 with "transform.dialect"
   }
 
-  transform.sequence %arg0 : !pdl.operation failures(propagate) {
-  ^bb1(%arg1: !pdl.operation):
+  transform.sequence %arg0 : !transform.any_op failures(propagate) {
+  ^bb1(%arg1: !transform.any_op):
     // This is necessary to run the transformation on something other than the
     // top-level module, "alternatives" cannot be run on that.
-    %0 = pdl_match @match_func in %arg1 : (!pdl.operation) -> !pdl.operation
-    transform.alternatives %0 : !pdl.operation {
-    ^bb2(%arg2: !pdl.operation):
-      %1 = transform.test_produce_self_handle_or_forward_operand
+    %0 = pdl_match @match_func in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.alternatives %0 : !transform.any_op {
+    ^bb2(%arg2: !transform.any_op):
+      %1 = transform.test_produce_self_handle_or_forward_operand : () -> !transform.any_op
       // This operation fails, which triggers the next alternative without
       // reporting the error.
-      transform.test_consume_operand_of_op_kind_or_fail %1, "transform.sequence"
+      transform.test_consume_operand_of_op_kind_or_fail %1, "transform.sequence" : !transform.any_op
     }, {
-    ^bb2(%arg2: !pdl.operation):
-      %1 = transform.test_produce_self_handle_or_forward_operand
+    ^bb2(%arg2: !transform.any_op):
+      %1 = transform.test_produce_self_handle_or_forward_operand : () -> !transform.any_op
       // expected-remark @below {{succeeded}}
-      transform.test_consume_operand_of_op_kind_or_fail %1, "transform.test_produce_self_handle_or_forward_operand"
+      transform.test_consume_operand_of_op_kind_or_fail %1, "transform.test_produce_self_handle_or_forward_operand" : !transform.any_op
     }
   }
 }
@@ -185,7 +185,7 @@
 }
 
 transform.with_pdl_patterns {
-^bb0(%arg0: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
   pdl.pattern @match_call : benefit(1) {
     %0 = pdl.operands
     %1 = pdl.types
@@ -193,16 +193,16 @@
     pdl.rewrite %2 with "transform.dialect"
   }
 
-  transform.sequence %arg0 : !pdl.operation failures(propagate) {
-  ^bb1(%arg1: !pdl.operation):
-    %0 = pdl_match @match_call in %arg1 : (!pdl.operation) -> !pdl.operation
-    %1 = get_closest_isolated_parent %0 : (!pdl.operation) -> !pdl.operation
+  transform.sequence %arg0 : !transform.any_op failures(propagate) {
+  ^bb1(%arg1: !transform.any_op):
+    %0 = pdl_match @match_call in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = get_closest_isolated_parent %0 : (!transform.any_op) -> !transform.any_op
     // expected-error @below {{all alternatives failed}}
-    transform.alternatives %1 : !pdl.operation {
-    ^bb2(%arg2: !pdl.operation):
-      %2 = transform.pdl_match @match_call in %arg2 : (!pdl.operation) -> !pdl.operation
+    transform.alternatives %1 : !transform.any_op {
+    ^bb2(%arg2: !transform.any_op):
+      %2 = transform.pdl_match @match_call in %arg2 : (!transform.any_op) -> !transform.any_op
       // expected-remark @below {{applying}}
-      transform.test_emit_remark_and_erase_operand %2, "applying" {fail_after_erase}
+      transform.test_emit_remark_and_erase_operand %2, "applying" {fail_after_erase} : !transform.any_op
     }
   }
 }
@@ -218,7 +218,7 @@
 }
 
 transform.with_pdl_patterns {
-^bb0(%arg0: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
   pdl.pattern @match_call : benefit(1) {
     %0 = pdl.operands
     %1 = pdl.types
@@ -226,25 +226,25 @@
     pdl.rewrite %2 with "transform.dialect"
   }
 
-  transform.sequence %arg0 : !pdl.operation failures(propagate) {
-  ^bb1(%arg1: !pdl.operation):
-    %0 = pdl_match @match_call in %arg1 : (!pdl.operation) -> !pdl.operation
-    %1 = get_closest_isolated_parent %0 : (!pdl.operation) -> !pdl.operation
-    transform.alternatives %1 : !pdl.operation {
-    ^bb2(%arg2: !pdl.operation):
-      %2 = transform.pdl_match @match_call in %arg2 : (!pdl.operation) -> !pdl.operation
+  transform.sequence %arg0 : !transform.any_op failures(propagate) {
+  ^bb1(%arg1: !transform.any_op):
+    %0 = pdl_match @match_call in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = get_closest_isolated_parent %0 : (!transform.any_op) -> !transform.any_op
+    transform.alternatives %1 : !transform.any_op {
+    ^bb2(%arg2: !transform.any_op):
+      %2 = transform.pdl_match @match_call in %arg2 : (!transform.any_op) -> !transform.any_op
       // expected-remark @below {{applying}}
-      transform.test_emit_remark_and_erase_operand %2, "applying" {fail_after_erase}
+      transform.test_emit_remark_and_erase_operand %2, "applying" {fail_after_erase} : !transform.any_op
     }, {
-    ^bb2(%arg2: !pdl.operation):
-      %2 = transform.pdl_match @match_call in %arg2 : (!pdl.operation) -> !pdl.operation
-      transform.test_print_remark_at_operand %2, "still here" : !pdl.operation
+    ^bb2(%arg2: !transform.any_op):
+      %2 = transform.pdl_match @match_call in %arg2 : (!transform.any_op) -> !transform.any_op
+      transform.test_print_remark_at_operand %2, "still here" : !transform.any_op
       // This alternative succeeds.
     }, {
-    ^bb2(%arg2: !pdl.operation):
+    ^bb2(%arg2: !transform.any_op):
       // This alternative is never run, so we must not have a remark here.
-      %2 = transform.pdl_match @match_call in %arg2 : (!pdl.operation) -> !pdl.operation
-      transform.test_emit_remark_and_erase_operand %2, "should not happen" {fail_after_erase}
+      %2 = transform.pdl_match @match_call in %arg2 : (!transform.any_op) -> !transform.any_op
+      transform.test_emit_remark_and_erase_operand %2, "should not happen" {fail_after_erase} : !transform.any_op
     }
   }
 }
@@ -259,7 +259,7 @@
 }
 
 transform.with_pdl_patterns {
-^bb0(%arg0: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
   pdl.pattern @match_call : benefit(1) {
     %0 = pdl.operands
     %1 = pdl.types
@@ -267,20 +267,20 @@
     pdl.rewrite %2 with "transform.dialect"
   }
 
-  transform.sequence %arg0 : !pdl.operation failures(propagate) {
-  ^bb1(%arg1: !pdl.operation):
-    %0 = pdl_match @match_call in %arg1 : (!pdl.operation) -> !pdl.operation
-    %1 = get_closest_isolated_parent %0 : (!pdl.operation) -> !pdl.operation
-    transform.alternatives %1 : !pdl.operation {
-    ^bb2(%arg2: !pdl.operation):
-      %2 = transform.pdl_match @match_call in %arg2 : (!pdl.operation) -> !pdl.operation
+  transform.sequence %arg0 : !transform.any_op failures(propagate) {
+  ^bb1(%arg1: !transform.any_op):
+    %0 = pdl_match @match_call in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = get_closest_isolated_parent %0 : (!transform.any_op) -> !transform.any_op
+    transform.alternatives %1 : !transform.any_op {
+    ^bb2(%arg2: !transform.any_op):
+      %2 = transform.pdl_match @match_call in %arg2 : (!transform.any_op) -> !transform.any_op
       // expected-remark @below {{applying}}
-      transform.test_emit_remark_and_erase_operand %2, "applying" {fail_after_erase}
+      transform.test_emit_remark_and_erase_operand %2, "applying" {fail_after_erase} : !transform.any_op
     }, {
-    ^bb2(%arg2: !pdl.operation):
-      %2 = transform.pdl_match @match_call in %arg2 : (!pdl.operation) -> !pdl.operation
+    ^bb2(%arg2: !transform.any_op):
+      %2 = transform.pdl_match @match_call in %arg2 : (!transform.any_op) -> !transform.any_op
       // expected-remark @below {{applying second time}}
-      transform.test_emit_remark_and_erase_operand %2, "applying second time"
+      transform.test_emit_remark_and_erase_operand %2, "applying second time" : !transform.any_op
     }
   }
 }
@@ -295,7 +295,7 @@
 }
 
 transform.with_pdl_patterns {
-^bb0(%arg0: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
   pdl.pattern @match_call : benefit(1) {
     %0 = pdl.operands
     %1 = pdl.types
@@ -303,27 +303,27 @@
     pdl.rewrite %2 with "transform.dialect"
   }
 
-  transform.sequence %arg0 : !pdl.operation failures(propagate) {
-  ^bb1(%arg1: !pdl.operation):
-    %0 = pdl_match @match_call in %arg1 : (!pdl.operation) -> !pdl.operation
-    %1 = get_closest_isolated_parent %0 : (!pdl.operation) -> !pdl.operation
-    %2 = transform.alternatives %1 : !pdl.operation -> !pdl.operation {
-    ^bb2(%arg2: !pdl.operation):
-      %3 = transform.pdl_match @match_call in %arg2 : (!pdl.operation) -> !pdl.operation
+  transform.sequence %arg0 : !transform.any_op failures(propagate) {
+  ^bb1(%arg1: !transform.any_op):
+    %0 = pdl_match @match_call in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = get_closest_isolated_parent %0 : (!transform.any_op) -> !transform.any_op
+    %2 = transform.alternatives %1 : !transform.any_op -> !transform.any_op {
+    ^bb2(%arg2: !transform.any_op):
+      %3 = transform.pdl_match @match_call in %arg2 : (!transform.any_op) -> !transform.any_op
       // expected-remark @below {{applying}}
-      transform.test_emit_remark_and_erase_operand %3, "applying" {fail_after_erase}
-      %4 = transform.test_produce_self_handle_or_forward_operand %3
-      transform.yield %4 : !pdl.operation
+      transform.test_emit_remark_and_erase_operand %3, "applying" {fail_after_erase} : !transform.any_op
+      %4 = transform.test_produce_self_handle_or_forward_operand %3 : (!transform.any_op) -> !transform.any_op
+      transform.yield %4 : !transform.any_op
     }, {
-    ^bb2(%arg2: !pdl.operation):
-      %4 = transform.test_produce_self_handle_or_forward_operand
-      transform.yield %4 : !pdl.operation
+    ^bb2(%arg2: !transform.any_op):
+      %4 = transform.test_produce_self_handle_or_forward_operand : () -> !transform.any_op
+      transform.yield %4 : !transform.any_op
     }
     // The first alternative failed, so the returned value is taken from the
     // second alternative, associated test_produce_self_handle_or_forward_operand rather
     // than pdl_match.
     // expected-remark @below {{succeeded}}
-    transform.test_consume_operand_of_op_kind_or_fail %2, "transform.test_produce_self_handle_or_forward_operand"
+    transform.test_consume_operand_of_op_kind_or_fail %2, "transform.test_produce_self_handle_or_forward_operand" : !transform.any_op
   }
 }
 
@@ -343,16 +343,16 @@
   }
 
   transform.sequence failures(propagate) {
-  ^bb1(%arg1: !pdl.operation):
+  ^bb1(%arg1: !transform.any_op):
     // expected-error @below {{scope must not contain the transforms being applied}}
-    transform.alternatives %arg1 : !pdl.operation {
-    ^bb2(%arg2: !pdl.operation):
-      %0 = transform.test_produce_self_handle_or_forward_operand
-      transform.test_consume_operand_of_op_kind_or_fail %0, "transform.sequence"
+    transform.alternatives %arg1 : !transform.any_op {
+    ^bb2(%arg2: !transform.any_op):
+      %0 = transform.test_produce_self_handle_or_forward_operand : () -> !transform.any_op
+      transform.test_consume_operand_of_op_kind_or_fail %0, "transform.sequence" : !transform.any_op
     }, {
-    ^bb2(%arg2: !pdl.operation):
-      %0 = transform.test_produce_self_handle_or_forward_operand
-      transform.test_consume_operand_of_op_kind_or_fail %0, "transform.test_produce_self_handle_or_forward_operand"
+    ^bb2(%arg2: !transform.any_op):
+      %0 = transform.test_produce_self_handle_or_forward_operand : () -> !transform.any_op
+      transform.test_consume_operand_of_op_kind_or_fail %0, "transform.test_produce_self_handle_or_forward_operand" : !transform.any_op
     }
   }
 }
@@ -368,7 +368,7 @@
 }
 
 transform.with_pdl_patterns {
-^bb0(%arg0: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
   pdl.pattern @match_const : benefit(1) {
     %0 = pdl.operands
     %1 = pdl.types
@@ -377,13 +377,13 @@
   }
 
 
-  sequence %arg0 : !pdl.operation failures(propagate) {
-  ^bb1(%arg1: !pdl.operation):
-    %0 = transform.pdl_match @match_const in %arg1 : (!pdl.operation) -> !pdl.operation
-    %1 = transform.loop.get_parent_for %0 : (!pdl.operation) -> !pdl.operation
+  sequence %arg0 : !transform.any_op failures(propagate) {
+  ^bb1(%arg1: !transform.any_op):
+    %0 = transform.pdl_match @match_const in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.loop.get_parent_for %0 : (!transform.any_op) -> !transform.any_op
     // expected-error @below {{only isolated-from-above ops can be alternative scopes}}
-    alternatives %1 : !pdl.operation {
-    ^bb2(%arg2: !pdl.operation):
+    alternatives %1 : !transform.any_op {
+    ^bb2(%arg2: !transform.any_op):
     }
   }
 }
@@ -396,7 +396,7 @@
 }
 
 transform.with_pdl_patterns {
-^bb0(%arg0: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
   pdl.pattern @some : benefit(1) {
     %0 = pdl.operands
     %1 = pdl.types
@@ -404,12 +404,12 @@
     pdl.rewrite %2 with "transform.dialect"
   }
 
-  transform.sequence %arg0 : !pdl.operation failures(propagate) {
-  ^bb0(%arg1: !pdl.operation):
-    %0 = pdl_match @some in %arg1 : (!pdl.operation) -> !pdl.operation
+  transform.sequence %arg0 : !transform.any_op failures(propagate) {
+  ^bb0(%arg1: !transform.any_op):
+    %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
     // expected-error @below {{application of transform.test_wrong_number_of_results expected to produce 3 results (actually produced 1).}}
     // expected-note @below {{if you need variadic results, consider a generic `apply` instead of the specialized `applyToOne`.}}
-    transform.test_wrong_number_of_results %0
+    transform.test_wrong_number_of_results %0 : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
   }
 }
 
@@ -423,7 +423,7 @@
 }
 
 transform.with_pdl_patterns {
-^bb0(%arg0: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
   pdl.pattern @some : benefit(1) {
     %0 = pdl.operands
     %1 = pdl.types
@@ -431,12 +431,12 @@
     pdl.rewrite %2 with "transform.dialect"
   }
 
-  transform.sequence %arg0 : !pdl.operation failures(propagate) {
-  ^bb0(%arg1: !pdl.operation):
-    %0 = pdl_match @some in %arg1 : (!pdl.operation) -> !pdl.operation
+  transform.sequence %arg0 : !transform.any_op failures(propagate) {
+  ^bb0(%arg1: !transform.any_op):
+    %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
     // expected-error @below {{application of transform.test_wrong_number_of_multi_results expected to produce 1 results (actually produced 0)}}
     // expected-note @below {{if you need variadic results, consider a generic `apply` instead of the specialized `applyToOne`.}}
-    transform.test_wrong_number_of_multi_results %0
+    transform.test_wrong_number_of_multi_results %0 : (!transform.any_op) -> (!transform.any_op)
   }
 }
 
@@ -450,7 +450,7 @@
 }
 
 transform.with_pdl_patterns {
-^bb0(%arg0: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
   pdl.pattern @some : benefit(1) {
     %0 = pdl.operands
     %1 = pdl.types
@@ -458,11 +458,11 @@
     pdl.rewrite %2 with "transform.dialect"
   }
 
-  transform.sequence %arg0 : !pdl.operation failures(propagate) {
-  ^bb0(%arg1: !pdl.operation):
-    %0 = pdl_match @some in %arg1 : (!pdl.operation) -> !pdl.operation
+  transform.sequence %arg0 : !transform.any_op failures(propagate) {
+  ^bb0(%arg1: !transform.any_op):
+    %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
     // Transform matches 3 ops and produces 2 results.
-    %1:2 = transform.test_correct_number_of_multi_results %0
+    %1:2 = transform.test_correct_number_of_multi_results %0 : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
   }
 }
 
@@ -474,7 +474,7 @@
 }
 
 transform.with_pdl_patterns {
-^bb0(%arg0: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
   pdl.pattern @some : benefit(1) {
     %0 = pdl.operands
     %1 = pdl.types
@@ -482,11 +482,11 @@
     pdl.rewrite %2 with "transform.dialect"
   }
 
-  transform.sequence %arg0 : !pdl.operation failures(propagate) {
-  ^bb0(%arg1: !pdl.operation):
-    %0 = pdl_match @some in %arg1 : (!pdl.operation) -> !pdl.operation
+  transform.sequence %arg0 : !transform.any_op failures(propagate) {
+  ^bb0(%arg1: !transform.any_op):
+    %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
     // Transform fails to match any but still produces 2 results.
-    %1:2 = transform.test_correct_number_of_multi_results %0
+    %1:2 = transform.test_correct_number_of_multi_results %0 : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
   }
 }
 
@@ -500,7 +500,7 @@
 }
 
 transform.with_pdl_patterns {
-^bb0(%arg0: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
   pdl.pattern @some : benefit(1) {
     %0 = pdl.operands
     %1 = pdl.types
@@ -508,10 +508,10 @@
     pdl.rewrite %2 with "transform.dialect"
   }
 
-  transform.sequence %arg0 : !pdl.operation failures(propagate) {
-  ^bb0(%arg1: !pdl.operation):
-    %0 = pdl_match @some in %arg1 : (!pdl.operation) -> !pdl.operation
-    transform.test_mixed_null_and_non_null_results %0
+  transform.sequence %arg0 : !transform.any_op failures(propagate) {
+  ^bb0(%arg1: !transform.any_op):
+    %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.test_mixed_null_and_non_null_results %0 : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
   }
 }
 
@@ -530,7 +530,7 @@
 }
 
 transform.with_pdl_patterns {
-^bb0(%arg0: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
   pdl.pattern @addi : benefit(1) {
     %0 = pdl.operands
     %1 = pdl.types
@@ -544,12 +544,12 @@
     pdl.rewrite %2 with "transform.dialect"
   }
 
-  transform.sequence %arg0 : !pdl.operation failures(propagate) {
-  ^bb0(%arg1: !pdl.operation):
-    %0 = pdl_match @addi in %arg1 : (!pdl.operation) -> !pdl.operation
-    %1 = pdl_match @subi in %arg1 : (!pdl.operation) -> !pdl.operation
-    %2 = merge_handles %0, %1 : !pdl.operation
-    test_print_remark_at_operand %2, "matched" : !pdl.operation
+  transform.sequence %arg0 : !transform.any_op failures(propagate) {
+  ^bb0(%arg1: !transform.any_op):
+    %0 = pdl_match @addi in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = pdl_match @subi in %arg1 : (!transform.any_op) -> !transform.any_op
+    %2 = merge_handles %0, %1 : !transform.any_op
+    test_print_remark_at_operand %2, "matched" : !transform.any_op
   }
 }
 
@@ -563,7 +563,7 @@
 }
 
 transform.with_pdl_patterns {
-^bb0(%arg0: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
   pdl.pattern @some : benefit(1) {
     %0 = pdl.operands
     %1 = pdl.types
@@ -571,11 +571,11 @@
     pdl.rewrite %2 with "transform.dialect"
   }
 
-  transform.sequence %arg0 : !pdl.operation failures(propagate) {
-  ^bb0(%arg1: !pdl.operation):
-    %0 = pdl_match @some in %arg1 : (!pdl.operation) -> !pdl.operation
+  transform.sequence %arg0 : !transform.any_op failures(propagate) {
+  ^bb0(%arg1: !transform.any_op):
+    %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
     // expected-error @below {{failed to apply}}
-    transform.test_mixed_sucess_and_silenceable %0
+    transform.test_mixed_success_and_silenceable %0 : !transform.any_op
   }
 }
 
@@ -587,7 +587,7 @@
 }
 
 transform.with_pdl_patterns {
-^bb0(%arg0: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
   pdl.pattern @some : benefit(1) {
     %0 = pdl.operands
     %1 = pdl.types
@@ -595,12 +595,12 @@
     pdl.rewrite %2 with "transform.dialect"
   }
 
-  transform.sequence %arg0 : !pdl.operation failures(suppress) {
-  ^bb0(%arg1: !pdl.operation):
-    %0 = pdl_match @some in %arg1 : (!pdl.operation) -> !pdl.operation
+  transform.sequence %arg0 : !transform.any_op failures(suppress) {
+  ^bb0(%arg1: !transform.any_op):
+    %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
     // Not expecting error here because we are suppressing it.
     // expected-remark @below {{foo}}
-    test_emit_remark_and_erase_operand %0, "foo" {fail_after_erase}
+    test_emit_remark_and_erase_operand %0, "foo" {fail_after_erase} : !transform.any_op
   }
 }
 
@@ -612,7 +612,7 @@
 }
 
 transform.with_pdl_patterns {
-^bb0(%arg0: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
   pdl.pattern @some : benefit(1) {
     %0 = pdl.operands
     %1 = pdl.types
@@ -620,12 +620,12 @@
     pdl.rewrite %2 with "transform.dialect"
   }
 
-  transform.sequence %arg0 : !pdl.operation failures(propagate) {
-  ^bb0(%arg1: !pdl.operation):
-    %0 = pdl_match @some in %arg1 : (!pdl.operation) -> !pdl.operation
+  transform.sequence %arg0 : !transform.any_op failures(propagate) {
+  ^bb0(%arg1: !transform.any_op):
+    %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
     // expected-error @below {{silenceable error}}
     // expected-remark @below {{foo}}
-    test_emit_remark_and_erase_operand %0, "foo" {fail_after_erase}
+    test_emit_remark_and_erase_operand %0, "foo" {fail_after_erase} : !transform.any_op
   }
 }
 
@@ -747,10 +747,10 @@
 }
 
 transform.sequence failures(propagate) {
-^bb1(%arg1: !pdl.operation):
-  %addi = transform.structured.match ops{["arith.addi"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-  %muli = get_producer_of_operand %addi[0] : (!pdl.operation) -> !pdl.operation
-  transform.test_print_remark_at_operand %muli, "found muli" : !pdl.operation
+^bb1(%arg1: !transform.any_op):
+  %addi = transform.structured.match ops{["arith.addi"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+  %muli = get_producer_of_operand %addi[0] : (!transform.any_op) -> !transform.any_op
+  transform.test_print_remark_at_operand %muli, "found muli" : !transform.any_op
 }
 
 // -----
@@ -762,10 +762,10 @@
 }
 
 transform.sequence failures(propagate) {
-^bb1(%arg1: !pdl.operation):
-  %muli = transform.structured.match ops{["arith.muli"]} in %arg1 : (!pdl.operation) -> !pdl.operation
+^bb1(%arg1: !transform.any_op):
+  %muli = transform.structured.match ops{["arith.muli"]} in %arg1 : (!transform.any_op) -> !transform.any_op
   // expected-error @below {{could not find a producer for operand number: 0 of}}
-  %bbarg = get_producer_of_operand %muli[0] : (!pdl.operation) -> !pdl.operation
+  %bbarg = get_producer_of_operand %muli[0] : (!transform.any_op) -> !transform.any_op
 
 }
 
@@ -779,10 +779,10 @@
 }
 
 transform.sequence failures(propagate) {
-^bb1(%arg1: !pdl.operation):
-  %muli = transform.structured.match ops{["arith.muli"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-  %addi = get_consumers_of_result %muli[0] : (!pdl.operation) -> !pdl.operation
-  transform.test_print_remark_at_operand %addi, "found addi" : !pdl.operation
+^bb1(%arg1: !transform.any_op):
+  %muli = transform.structured.match ops{["arith.muli"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+  %addi = get_consumers_of_result %muli[0] : (!transform.any_op) -> !transform.any_op
+  transform.test_print_remark_at_operand %addi, "found addi" : !transform.any_op
 }
 
 // -----
@@ -794,10 +794,10 @@
 }
 
 transform.sequence failures(propagate) {
-^bb1(%arg1: !pdl.operation):
-  %muli = transform.structured.match ops{["arith.muli"]} in %arg1 : (!pdl.operation) -> !pdl.operation
+^bb1(%arg1: !transform.any_op):
+  %muli = transform.structured.match ops{["arith.muli"]} in %arg1 : (!transform.any_op) -> !transform.any_op
   // expected-error @below {{handle must be mapped to exactly one payload op}}
-  %bbarg = get_consumers_of_result %muli[0] : (!pdl.operation) -> !pdl.operation
+  %bbarg = get_consumers_of_result %muli[0] : (!transform.any_op) -> !transform.any_op
 
 }
 
@@ -809,10 +809,10 @@
 }
 
 transform.sequence failures(propagate) {
-^bb1(%arg1: !pdl.operation):
-  %muli = transform.structured.match ops{["arith.muli"]} in %arg1 : (!pdl.operation) -> !pdl.operation
+^bb1(%arg1: !transform.any_op):
+  %muli = transform.structured.match ops{["arith.muli"]} in %arg1 : (!transform.any_op) -> !transform.any_op
   // expected-error @below {{result number overflow}}
-  %bbarg = get_consumers_of_result %muli[1] : (!pdl.operation) -> !pdl.operation
+  %bbarg = get_consumers_of_result %muli[1] : (!transform.any_op) -> !transform.any_op
 
 }
 
@@ -923,18 +923,18 @@
 "other_dialect.other_op"() : () -> ()
 
 transform.with_pdl_patterns {
-^bb0(%arg0: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
   pdl.pattern @other : benefit(1) {
     %0 = pdl.operation "other_dialect.other_op"
     pdl.rewrite %0 with "transform.dialect"
   }
 
-  sequence %arg0 : !pdl.operation failures(propagate) {
-  ^bb1(%arg1: !pdl.operation):
-    %0 = pdl_match @other in %arg1 : (!pdl.operation) -> !pdl.operation
+  sequence %arg0 : !transform.any_op failures(propagate) {
+  ^bb1(%arg1: !transform.any_op):
+    %0 = pdl_match @other in %arg1 : (!transform.any_op) -> !transform.any_op
     // expected-error @below {{expected the payload operation to belong to the 'test' dialect}}
-    %2 = transform.cast %0 : !pdl.operation to !transform.test_dialect_op
-    transform.cast %2 : !transform.test_dialect_op to !pdl.operation
+    %2 = transform.cast %0 : !transform.any_op to !transform.test_dialect_op
+    transform.cast %2 : !transform.test_dialect_op to !transform.any_op
   }
 }
 
@@ -944,17 +944,17 @@
 "other_dialect.other_op"() : () -> ()
 
 transform.with_pdl_patterns {
-^bb0(%arg0: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
   pdl.pattern @some : benefit(1) {
     %0 = pdl.operation "test.some_op"
     pdl.rewrite %0 with "transform.dialect"
   }
 
-  sequence %arg0 : !pdl.operation failures(propagate) {
-  ^bb1(%arg1: !pdl.operation):
-    %0 = pdl_match @some in %arg1 : (!pdl.operation) -> !pdl.operation
-    %2 = transform.cast %0 : !pdl.operation to !transform.op<"test.some_op">
-    transform.cast %2 : !transform.op<"test.some_op"> to !pdl.operation
+  sequence %arg0 : !transform.any_op failures(propagate) {
+  ^bb1(%arg1: !transform.any_op):
+    %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
+    %2 = transform.cast %0 : !transform.any_op to !transform.op<"test.some_op">
+    transform.cast %2 : !transform.op<"test.some_op"> to !transform.any_op
   }
 }
 
@@ -965,36 +965,36 @@
 "other_dialect.other_op"() : () -> ()
 
 transform.with_pdl_patterns {
-^bb0(%arg0: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
   pdl.pattern @other : benefit(1) {
     %0 = pdl.operation "other_dialect.other_op"
     pdl.rewrite %0 with "transform.dialect"
   }
 
-  sequence %arg0 : !pdl.operation failures(propagate) {
-  ^bb1(%arg1: !pdl.operation):
-    %0 = pdl_match @other in %arg1 : (!pdl.operation) -> !pdl.operation
+  sequence %arg0 : !transform.any_op failures(propagate) {
+  ^bb1(%arg1: !transform.any_op):
+    %0 = pdl_match @other in %arg1 : (!transform.any_op) -> !transform.any_op
     // expected-error @below {{incompatible payload operation name}}
-    %2 = transform.cast %0 : !pdl.operation to !transform.op<"test.some_op">
-    transform.cast %2 : !transform.op<"test.some_op"> to !pdl.operation
+    %2 = transform.cast %0 : !transform.any_op to !transform.op<"test.some_op">
+    transform.cast %2 : !transform.op<"test.some_op"> to !transform.any_op
   }
 }
 
 // -----
 
 transform.with_pdl_patterns {
-^bb0(%arg0: !pdl.operation):
-  transform.sequence %arg0 : !pdl.operation failures(propagate) {
-  ^bb0(%arg1: !pdl.operation):
-    %0 = pdl_match @some in %arg1 : (!pdl.operation) -> !pdl.operation
+^bb0(%arg0: !transform.any_op):
+  transform.sequence %arg0 : !transform.any_op failures(propagate) {
+  ^bb0(%arg1: !transform.any_op):
+    %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
     // here, the handles nested under are {%arg0, %arg1, %0}
     // expected-remark @below {{3 handles nested under}}
-    transform.test_report_number_of_tracked_handles_nested_under %arg1
+    transform.test_report_number_of_tracked_handles_nested_under %arg1 : !transform.any_op
     // expected-remark @below {{erased}}
-    transform.test_emit_remark_and_erase_operand %0, "erased"
+    transform.test_emit_remark_and_erase_operand %0, "erased" : !transform.any_op
     // here, the handles nested under are only {%arg0, %arg1}
     // expected-remark @below {{2 handles nested under}}
-    transform.test_report_number_of_tracked_handles_nested_under %arg1
+    transform.test_report_number_of_tracked_handles_nested_under %arg1 : !transform.any_op
   }
 
   pdl.pattern @some : benefit(1) {
@@ -1065,9 +1065,9 @@
 // -----
 
 transform.sequence failures(propagate) {
-^bb0(%arg0: !pdl.operation):
-  %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!pdl.operation) -> !pdl.operation
-  %1 = transform.test_produce_param_with_number_of_test_ops %0 : !pdl.operation
+^bb0(%arg0: !transform.any_op):
+  %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+  %1 = transform.test_produce_param_with_number_of_test_ops %0 : !transform.any_op
   // expected-remark @below {{1 : i32, 3 : i32}}
   transform.test_print_param %1 : !transform.test_dialect_param
   %2 = transform.test_add_to_param %1, 100
diff --git a/mlir/test/Dialect/Transform/transform-state-extension.mlir b/mlir/test/Dialect/Transform/transform-state-extension.mlir
--- a/mlir/test/Dialect/Transform/transform-state-extension.mlir
+++ b/mlir/test/Dialect/Transform/transform-state-extension.mlir
@@ -3,15 +3,15 @@
 // expected-note @below {{associated payload op}}
 module {
   transform.sequence failures(propagate) {
-  ^bb0(%arg0: !pdl.operation):
+  ^bb0(%arg0: !transform.any_op):
     // expected-remark @below {{extension absent}}
-    test_check_if_test_extension_present %arg0
+    test_check_if_test_extension_present %arg0 : !transform.any_op
     test_add_test_extension "A"
     // expected-remark @below {{extension present, A}}
-    test_check_if_test_extension_present %arg0
+    test_check_if_test_extension_present %arg0 : !transform.any_op
     test_remove_test_extension
     // expected-remark @below {{extension absent}}
-    test_check_if_test_extension_present %arg0
+    test_check_if_test_extension_present %arg0 : !transform.any_op
   }
 }
 
@@ -20,12 +20,12 @@
 // expected-note @below {{associated payload op}}
 module {
   transform.sequence failures(propagate) {
-  ^bb0(%arg0: !pdl.operation):
+  ^bb0(%arg0: !transform.any_op):
     test_add_test_extension "A"
     test_remove_test_extension
     test_add_test_extension "B"
     // expected-remark @below {{extension present, B}}
-    test_check_if_test_extension_present %arg0
+    test_check_if_test_extension_present %arg0 : !transform.any_op
   }
 }
 
@@ -34,56 +34,56 @@
 // expected-note @below {{associated payload op}}
 module {
   transform.sequence failures(propagate) {
-  ^bb0(%arg0: !pdl.operation):
+  ^bb0(%arg0: !transform.any_op):
     test_add_test_extension "A"
     // expected-remark @below {{extension present, A}}
-    test_check_if_test_extension_present %arg0
+    test_check_if_test_extension_present %arg0 : !transform.any_op
     // expected-note @below {{associated payload op}}
-    test_remap_operand_to_self %arg0
+    test_remap_operand_to_self %arg0 : (!transform.any_op) -> !transform.any_op
     // expected-remark @below {{extension present, A}}
-    test_check_if_test_extension_present %arg0
+    test_check_if_test_extension_present %arg0 : !transform.any_op
   }
 }
 
 // -----
 
 transform.sequence failures(propagate) {
-^bb0(%arg0: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
   test_add_test_extension "A"
    // This is okay because we are replacing the top-level module operation
    // (0 results) with this operation that has _more_ (1) results.
-  %dummy = test_remap_operand_to_self %arg0 : !pdl.operation
+  %dummy = test_remap_operand_to_self %arg0 : (!transform.any_op) -> !transform.any_op
 }
 
 // -----
 
 transform.sequence failures(propagate) {
-^bb0(%arg0: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
   test_add_test_extension "A"
-  %dummy = test_remap_operand_to_self %arg0 : !pdl.operation
+  %dummy = test_remap_operand_to_self %arg0 : (!transform.any_op) -> !transform.any_op
   // This is still okay. Even though we are replacing the previous
   // operation with (1 result) with this operation that has less (0) results,
   // there is no handle to the result, hence no issue with value handle update.
-  test_remap_operand_to_self %dummy
+  test_remap_operand_to_self %dummy : (!transform.any_op) -> !transform.any_op
 }
 
 // -----
 
 transform.sequence failures(propagate) {
-^bb0(%arg0: !pdl.operation):
+^bb0(%arg0: !transform.any_op):
   test_add_test_extension "A"
   // expected-error @below {{cannot replace an op with another op producing fewer results while tracking handles}}
-  %dummy = test_remap_operand_to_self %arg0 : !pdl.operation
-  %valuehandle = transform.get_result %dummy[0] : (!pdl.operation) -> !transform.any_value
-  test_remap_operand_to_self %dummy
+  %dummy = test_remap_operand_to_self %arg0 : (!transform.any_op) -> !transform.any_op
+  %valuehandle = transform.get_result %dummy[0] : (!transform.any_op) -> !transform.any_value
+  test_remap_operand_to_self %dummy : (!transform.any_op) -> ()
 }
 
 // -----
 
 module {
   transform.sequence failures(suppress) {
-  ^bb0(%arg0: !pdl.operation):
+  ^bb0(%arg0: !transform.any_op):
     // expected-error @below {{TestTransformStateExtension missing}}
-    test_remap_operand_to_self %arg0
+    test_remap_operand_to_self %arg0 : (!transform.any_op) -> !transform.any_op
   }
 }
diff --git a/mlir/test/Dialect/Vector/vector-broadcast-lowering-transforms.mlir b/mlir/test/Dialect/Vector/vector-broadcast-lowering-transforms.mlir
--- a/mlir/test/Dialect/Vector/vector-broadcast-lowering-transforms.mlir
+++ b/mlir/test/Dialect/Vector/vector-broadcast-lowering-transforms.mlir
@@ -163,10 +163,10 @@
 }
 
 transform.sequence failures(propagate) {
-^bb1(%module_op: !pdl.operation):
+^bb1(%module_op: !transform.any_op):
   %f = transform.structured.match ops{["func.func"]} in %module_op 
-    : (!pdl.operation) -> !pdl.operation
+    : (!transform.any_op) -> !transform.any_op
 
   transform.vector.lower_broadcast %f
-      : (!pdl.operation) -> !pdl.operation
+      : (!transform.any_op) -> !transform.any_op
 }
diff --git a/mlir/test/Dialect/Vector/vector-contract-matvec-transforms.mlir b/mlir/test/Dialect/Vector/vector-contract-matvec-transforms.mlir
--- a/mlir/test/Dialect/Vector/vector-contract-matvec-transforms.mlir
+++ b/mlir/test/Dialect/Vector/vector-contract-matvec-transforms.mlir
@@ -209,8 +209,8 @@
 }
 
 transform.sequence failures(propagate) {
-^bb1(%module_op: !pdl.operation):
+^bb1(%module_op: !transform.any_op):
   transform.vector.lower_contraction %module_op
     lowering_strategy = "outerproduct"
-      : (!pdl.operation) -> !pdl.operation
+      : (!transform.any_op) -> !transform.any_op
 }
diff --git a/mlir/test/Dialect/Vector/vector-contract-to-dot-transforms.mlir b/mlir/test/Dialect/Vector/vector-contract-to-dot-transforms.mlir
--- a/mlir/test/Dialect/Vector/vector-contract-to-dot-transforms.mlir
+++ b/mlir/test/Dialect/Vector/vector-contract-to-dot-transforms.mlir
@@ -296,11 +296,11 @@
 }
 
 transform.sequence failures(propagate) {
-^bb1(%module_op: !pdl.operation):
+^bb1(%module_op: !transform.any_op):
   %f = transform.structured.match ops{["func.func"]} in %module_op 
-    : (!pdl.operation) -> !pdl.operation
+    : (!transform.any_op) -> !transform.any_op
 
   %f2 = transform.vector.lower_contraction %f
     lowering_strategy = "dot"
-      : (!pdl.operation) -> !pdl.operation
+      : (!transform.any_op) -> !transform.any_op
 }
diff --git a/mlir/test/Dialect/Vector/vector-contract-to-matrix-intrinsics-transforms.mlir b/mlir/test/Dialect/Vector/vector-contract-to-matrix-intrinsics-transforms.mlir
--- a/mlir/test/Dialect/Vector/vector-contract-to-matrix-intrinsics-transforms.mlir
+++ b/mlir/test/Dialect/Vector/vector-contract-to-matrix-intrinsics-transforms.mlir
@@ -44,14 +44,14 @@
 }
 
 transform.sequence failures(propagate) {
-^bb1(%module_op: !pdl.operation):
+^bb1(%module_op: !transform.any_op):
   %f = transform.structured.match ops{["func.func"]} in %module_op 
-    : (!pdl.operation) -> !pdl.operation
+    : (!transform.any_op) -> !transform.any_op
 
   %f2 = transform.vector.lower_contraction %f
     lowering_strategy = "matmulintrinsics"
-      : (!pdl.operation) -> !pdl.operation
+      : (!transform.any_op) -> !transform.any_op
 
   %f3 = transform.vector.lower_shape_cast %f2
-    : (!pdl.operation) -> !pdl.operation
+    : (!transform.any_op) -> !transform.any_op
 }
diff --git a/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-transforms.mlir b/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-transforms.mlir
--- a/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-transforms.mlir
+++ b/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-transforms.mlir
@@ -343,11 +343,11 @@
 
 
 transform.sequence failures(propagate) {
-^bb1(%module_op: !pdl.operation):
+^bb1(%module_op: !transform.any_op):
   %f = transform.structured.match ops{["func.func"]} in %module_op 
-    : (!pdl.operation) -> !pdl.operation
+    : (!transform.any_op) -> !transform.any_op
 
   %f2 = transform.vector.lower_contraction %f
     lowering_strategy = "outerproduct"
-      : (!pdl.operation) -> !pdl.operation
+      : (!transform.any_op) -> !transform.any_op
 }
diff --git a/mlir/test/Dialect/Vector/vector-contract-to-parallel-arith-transforms.mlir b/mlir/test/Dialect/Vector/vector-contract-to-parallel-arith-transforms.mlir
--- a/mlir/test/Dialect/Vector/vector-contract-to-parallel-arith-transforms.mlir
+++ b/mlir/test/Dialect/Vector/vector-contract-to-parallel-arith-transforms.mlir
@@ -52,11 +52,11 @@
 }
 
 transform.sequence failures(propagate) {
-^bb1(%module_op: !pdl.operation):
+^bb1(%module_op: !transform.any_op):
   %f = transform.structured.match ops{["func.func"]} in %module_op 
-    : (!pdl.operation) -> !pdl.operation
+    : (!transform.any_op) -> !transform.any_op
 
   %f2 = transform.vector.lower_contraction %f
     lowering_strategy = "parallelarith"
-      : (!pdl.operation) -> !pdl.operation
+      : (!transform.any_op) -> !transform.any_op
 }
diff --git a/mlir/test/Dialect/Vector/vector-mask-lowering-transforms.mlir b/mlir/test/Dialect/Vector/vector-mask-lowering-transforms.mlir
--- a/mlir/test/Dialect/Vector/vector-mask-lowering-transforms.mlir
+++ b/mlir/test/Dialect/Vector/vector-mask-lowering-transforms.mlir
@@ -92,12 +92,12 @@
 }
 
 transform.sequence failures(propagate) {
-^bb1(%module_op: !pdl.operation):
+^bb1(%module_op: !transform.any_op):
   %f = transform.structured.match ops{["func.func"]} in %module_op 
-    : (!pdl.operation) -> !pdl.operation
+    : (!transform.any_op) -> !transform.any_op
 
   transform.vector.lower_masks %f
-      : (!pdl.operation) -> !pdl.operation
+      : (!transform.any_op) -> !transform.any_op
 }
 
 // -----
@@ -122,10 +122,10 @@
 }
 
 transform.sequence failures(propagate) {
-^bb1(%module_op: !pdl.operation):
+^bb1(%module_op: !transform.any_op):
   %f = transform.structured.match ops{["func.func"]} in %module_op 
-    : (!pdl.operation) -> !pdl.operation
+    : (!transform.any_op) -> !transform.any_op
 
   transform.vector.lower_masked_transfers %f
-      : (!pdl.operation) -> !pdl.operation
+      : (!transform.any_op) -> !transform.any_op
 }
diff --git a/mlir/test/Dialect/Vector/vector-multi-reduction-lowering.mlir b/mlir/test/Dialect/Vector/vector-multi-reduction-lowering.mlir
--- a/mlir/test/Dialect/Vector/vector-multi-reduction-lowering.mlir
+++ b/mlir/test/Dialect/Vector/vector-multi-reduction-lowering.mlir
@@ -266,8 +266,8 @@
 //       CHECK: vector.transpose %[[INPUT]], [1, 0, 2] : vector<3x4x5xf32> to vector<4x3x5xf32>
 
 transform.sequence failures(propagate) {
-^bb1(%module_op: !pdl.operation):
+^bb1(%module_op: !transform.any_op):
   transform.vector.lower_multi_reduction %module_op
     lowering_strategy = "innerreduction"
-      : (!pdl.operation) -> !pdl.operation
+      : (!transform.any_op) -> !transform.any_op
 }
diff --git a/mlir/test/Dialect/Vector/vector-multi-reduction-outer-lowering.mlir b/mlir/test/Dialect/Vector/vector-multi-reduction-outer-lowering.mlir
--- a/mlir/test/Dialect/Vector/vector-multi-reduction-outer-lowering.mlir
+++ b/mlir/test/Dialect/Vector/vector-multi-reduction-outer-lowering.mlir
@@ -189,8 +189,8 @@
 //       CHECK:   return %{{.+}}
 
 transform.sequence failures(propagate) {
-^bb1(%module_op: !pdl.operation):
+^bb1(%module_op: !transform.any_op):
   transform.vector.lower_multi_reduction %module_op
     lowering_strategy = "innerparallel"
-      : (!pdl.operation) -> !pdl.operation
+      : (!transform.any_op) -> !transform.any_op
 }
diff --git a/mlir/test/Dialect/Vector/vector-outerproduct-lowering-transforms.mlir b/mlir/test/Dialect/Vector/vector-outerproduct-lowering-transforms.mlir
--- a/mlir/test/Dialect/Vector/vector-outerproduct-lowering-transforms.mlir
+++ b/mlir/test/Dialect/Vector/vector-outerproduct-lowering-transforms.mlir
@@ -136,13 +136,13 @@
 }
 
 transform.sequence failures(propagate) {
-^bb1(%module_op: !pdl.operation):
+^bb1(%module_op: !transform.any_op):
   %f = transform.structured.match ops{["func.func"]} in %module_op 
-    : (!pdl.operation) -> !pdl.operation
+    : (!transform.any_op) -> !transform.any_op
 
   %f2 = transform.vector.lower_outerproduct %f
-      : (!pdl.operation) -> !pdl.operation
+      : (!transform.any_op) -> !transform.any_op
 
   %f3 = transform.vector.lower_broadcast %f2
-      : (!pdl.operation) -> !pdl.operation
+      : (!transform.any_op) -> !transform.any_op
 }
diff --git a/mlir/test/Dialect/Vector/vector-shape-cast-lowering-transforms.mlir b/mlir/test/Dialect/Vector/vector-shape-cast-lowering-transforms.mlir
--- a/mlir/test/Dialect/Vector/vector-shape-cast-lowering-transforms.mlir
+++ b/mlir/test/Dialect/Vector/vector-shape-cast-lowering-transforms.mlir
@@ -125,10 +125,10 @@
 }
 
 transform.sequence failures(propagate) {
-^bb1(%module_op: !pdl.operation):
+^bb1(%module_op: !transform.any_op):
   %f = transform.structured.match ops{["func.func"]} in %module_op 
-    : (!pdl.operation) -> !pdl.operation
+    : (!transform.any_op) -> !transform.any_op
 
   %f2 = transform.vector.lower_shape_cast %f
-      : (!pdl.operation) -> !pdl.operation
+      : (!transform.any_op) -> !transform.any_op
 }
diff --git a/mlir/test/Dialect/Vector/vector-transfer-drop-unit-dims-patterns.mlir b/mlir/test/Dialect/Vector/vector-transfer-drop-unit-dims-patterns.mlir
--- a/mlir/test/Dialect/Vector/vector-transfer-drop-unit-dims-patterns.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-drop-unit-dims-patterns.mlir
@@ -30,7 +30,7 @@
 
 
 transform.sequence failures(propagate) {
-^bb1(%module_op: !pdl.operation):
+^bb1(%module_op: !transform.any_op):
   transform.vector.apply_rank_reducing_subview_patterns %module_op
-      : (!pdl.operation) -> !pdl.operation
+      : (!transform.any_op) -> !transform.any_op
 }
diff --git a/mlir/test/Dialect/Vector/vector-transfer-full-partial-split-copy-transform.mlir b/mlir/test/Dialect/Vector/vector-transfer-full-partial-split-copy-transform.mlir
--- a/mlir/test/Dialect/Vector/vector-transfer-full-partial-split-copy-transform.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-full-partial-split-copy-transform.mlir
@@ -107,10 +107,10 @@
 }
 
 transform.sequence failures(propagate) {
-^bb1(%module_op: !pdl.operation):
+^bb1(%module_op: !transform.any_op):
   transform.vector.split_transfer_full_partial %module_op
     split_transfer_strategy = "linalg-copy"
-      : (!pdl.operation) -> !pdl.operation
+      : (!transform.any_op) -> !transform.any_op
 }
 
 // -----
@@ -168,10 +168,10 @@
 // CHECK:         }
 
 transform.sequence failures(propagate) {
-^bb1(%module_op: !pdl.operation):
+^bb1(%module_op: !transform.any_op):
   transform.vector.split_transfer_full_partial %module_op
     split_transfer_strategy = "linalg-copy"
-      : (!pdl.operation) -> !pdl.operation
+      : (!transform.any_op) -> !transform.any_op
 }
 
 // -----
@@ -236,8 +236,8 @@
 // CHECK:         }
 
 transform.sequence failures(propagate) {
-^bb1(%module_op: !pdl.operation):
+^bb1(%module_op: !transform.any_op):
   transform.vector.split_transfer_full_partial %module_op
     split_transfer_strategy = "linalg-copy"
-      : (!pdl.operation) -> !pdl.operation
+      : (!transform.any_op) -> !transform.any_op
 }
diff --git a/mlir/test/Dialect/Vector/vector-transfer-full-partial-split.mlir b/mlir/test/Dialect/Vector/vector-transfer-full-partial-split.mlir
--- a/mlir/test/Dialect/Vector/vector-transfer-full-partial-split.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-full-partial-split.mlir
@@ -102,10 +102,10 @@
 }
 
 transform.sequence failures(propagate) {
-^bb1(%module_op: !pdl.operation):
+^bb1(%module_op: !transform.any_op):
   transform.vector.split_transfer_full_partial %module_op
     split_transfer_strategy = "vector-transfer"
-      : (!pdl.operation) -> !pdl.operation
+      : (!transform.any_op) -> !transform.any_op
 }
 
 // -----
@@ -160,10 +160,10 @@
 
 
 transform.sequence failures(propagate) {
-^bb1(%module_op: !pdl.operation):
+^bb1(%module_op: !transform.any_op):
   transform.vector.split_transfer_full_partial %module_op
     split_transfer_strategy = "vector-transfer"
-      : (!pdl.operation) -> !pdl.operation
+      : (!transform.any_op) -> !transform.any_op
 }
 
 // -----
@@ -222,10 +222,10 @@
 // CHECK:         }
 
 transform.sequence failures(propagate) {
-^bb1(%module_op: !pdl.operation):
+^bb1(%module_op: !transform.any_op):
   transform.vector.split_transfer_full_partial %module_op
     split_transfer_strategy = "vector-transfer"
-      : (!pdl.operation) -> !pdl.operation
+      : (!transform.any_op) -> !transform.any_op
 }
 
 // -----
@@ -264,8 +264,8 @@
 }
 
 transform.sequence failures(propagate) {
-^bb1(%module_op: !pdl.operation):
+^bb1(%module_op: !transform.any_op):
   transform.vector.split_transfer_full_partial %module_op
     split_transfer_strategy = "vector-transfer"
-      : (!pdl.operation) -> !pdl.operation
+      : (!transform.any_op) -> !transform.any_op
 }
diff --git a/mlir/test/Dialect/Vector/vector-transfer-to-vector-load-store.mlir b/mlir/test/Dialect/Vector/vector-transfer-to-vector-load-store.mlir
--- a/mlir/test/Dialect/Vector/vector-transfer-to-vector-load-store.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-to-vector-load-store.mlir
@@ -239,12 +239,12 @@
 
 
 transform.sequence failures(propagate) {
-^bb1(%module_op: !pdl.operation):
+^bb1(%module_op: !transform.any_op):
   %m2 = transform.vector.lower_transfer %module_op
     max_transfer_rank = 99
-      : (!pdl.operation) -> !pdl.operation
+      : (!transform.any_op) -> !transform.any_op
   transform.vector.apply_transfer_permutation_patterns %m2
-      : (!pdl.operation) -> !pdl.operation
+      : (!transform.any_op) -> !transform.any_op
 }
 
 // -----
@@ -362,10 +362,10 @@
 }
 
 transform.sequence failures(propagate) {
-^bb1(%module_op: !pdl.operation):
+^bb1(%module_op: !transform.any_op):
   %m2 = transform.vector.lower_transfer %module_op
     max_transfer_rank = 99
-      : (!pdl.operation) -> !pdl.operation
+      : (!transform.any_op) -> !transform.any_op
   transform.vector.apply_transfer_permutation_patterns %m2
-      : (!pdl.operation) -> !pdl.operation
+      : (!transform.any_op) -> !transform.any_op
 }
diff --git a/mlir/test/Dialect/Vector/vector-transpose-lowering.mlir b/mlir/test/Dialect/Vector/vector-transpose-lowering.mlir
--- a/mlir/test/Dialect/Vector/vector-transpose-lowering.mlir
+++ b/mlir/test/Dialect/Vector/vector-transpose-lowering.mlir
@@ -75,10 +75,10 @@
 }
 
 transform.sequence failures(propagate) {
-^bb1(%module_op: !pdl.operation):
+^bb1(%module_op: !transform.any_op):
   transform.vector.lower_transpose %module_op
     lowering_strategy = "eltwise"
-      : (!pdl.operation) -> !pdl.operation
+      : (!transform.any_op) -> !transform.any_op
 }
 
 // -----
@@ -98,10 +98,10 @@
 
 
 transform.sequence failures(propagate) {
-^bb1(%module_op: !pdl.operation):
+^bb1(%module_op: !transform.any_op):
   transform.vector.lower_transpose %module_op
     lowering_strategy = "shuffle_1d"
-      : (!pdl.operation) -> !pdl.operation
+      : (!transform.any_op) -> !transform.any_op
 }
 
 // -----
@@ -117,10 +117,10 @@
 
 
 transform.sequence failures(propagate) {
-^bb1(%module_op: !pdl.operation):
+^bb1(%module_op: !transform.any_op):
   transform.vector.lower_transpose %module_op
     lowering_strategy = "flat_transpose"
-      : (!pdl.operation) -> !pdl.operation
+      : (!transform.any_op) -> !transform.any_op
 }
 
 // -----
@@ -604,10 +604,10 @@
 }
 
 transform.sequence failures(propagate) {
-^bb1(%module_op: !pdl.operation):
+^bb1(%module_op: !transform.any_op):
   transform.vector.lower_transpose %module_op
     avx2_lowering_strategy = true
-      : (!pdl.operation) -> !pdl.operation
+      : (!transform.any_op) -> !transform.any_op
 }
 
 // -----
@@ -682,10 +682,10 @@
 }
 
 transform.sequence failures(propagate) {
-^bb1(%module_op: !pdl.operation):
+^bb1(%module_op: !transform.any_op):
   transform.vector.lower_transpose %module_op
     lowering_strategy = "shuffle_16x16"
-      : (!pdl.operation) -> !pdl.operation
+      : (!transform.any_op) -> !transform.any_op
 }
 
 // -----
@@ -761,8 +761,8 @@
 }
 
 transform.sequence failures(propagate) {
-^bb1(%module_op: !pdl.operation):
+^bb1(%module_op: !transform.any_op):
   transform.vector.lower_transpose %module_op
     lowering_strategy = "shuffle_16x16"
-      : (!pdl.operation) -> !pdl.operation
+      : (!transform.any_op) -> !transform.any_op
 }
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-call.mlir
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-call.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-call.mlir
@@ -25,9 +25,9 @@
 }
 
 transform.sequence failures(propagate) {
-  ^bb0(%arg1: !pdl.operation):
-    %0 = transform.structured.match ops{["linalg.conv_1d"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-    %1, %loop = transform.structured.tile %0 [4] : (!pdl.operation) -> (!pdl.operation, !pdl.operation)
+  ^bb0(%arg1: !transform.any_op):
+    %0 = transform.structured.match ops{["linalg.conv_1d"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1, %loop = transform.structured.tile %0 [4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 }
 
 func.func @main() {
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-nwc-wcf-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-nwc-wcf-call.mlir
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-nwc-wcf-call.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-nwc-wcf-call.mlir
@@ -27,9 +27,9 @@
 }
 
 transform.sequence failures(propagate) {
-  ^bb0(%arg1: !pdl.operation):
-    %0 = transform.structured.match ops{["linalg.conv_1d_nwc_wcf"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-    %1, %loops:2 = transform.structured.tile %0 [2, 4] : (!pdl.operation) -> (!pdl.operation, !pdl.operation, !pdl.operation)
+  ^bb0(%arg1: !transform.any_op):
+    %0 = transform.structured.match ops{["linalg.conv_1d_nwc_wcf"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1, %loops:2 = transform.structured.tile %0 [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
 }
 
 func.func @main() {
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-call.mlir
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-call.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-call.mlir
@@ -25,9 +25,9 @@
 }
 
 transform.sequence failures(propagate) {
-  ^bb0(%arg1: !pdl.operation):
-    %0 = transform.structured.match ops{["linalg.conv_2d"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-    %1, %loops:2 = transform.structured.tile %0 [2, 2] : (!pdl.operation) -> (!pdl.operation, !pdl.operation, !pdl.operation)
+  ^bb0(%arg1: !transform.any_op):
+    %0 = transform.structured.match ops{["linalg.conv_2d"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1, %loops:2 = transform.structured.tile %0 [2, 2] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
 }
 
 func.func @main() {
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-nhwc-hwcf-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-nhwc-hwcf-call.mlir
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-nhwc-hwcf-call.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-nhwc-hwcf-call.mlir
@@ -27,9 +27,9 @@
 }
 
 transform.sequence failures(propagate) {
-  ^bb0(%arg1: !pdl.operation):
-    %0 = transform.structured.match ops{["linalg.conv_2d_nhwc_hwcf"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-    %1, %loops:4 = transform.structured.tile %0 [2, 3, 3, 2] : (!pdl.operation) -> (!pdl.operation, !pdl.operation, !pdl.operation, !pdl.operation, !pdl.operation)
+  ^bb0(%arg1: !transform.any_op):
+    %0 = transform.structured.match ops{["linalg.conv_2d_nhwc_hwcf"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1, %loops:4 = transform.structured.tile %0 [2, 3, 3, 2] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
 }
 
 func.func @main() {
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-call.mlir
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-call.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-call.mlir
@@ -25,9 +25,9 @@
 }
 
 transform.sequence failures(propagate) {
-  ^bb0(%arg1: !pdl.operation):
-    %0 = transform.structured.match ops{["linalg.conv_3d"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-    %1, %loops:3 = transform.structured.tile %0 [2, 2, 2] : (!pdl.operation) -> (!pdl.operation, !pdl.operation, !pdl.operation, !pdl.operation)
+  ^bb0(%arg1: !transform.any_op):
+    %0 = transform.structured.match ops{["linalg.conv_3d"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1, %loops:3 = transform.structured.tile %0 [2, 2, 2] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
 }
 
 func.func @main() {
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-ndhwc-dhwcf-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-ndhwc-dhwcf-call.mlir
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-ndhwc-dhwcf-call.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-ndhwc-dhwcf-call.mlir
@@ -27,9 +27,9 @@
 }
 
 transform.sequence failures(propagate) {
-  ^bb0(%arg1: !pdl.operation):
-    %0 = transform.structured.match ops{["linalg.conv_3d_ndhwc_dhwcf"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-    %1, %loops:3 = transform.structured.tile %0 [0, 5, 5, 5] : (!pdl.operation) -> (!pdl.operation, !pdl.operation, !pdl.operation, !pdl.operation)
+  ^bb0(%arg1: !transform.any_op):
+    %0 = transform.structured.match ops{["linalg.conv_3d_ndhwc_dhwcf"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1, %loops:3 = transform.structured.tile %0 [0, 5, 5, 5] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
 }
 
 func.func @main() {
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-matmul.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-matmul.mlir
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-matmul.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-matmul.mlir
@@ -37,9 +37,9 @@
 }
 
 transform.sequence failures(propagate) {
-  ^bb0(%arg1: !pdl.operation):
-    %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-    %1, %loops:3 = transform.structured.tile %0 [1, 2, 3] : (!pdl.operation) -> (!pdl.operation, !pdl.operation, !pdl.operation, !pdl.operation)
+  ^bb0(%arg1: !transform.any_op):
+    %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1, %loops:3 = transform.structured.tile %0 [1, 2, 3] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
 }
 
 func.func private @printMemrefF32(%ptr : tensor<*xf32>)
diff --git a/mlir/test/Target/LLVMIR/openmp-llvm.mlir b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
--- a/mlir/test/Target/LLVMIR/openmp-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
@@ -2340,6 +2340,29 @@
 
 // -----
 
+llvm.func @par_task_(%arg0: !llvm.ptr<i32> {fir.bindc_name = "a"}) {
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  omp.task   {
+    omp.parallel   {
+      llvm.store %0, %arg0 : !llvm.ptr<i32>
+      omp.terminator
+    }
+    omp.terminator
+  }
+  llvm.return
+}
+
+// CHECK-LABEL: @par_task_
+// CHECK: %[[TASK_ALLOC:.*]] = call ptr @__kmpc_omp_task_alloc({{.*}}ptr @par_task_..omp_par.wrapper)
+// CHECK: call i32 @__kmpc_omp_task({{.*}}, ptr %[[TASK_ALLOC]])
+// CHECK-LABEL: define internal void @par_task_..omp_par
+// CHECK: %[[ARG_ALLOC:.*]] = alloca { ptr }, align 8
+// CHECK: call void ({{.*}}) @__kmpc_fork_call({{.*}}, ptr @par_task_..omp_par..omp_par, ptr %[[ARG_ALLOC]])
+// CHECK: define internal void @par_task_..omp_par..omp_par
+// CHECK: define i32 @par_task_..omp_par.wrapper
+// CHECK: call void @par_task_..omp_par
+// -----
+
 llvm.func @foo() -> ()
 
 llvm.func @omp_taskgroup(%x: i32, %y: i32, %zaddr: !llvm.ptr<i32>) {
diff --git a/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.td b/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.td
--- a/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.td
+++ b/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.td
@@ -44,9 +44,10 @@
   : Op<Transform_Dialect, "test_produce_self_handle_or_forward_operand",
        [DeclareOpInterfaceMethods<TransformOpInterface>,
         DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
-  let arguments = (ins Optional<PDL_Operation>:$operand);
-  let results = (outs PDL_Operation:$res);
-  let assemblyFormat = "($operand^)? attr-dict";
+  let arguments = (ins Optional<TransformHandleTypeInterface>:$operand);
+  let results = (outs TransformHandleTypeInterface:$res);
+  let assemblyFormat =
+      "($operand^)? attr-dict `:` functional-type($operand, $res)";
   let cppNamespace = "::mlir::test";
 }
 
@@ -100,8 +101,10 @@
       DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
   let arguments = (ins
     Transform_AnyHandleOrParamType:$operand,
-    Optional<PDL_Operation>:$second_operand);
-  let assemblyFormat = "$operand (`,` $second_operand^)? attr-dict `:` type($operand)";
+    Optional<TransformHandleTypeInterface>:$second_operand);
+  let assemblyFormat = 
+      "$operand (`,` $second_operand^)? attr-dict `:` type($operand)"
+      "(`,` type($second_operand)^)?";
   let cppNamespace = "::mlir::test";
 }
 
@@ -110,9 +113,10 @@
        [DeclareOpInterfaceMethods<TransformOpInterface>,
         DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
   let arguments = (ins
-    PDL_Operation:$operand,
+    TransformHandleTypeInterface:$operand,
     StrAttr:$op_kind);
-  let assemblyFormat = "$operand `,` $op_kind attr-dict";
+  let assemblyFormat =
+      "$operand `,` $op_kind attr-dict `:` type($operand)";
   let cppNamespace = "::mlir::test";
 }
 
@@ -166,8 +170,8 @@
   : Op<Transform_Dialect, "test_check_if_test_extension_present",
        [DeclareOpInterfaceMethods<TransformOpInterface>,
         DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
-  let arguments = (ins PDL_Operation:$operand);
-  let assemblyFormat = "$operand attr-dict";
+  let arguments = (ins TransformHandleTypeInterface:$operand);
+  let assemblyFormat = "$operand attr-dict `:` type($operand)";
   let cppNamespace = "::mlir::test";
 }
 
@@ -175,9 +179,9 @@
   : Op<Transform_Dialect, "test_remap_operand_to_self",
        [DeclareOpInterfaceMethods<TransformOpInterface>,
         DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
-  let arguments = (ins PDL_Operation:$operand);
+  let arguments = (ins TransformHandleTypeInterface:$operand);
   let results = (outs Optional<TransformHandleTypeInterface>:$out);        
-  let assemblyFormat = "$operand attr-dict (`:` type($out)^)?";
+  let assemblyFormat = "$operand attr-dict `:` functional-type($operand, $out)";
   let cppNamespace = "::mlir::test";
 }
 
@@ -221,9 +225,9 @@
     [DeclareOpInterfaceMethods<TransformOpInterface>,
      DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
      FunctionalStyleTransformOpTrait]> {
-  let arguments = (ins PDL_Operation:$target, StrAttr:$remark,
+  let arguments = (ins TransformHandleTypeInterface:$target, StrAttr:$remark,
                    UnitAttr:$fail_after_erase);
-  let assemblyFormat = "$target `,` $remark attr-dict";
+  let assemblyFormat = "$target `,` $remark attr-dict `:` type($target)";
   let cppNamespace = "::mlir::test";
 }
 
@@ -231,11 +235,12 @@
   : Op<Transform_Dialect, "test_wrong_number_of_results",
     [FunctionalStyleTransformOpTrait, MemoryEffectsOpInterface,
      TransformEachOpTrait, TransformOpInterface]> {
-  let arguments = (ins PDL_Operation:$target);
-  let results = (outs PDL_Operation:$a,
-                      PDL_Operation:$b,
-                      PDL_Operation:$c);
-  let assemblyFormat = "$target attr-dict";
+  let arguments = (ins TransformHandleTypeInterface:$target);
+  let results = (outs TransformHandleTypeInterface:$a,
+                      TransformHandleTypeInterface:$b,
+                      TransformHandleTypeInterface:$c);
+  let assemblyFormat =
+      "$target attr-dict `:` functional-type(operands, results)";
   let cppNamespace = "::mlir::test";
   let extraClassDeclaration = [{
     ::mlir::DiagnosedSilenceableFailure applyToOne(
@@ -249,9 +254,10 @@
   : Op<Transform_Dialect, "test_wrong_number_of_multi_results",
     [FunctionalStyleTransformOpTrait, MemoryEffectsOpInterface,
      TransformEachOpTrait, TransformOpInterface]> {
-  let arguments = (ins PDL_Operation:$target);
-  let results = (outs PDL_Operation:$result);
-  let assemblyFormat = "$target attr-dict";
+  let arguments = (ins TransformHandleTypeInterface:$target);
+  let results = (outs TransformHandleTypeInterface:$result);
+  let assemblyFormat =
+      "$target attr-dict `:` functional-type($target, $result)";
   let cppNamespace = "::mlir::test";
   let extraClassDeclaration = [{
     ::mlir::DiagnosedSilenceableFailure applyToOne(
@@ -265,10 +271,11 @@
   : Op<Transform_Dialect, "test_correct_number_of_multi_results",
     [FunctionalStyleTransformOpTrait, MemoryEffectsOpInterface,
      TransformEachOpTrait, TransformOpInterface]> {
-  let arguments = (ins PDL_Operation:$target);
-  let results = (outs PDL_Operation:$result1,
-                      PDL_Operation:$result2);
-  let assemblyFormat = "$target attr-dict";
+  let arguments = (ins TransformHandleTypeInterface:$target);
+  let results = (outs TransformHandleTypeInterface:$result1,
+                      TransformHandleTypeInterface:$result2);
+  let assemblyFormat =
+      "$target attr-dict `:` functional-type(operands, results)";
   let cppNamespace = "::mlir::test";
   let extraClassDeclaration = [{
     ::mlir::DiagnosedSilenceableFailure applyToOne(
@@ -282,10 +289,11 @@
   : Op<Transform_Dialect, "test_mixed_null_and_non_null_results",
     [FunctionalStyleTransformOpTrait, MemoryEffectsOpInterface,
      TransformEachOpTrait, TransformOpInterface]> {
-  let arguments = (ins PDL_Operation:$target);
-  let results = (outs PDL_Operation:$null,
-                      PDL_Operation:$non_null);
-  let assemblyFormat = "$target attr-dict";
+  let arguments = (ins TransformHandleTypeInterface:$target);
+  let results = (outs TransformHandleTypeInterface:$null,
+                      TransformHandleTypeInterface:$non_null);
+  let assemblyFormat =
+      "$target attr-dict `:` functional-type(operands, results)";
   let cppNamespace = "::mlir::test";
   let extraClassDeclaration = [{
     ::mlir::DiagnosedSilenceableFailure applyToOne(
@@ -296,12 +304,12 @@
 }
 
 def TestMixedSuccessAndSilenceableOp
-  : Op<Transform_Dialect, "test_mixed_sucess_and_silenceable",
+  : Op<Transform_Dialect, "test_mixed_success_and_silenceable",
     [FunctionalStyleTransformOpTrait, MemoryEffectsOpInterface,
      TransformEachOpTrait, TransformOpInterface]> {
-  let arguments = (ins PDL_Operation:$target);
+  let arguments = (ins TransformHandleTypeInterface:$target);
   let results = (outs);
-  let assemblyFormat = "$target attr-dict";
+  let assemblyFormat = "$target attr-dict `:` type($target)";
   let cppNamespace = "::mlir::test";
   let extraClassDeclaration = [{
     ::mlir::DiagnosedSilenceableFailure applyToOne(
@@ -324,18 +332,19 @@
   : Op<Transform_Dialect, "test_copy_payload",
        [DeclareOpInterfaceMethods<TransformOpInterface>,
         DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
-  let arguments = (ins PDL_Operation:$handle);
-  let results = (outs PDL_Operation:$copy);
+  let arguments = (ins TransformHandleTypeInterface:$handle);
+  let results = (outs TransformHandleTypeInterface:$copy);
   let cppNamespace = "::mlir::test";
-  let assemblyFormat = "$handle attr-dict";
+  let assemblyFormat =
+      "$handle attr-dict `:` functional-type(operands, results)";
 }
 
 def TestReportNumberOfTrackedHandlesNestedUnder
   : Op<Transform_Dialect, "test_report_number_of_tracked_handles_nested_under",
     [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
      DeclareOpInterfaceMethods<TransformOpInterface>]> {
-  let arguments = (ins PDL_Operation:$target);
-  let assemblyFormat = "$target attr-dict";
+  let arguments = (ins TransformHandleTypeInterface:$target);
+  let assemblyFormat = "$target attr-dict `:` type($target)";
   let cppNamespace = "::mlir::test";
 }
 
diff --git a/mlir/test/lib/IR/CMakeLists.txt b/mlir/test/lib/IR/CMakeLists.txt
--- a/mlir/test/lib/IR/CMakeLists.txt
+++ b/mlir/test/lib/IR/CMakeLists.txt
@@ -18,6 +18,7 @@
   TestSymbolUses.cpp
   TestRegions.cpp
   TestTypes.cpp
+  TestUseListOrders.cpp
   TestVisitors.cpp
   TestVisitorsGeneric.cpp
 
diff --git a/mlir/test/lib/IR/TestUseListOrders.cpp b/mlir/test/lib/IR/TestUseListOrders.cpp
new file mode 100644
--- /dev/null
+++ b/mlir/test/lib/IR/TestUseListOrders.cpp
@@ -0,0 +1,227 @@
+//===- TestPrintDefUse.cpp - Passes to illustrate the IR def-use chains ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Bytecode/BytecodeWriter.h"
+#include "mlir/Bytecode/Encoding.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/OwningOpRef.h"
+#include "mlir/Parser/Parser.h"
+#include "mlir/Pass/Pass.h"
+
+#include <numeric>
+#include <random>
+
+using namespace mlir;
+
+namespace {
+/// This pass tests that:
+/// 1) we can shuffle use-lists correctly;
+/// 2) use-list orders are preserved after a roundtrip to bytecode.
+class TestPreserveUseListOrders
+    : public PassWrapper<TestPreserveUseListOrders, OperationPass<ModuleOp>> {
+public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestPreserveUseListOrders)
+
+  TestPreserveUseListOrders() = default;
+  TestPreserveUseListOrders(const TestPreserveUseListOrders &pass)
+      : PassWrapper(pass) {}
+  StringRef getArgument() const final { return "test-verify-uselistorder"; }
+  StringRef getDescription() const final {
+    return "Verify that roundtripping the IR to bytecode preserves the order "
+           "of the uselists";
+  }
+  Option<unsigned> rngSeed{*this, "rng-seed",
+                           llvm::cl::desc("Specify an input random seed"),
+                           llvm::cl::init(1)};
+
+  LogicalResult initialize(MLIRContext *context) override {
+    rng.seed(static_cast<unsigned>(rngSeed));
+    return success();
+  }
+
+  void runOnOperation() override {
+    // Clone the module so that we can plug in this pass to any other
+    // independently.
+    OwningOpRef<ModuleOp> cloneModule = getOperation().clone();
+
+    // 1. Compute the op numbering of the module.
+    computeOpNumbering(*cloneModule);
+
+    // 2. Loop over all the values and shuffle the uses. While doing so, check
+    // that each shuffle is correct.
+    if (failed(shuffleUses(*cloneModule)))
+      return signalPassFailure();
+
+    // 3. Do a bytecode roundtrip to version 3, which supports use-list order
+    // preservation.
+    auto roundtripModuleOr = doRoundtripToBytecode(*cloneModule, 3);
+    // If the bytecode roundtrip failed, try to roundtrip the original module
+    // to version 2, which does not support use-list. If this also fails, the
+    // original module had an issue unrelated to uselists.
+    if (failed(roundtripModuleOr)) {
+      auto testModuleOr = doRoundtripToBytecode(getOperation(), 2);
+      if (failed(testModuleOr))
+        return;
+
+      return signalPassFailure();
+    }
+
+    // 4. Recompute the op numbering on the new module. The numbering should be
+    // the same as (1), but on the new operation pointers.
+    computeOpNumbering(roundtripModuleOr->get());
+
+    // 5. Loop over all the values and verify that the use-list is consistent
+    // with the post-shuffle order of step (2).
+    if (failed(verifyUseListOrders(roundtripModuleOr->get())))
+      return signalPassFailure();
+  }
+
+private:
+  FailureOr<OwningOpRef<Operation *>> doRoundtripToBytecode(Operation *module,
+                                                            uint32_t version) {
+    std::string str;
+    llvm::raw_string_ostream m(str);
+    BytecodeWriterConfig config;
+    config.setDesiredBytecodeVersion(version);
+    if (failed(writeBytecodeToFile(module, m, config)))
+      return failure();
+
+    ParserConfig parseConfig(&getContext(), /*verifyAfterParse=*/true);
+    auto newModuleOp = parseSourceString(StringRef(str), parseConfig);
+    if (!newModuleOp.get())
+      return failure();
+    return newModuleOp;
+  }
+
+  /// Compute an ordered numbering for all the operations in the IR.
+  void computeOpNumbering(Operation *topLevelOp) {
+    uint32_t operationID = 0;
+    opNumbering.clear();
+    topLevelOp->walk<mlir::WalkOrder::PreOrder>(
+        [&](Operation *op) { opNumbering.try_emplace(op, operationID++); });
+  }
+
+  template <typename ValueT>
+  SmallVector<uint64_t> getUseIDs(ValueT val) {
+    return SmallVector<uint64_t>(llvm::map_range(val.getUses(), [&](auto &use) {
+      return bytecode::getUseID(use, opNumbering.at(use.getOwner()));
+    }));
+  }
+
+  LogicalResult shuffleUses(Operation *topLevelOp) {
+    uint32_t valueID = 0;
+    /// Permute randomly the use-list of each value. It is guaranteed that at
+    /// least one pair of the use list is permuted.
+    auto doShuffleForRange = [&](ValueRange range) -> LogicalResult {
+      for (auto val : range) {
+        if (val.use_empty() || val.hasOneUse())
+          continue;
+
+        /// Get a valid index permutation for the uses of value.
+        SmallVector<unsigned> permutation = getRandomPermutation(val);
+
+        /// Store original order and verify that the shuffle was applied
+        /// correctly.
+        auto useIDs = getUseIDs(val);
+
+        /// Apply shuffle to the uselist.
+        val.shuffleUseList(permutation);
+
+        /// Get the new order and verify the shuffle happened correctly.
+        auto permutedIDs = getUseIDs(val);
+        if (permutedIDs.size() != useIDs.size())
+          return failure();
+        for (size_t idx = 0; idx < permutation.size(); idx++)
+          if (useIDs[idx] != permutedIDs[permutation[idx]])
+            return failure();
+
+        referenceUseListOrder.try_emplace(
+            valueID++, llvm::map_range(val.getUses(), [&](auto &use) {
+              return bytecode::getUseID(use, opNumbering.at(use.getOwner()));
+            }));
+      }
+      return success();
+    };
+
+    return walkOverValues(topLevelOp, doShuffleForRange);
+  }
+
+  LogicalResult verifyUseListOrders(Operation *topLevelOp) {
+    uint32_t valueID = 0;
+    /// Check that the use-list for the value range matches the one stored in
+    /// the reference.
+    auto doValidationForRange = [&](ValueRange range) -> LogicalResult {
+      for (auto val : range) {
+        if (val.use_empty() || val.hasOneUse())
+          continue;
+        auto referenceOrder = referenceUseListOrder.at(valueID++);
+        for (auto [use, referenceID] :
+             llvm::zip(val.getUses(), referenceOrder)) {
+          uint64_t uniqueID =
+              bytecode::getUseID(use, opNumbering.at(use.getOwner()));
+          if (uniqueID != referenceID) {
+            use.getOwner()->emitError()
+                << "found use-list order mismatch for value: " << val;
+            return failure();
+          }
+        }
+      }
+      return success();
+    };
+
+    return walkOverValues(topLevelOp, doValidationForRange);
+  }
+
+  /// Walk over blocks and operations and execute a callable over the ranges of
+  /// operands/results respectively.
+  template <typename FuncT>
+  LogicalResult walkOverValues(Operation *topLevelOp, FuncT callable) {
+    auto blockWalk = topLevelOp->walk([&](Block *block) {
+      if (failed(callable(block->getArguments())))
+        return WalkResult::interrupt();
+      return WalkResult::advance();
+    });
+
+    if (blockWalk.wasInterrupted())
+      return failure();
+
+    auto resultsWalk = topLevelOp->walk([&](Operation *op) {
+      if (failed(callable(op->getResults())))
+        return WalkResult::interrupt();
+      return WalkResult::advance();
+    });
+
+    return failure(resultsWalk.wasInterrupted());
+  }
+
+  /// Creates a random permutation of the uselist order chain of the provided
+  /// value.
+  SmallVector<unsigned> getRandomPermutation(Value value) {
+    size_t numUses = std::distance(value.use_begin(), value.use_end());
+    SmallVector<unsigned> permutation(numUses);
+    unsigned zero = 0;
+    std::iota(permutation.begin(), permutation.end(), zero);
+    std::shuffle(permutation.begin(), permutation.end(), rng);
+    return permutation;
+  }
+
+  /// Map each value to its use-list order encoded with unique use IDs.
+  DenseMap<uint32_t, SmallVector<uint64_t>> referenceUseListOrder;
+
+  /// Map each operation to its global ID.
+  DenseMap<Operation *, uint32_t> opNumbering;
+
+  std::default_random_engine rng;
+};
+} // namespace
+
+namespace mlir {
+void registerTestPreserveUseListOrders() {
+  PassRegistration<TestPreserveUseListOrders>();
+}
+} // namespace mlir
diff --git a/mlir/test/python/dialects/python_test.py b/mlir/test/python/dialects/python_test.py
--- a/mlir/test/python/dialects/python_test.py
+++ b/mlir/test/python/dialects/python_test.py
@@ -131,6 +131,27 @@
     del op.unit
     print(f"Unit: {op.unit}")
 
+# CHECK-LABEL: TEST: attrBuilder
+@run
+def attrBuilder():
+  with Context() as ctx, Location.unknown():
+    ctx.allow_unregistered_dialects = True
+    op = test.AttributesOp(x_bool=True,
+                           x_i16=1,
+                           x_i32=2,
+                           x_i64=3,
+                           x_si16=-1,
+                           x_si32=-2,
+                           x_f32=1.5,
+                           x_f64=2.5,
+                           x_str='x_str',
+                           x_i32_array=[1, 2, 3],
+                           x_i64_array=[4, 5, 6],
+                           x_f32_array=[1.5, -2.5, 3.5],
+                           x_f64_array=[4.5, 5.5, -6.5],
+                           x_i64_dense=[1, 2, 3, 4, 5, 6])
+    print(op)
+
 
 # CHECK-LABEL: TEST: inferReturnTypes
 @run
diff --git a/mlir/test/python/dialects/transform.py b/mlir/test/python/dialects/transform.py
--- a/mlir/test/python/dialects/transform.py
+++ b/mlir/test/python/dialects/transform.py
@@ -32,38 +32,38 @@
 @run
 def testSequenceOp():
   sequence = transform.SequenceOp(transform.FailurePropagationMode.PROPAGATE,
-                                  [pdl.OperationType.get()],
-                                  pdl.OperationType.get())
+                                  [transform.AnyOpType.get()],
+                                  transform.AnyOpType.get())
   with InsertionPoint(sequence.body):
     transform.YieldOp([sequence.bodyTarget])
   # CHECK-LABEL: TEST: testSequenceOp
-  # CHECK: = transform.sequence -> !pdl.operation failures(propagate) {
-  # CHECK: ^{{.*}}(%[[ARG0:.+]]: !pdl.operation):
-  # CHECK:   yield %[[ARG0]] : !pdl.operation
+  # CHECK: = transform.sequence -> !transform.any_op failures(propagate) {
+  # CHECK: ^{{.*}}(%[[ARG0:.+]]: !transform.any_op):
+  # CHECK:   yield %[[ARG0]] : !transform.any_op
   # CHECK: }
 
 
 @run
 def testNestedSequenceOp():
-  sequence = transform.SequenceOp(transform.FailurePropagationMode.PROPAGATE, [], pdl.OperationType.get())
+  sequence = transform.SequenceOp(transform.FailurePropagationMode.PROPAGATE, [], transform.AnyOpType.get())
   with InsertionPoint(sequence.body):
     nested = transform.SequenceOp(transform.FailurePropagationMode.PROPAGATE, [], sequence.bodyTarget)
     with InsertionPoint(nested.body):
       doubly_nested = transform.SequenceOp(
           transform.FailurePropagationMode.PROPAGATE,
-          [pdl.OperationType.get()], nested.bodyTarget)
+          [transform.AnyOpType.get()], nested.bodyTarget)
       with InsertionPoint(doubly_nested.body):
         transform.YieldOp([doubly_nested.bodyTarget])
       transform.YieldOp()
     transform.YieldOp()
   # CHECK-LABEL: TEST: testNestedSequenceOp
   # CHECK: transform.sequence failures(propagate) {
-  # CHECK: ^{{.*}}(%[[ARG0:.+]]: !pdl.operation):
-  # CHECK:   sequence %[[ARG0]] : !pdl.operation failures(propagate) {
-  # CHECK:   ^{{.*}}(%[[ARG1:.+]]: !pdl.operation):
-  # CHECK:     = sequence %[[ARG1]] : !pdl.operation -> !pdl.operation failures(propagate) {
-  # CHECK:     ^{{.*}}(%[[ARG2:.+]]: !pdl.operation):
-  # CHECK:       yield %[[ARG2]] : !pdl.operation
+  # CHECK: ^{{.*}}(%[[ARG0:.+]]: !transform.any_op):
+  # CHECK:   sequence %[[ARG0]] : !transform.any_op failures(propagate) {
+  # CHECK:   ^{{.*}}(%[[ARG1:.+]]: !transform.any_op):
+  # CHECK:     = sequence %[[ARG1]] : !transform.any_op -> !transform.any_op failures(propagate) {
+  # CHECK:     ^{{.*}}(%[[ARG2:.+]]: !transform.any_op):
+  # CHECK:       yield %[[ARG2]] : !transform.any_op
   # CHECK:     }
   # CHECK:   }
   # CHECK: }
@@ -103,58 +103,58 @@
 
 @run
 def testTransformPDLOps():
-  withPdl = transform.WithPDLPatternsOp(pdl.OperationType.get())
+  withPdl = transform.WithPDLPatternsOp(transform.AnyOpType.get())
   with InsertionPoint(withPdl.body):
     sequence = transform.SequenceOp(transform.FailurePropagationMode.PROPAGATE,
-                                    [pdl.OperationType.get()],
+                                    [transform.AnyOpType.get()],
                                     withPdl.bodyTarget)
     with InsertionPoint(sequence.body):
-      match = transform.PDLMatchOp(pdl.OperationType.get(), sequence.bodyTarget, "pdl_matcher")
+      match = transform.PDLMatchOp(transform.AnyOpType.get(), sequence.bodyTarget, "pdl_matcher")
       transform.YieldOp(match)
   # CHECK-LABEL: TEST: testTransformPDLOps
   # CHECK: transform.with_pdl_patterns {
-  # CHECK: ^{{.*}}(%[[ARG0:.+]]: !pdl.operation):
-  # CHECK:   = sequence %[[ARG0]] : !pdl.operation -> !pdl.operation failures(propagate) {
-  # CHECK:   ^{{.*}}(%[[ARG1:.+]]: !pdl.operation):
+  # CHECK: ^{{.*}}(%[[ARG0:.+]]: !transform.any_op):
+  # CHECK:   = sequence %[[ARG0]] : !transform.any_op -> !transform.any_op failures(propagate) {
+  # CHECK:   ^{{.*}}(%[[ARG1:.+]]: !transform.any_op):
   # CHECK:     %[[RES:.+]] = pdl_match @pdl_matcher in %[[ARG1]]
-  # CHECK:     yield %[[RES]] : !pdl.operation
+  # CHECK:     yield %[[RES]] : !transform.any_op
   # CHECK:   }
   # CHECK: }
 
 
 @run
 def testGetClosestIsolatedParentOp():
-  sequence = transform.SequenceOp(transform.FailurePropagationMode.PROPAGATE, [], pdl.OperationType.get())
+  sequence = transform.SequenceOp(transform.FailurePropagationMode.PROPAGATE, [], transform.AnyOpType.get())
   with InsertionPoint(sequence.body):
-    transform.GetClosestIsolatedParentOp(pdl.OperationType.get(), sequence.bodyTarget)
+    transform.GetClosestIsolatedParentOp(transform.AnyOpType.get(), sequence.bodyTarget)
     transform.YieldOp()
   # CHECK-LABEL: TEST: testGetClosestIsolatedParentOp
   # CHECK: transform.sequence
-  # CHECK: ^{{.*}}(%[[ARG1:.+]]: !pdl.operation):
+  # CHECK: ^{{.*}}(%[[ARG1:.+]]: !transform.any_op):
   # CHECK:   = get_closest_isolated_parent %[[ARG1]]
 
 
 @run
 def testMergeHandlesOp():
-  sequence = transform.SequenceOp(transform.FailurePropagationMode.PROPAGATE, [], pdl.OperationType.get())
+  sequence = transform.SequenceOp(transform.FailurePropagationMode.PROPAGATE, [], transform.AnyOpType.get())
   with InsertionPoint(sequence.body):
     transform.MergeHandlesOp([sequence.bodyTarget])
     transform.YieldOp()
   # CHECK-LABEL: TEST: testMergeHandlesOp
   # CHECK: transform.sequence
-  # CHECK: ^{{.*}}(%[[ARG1:.+]]: !pdl.operation):
+  # CHECK: ^{{.*}}(%[[ARG1:.+]]: !transform.any_op):
   # CHECK:   = merge_handles %[[ARG1]]
 
 
 @run
 def testReplicateOp():
-  with_pdl = transform.WithPDLPatternsOp(pdl.OperationType.get())
+  with_pdl = transform.WithPDLPatternsOp(transform.AnyOpType.get())
   with InsertionPoint(with_pdl.body):
     sequence = transform.SequenceOp(
         transform.FailurePropagationMode.PROPAGATE, [], with_pdl.bodyTarget)
     with InsertionPoint(sequence.body):
-      m1 = transform.PDLMatchOp(pdl.OperationType.get(), sequence.bodyTarget, "first")
-      m2 = transform.PDLMatchOp(pdl.OperationType.get(), sequence.bodyTarget, "second")
+      m1 = transform.PDLMatchOp(transform.AnyOpType.get(), sequence.bodyTarget, "first")
+      m2 = transform.PDLMatchOp(transform.AnyOpType.get(), sequence.bodyTarget, "second")
       transform.ReplicateOp(m1, [m2])
       transform.YieldOp()
   # CHECK-LABEL: TEST: testReplicateOp
diff --git a/mlir/test/python/python_test_ops.td b/mlir/test/python/python_test_ops.td
--- a/mlir/test/python/python_test_ops.td
+++ b/mlir/test/python/python_test_ops.td
@@ -57,6 +57,23 @@
                    UnitAttr:$unit);
 }
 
+def AttributesOp : TestOp<"attributes_op"> {
+  let arguments = (ins BoolAttr:$x_bool,
+                   I16Attr: $x_i16,
+                   I32Attr: $x_i32,
+                   I64Attr: $x_i64,
+                   SI16Attr: $x_si16,
+                   SI32Attr: $x_si32,
+                   F32Attr: $x_f32,
+                   F64Attr: $x_f64,
+                   StrAttr: $x_str,
+                   I32ArrayAttr: $x_i32_array,
+                   I64ArrayAttr: $x_i64_array,
+                   F32ArrayAttr: $x_f32_array,
+                   F64ArrayAttr: $x_f64_array,
+                   DenseI64ArrayAttr: $x_i64_dense);
+}
+
 def PropertyOp : TestOp<"property_op"> {
   let arguments = (ins I32Attr:$property,
                    I32:$idx);
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -53,6 +53,7 @@
 void registerTestPrintDefUsePass();
 void registerTestPrintInvalidPass();
 void registerTestPrintNestingPass();
+void registerTestPreserveUseListOrders();
 void registerTestReducer();
 void registerTestSpirvEntryPointABIPass();
 void registerTestSpirvModuleCombinerPass();
@@ -167,6 +168,7 @@
   registerTestPrintDefUsePass();
   registerTestPrintInvalidPass();
   registerTestPrintNestingPass();
+  registerTestPreserveUseListOrders();
   registerTestReducer();
   registerTestSpirvEntryPointABIPass();
   registerTestSpirvModuleCombinerPass();
diff --git a/utils/bazel/WORKSPACE b/utils/bazel/WORKSPACE
--- a/utils/bazel/WORKSPACE
+++ b/utils/bazel/WORKSPACE
@@ -27,13 +27,6 @@
 
 llvm_configure(name = "llvm-project")
 
-load("@llvm-raw//utils/bazel:terminfo.bzl", "llvm_terminfo_from_env")
-
-maybe(
-    llvm_terminfo_from_env,
-    name = "llvm_terminfo",
-)
-
 maybe(
     http_archive,
     name = "llvm_zlib",
diff --git a/utils/bazel/configure.bzl b/utils/bazel/configure.bzl
--- a/utils/bazel/configure.bzl
+++ b/utils/bazel/configure.bzl
@@ -5,7 +5,6 @@
 """Helper macros to configure the LLVM overlay project."""
 
 load("@bazel_tools//tools/build_defs/repo:utils.bzl", "maybe")
-load(":terminfo.bzl", "llvm_terminfo_disable", "llvm_terminfo_system")
 
 # Directory of overlay files relative to WORKSPACE
 DEFAULT_OVERLAY_PATH = "llvm-project-overlay"
@@ -173,15 +172,3 @@
         "targets": attr.string_list(default = DEFAULT_TARGETS),
     },
 )
-
-def llvm_disable_optional_support_deps():
-    maybe(
-        llvm_terminfo_disable,
-        name = "llvm_terminfo",
-    )
-
-def llvm_use_system_support_deps():
-    maybe(
-        llvm_terminfo_system,
-        name = "llvm_terminfo",
-    )
diff --git a/utils/bazel/deps_impl/BUILD.bazel b/utils/bazel/deps_impl/BUILD.bazel
deleted file mode 100644
--- a/utils/bazel/deps_impl/BUILD.bazel
+++ /dev/null
@@ -1,5 +0,0 @@
-# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-# Required to reference files in this package
diff --git a/utils/bazel/deps_impl/terminfo_disable.BUILD b/utils/bazel/deps_impl/terminfo_disable.BUILD
deleted file mode 100644
--- a/utils/bazel/deps_impl/terminfo_disable.BUILD
+++ /dev/null
@@ -1,10 +0,0 @@
-# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-# Empty stub library. This doesn't include any terminfo library and doesn't set
-# the LLVM `#define`s to enable usage of terminfo.
-cc_library(
-    name = "terminfo",
-    visibility = ["//visibility:public"],
-)
diff --git a/utils/bazel/deps_impl/terminfo_system.BUILD b/utils/bazel/deps_impl/terminfo_system.BUILD
deleted file mode 100644
--- a/utils/bazel/deps_impl/terminfo_system.BUILD
+++ /dev/null
@@ -1,15 +0,0 @@
-# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-# Wrapper library for some system terminfo. Using this only works if the
-# toolchain already has the relevant library search paths configured. It also
-# sets the relevant LLVM `#define`s to enoble using terminfo.
-cc_library(
-    name = "terminfo",
-    defines = ["LLVM_ENABLE_TERMINFO=1"],
-    # Note that we will replace these link options with ones needed to
-    # effectively link against a terminfo providing library on the system.
-    linkopts = {TERMINFO_LINKOPTS},
-    visibility = ["//visibility:public"],
-)
diff --git a/utils/bazel/deps_impl/terminfo_test.c b/utils/bazel/deps_impl/terminfo_test.c
deleted file mode 100644
--- a/utils/bazel/deps_impl/terminfo_test.c
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
-This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-See https://llvm.org/LICENSE.txt for license information.
-SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-*/
-
-extern int setupterm(char *term, int filedes, int *errret);
-extern struct term *set_curterm(struct term *termp);
-extern int del_curterm(struct term *termp);
-extern int tigetnum(char *capname);
-
-int main() {
-  setupterm(0, 0, 0);
-  set_curterm(0);
-  del_curterm(0);
-  tigetnum(0);
-}
diff --git a/utils/bazel/examples/http_archive/WORKSPACE b/utils/bazel/examples/http_archive/WORKSPACE
--- a/utils/bazel/examples/http_archive/WORKSPACE
+++ b/utils/bazel/examples/http_archive/WORKSPACE
@@ -38,8 +38,3 @@
 load("@llvm-raw//utils/bazel:configure.bzl", "llvm_configure", "llvm_disable_optional_support_deps")
 
 llvm_configure(name = "llvm-project")
-
-# Disables optional dependencies for Support like zlib and terminfo. You may
-# instead want to configure them using the macros in the corresponding bzl
-# files.
-llvm_disable_optional_support_deps()
diff --git a/utils/bazel/examples/submodule/WORKSPACE b/utils/bazel/examples/submodule/WORKSPACE
--- a/utils/bazel/examples/submodule/WORKSPACE
+++ b/utils/bazel/examples/submodule/WORKSPACE
@@ -27,8 +27,3 @@
 load("@llvm-raw//utils/bazel:configure.bzl", "llvm_configure", "llvm_disable_optional_support_deps")
 
 llvm_configure(name = "llvm-project")
-
-# Disables optional dependencies for Support like zlib and terminfo. You may
-# instead want to configure them using the macros in the corresponding bzl
-# files.
-llvm_disable_optional_support_deps()
diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -274,11 +274,6 @@
     deps = [
         ":config",
         ":Demangle",
-        # We unconditionally depend on the custom LLVM terminfo wrapper. This
-        # will be an empty library unless terminfo is enabled, in which case it
-        # will both provide the necessary dependencies and configuration
-        # defines.
-        "@llvm_terminfo//:terminfo",
         # We unconditionally depend on the custom LLVM zlib wrapper. This will
         # be an empty library unless zlib is enabled, in which case it will
         # both provide the necessary dependencies and configuration defines.
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -6164,6 +6164,14 @@
             ["-gen-op-interface-defs"],
             "include/mlir/Interfaces/MemorySlotOpInterfaces.cpp.inc",
         ),
+        (
+            ["-gen-type-interface-decls"],
+            "include/mlir/Interfaces/MemorySlotTypeInterfaces.h.inc",
+        ),
+        (
+            ["-gen-type-interface-defs"],
+            "include/mlir/Interfaces/MemorySlotTypeInterfaces.cpp.inc",
+        ),
     ],
     tblgen = ":mlir-tblgen",
     td_file = "include/mlir/Interfaces/MemorySlotInterfaces.td",
@@ -8274,6 +8282,7 @@
     ],
     deps = [
         ":ControlFlowInterfaces",
+        ":FuncDialect",
         ":IR",
         ":LLVMDialect",
         ":OpenMPInterfacesIncGen",
@@ -10524,11 +10533,11 @@
     ],
     includes = ["include"],
     deps = [
-        ":MemorySlotInterfacesTdFiles",
         ":ArithOpsTdFiles",
         ":CastInterfacesTdFiles",
         ":ControlFlowInterfacesTdFiles",
         ":CopyOpInterfaceTdFiles",
+        ":MemorySlotInterfacesTdFiles",
         ":OpBaseTdFiles",
         ":ShapedOpInterfacesTdFiles",
         ":SideEffectInterfacesTdFiles",
@@ -10604,9 +10613,9 @@
         ":DialectUtils",
         ":IR",
         ":InferTypeOpInterface",
-        ":MemorySlotInterfaces",
         ":MemRefBaseIncGen",
         ":MemRefOpsIncGen",
+        ":MemorySlotInterfaces",
         ":ShapedOpInterfaces",
         ":ValueBoundsOpInterface",
         ":ViewLikeInterface",
diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
--- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
@@ -386,9 +386,12 @@
         "//llvm:Support",
         "//mlir:Analysis",
         "//mlir:ArithDialect",
+        "//mlir:BytecodeReader",
+        "//mlir:BytecodeWriter",
         "//mlir:FuncDialect",
         "//mlir:IR",
         "//mlir:LinalgDialect",
+        "//mlir:Parser",
         "//mlir:Pass",
         "//mlir:Support",
     ],
diff --git a/utils/bazel/terminfo.bzl b/utils/bazel/terminfo.bzl
deleted file mode 100644
--- a/utils/bazel/terminfo.bzl
+++ /dev/null
@@ -1,203 +0,0 @@
-# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-"""Repository rules to configure the terminfo used by LLVM.
-
-Most users should pick one of the explicit rules to configure their use of terminfo
-with LLVM:
-- `llvm_terminfo_system` will detect and link against a terminfo-implementing
-  system library (non-hermetically).
-- 'llvm_terminfo_disable` will disable terminfo completely.
-
-If you would like to make your build configurable, you can use
-`llvm_terminfo_from_env`. By default, this will disable terminfo, but will
-inspect the environment variable (most easily set with a `--repo_env` flag to
-the Bazel invocation) `BAZEL_LLVM_TERMINFO_STRATEGY`. If it is set to
-`system` then it will behave the same as `llvm_terminfo_system`. Any other
-setting will disable terminfo the same as not setting it at all.
-"""
-
-def _llvm_terminfo_disable_impl(repository_ctx):
-    repository_ctx.template(
-        "BUILD",
-        repository_ctx.attr._disable_build_template,
-        executable = False,
-    )
-
-_terminfo_disable_attrs = {
-    "_disable_build_template": attr.label(
-        default = "@llvm-raw//utils/bazel/deps_impl:terminfo_disable.BUILD",
-        allow_single_file = True,
-    ),
-}
-
-llvm_terminfo_disable = repository_rule(
-    implementation = _llvm_terminfo_disable_impl,
-    attrs = _terminfo_disable_attrs,
-)
-
-def _find_c_compiler(repository_ctx):
-    """Returns the path to a plausible C compiler.
-
-    This routine will only reliably work on roughly POSIX-y systems as it
-    ultimately falls back on the `cc` binary. Fortunately, the thing we are
-    trying to use it for (detecting if a trivial source file can compile and
-    link against a particular library) requires very little.
-    """
-    cc_env = repository_ctx.os.environ.get("CC")
-    cc = None
-    if cc_env:
-        if "/" in cc_env:
-            return repository_ctx.path(cc_env)
-        else:
-            return repository_ctx.which(cc_env)
-
-    # Look for Clang, GCC, and the POSIX / UNIX specified C compiler
-    # binaries.
-    for compiler in ["clang", "gcc", "c99", "c89", "cc"]:
-        cc = repository_ctx.which(compiler)
-        if cc:
-            return cc
-
-    return None
-
-def _try_link(repository_ctx, cc, source, linker_flags):
-    """Returns `True` if able to link the source with the linker flag.
-
-    Given a source file that contains references to library routines, this
-    will check that when linked with the provided linker flag, those
-    references are successfully resolved. This routine assumes a generally
-    POSIX-y and GCC-ish compiler and environment and shouldn't be expected to
-    work outside of that.
-    """
-    cmd = [
-        cc,
-        # Force discard the linked executable.
-        "-o",
-        "/dev/null",
-        # Leave language detection to the compiler.
-        source,
-    ]
-
-    # The linker flag must be valid for a compiler invocation of the link step,
-    # so just append them to the command.
-    cmd += linker_flags
-    exec_result = repository_ctx.execute(cmd, timeout = 20)
-    return exec_result.return_code == 0
-
-def _llvm_terminfo_system_impl(repository_ctx):
-    # LLVM doesn't need terminfo support on Windows, so just disable it.
-    if repository_ctx.os.name.lower().find("windows") != -1:
-        _llvm_terminfo_disable_impl(repository_ctx)
-        return
-
-    if len(repository_ctx.attr.system_linkopts) > 0:
-        linkopts = repository_ctx.attr.system_linkopts
-    else:
-        required = repository_ctx.attr.system_required
-
-        # Find a C compiler we can use to detect viable linkopts on this system.
-        cc = _find_c_compiler(repository_ctx)
-        if not cc:
-            if required:
-                fail("Failed  to find a C compiler executable")
-            else:
-                _llvm_terminfo_disable_impl(repository_ctx)
-                return
-
-        # Get the source file we use to detect successful linking of terminfo.
-        source = repository_ctx.path(repository_ctx.attr._terminfo_test_source)
-
-        # Collect the candidate linkopts and wrap them into a list. Ideally,
-        # these would be provided as lists, but Bazel doesn't currently
-        # support that. See: https://github.com/bazelbuild/bazel/issues/12178
-        linkopts_candidates = [[x] for x in repository_ctx.attr.candidate_system_linkopts]
-        linkopts = None
-
-        # For each candidate, try to use it to link our test source file.
-        for linkopts_candidate in linkopts_candidates:
-            if _try_link(repository_ctx, cc, source, linkopts_candidate):
-                linkopts = linkopts_candidate
-                break
-
-        # If we never found a viable linkopts candidate, either error or disable
-        # terminfo for LLVM.
-        if not linkopts:
-            if required:
-                fail("Failed to detect which linkopt would successfully provide the " +
-                     "necessary terminfo functionality")
-            else:
-                _llvm_terminfo_disable_impl(repository_ctx)
-                return
-
-    repository_ctx.template(
-        "BUILD",
-        repository_ctx.attr._system_build_template,
-        substitutions = {
-            "{TERMINFO_LINKOPTS}": str(linkopts),
-        },
-        executable = False,
-    )
-
-def _merge_attrs(attrs_list):
-    attrs = {}
-    for input_attrs in attrs_list:
-        attrs.update(input_attrs)
-    return attrs
-
-_terminfo_system_attrs = _merge_attrs([_terminfo_disable_attrs, {
-    "_system_build_template": attr.label(
-        default = "@llvm-raw//utils/bazel/deps_impl:terminfo_system.BUILD",
-        allow_single_file = True,
-    ),
-    "_terminfo_test_source": attr.label(
-        default = "@llvm-raw//utils/bazel/deps_impl:terminfo_test.c",
-        allow_single_file = True,
-    ),
-    "candidate_system_linkopts": attr.string_list(
-        default = [
-            "-lterminfo",
-            "-ltinfo",
-            "-lcurses",
-            "-lncurses",
-            "-lncursesw",
-        ],
-        doc = "Candidate linkopts to test and see if they can link " +
-              "successfully.",
-    ),
-    "system_required": attr.bool(
-        default = False,
-        doc = "Require that one of the candidates is detected successfully on POSIX platforms where it is needed.",
-    ),
-    "system_linkopts": attr.string_list(
-        default = [],
-        doc = "If non-empty, a specific array of linkopts to use to " +
-              "successfully link against the terminfo library. No " +
-              "detection is performed if this option is provided, it " +
-              "directly forces the use of these link options. No test is " +
-              "run to determine if they are valid or work correctly either.",
-    ),
-}])
-
-llvm_terminfo_system = repository_rule(
-    implementation = _llvm_terminfo_system_impl,
-    configure = True,
-    local = True,
-    attrs = _terminfo_system_attrs,
-)
-
-def _llvm_terminfo_from_env_impl(repository_ctx):
-    terminfo_strategy = repository_ctx.os.environ.get("BAZEL_LLVM_TERMINFO_STRATEGY")
-    if terminfo_strategy == "system":
-        _llvm_terminfo_system_impl(repository_ctx)
-    else:
-        _llvm_terminfo_disable_impl(repository_ctx)
-
-llvm_terminfo_from_env = repository_rule(
-    implementation = _llvm_terminfo_from_env_impl,
-    configure = True,
-    local = True,
-    attrs = _merge_attrs([_terminfo_disable_attrs, _terminfo_system_attrs]),
-    environ = ["BAZEL_LLVM_TERMINFO_STRATEGY", "CC"],
-)