diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -706,6 +706,14 @@
                              unsigned AddrSpace = 0,
                              Instruction *I = nullptr) const;
 
+  /// Checks if the specified operation with the given vector type is not going
+  /// to be scalarized.
+  bool isLegalVectorOp(unsigned, VectorType *) const;
+
+  /// Checks if the specified operation(intrinsic) with the given vector type is
+  /// not going to be scalarized.
+  bool isLegalVectorIntrinsic(Intrinsic::ID, VectorType *) const;
+
   /// Return true if LSR cost of C1 is lower than C2.
   bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
                      const TargetTransformInfo::LSRCost &C2) const;
@@ -1757,6 +1765,10 @@
                                      int64_t BaseOffset, bool HasBaseReg,
                                      int64_t Scale, unsigned AddrSpace,
                                      Instruction *I) = 0;
+  virtual bool isLegalVectorOp(unsigned, VectorType *) const = 0;
+
+  virtual bool isLegalVectorIntrinsic(Intrinsic::ID, VectorType *) const = 0;
+
   virtual bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
                              const TargetTransformInfo::LSRCost &C2) = 0;
   virtual bool isNumRegsMajorCostOfLSR() = 0;
@@ -2198,6 +2210,15 @@
     return Impl.isLegalAddressingMode(Ty, BaseGV, BaseOffset, HasBaseReg, Scale,
                                       AddrSpace, I);
   }
+  bool isLegalVectorOp(unsigned Opcode, VectorType *VecTy) const override {
+    return Impl.isLegalVectorOp(Opcode, VecTy);
+  }
+
+  bool isLegalVectorIntrinsic(Intrinsic::ID Id,
+                              VectorType *VecTy) const override {
+    return Impl.isLegalVectorIntrinsic(Id, VecTy);
+  }
+
   bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
                      const TargetTransformInfo::LSRCost &C2) override {
     return Impl.isLSRCostLess(C1, C2);
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -298,6 +298,12 @@
 
   bool isLegalMaskedExpandLoad(Type *DataType) const { return false; }
 
+  bool isLegalVectorOp(unsigned, VectorType *) const { return true; }
+
+  bool isLegalVectorIntrinsic(Intrinsic::ID, VectorType *) const {
+    return true;
+  }
+
   bool enableOrderedReductions() const { return false; }
 
   bool hasDivRemOp(Type *DataType, bool IsSigned) const { return false; }
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -342,6 +342,108 @@
     return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
   }
 
+  bool isLegalVectorOp(unsigned Opcode, VectorType *VecTy) const {
+    int ISD = getTLI()->InstructionOpcodeToISD(Opcode);
+    EVT VT = getTLI()->getValueType(DL, VecTy);
+    TargetLoweringBase::LegalizeKind LK =
+        getTLI()->getTypeConversion(VecTy->getContext(), VT);
+    return LK.first != TargetLoweringBase::TypeScalarizeVector &&
+           getTLI()->getOperationAction(ISD, LK.second) !=
+               TargetLowering::Expand;
+  }
+
+  static unsigned intrinsicIdToISD(Intrinsic::ID IID) {
+    switch (IID) {
+    default:
+      break;
+    case Intrinsic::sqrt:
+      return ISD::FSQRT;
+    case Intrinsic::sin:
+      return ISD::FSIN;
+    case Intrinsic::cos:
+      return ISD::FCOS;
+    case Intrinsic::exp:
+      return ISD::FEXP;
+    case Intrinsic::exp2:
+      return ISD::FEXP2;
+    case Intrinsic::log:
+      return ISD::FLOG;
+    case Intrinsic::log10:
+      return ISD::FLOG10;
+    case Intrinsic::log2:
+      return ISD::FLOG2;
+    case Intrinsic::fabs:
+      return ISD::FABS;
+    case Intrinsic::canonicalize:
+      return ISD::FCANONICALIZE;
+    case Intrinsic::minnum:
+      return ISD::FMINNUM;
+    case Intrinsic::maxnum:
+      return ISD::FMAXNUM;
+    case Intrinsic::minimum:
+      return ISD::FMINIMUM;
+    case Intrinsic::maximum:
+      return ISD::FMAXIMUM;
+    case Intrinsic::copysign:
+      return ISD::FCOPYSIGN;
+    case Intrinsic::floor:
+      return ISD::FFLOOR;
+    case Intrinsic::ceil:
+      return ISD::FCEIL;
+    case Intrinsic::trunc:
+      return ISD::FTRUNC;
+    case Intrinsic::nearbyint:
+      return ISD::FNEARBYINT;
+    case Intrinsic::rint:
+      return ISD::FRINT;
+    case Intrinsic::round:
+      return ISD::FROUND;
+    case Intrinsic::roundeven:
+      return ISD::FROUNDEVEN;
+    case Intrinsic::pow:
+      return ISD::FPOW;
+    case Intrinsic::fma:
+      return ISD::FMA;
+    case Intrinsic::fmuladd:
+      return ISD::FMA;
+    case Intrinsic::experimental_constrained_fmuladd:
+      return ISD::STRICT_FMA;
+    case Intrinsic::ctpop:
+      return ISD::CTPOP;
+    case Intrinsic::ctlz:
+      return ISD::CTLZ;
+    case Intrinsic::cttz:
+      return ISD::CTTZ;
+    case Intrinsic::bswap:
+      return ISD::BSWAP;
+    case Intrinsic::bitreverse:
+      return ISD::BITREVERSE;
+    }
+    return ISD::DELETED_NODE;
+  }
+
+  bool isLegalVectorIntrinsic(Intrinsic::ID Id, VectorType *VecTy) const {
+    unsigned ISD = intrinsicIdToISD(Id);
+    switch (intrinsicIdToISD(Id)) {
+    default:
+      return true;
+    case ISD::FEXP:
+    case ISD::FEXP2:
+    case ISD::FLOG:
+    case ISD::FLOG2:
+    case ISD::FLOG10:
+    case ISD::FSIN:
+    case ISD::FCOS:
+    case ISD::FSQRT:
+      break;
+    }
+
+    EVT VT = getTLI()->getValueType(DL, VecTy);
+    return getTLI()->getTypeAction(VecTy->getContext(), VT) !=
+               TargetLoweringBase::TypeScalarizeVector &&
+           getTLI()->getOperationAction(ISD, VT) != TargetLowering::Expand;
+  }
+
   unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
                              Type *ScalarValTy) const {
     auto &&IsSupportedByTarget = [this, ScalarMemTy, ScalarValTy](unsigned VF) {
@@ -1727,404 +1829,320 @@
 
     // Library call cost - other than size, make it expensive.
     unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10;
-    unsigned ISD = 0;
-    switch (IID) {
-    default: {
-      // Scalable vectors cannot be scalarized, so return Invalid.
-      if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
-            return isa<ScalableVectorType>(Ty);
-          }))
-        return InstructionCost::getInvalid();
-
-      // Assume that we need to scalarize this intrinsic.
-      InstructionCost ScalarizationCost =
-          SkipScalarizationCost ? ScalarizationCostPassed : 0;
-      unsigned ScalarCalls = 1;
-      Type *ScalarRetTy = RetTy;
-      if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
-        if (!SkipScalarizationCost)
-          ScalarizationCost = getScalarizationOverhead(
-              RetVTy, /*Insert*/ true, /*Extract*/ false, CostKind);
-        ScalarCalls = std::max(ScalarCalls,
-                               cast<FixedVectorType>(RetVTy)->getNumElements());
-        ScalarRetTy = RetTy->getScalarType();
-      }
-      SmallVector<Type *, 4> ScalarTys;
-      for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
-        Type *Ty = Tys[i];
-        if (auto *VTy = dyn_cast<VectorType>(Ty)) {
+    // Look for intrinsics that can be lowered directly or turned into a
+    // scalar intrinsic call.
+    unsigned ISD = intrinsicIdToISD(IID);
+    if (ISD == ISD::DELETED_NODE) {
+      switch (IID) {
+      default: {
+        // Scalable vectors cannot be scalarized, so return Invalid.
+        if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
+              return isa<ScalableVectorType>(Ty);
+            }))
+          return InstructionCost::getInvalid();
+
+        // Assume that we need to scalarize this intrinsic.
+        InstructionCost ScalarizationCost =
+            SkipScalarizationCost ? ScalarizationCostPassed : 0;
+        unsigned ScalarCalls = 1;
+        Type *ScalarRetTy = RetTy;
+        if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
           if (!SkipScalarizationCost)
-            ScalarizationCost += getScalarizationOverhead(
-                VTy, /*Insert*/ false, /*Extract*/ true, CostKind);
-          ScalarCalls = std::max(ScalarCalls,
-                                 cast<FixedVectorType>(VTy)->getNumElements());
-          Ty = Ty->getScalarType();
+            ScalarizationCost = getScalarizationOverhead(
+                RetVTy, /*Insert*/ true, /*Extract*/ false, CostKind);
+          ScalarCalls = std::max(
+              ScalarCalls, cast<FixedVectorType>(RetVTy)->getNumElements());
+          ScalarRetTy = RetTy->getScalarType();
         }
-        ScalarTys.push_back(Ty);
-      }
-      if (ScalarCalls == 1)
-        return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.
-
-      IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF);
-      InstructionCost ScalarCost =
-          thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind);
-
-      return ScalarCalls * ScalarCost + ScalarizationCost;
-    }
-    // Look for intrinsics that can be lowered directly or turned into a scalar
-    // intrinsic call.
-    case Intrinsic::sqrt:
-      ISD = ISD::FSQRT;
-      break;
-    case Intrinsic::sin:
-      ISD = ISD::FSIN;
-      break;
-    case Intrinsic::cos:
-      ISD = ISD::FCOS;
-      break;
-    case Intrinsic::exp:
-      ISD = ISD::FEXP;
-      break;
-    case Intrinsic::exp2:
-      ISD = ISD::FEXP2;
-      break;
-    case Intrinsic::log:
-      ISD = ISD::FLOG;
-      break;
-    case Intrinsic::log10:
-      ISD = ISD::FLOG10;
-      break;
-    case Intrinsic::log2:
-      ISD = ISD::FLOG2;
-      break;
-    case Intrinsic::fabs:
-      ISD = ISD::FABS;
-      break;
-    case Intrinsic::canonicalize:
-      ISD = ISD::FCANONICALIZE;
-      break;
-    case Intrinsic::minnum:
-      ISD = ISD::FMINNUM;
-      break;
-    case Intrinsic::maxnum:
-      ISD = ISD::FMAXNUM;
-      break;
-    case Intrinsic::minimum:
-      ISD = ISD::FMINIMUM;
-      break;
-    case Intrinsic::maximum:
-      ISD = ISD::FMAXIMUM;
-      break;
-    case Intrinsic::copysign:
-      ISD = ISD::FCOPYSIGN;
-      break;
-    case Intrinsic::floor:
-      ISD = ISD::FFLOOR;
-      break;
-    case Intrinsic::ceil:
-      ISD = ISD::FCEIL;
-      break;
-    case Intrinsic::trunc:
-      ISD = ISD::FTRUNC;
-      break;
-    case Intrinsic::nearbyint:
-      ISD = ISD::FNEARBYINT;
-      break;
-    case Intrinsic::rint:
-      ISD = ISD::FRINT;
-      break;
-    case Intrinsic::round:
-      ISD = ISD::FROUND;
-      break;
-    case Intrinsic::roundeven:
-      ISD = ISD::FROUNDEVEN;
-      break;
-    case Intrinsic::pow:
-      ISD = ISD::FPOW;
-      break;
-    case Intrinsic::fma:
-      ISD = ISD::FMA;
-      break;
-    case Intrinsic::fmuladd:
-      ISD = ISD::FMA;
-      break;
-    case Intrinsic::experimental_constrained_fmuladd:
-      ISD = ISD::STRICT_FMA;
-      break;
-    // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
-    case Intrinsic::lifetime_start:
-    case Intrinsic::lifetime_end:
-    case Intrinsic::sideeffect:
-    case Intrinsic::pseudoprobe:
-    case Intrinsic::arithmetic_fence:
-      return 0;
-    case Intrinsic::masked_store: {
-      Type *Ty = Tys[0];
-      Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
-      return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0,
-                                            CostKind);
-    }
-    case Intrinsic::masked_load: {
-      Type *Ty = RetTy;
-      Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
-      return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
-                                            CostKind);
-    }
-    case Intrinsic::vector_reduce_add:
-      return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy,
-                                                 std::nullopt, CostKind);
-    case Intrinsic::vector_reduce_mul:
-      return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy,
-                                                 std::nullopt, CostKind);
-    case Intrinsic::vector_reduce_and:
-      return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy,
-                                                 std::nullopt, CostKind);
-    case Intrinsic::vector_reduce_or:
-      return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy,
-                                                 std::nullopt, CostKind);
-    case Intrinsic::vector_reduce_xor:
-      return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy,
-                                                 std::nullopt, CostKind);
-    case Intrinsic::vector_reduce_fadd:
-      return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy,
-                                                 FMF, CostKind);
-    case Intrinsic::vector_reduce_fmul:
-      return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy,
-                                                 FMF, CostKind);
-    case Intrinsic::vector_reduce_smax:
-      return thisT()->getMinMaxReductionCost(Intrinsic::smax, VecOpTy,
-                                             ICA.getFlags(), CostKind);
-    case Intrinsic::vector_reduce_smin:
-      return thisT()->getMinMaxReductionCost(Intrinsic::smin, VecOpTy,
-                                             ICA.getFlags(), CostKind);
-    case Intrinsic::vector_reduce_umax:
-      return thisT()->getMinMaxReductionCost(Intrinsic::umax, VecOpTy,
-                                             ICA.getFlags(), CostKind);
-    case Intrinsic::vector_reduce_umin:
-      return thisT()->getMinMaxReductionCost(Intrinsic::umin, VecOpTy,
-                                             ICA.getFlags(), CostKind);
-    case Intrinsic::vector_reduce_fmax:
-      return thisT()->getMinMaxReductionCost(Intrinsic::maxnum, VecOpTy,
-                                             ICA.getFlags(), CostKind);
-    case Intrinsic::vector_reduce_fmin:
-      return thisT()->getMinMaxReductionCost(Intrinsic::minnum, VecOpTy,
-                                             ICA.getFlags(), CostKind);
-    case Intrinsic::vector_reduce_fmaximum:
-      return thisT()->getMinMaxReductionCost(Intrinsic::maximum, VecOpTy,
-                                             ICA.getFlags(), CostKind);
-    case Intrinsic::vector_reduce_fminimum:
-      return thisT()->getMinMaxReductionCost(Intrinsic::minimum, VecOpTy,
-                                             ICA.getFlags(), CostKind);
-    case Intrinsic::abs: {
-      // abs(X) = select(icmp(X,0),X,sub(0,X))
-      Type *CondTy = RetTy->getWithNewBitWidth(1);
-      CmpInst::Predicate Pred = CmpInst::ICMP_SGT;
-      InstructionCost Cost = 0;
-      Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
-                                          Pred, CostKind);
-      Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
-                                          Pred, CostKind);
-      // TODO: Should we add an OperandValueProperties::OP_Zero property?
-      Cost += thisT()->getArithmeticInstrCost(
-         BinaryOperator::Sub, RetTy, CostKind, {TTI::OK_UniformConstantValue, TTI::OP_None});
-      return Cost;
-    }
-    case Intrinsic::smax:
-    case Intrinsic::smin:
-    case Intrinsic::umax:
-    case Intrinsic::umin: {
-      // minmax(X,Y) = select(icmp(X,Y),X,Y)
-      Type *CondTy = RetTy->getWithNewBitWidth(1);
-      bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin;
-      CmpInst::Predicate Pred =
-          IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT;
-      InstructionCost Cost = 0;
-      Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
-                                          Pred, CostKind);
-      Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
-                                          Pred, CostKind);
-      return Cost;
-    }
-    case Intrinsic::sadd_sat:
-    case Intrinsic::ssub_sat: {
-      Type *CondTy = RetTy->getWithNewBitWidth(1);
-
-      Type *OpTy = StructType::create({RetTy, CondTy});
-      Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
-                                     ? Intrinsic::sadd_with_overflow
-                                     : Intrinsic::ssub_with_overflow;
-      CmpInst::Predicate Pred = CmpInst::ICMP_SGT;
-
-      // SatMax -> Overflow && SumDiff < 0
-      // SatMin -> Overflow && SumDiff >= 0
-      InstructionCost Cost = 0;
-      IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
-                                    nullptr, ScalarizationCostPassed);
-      Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
-      Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
-                                          Pred, CostKind);
-      Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
-                                              CondTy, Pred, CostKind);
-      return Cost;
-    }
-    case Intrinsic::uadd_sat:
-    case Intrinsic::usub_sat: {
-      Type *CondTy = RetTy->getWithNewBitWidth(1);
+        SmallVector<Type *, 4> ScalarTys;
+        for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
+          Type *Ty = Tys[i];
+          if (auto *VTy = dyn_cast<VectorType>(Ty)) {
+            if (!SkipScalarizationCost)
+              ScalarizationCost += getScalarizationOverhead(
+                  VTy, /*Insert*/ false, /*Extract*/ true, CostKind);
+            ScalarCalls = std::max(
+                ScalarCalls, cast<FixedVectorType>(VTy)->getNumElements());
+            Ty = Ty->getScalarType();
+          }
+          ScalarTys.push_back(Ty);
+        }
+        if (ScalarCalls == 1)
+          return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.
 
-      Type *OpTy = StructType::create({RetTy, CondTy});
-      Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat
-                                     ? Intrinsic::uadd_with_overflow
-                                     : Intrinsic::usub_with_overflow;
+        IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF);
+        InstructionCost ScalarCost =
+            thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind);
 
-      InstructionCost Cost = 0;
-      IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
-                                    nullptr, ScalarizationCostPassed);
-      Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
-      Cost +=
-          thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
-                                      CmpInst::BAD_ICMP_PREDICATE, CostKind);
-      return Cost;
-    }
-    case Intrinsic::smul_fix:
-    case Intrinsic::umul_fix: {
-      unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;
-      Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize);
+        return ScalarCalls * ScalarCost + ScalarizationCost;
+      }
+      // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
+      case Intrinsic::lifetime_start:
+      case Intrinsic::lifetime_end:
+      case Intrinsic::sideeffect:
+      case Intrinsic::pseudoprobe:
+      case Intrinsic::arithmetic_fence:
+        return 0;
+      case Intrinsic::masked_store: {
+        Type *Ty = Tys[0];
+        Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
+        return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign,
+                                              0, CostKind);
+      }
+      case Intrinsic::masked_load: {
+        Type *Ty = RetTy;
+        Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
+        return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
+                                              CostKind);
+      }
+      case Intrinsic::vector_reduce_add:
+        return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy,
+                                                   std::nullopt, CostKind);
+      case Intrinsic::vector_reduce_mul:
+        return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy,
+                                                   std::nullopt, CostKind);
+      case Intrinsic::vector_reduce_and:
+        return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy,
+                                                   std::nullopt, CostKind);
+      case Intrinsic::vector_reduce_or:
+        return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy,
+                                                   std::nullopt, CostKind);
+      case Intrinsic::vector_reduce_xor:
+        return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy,
+                                                   std::nullopt, CostKind);
+      case Intrinsic::vector_reduce_fadd:
+        return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy,
+                                                   FMF, CostKind);
+      case Intrinsic::vector_reduce_fmul:
+        return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy,
+                                                   FMF, CostKind);
+      case Intrinsic::vector_reduce_smax:
+        return thisT()->getMinMaxReductionCost(Intrinsic::smax, VecOpTy,
+                                               ICA.getFlags(), CostKind);
+      case Intrinsic::vector_reduce_smin:
+        return thisT()->getMinMaxReductionCost(Intrinsic::smin, VecOpTy,
+                                               ICA.getFlags(), CostKind);
+      case Intrinsic::vector_reduce_umax:
+        return thisT()->getMinMaxReductionCost(Intrinsic::umax, VecOpTy,
+                                               ICA.getFlags(), CostKind);
+      case Intrinsic::vector_reduce_umin:
+        return thisT()->getMinMaxReductionCost(Intrinsic::umin, VecOpTy,
+                                               ICA.getFlags(), CostKind);
+      case Intrinsic::vector_reduce_fmax:
+        return thisT()->getMinMaxReductionCost(Intrinsic::maxnum, VecOpTy,
+                                               ICA.getFlags(), CostKind);
+      case Intrinsic::vector_reduce_fmin:
+        return thisT()->getMinMaxReductionCost(Intrinsic::minnum, VecOpTy,
+                                               ICA.getFlags(), CostKind);
+      case Intrinsic::vector_reduce_fmaximum:
+        return thisT()->getMinMaxReductionCost(Intrinsic::maximum, VecOpTy,
+                                               ICA.getFlags(), CostKind);
+      case Intrinsic::vector_reduce_fminimum:
+        return thisT()->getMinMaxReductionCost(Intrinsic::minimum, VecOpTy,
+                                               ICA.getFlags(), CostKind);
+      case Intrinsic::abs: {
+        // abs(X) = select(icmp(X,0),X,sub(0,X))
+        Type *CondTy = RetTy->getWithNewBitWidth(1);
+        CmpInst::Predicate Pred = CmpInst::ICMP_SGT;
+        InstructionCost Cost = 0;
+        Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
+                                            Pred, CostKind);
+        Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
+                                            CondTy, Pred, CostKind);
+        // TODO: Should we add an OperandValueProperties::OP_Zero property?
+        Cost += thisT()->getArithmeticInstrCost(
+            BinaryOperator::Sub, RetTy, CostKind,
+            {TTI::OK_UniformConstantValue, TTI::OP_None});
+        return Cost;
+      }
+      case Intrinsic::smax:
+      case Intrinsic::smin:
+      case Intrinsic::umax:
+      case Intrinsic::umin: {
+        // minmax(X,Y) = select(icmp(X,Y),X,Y)
+        Type *CondTy = RetTy->getWithNewBitWidth(1);
+        bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin;
+        CmpInst::Predicate Pred =
+            IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT;
+        InstructionCost Cost = 0;
+        Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
+                                            Pred, CostKind);
+        Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
+                                            CondTy, Pred, CostKind);
+        return Cost;
+      }
+      case Intrinsic::sadd_sat:
+      case Intrinsic::ssub_sat: {
+        Type *CondTy = RetTy->getWithNewBitWidth(1);
 
-      unsigned ExtOp =
-          IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
-      TTI::CastContextHint CCH = TTI::CastContextHint::None;
+        Type *OpTy = StructType::create({RetTy, CondTy});
+        Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
+                                       ? Intrinsic::sadd_with_overflow
+                                       : Intrinsic::ssub_with_overflow;
+        CmpInst::Predicate Pred = CmpInst::ICMP_SGT;
+
+        // SatMax -> Overflow && SumDiff < 0
+        // SatMin -> Overflow && SumDiff >= 0
+        InstructionCost Cost = 0;
+        IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
+                                      nullptr, ScalarizationCostPassed);
+        Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
+        Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
+                                            Pred, CostKind);
+        Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
+                                                CondTy, Pred, CostKind);
+        return Cost;
+      }
+      case Intrinsic::uadd_sat:
+      case Intrinsic::usub_sat: {
+        Type *CondTy = RetTy->getWithNewBitWidth(1);
 
-      InstructionCost Cost = 0;
-      Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);
-      Cost +=
-          thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
-      Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,
-                                            CCH, CostKind);
-      Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy,
-                                              CostKind,
-                                              {TTI::OK_AnyValue, TTI::OP_None},
-                                              {TTI::OK_UniformConstantValue, TTI::OP_None});
-      Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind,
-                                              {TTI::OK_AnyValue, TTI::OP_None},
-                                              {TTI::OK_UniformConstantValue, TTI::OP_None});
-      Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
-      return Cost;
-    }
-    case Intrinsic::sadd_with_overflow:
-    case Intrinsic::ssub_with_overflow: {
-      Type *SumTy = RetTy->getContainedType(0);
-      Type *OverflowTy = RetTy->getContainedType(1);
-      unsigned Opcode = IID == Intrinsic::sadd_with_overflow
-                            ? BinaryOperator::Add
-                            : BinaryOperator::Sub;
-
-      //   Add:
-      //   Overflow -> (Result < LHS) ^ (RHS < 0)
-      //   Sub:
-      //   Overflow -> (Result < LHS) ^ (RHS > 0)
-      InstructionCost Cost = 0;
-      Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
-      Cost += 2 * thisT()->getCmpSelInstrCost(
-                      Instruction::ICmp, SumTy, OverflowTy,
-                      CmpInst::ICMP_SGT, CostKind);
-      Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy,
-                                              CostKind);
-      return Cost;
-    }
-    case Intrinsic::uadd_with_overflow:
-    case Intrinsic::usub_with_overflow: {
-      Type *SumTy = RetTy->getContainedType(0);
-      Type *OverflowTy = RetTy->getContainedType(1);
-      unsigned Opcode = IID == Intrinsic::uadd_with_overflow
-                            ? BinaryOperator::Add
-                            : BinaryOperator::Sub;
-      CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow
-                                    ? CmpInst::ICMP_ULT
-                                    : CmpInst::ICMP_UGT;
+        Type *OpTy = StructType::create({RetTy, CondTy});
+        Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat
+                                       ? Intrinsic::uadd_with_overflow
+                                       : Intrinsic::usub_with_overflow;
 
-      InstructionCost Cost = 0;
-      Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
-      Cost +=
-          thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy,
-                                      Pred, CostKind);
-      return Cost;
-    }
-    case Intrinsic::smul_with_overflow:
-    case Intrinsic::umul_with_overflow: {
-      Type *MulTy = RetTy->getContainedType(0);
-      Type *OverflowTy = RetTy->getContainedType(1);
-      unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
-      Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);
-      bool IsSigned = IID == Intrinsic::smul_with_overflow;
+        InstructionCost Cost = 0;
+        IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
+                                      nullptr, ScalarizationCostPassed);
+        Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
+        Cost +=
+            thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
+                                        CmpInst::BAD_ICMP_PREDICATE, CostKind);
+        return Cost;
+      }
+      case Intrinsic::smul_fix:
+      case Intrinsic::umul_fix: {
+        unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;
+        Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize);
 
-      unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt;
-      TTI::CastContextHint CCH = TTI::CastContextHint::None;
+        unsigned ExtOp =
+            IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
+        TTI::CastContextHint CCH = TTI::CastContextHint::None;
 
-      InstructionCost Cost = 0;
-      Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);
-      Cost +=
-          thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
-      Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
-                                            CCH, CostKind);
-      Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, ExtTy,
-                                              CostKind,
-                                              {TTI::OK_AnyValue, TTI::OP_None},
-                                              {TTI::OK_UniformConstantValue, TTI::OP_None});
-
-      if (IsSigned)
-        Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy,
-                                                CostKind,
-                                                {TTI::OK_AnyValue, TTI::OP_None},
-                                                {TTI::OK_UniformConstantValue, TTI::OP_None});
-
-      Cost += thisT()->getCmpSelInstrCost(
-          BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind);
-      return Cost;
-    }
-    case Intrinsic::fptosi_sat:
-    case Intrinsic::fptoui_sat: {
-      if (Tys.empty())
-        break;
-      Type *FromTy = Tys[0];
-      bool IsSigned = IID == Intrinsic::fptosi_sat;
+        InstructionCost Cost = 0;
+        Cost +=
+            2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);
+        Cost +=
+            thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
+        Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,
+                                              CCH, CostKind);
+        Cost += thisT()->getArithmeticInstrCost(
+            Instruction::LShr, RetTy, CostKind,
+            {TTI::OK_AnyValue, TTI::OP_None},
+            {TTI::OK_UniformConstantValue, TTI::OP_None});
+        Cost += thisT()->getArithmeticInstrCost(
+            Instruction::Shl, RetTy, CostKind, {TTI::OK_AnyValue, TTI::OP_None},
+            {TTI::OK_UniformConstantValue, TTI::OP_None});
+        Cost +=
+            thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
+        return Cost;
+      }
+      case Intrinsic::sadd_with_overflow:
+      case Intrinsic::ssub_with_overflow: {
+        Type *SumTy = RetTy->getContainedType(0);
+        Type *OverflowTy = RetTy->getContainedType(1);
+        unsigned Opcode = IID == Intrinsic::sadd_with_overflow
+                              ? BinaryOperator::Add
+                              : BinaryOperator::Sub;
+
+        //   Add:
+        //   Overflow -> (Result < LHS) ^ (RHS < 0)
+        //   Sub:
+        //   Overflow -> (Result < LHS) ^ (RHS > 0)
+        InstructionCost Cost = 0;
+        Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
+        Cost += 2 * thisT()->getCmpSelInstrCost(Instruction::ICmp, SumTy,
+                                                OverflowTy, CmpInst::ICMP_SGT,
+                                                CostKind);
+        Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy,
+                                                CostKind);
+        return Cost;
+      }
+      case Intrinsic::uadd_with_overflow:
+      case Intrinsic::usub_with_overflow: {
+        Type *SumTy = RetTy->getContainedType(0);
+        Type *OverflowTy = RetTy->getContainedType(1);
+        unsigned Opcode = IID == Intrinsic::uadd_with_overflow
+                              ? BinaryOperator::Add
+                              : BinaryOperator::Sub;
+        CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow
+                                      ? CmpInst::ICMP_ULT
+                                      : CmpInst::ICMP_UGT;
+
+        InstructionCost Cost = 0;
+        Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
+        Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy,
+                                            OverflowTy, Pred, CostKind);
+        return Cost;
+      }
+      case Intrinsic::smul_with_overflow:
+      case Intrinsic::umul_with_overflow: {
+        Type *MulTy = RetTy->getContainedType(0);
+        Type *OverflowTy = RetTy->getContainedType(1);
+        unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
+        Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);
+        bool IsSigned = IID == Intrinsic::smul_with_overflow;
+
+        unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt;
+        TTI::CastContextHint CCH = TTI::CastContextHint::None;
+
+        InstructionCost Cost = 0;
+        Cost +=
+            2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);
+        Cost +=
+            thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
+        Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
+                                              CCH, CostKind);
+        Cost += thisT()->getArithmeticInstrCost(
+            Instruction::LShr, ExtTy, CostKind,
+            {TTI::OK_AnyValue, TTI::OP_None},
+            {TTI::OK_UniformConstantValue, TTI::OP_None});
+
+        if (IsSigned)
+          Cost += thisT()->getArithmeticInstrCost(
+              Instruction::AShr, MulTy, CostKind,
+              {TTI::OK_AnyValue, TTI::OP_None},
+              {TTI::OK_UniformConstantValue, TTI::OP_None});
 
-      InstructionCost Cost = 0;
-      IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FromTy,
-                                     {FromTy, FromTy});
-      Cost += thisT()->getIntrinsicInstrCost(Attrs1, CostKind);
-      IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FromTy,
-                                     {FromTy, FromTy});
-      Cost += thisT()->getIntrinsicInstrCost(Attrs2, CostKind);
-      Cost += thisT()->getCastInstrCost(
-          IsSigned ? Instruction::FPToSI : Instruction::FPToUI, RetTy, FromTy,
-          TTI::CastContextHint::None, CostKind);
-      if (IsSigned) {
-        Type *CondTy = RetTy->getWithNewBitWidth(1);
-        Cost += thisT()->getCmpSelInstrCost(
-            BinaryOperator::FCmp, FromTy, CondTy, CmpInst::FCMP_UNO, CostKind);
-        Cost += thisT()->getCmpSelInstrCost(
-            BinaryOperator::Select, RetTy, CondTy, CmpInst::FCMP_UNO, CostKind);
+        Cost +=
+            thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, MulTy, OverflowTy,
+                                        CmpInst::ICMP_NE, CostKind);
+        return Cost;
       }
-      return Cost;
-    }
-    case Intrinsic::ctpop:
-      ISD = ISD::CTPOP;
+      case Intrinsic::fptosi_sat:
+      case Intrinsic::fptoui_sat: {
+        if (Tys.empty())
+          break;
+        Type *FromTy = Tys[0];
+        bool IsSigned = IID == Intrinsic::fptosi_sat;
+
+        InstructionCost Cost = 0;
+        IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FromTy,
+                                       {FromTy, FromTy});
+        Cost += thisT()->getIntrinsicInstrCost(Attrs1, CostKind);
+        IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FromTy,
+                                       {FromTy, FromTy});
+        Cost += thisT()->getIntrinsicInstrCost(Attrs2, CostKind);
+        Cost += thisT()->getCastInstrCost(
+            IsSigned ? Instruction::FPToSI : Instruction::FPToUI, RetTy, FromTy,
+            TTI::CastContextHint::None, CostKind);
+        if (IsSigned) {
+          Type *CondTy = RetTy->getWithNewBitWidth(1);
+          Cost +=
+              thisT()->getCmpSelInstrCost(BinaryOperator::FCmp, FromTy, CondTy,
+                                          CmpInst::FCMP_UNO, CostKind);
+          Cost +=
+              thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
+                                          CmpInst::FCMP_UNO, CostKind);
+        }
+        return Cost;
+      }
+      }
+    } else if (ISD == ISD::CTPOP) {
       // In case of legalization use TCC_Expensive. This is cheaper than a
       // library call but still not a cheap instruction.
       SingleCallCost = TargetTransformInfo::TCC_Expensive;
-      break;
-    case Intrinsic::ctlz:
-      ISD = ISD::CTLZ;
-      break;
-    case Intrinsic::cttz:
-      ISD = ISD::CTTZ;
-      break;
-    case Intrinsic::bswap:
-      ISD = ISD::BSWAP;
-      break;
-    case Intrinsic::bitreverse:
-      ISD = ISD::BITREVERSE;
-      break;
     }
 
     const TargetLoweringBase *TLI = getTLI();
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -393,6 +393,16 @@
                                         Scale, AddrSpace, I);
 }
 
+bool TargetTransformInfo::isLegalVectorOp(unsigned Opcode,
+                                          VectorType *VecTy) const {
+  return TTIImpl->isLegalVectorOp(Opcode, VecTy);
+}
+
+bool TargetTransformInfo::isLegalVectorIntrinsic(Intrinsic::ID Id,
+                                                 VectorType *VecTy) const {
+  return TTIImpl->isLegalVectorIntrinsic(Id, VecTy);
+}
+
 bool TargetTransformInfo::isLSRCostLess(const LSRCost &C1,
                                         const LSRCost &C2) const {
   return TTIImpl->isLSRCostLess(C1, C2);
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1189,6 +1189,13 @@
     UserIgnoreList = nullptr;
     PostponedGathers.clear();
     ValueToGatherNodes.clear();
+    OperandsToVectorize.clear();
+  }
+
+  /// Returns the list of the operands to try to vectorize later, if the user
+  /// node was not vectorized.
+  ArrayRef<SmallVector<Value *>> operandsToVectorize() const {
+    return OperandsToVectorize;
   }
 
   unsigned getTreeSize() const { return VectorizableTree.size(); }
@@ -2427,6 +2434,10 @@
   bool areAllUsersVectorized(Instruction *I,
                              ArrayRef<Value *> VectorizedVals) const;
 
+  /// Checks if the list of the values worth to be vectorized and not going to
+  /// be scalarized later.
+  bool isLegalVectorOp(ArrayRef<Value *> VL);
+
   /// Return information about the vector formed for the specified index
   /// of a vector of (the same) instruction.
   TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> VL,
@@ -2960,6 +2971,10 @@
   /// A list of scalars that we found that we need to keep as scalars.
   ValueSet MustGather;
 
+  /// A list of the operands of the nodes, which are not vectorized. These
+  /// operands are the candidates for the vectorization later.
+  SmallVector<SmallVector<Value *>> OperandsToVectorize;
+
   /// A map between the vectorized entries and the last instructions in the
   /// bundles. The bundles are built in use order, not in the def order of the
   /// instructions. So, we cannot rely directly on the last instruction in the
@@ -5781,6 +5796,21 @@
     return;
   }
 
+  // Check if the generated vector instruction won't be scalarized later.
+  if (!isLegalVectorOp(VL)) {
+    LLVM_DEBUG(dbgs() << "SLP: scalarized bundle starting " << *S.OpValue
+                      << ".\n");
+    newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
+                 ReuseShuffleIndicies);
+    // Gather operands to try to vectorize them later.
+    for (unsigned I = 0, End = S.MainOp->getNumOperands(); I < End; ++I) {
+      auto &Operands = OperandsToVectorize.emplace_back();
+      for (Value *V : VL)
+        Operands.push_back(cast<Instruction>(V)->getOperand(I));
+    }
+    return;
+  }
+
   auto &BSRef = BlocksSchedules[BB];
   if (!BSRef)
     BSRef = std::make_unique<BlockScheduling>(BB);
@@ -6431,6 +6461,75 @@
   return I->getOpcode() == AltOp->getOpcode();
 }
 
+bool BoUpSLP::isLegalVectorOp(ArrayRef<Value *> VL) {
+  InstructionsState S = getSameOpcode(VL, *TLI);
+  const unsigned Sz = VL.size();
+  Value *V0 = VL.front();
+  Type *ScalarTy = V0->getType();
+  if (isa<StoreInst, InsertElementInst>(V0))
+    return true;
+  if (auto *CI = dyn_cast<CmpInst>(V0))
+    ScalarTy = CI->getOperand(0)->getType();
+  else if (auto *CI = dyn_cast<CastInst>(V0))
+    if (!isa<BitCastInst, FPToSIInst, FPToSIInst>(CI))
+      ScalarTy = CI->getSrcTy();
+  if (!isValidElementType(ScalarTy))
+    return false;
+  auto *VecTy = FixedVectorType::get(ScalarTy, Sz);
+
+  // If we have computed a smaller type for the expression, update VecTy so
+  // that the costs will be accurate.
+  const auto It = MinBWs.find(VL[0]);
+  if (It != MinBWs.end())
+    VecTy = FixedVectorType::get(
+        IntegerType::get(F->getContext(), It->second.first), VL.size());
+
+  unsigned ShuffleOrOp =
+      S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
+  switch (ShuffleOrOp) {
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::UDiv:
+  case Instruction::SDiv: {
+    // Check if it can be represented as shift
+    TTI::OperandValueInfo OVI = getOperandInfo(VL, 1);
+    if (OVI.isConstant())
+      return true;
+    return TTI->isLegalVectorOp(ShuffleOrOp, VecTy);
+  }
+  case Instruction::Mul: {
+    // Check if it can be represented as shift
+    TTI::OperandValueInfo OVI = getOperandInfo(VL, 1);
+    if (OVI.isConstant())
+      return true;
+    return TTI->isLegalVectorOp(ShuffleOrOp, VecTy);
+  }
+  case Instruction::FNeg:
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::FMul:
+  case Instruction::FDiv:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    return TTI->isLegalVectorOp(ShuffleOrOp, VecTy);
+  case Instruction::Call: {
+    auto *CI = cast<CallInst>(V0);
+    auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
+    return (VecCallCosts.first > VecCallCosts.second ||
+            TTI->isLegalVectorIntrinsic(CI->getIntrinsicID(), VecTy));
+  }
+  default:
+    return true;
+  }
+}
+
 TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> VL,
                                               unsigned OpIdx) {
   assert(!VL.empty());
@@ -12382,6 +12481,51 @@
   return Changed;
 }
 
+static bool vectorizeOperands(BoUpSLP &R) {
+  SmallVector<SmallVector<Value *>> Operands(R.operandsToVectorize().begin(),
+                                             R.operandsToVectorize().end());
+  DenseSet<hash_code> VisitedOperands;
+  bool Changed = false;
+  while (!Operands.empty()) {
+    SmallVector<Value *> Chain = Operands.pop_back_val();
+    if (!VisitedOperands.insert(hash_value(ArrayRef(Chain))).second)
+      continue;
+    unsigned VF = Chain.size();
+    R.buildTree(Chain);
+    if (R.isTreeTinyAndNotFullyVectorizable())
+      return false;
+    if (R.isLoadCombineCandidate())
+      return false;
+    R.reorderTopToBottom();
+    R.reorderBottomToTop();
+    R.buildExternalUses();
+
+    R.computeMinimumValueSizes();
+
+    InstructionCost Cost = R.getTreeCost();
+
+    LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF
+                      << "\n");
+    if (Cost < -SLPCostThreshold) {
+      LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
+
+      using namespace ore;
+
+      R.getORE()->emit(OptimizationRemark(SV_NAME, "OperandsVectorized",
+                                          cast<Instruction>(Chain[0]))
+                       << "Operands SLP vectorized with cost "
+                       << NV("Cost", Cost) << " and with tree size "
+                       << NV("TreeSize", R.getTreeSize()));
+
+      R.vectorizeTree();
+      Changed = true;
+    }
+    Operands.append(R.operandsToVectorize().begin(),
+                    R.operandsToVectorize().end());
+  }
+  return Changed;
+}
+
 bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
                                             unsigned Idx, unsigned MinVF) {
   LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
@@ -12421,10 +12565,11 @@
                      << NV("TreeSize", R.getTreeSize()));
 
     R.vectorizeTree();
+    (void)vectorizeOperands(R);
     return true;
   }
 
-  return false;
+  return vectorizeOperands(R);
 }
 
 bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
@@ -12786,6 +12931,7 @@
         NextInst = I + 1;
         Changed = true;
       }
+      Changed |= vectorizeOperands(R);
     }
   }
 
@@ -13802,6 +13948,7 @@
         // Vectorize a tree.
         Value *VectorizedRoot = V.vectorizeTree(LocalExternallyUsedValues,
                                                 ReplacedExternals, InsertPt);
+        (void)vectorizeOperands(V);
 
         Builder.SetInsertPoint(InsertPt);
 
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll
@@ -24,11 +24,13 @@
 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
+; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]])
+; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
+; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16
@@ -997,11 +999,13 @@
 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]])
 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
+; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_2]])
+; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]])
+; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll
@@ -24,11 +24,13 @@
 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
+; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]])
+; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
+; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16
@@ -997,11 +999,13 @@
 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]])
 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]])
-; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
+; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_2]])
+; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]])
+; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll
@@ -6,12 +6,26 @@
 
 ; Simple 3-pair chain with loads and stores
 define amdgpu_kernel void @test1_as_3_3_3_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c) {
-; GCN-LABEL: @test1_as_3_3_3_v2f16(
-; GCN-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2
-; GCN-NEXT:    [[TMP4:%.*]] = load <2 x half>, ptr addrspace(3) [[B:%.*]], align 2
-; GCN-NEXT:    [[TMP5:%.*]] = fmul <2 x half> [[TMP2]], [[TMP4]]
-; GCN-NEXT:    store <2 x half> [[TMP5]], ptr addrspace(3) [[C:%.*]], align 2
-; GCN-NEXT:    ret void
+; GFX9-LABEL: @test1_as_3_3_3_v2f16(
+; GFX9-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2
+; GFX9-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[B:%.*]], align 2
+; GFX9-NEXT:    [[TMP3:%.*]] = fmul <2 x half> [[TMP1]], [[TMP2]]
+; GFX9-NEXT:    store <2 x half> [[TMP3]], ptr addrspace(3) [[C:%.*]], align 2
+; GFX9-NEXT:    ret void
+;
+; VI-LABEL: @test1_as_3_3_3_v2f16(
+; VI-NEXT:    [[I0:%.*]] = load half, ptr addrspace(3) [[A:%.*]], align 2
+; VI-NEXT:    [[I1:%.*]] = load half, ptr addrspace(3) [[B:%.*]], align 2
+; VI-NEXT:    [[MUL:%.*]] = fmul half [[I0]], [[I1]]
+; VI-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[A]], i64 1
+; VI-NEXT:    [[I3:%.*]] = load half, ptr addrspace(3) [[ARRAYIDX3]], align 2
+; VI-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[B]], i64 1
+; VI-NEXT:    [[I4:%.*]] = load half, ptr addrspace(3) [[ARRAYIDX4]], align 2
+; VI-NEXT:    [[MUL5:%.*]] = fmul half [[I3]], [[I4]]
+; VI-NEXT:    store half [[MUL]], ptr addrspace(3) [[C:%.*]], align 2
+; VI-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[C]], i64 1
+; VI-NEXT:    store half [[MUL5]], ptr addrspace(3) [[ARRAYIDX5]], align 2
+; VI-NEXT:    ret void
 ;
   %i0 = load half, ptr addrspace(3) %a, align 2
   %i1 = load half, ptr addrspace(3) %b, align 2
@@ -28,12 +42,26 @@
 }
 
 define amdgpu_kernel void @test1_as_3_0_0(ptr addrspace(3) %a, ptr %b, ptr %c) {
-; GCN-LABEL: @test1_as_3_0_0(
-; GCN-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2
-; GCN-NEXT:    [[TMP4:%.*]] = load <2 x half>, ptr [[B:%.*]], align 2
-; GCN-NEXT:    [[TMP5:%.*]] = fmul <2 x half> [[TMP2]], [[TMP4]]
-; GCN-NEXT:    store <2 x half> [[TMP5]], ptr [[C:%.*]], align 2
-; GCN-NEXT:    ret void
+; GFX9-LABEL: @test1_as_3_0_0(
+; GFX9-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2
+; GFX9-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr [[B:%.*]], align 2
+; GFX9-NEXT:    [[TMP3:%.*]] = fmul <2 x half> [[TMP1]], [[TMP2]]
+; GFX9-NEXT:    store <2 x half> [[TMP3]], ptr [[C:%.*]], align 2
+; GFX9-NEXT:    ret void
+;
+; VI-LABEL: @test1_as_3_0_0(
+; VI-NEXT:    [[I0:%.*]] = load half, ptr addrspace(3) [[A:%.*]], align 2
+; VI-NEXT:    [[I1:%.*]] = load half, ptr [[B:%.*]], align 2
+; VI-NEXT:    [[MUL:%.*]] = fmul half [[I0]], [[I1]]
+; VI-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[A]], i64 1
+; VI-NEXT:    [[I3:%.*]] = load half, ptr addrspace(3) [[ARRAYIDX3]], align 2
+; VI-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds half, ptr [[B]], i64 1
+; VI-NEXT:    [[I4:%.*]] = load half, ptr [[ARRAYIDX4]], align 2
+; VI-NEXT:    [[MUL5:%.*]] = fmul half [[I3]], [[I4]]
+; VI-NEXT:    store half [[MUL]], ptr [[C:%.*]], align 2
+; VI-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds half, ptr [[C]], i64 1
+; VI-NEXT:    store half [[MUL5]], ptr [[ARRAYIDX5]], align 2
+; VI-NEXT:    ret void
 ;
   %i0 = load half, ptr addrspace(3) %a, align 2
   %i1 = load half, ptr %b, align 2
@@ -50,12 +78,26 @@
 }
 
 define amdgpu_kernel void @test1_as_0_0_3_v2f16(ptr %a, ptr %b, ptr addrspace(3) %c) {
-; GCN-LABEL: @test1_as_0_0_3_v2f16(
-; GCN-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr [[A:%.*]], align 2
-; GCN-NEXT:    [[TMP4:%.*]] = load <2 x half>, ptr [[B:%.*]], align 2
-; GCN-NEXT:    [[TMP5:%.*]] = fmul <2 x half> [[TMP2]], [[TMP4]]
-; GCN-NEXT:    store <2 x half> [[TMP5]], ptr addrspace(3) [[C:%.*]], align 2
-; GCN-NEXT:    ret void
+; GFX9-LABEL: @test1_as_0_0_3_v2f16(
+; GFX9-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr [[A:%.*]], align 2
+; GFX9-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr [[B:%.*]], align 2
+; GFX9-NEXT:    [[TMP3:%.*]] = fmul <2 x half> [[TMP1]], [[TMP2]]
+; GFX9-NEXT:    store <2 x half> [[TMP3]], ptr addrspace(3) [[C:%.*]], align 2
+; GFX9-NEXT:    ret void
+;
+; VI-LABEL: @test1_as_0_0_3_v2f16(
+; VI-NEXT:    [[I0:%.*]] = load half, ptr [[A:%.*]], align 2
+; VI-NEXT:    [[I1:%.*]] = load half, ptr [[B:%.*]], align 2
+; VI-NEXT:    [[MUL:%.*]] = fmul half [[I0]], [[I1]]
+; VI-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds half, ptr [[A]], i64 1
+; VI-NEXT:    [[I3:%.*]] = load half, ptr [[ARRAYIDX3]], align 2
+; VI-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds half, ptr [[B]], i64 1
+; VI-NEXT:    [[I4:%.*]] = load half, ptr [[ARRAYIDX4]], align 2
+; VI-NEXT:    [[MUL5:%.*]] = fmul half [[I3]], [[I4]]
+; VI-NEXT:    store half [[MUL]], ptr addrspace(3) [[C:%.*]], align 2
+; VI-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[C]], i64 1
+; VI-NEXT:    store half [[MUL5]], ptr addrspace(3) [[ARRAYIDX5]], align 2
+; VI-NEXT:    ret void
 ;
   %i0 = load half, ptr %a, align 2
   %i1 = load half, ptr %b, align 2
@@ -73,11 +115,11 @@
 
 define amdgpu_kernel void @test1_fma_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) {
 ; GCN-LABEL: @test1_fma_v2f16(
-; GCN-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2
-; GCN-NEXT:    [[TMP4:%.*]] = load <2 x half>, ptr addrspace(3) [[B:%.*]], align 2
-; GCN-NEXT:    [[TMP6:%.*]] = load <2 x half>, ptr addrspace(3) [[C:%.*]], align 2
-; GCN-NEXT:    [[TMP7:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP2]], <2 x half> [[TMP4]], <2 x half> [[TMP6]])
-; GCN-NEXT:    store <2 x half> [[TMP7]], ptr addrspace(3) [[D:%.*]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2
+; GCN-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[B:%.*]], align 2
+; GCN-NEXT:    [[TMP3:%.*]] = load <2 x half>, ptr addrspace(3) [[C:%.*]], align 2
+; GCN-NEXT:    [[TMP4:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP1]], <2 x half> [[TMP2]], <2 x half> [[TMP3]])
+; GCN-NEXT:    store <2 x half> [[TMP4]], ptr addrspace(3) [[D:%.*]], align 2
 ; GCN-NEXT:    ret void
 ;
   %i0 = load half, ptr addrspace(3) %a, align 2
@@ -98,13 +140,24 @@
 }
 
 define amdgpu_kernel void @mul_scalar_v2f16(ptr addrspace(3) %a, half %scalar, ptr addrspace(3) %c) {
-; GCN-LABEL: @mul_scalar_v2f16(
-; GCN-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2
-; GCN-NEXT:    [[TMP3:%.*]] = insertelement <2 x half> poison, half [[SCALAR:%.*]], i32 0
-; GCN-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x half> [[TMP3]], <2 x half> poison, <2 x i32> zeroinitializer
-; GCN-NEXT:    [[TMP4:%.*]] = fmul <2 x half> [[TMP2]], [[SHUFFLE]]
-; GCN-NEXT:    store <2 x half> [[TMP4]], ptr addrspace(3) [[C:%.*]], align 2
-; GCN-NEXT:    ret void
+; GFX9-LABEL: @mul_scalar_v2f16(
+; GFX9-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2
+; GFX9-NEXT:    [[TMP2:%.*]] = insertelement <2 x half> poison, half [[SCALAR:%.*]], i32 0
+; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <2 x half> [[TMP2]], <2 x half> poison, <2 x i32> zeroinitializer
+; GFX9-NEXT:    [[TMP4:%.*]] = fmul <2 x half> [[TMP1]], [[TMP3]]
+; GFX9-NEXT:    store <2 x half> [[TMP4]], ptr addrspace(3) [[C:%.*]], align 2
+; GFX9-NEXT:    ret void
+;
+; VI-LABEL: @mul_scalar_v2f16(
+; VI-NEXT:    [[I0:%.*]] = load half, ptr addrspace(3) [[A:%.*]], align 2
+; VI-NEXT:    [[MUL:%.*]] = fmul half [[I0]], [[SCALAR:%.*]]
+; VI-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[A]], i64 1
+; VI-NEXT:    [[I3:%.*]] = load half, ptr addrspace(3) [[ARRAYIDX3]], align 2
+; VI-NEXT:    [[MUL5:%.*]] = fmul half [[I3]], [[SCALAR]]
+; VI-NEXT:    store half [[MUL]], ptr addrspace(3) [[C:%.*]], align 2
+; VI-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[C]], i64 1
+; VI-NEXT:    store half [[MUL5]], ptr addrspace(3) [[ARRAYIDX5]], align 2
+; VI-NEXT:    ret void
 ;
   %i0 = load half, ptr addrspace(3) %a, align 2
   %mul = fmul half %i0, %scalar
@@ -119,9 +172,9 @@
 
 define amdgpu_kernel void @fabs_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %c) {
 ; GCN-LABEL: @fabs_v2f16(
-; GCN-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2
-; GCN-NEXT:    [[TMP3:%.*]] = call <2 x half> @llvm.fabs.v2f16(<2 x half> [[TMP2]])
-; GCN-NEXT:    store <2 x half> [[TMP3]], ptr addrspace(3) [[C:%.*]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2
+; GCN-NEXT:    [[TMP2:%.*]] = call <2 x half> @llvm.fabs.v2f16(<2 x half> [[TMP1]])
+; GCN-NEXT:    store <2 x half> [[TMP2]], ptr addrspace(3) [[C:%.*]], align 2
 ; GCN-NEXT:    ret void
 ;
   %i0 = load half, ptr addrspace(3) %a, align 2
@@ -137,12 +190,12 @@
 
 define amdgpu_kernel void @test1_fabs_fma_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) {
 ; GCN-LABEL: @test1_fabs_fma_v2f16(
-; GCN-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2
-; GCN-NEXT:    [[TMP4:%.*]] = load <2 x half>, ptr addrspace(3) [[B:%.*]], align 2
-; GCN-NEXT:    [[TMP6:%.*]] = load <2 x half>, ptr addrspace(3) [[C:%.*]], align 2
-; GCN-NEXT:    [[TMP7:%.*]] = call <2 x half> @llvm.fabs.v2f16(<2 x half> [[TMP2]])
-; GCN-NEXT:    [[TMP8:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP7]], <2 x half> [[TMP4]], <2 x half> [[TMP6]])
-; GCN-NEXT:    store <2 x half> [[TMP8]], ptr addrspace(3) [[D:%.*]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2
+; GCN-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[B:%.*]], align 2
+; GCN-NEXT:    [[TMP3:%.*]] = load <2 x half>, ptr addrspace(3) [[C:%.*]], align 2
+; GCN-NEXT:    [[TMP4:%.*]] = call <2 x half> @llvm.fabs.v2f16(<2 x half> [[TMP1]])
+; GCN-NEXT:    [[TMP5:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP4]], <2 x half> [[TMP2]], <2 x half> [[TMP3]])
+; GCN-NEXT:    store <2 x half> [[TMP5]], ptr addrspace(3) [[D:%.*]], align 2
 ; GCN-NEXT:    ret void
 ;
   %i0 = load half, ptr addrspace(3) %a, align 2
@@ -172,12 +225,12 @@
 ; GCN-NEXT:    [[I1_FABS:%.*]] = call half @llvm.fabs.f16(half [[I1]])
 ; GCN-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[B]], i64 1
 ; GCN-NEXT:    [[I4:%.*]] = load half, ptr addrspace(3) [[ARRAYIDX4]], align 2
-; GCN-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2
-; GCN-NEXT:    [[TMP4:%.*]] = load <2 x half>, ptr addrspace(3) [[C:%.*]], align 2
-; GCN-NEXT:    [[TMP5:%.*]] = insertelement <2 x half> poison, half [[I1_FABS]], i32 0
-; GCN-NEXT:    [[TMP6:%.*]] = insertelement <2 x half> [[TMP5]], half [[I4]], i32 1
-; GCN-NEXT:    [[TMP7:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP2]], <2 x half> [[TMP6]], <2 x half> [[TMP4]])
-; GCN-NEXT:    store <2 x half> [[TMP7]], ptr addrspace(3) [[D:%.*]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2
+; GCN-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[C:%.*]], align 2
+; GCN-NEXT:    [[TMP3:%.*]] = insertelement <2 x half> poison, half [[I1_FABS]], i32 0
+; GCN-NEXT:    [[TMP4:%.*]] = insertelement <2 x half> [[TMP3]], half [[I4]], i32 1
+; GCN-NEXT:    [[TMP5:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP1]], <2 x half> [[TMP4]], <2 x half> [[TMP2]])
+; GCN-NEXT:    store <2 x half> [[TMP5]], ptr addrspace(3) [[D:%.*]], align 2
 ; GCN-NEXT:    ret void
 ;
   %i0 = load half, ptr addrspace(3) %a, align 2
@@ -201,9 +254,9 @@
 
 define amdgpu_kernel void @canonicalize_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %c) {
 ; GFX9-LABEL: @canonicalize_v2f16(
-; GFX9-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2
-; GFX9-NEXT:    [[TMP3:%.*]] = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> [[TMP2]])
-; GFX9-NEXT:    store <2 x half> [[TMP3]], ptr addrspace(3) [[C:%.*]], align 2
+; GFX9-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2
+; GFX9-NEXT:    [[TMP2:%.*]] = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> [[TMP1]])
+; GFX9-NEXT:    store <2 x half> [[TMP2]], ptr addrspace(3) [[C:%.*]], align 2
 ; GFX9-NEXT:    ret void
 ;
 ; VI-LABEL: @canonicalize_v2f16(
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
@@ -208,11 +208,13 @@
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x float> [[VECINS_31]]
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_2]])
+; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_3]])
+; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 ; DEFAULT-LABEL: define <4 x float> @int_exp_4x
 ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
@@ -224,11 +226,13 @@
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; DEFAULT-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
-; DEFAULT-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; DEFAULT-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; DEFAULT-NEXT:    ret <4 x float> [[VECINS_31]]
+; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_2]])
+; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_3]])
+; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16
@@ -314,11 +318,13 @@
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x float> [[VECINS_31]]
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_2]])
+; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_3]])
+; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 ; DEFAULT-LABEL: define <4 x float> @int_log_4x
 ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
@@ -330,11 +336,13 @@
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; DEFAULT-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
-; DEFAULT-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; DEFAULT-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; DEFAULT-NEXT:    ret <4 x float> [[VECINS_31]]
+; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_2]])
+; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_3]])
+; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16
@@ -420,11 +428,13 @@
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x float> [[VECINS_31]]
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]])
+; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
+; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 ; DEFAULT-LABEL: define <4 x float> @int_sin_4x
 ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
@@ -436,11 +446,13 @@
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; DEFAULT-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
-; DEFAULT-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; DEFAULT-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; DEFAULT-NEXT:    ret <4 x float> [[VECINS_31]]
+; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]])
+; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
+; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-div-undef.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-div-undef.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-div-undef.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-div-undef.ll
@@ -3,7 +3,21 @@
 
 define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) {
 ; CHECK-LABEL: @sdiv_v8i32_undefs(
-; CHECK-NEXT:    ret <8 x i32> poison
+; CHECK-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 1
+; CHECK-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
+; CHECK-NEXT:    [[AB1:%.*]] = sdiv i32 [[A1]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = sdiv <2 x i32> [[TMP1]], <i32 8, i32 16>
+; CHECK-NEXT:    [[AB5:%.*]] = sdiv i32 [[A5]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> <i32 6, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = sdiv <2 x i32> [[TMP3]], <i32 8, i32 16>
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[R32:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP5]], <8 x i32> <i32 poison, i32 1, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R32]], i32 [[AB5]], i64 5
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[R71:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP6]], <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 8, i32 9>
+; CHECK-NEXT:    ret <8 x i32> [[R71]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/control-dependence.ll b/llvm/test/Transforms/SLPVectorizer/X86/control-dependence.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/control-dependence.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/control-dependence.ll
@@ -8,10 +8,10 @@
 ; Base case with no interesting control dependencies
 define void @test_no_control(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test_no_control(
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    store <2 x i64> [[TMP3]], ptr [[B:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -34,11 +34,11 @@
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:    [[C1:%.*]] = load i64, ptr [[C:%.*]], align 4
 ; CHECK-NEXT:    [[C2:%.*]] = call i64 @may_inf_loop_ro()
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[C2]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -60,11 +60,11 @@
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:    [[C1:%.*]] = load i64, ptr [[C:%.*]], align 4
 ; CHECK-NEXT:    [[C2:%.*]] = call i64 @may_inf_loop_ro()
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[C2]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %c1 = load i64, ptr %c
@@ -87,11 +87,11 @@
 ; CHECK-LABEL: @test3(
 ; CHECK-NEXT:    [[C1:%.*]] = load i64, ptr [[C:%.*]], align 4
 ; CHECK-NEXT:    [[C2:%.*]] = call i64 @may_inf_loop_ro()
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[C2]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -113,11 +113,11 @@
 ; CHECK-LABEL: @test4(
 ; CHECK-NEXT:    [[C1:%.*]] = load i64, ptr [[C:%.*]], align 4
 ; CHECK-NEXT:    [[C2:%.*]] = call i64 @may_inf_loop_ro()
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[C2]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -139,11 +139,11 @@
 ; CHECK-LABEL: @test5(
 ; CHECK-NEXT:    [[C2:%.*]] = call i64 @may_inf_loop_ro()
 ; CHECK-NEXT:    [[C1:%.*]] = load i64, ptr [[C:%.*]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[C2]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %a2 = getelementptr i64, ptr %a, i32 1
@@ -164,10 +164,10 @@
 define void @test6(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test6(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @may_inf_loop_ro()
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP3]], [[TMP5]]
-; CHECK-NEXT:    store <2 x i64> [[TMP6]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -200,11 +200,11 @@
 ; CHECK-NEXT:    store i64 0, ptr [[A]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @may_inf_loop_ro()
 ; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[A2]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[V2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP5]], [[TMP3]]
-; CHECK-NEXT:    store <2 x i64> [[TMP6]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[V2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -233,11 +233,11 @@
 ; CHECK-NEXT:    store i64 0, ptr [[A]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @may_throw() #[[ATTR4:[0-9]+]]
 ; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[A2]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[V2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP5]], [[TMP3]]
-; CHECK-NEXT:    store <2 x i64> [[TMP6]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[V2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -266,11 +266,11 @@
 ; CHECK-NEXT:    store i64 0, ptr [[A]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @may_throw()
 ; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[A2]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[V2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP5]], [[TMP3]]
-; CHECK-NEXT:    store <2 x i64> [[TMP6]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[V2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -294,18 +294,18 @@
 ; A variant of test7 which shows the same problem with a non-load instruction
 define void @test10(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test10(
-; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[A]], i32 1
-; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[A2]], align 4
-; CHECK-NEXT:    [[U1:%.*]] = udiv i64 200, [[V1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
+; CHECK-NEXT:    [[U1:%.*]] = udiv i64 200, [[TMP2]]
 ; CHECK-NEXT:    store i64 [[U1]], ptr [[A]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @may_inf_loop_ro()
-; CHECK-NEXT:    [[U2:%.*]] = udiv i64 200, [[V2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[U1]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[U2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP5]], [[TMP3]]
-; CHECK-NEXT:    store <2 x i64> [[TMP6]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @may_inf_loop_ro()
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+; CHECK-NEXT:    [[U2:%.*]] = udiv i64 200, [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> poison, i64 [[U1]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[U2]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = add <2 x i64> [[TMP7]], [[TMP5]]
+; CHECK-NEXT:    store <2 x i64> [[TMP8]], ptr [[B:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -337,11 +337,11 @@
 ; CHECK-NEXT:    store i64 [[U1]], ptr [[B:%.*]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @may_inf_loop_ro()
 ; CHECK-NEXT:    [[U2:%.*]] = udiv i64 200, [[Y:%.*]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[U1]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[U2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP5]], [[TMP3]]
-; CHECK-NEXT:    store <2 x i64> [[TMP6]], ptr [[B]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[U1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[U2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %u1 = udiv i64 200, %x
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll
@@ -10,19 +10,26 @@
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[P3:%.*]], i32 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[TMP3]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i64> [[TMP3]], [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = sdiv <4 x i64> [[TMP3]], [[TMP3]]
-; CHECK-NEXT:    [[TMP7:%.*]] = sub <4 x i64> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = shl <4 x i64> [[TMP4]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 poison, i32 4>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 poison, i32 5>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP11]], <4 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 5, i32 3>
-; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i64> [[TMP10]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = trunc <4 x i64> [[TMP13]] to <4 x i32>
+; CHECK-NEXT:    [[D0:%.*]] = sdiv i64 [[P0]], [[P0]]
+; CHECK-NEXT:    [[D1:%.*]] = sdiv i64 [[P1]], [[P1]]
+; CHECK-NEXT:    [[D2:%.*]] = sdiv i64 [[P2]], [[P2]]
+; CHECK-NEXT:    [[D3:%.*]] = sdiv i64 [[P3]], [[P3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[D0]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i64> [[TMP6]], i64 [[D1]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i64> [[TMP7]], i64 [[D2]], i32 2
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i64> [[TMP8]], i64 [[D3]], i32 3
+; CHECK-NEXT:    [[TMP10:%.*]] = sub <4 x i64> [[TMP5]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shl <4 x i64> [[TMP4]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 poison, i32 4>
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i64> [[TMP12]], i64 [[D0]], i32 2
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 poison, i32 5>
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> [[TMP14]], i64 [[D1]], i32 2
+; CHECK-NEXT:    [[TMP16:%.*]] = or <4 x i64> [[TMP13]], [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = trunc <4 x i64> [[TMP16]] to <4 x i32>
 ; CHECK-NEXT:    br label [[BB:%.*]]
 ; CHECK:       bb:
-; CHECK-NEXT:    [[TMP15:%.*]] = phi <4 x i32> [ [[TMP16:%.*]], [[BB]] ], [ [[TMP14]], [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP16]] = trunc <4 x i64> [[TMP8]] to <4 x i32>
+; CHECK-NEXT:    [[TMP18:%.*]] = phi <4 x i32> [ [[TMP19:%.*]], [[BB]] ], [ [[TMP17]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP19]] = trunc <4 x i64> [[TMP11]] to <4 x i32>
 ; CHECK-NEXT:    br label [[BB]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll b/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll
@@ -18,22 +18,24 @@
 ; CHECK-NEXT:    [[A5:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 5), align 8
 ; CHECK-NEXT:    [[A6:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 6), align 8
 ; CHECK-NEXT:    [[A7:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A2]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A6]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = call fast <2 x double> @llvm.sin.v2f64(<2 x double> [[TMP2]])
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[A3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A7]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = call fast <2 x double> @llvm.sin.v2f64(<2 x double> [[TMP5]])
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A4]], i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP8]])
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> poison, double [[A1]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> [[TMP10]], double [[A5]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP11]])
-; CHECK-NEXT:    [[TMP13:%.*]] = fadd fast <2 x double> [[TMP9]], [[TMP6]]
-; CHECK-NEXT:    [[TMP14:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP12]]
-; CHECK-NEXT:    [[TMP15:%.*]] = fadd fast <2 x double> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    store <2 x double> [[TMP15]], ptr @dst, align 8
+; CHECK-NEXT:    [[SIN0:%.*]] = call fast double @llvm.sin.f64(double [[A2]])
+; CHECK-NEXT:    [[SIN1:%.*]] = call fast double @llvm.sin.f64(double [[A3]])
+; CHECK-NEXT:    [[SIN2:%.*]] = call fast double @llvm.sin.f64(double [[A6]])
+; CHECK-NEXT:    [[SIN3:%.*]] = call fast double @llvm.sin.f64(double [[A7]])
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A4]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[A1]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A5]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> poison, double [[SIN1]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[SIN3]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> poison, double [[SIN0]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> [[TMP10]], double [[SIN2]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = fadd fast <2 x double> [[TMP11]], [[TMP6]]
+; CHECK-NEXT:    [[TMP13:%.*]] = fadd fast <2 x double> [[TMP9]], [[TMP12]]
+; CHECK-NEXT:    store <2 x double> [[TMP13]], ptr @dst, align 8
 ; CHECK-NEXT:    ret void
 ;
 ; VECLIB-LABEL: @test(