diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -430,26 +430,6 @@
 ///                                                         i32 6>
 /// %2 = mul <4 x i8> %1, %1
 /// ret <4 x i8> %2
-/// We convert this initially to something like:
-/// %x0 = extractelement <4 x i8> %x, i32 0
-/// %x3 = extractelement <4 x i8> %x, i32 3
-/// %y1 = extractelement <4 x i8> %y, i32 1
-/// %y2 = extractelement <4 x i8> %y, i32 2
-/// %1 = insertelement <4 x i8> poison, i8 %x0, i32 0
-/// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1
-/// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2
-/// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3
-/// %5 = mul <4 x i8> %4, %4
-/// %6 = extractelement <4 x i8> %5, i32 0
-/// %ins1 = insertelement <4 x i8> poison, i8 %6, i32 0
-/// %7 = extractelement <4 x i8> %5, i32 1
-/// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1
-/// %8 = extractelement <4 x i8> %5, i32 2
-/// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2
-/// %9 = extractelement <4 x i8> %5, i32 3
-/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3
-/// ret <4 x i8> %ins4
-/// InstCombiner transforms this into a shuffle and vector mul
 /// Mask will return the Shuffle Mask equivalent to the extracted elements.
 /// TODO: Can we split off and reuse the shuffle mask detection from
 /// ShuffleVectorInst/getShuffleCost?
@@ -1080,6 +1060,7 @@
 class BoUpSLP {
   struct TreeEntry;
   struct ScheduleData;
+  class ShuffleCostEstimator;
   class ShuffleInstructionBuilder;
 
 public:
@@ -2449,6 +2430,12 @@
   /// \p E.
   Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx);
 
+  /// Create a new vector from a list of scalar values.  Produces a sequence
+  /// which exploits values reused across lanes, and arranges the inserts
+  /// for ease of later optimization.
+  template <typename BVTy, typename ResTy, typename... Args>
+  ResTy processBuildVector(const TreeEntry *E, Args &...Params);
+
   /// Create a new vector from a list of scalar values.  Produces a sequence
   /// which exploits values reused across lanes, and arranges the inserts
   /// for ease of later optimization.
@@ -2490,7 +2477,7 @@
   void setInsertPointAfterBundle(const TreeEntry *E);
 
   /// \returns a vector from a collection of scalars in \p VL.
-  Value *gather(ArrayRef<Value *> VL);
+  Value *gather(ArrayRef<Value *> VL, Value *Root = nullptr);
 
   /// \returns whether the VectorizableTree is fully vectorizable and will
   /// be beneficial even the tree height is tiny.
@@ -2548,7 +2535,7 @@
         // directly, without reordering.
         SmallVector<int> Mask;
         inversePermutation(ReorderIndices, Mask);
-        if (VL.size() == Scalars.size())
+        if (VL.size() == Scalars.size() && ReuseShuffleIndices.empty())
           return IsSame(Scalars, Mask);
         if (VL.size() == ReuseShuffleIndices.size()) {
           ::addMask(Mask, ReuseShuffleIndices);
@@ -6287,68 +6274,6 @@
   return {IntrinsicCost, LibCost};
 }
 
-/// Compute the cost of creating a vector of type \p VecTy containing the
-/// extracted values from \p VL.
-static InstructionCost
-computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
-                   TargetTransformInfo::ShuffleKind ShuffleKind,
-                   ArrayRef<int> Mask, TargetTransformInfo &TTI) {
-  unsigned NumOfParts = TTI.getNumberOfParts(VecTy);
-
-  if (ShuffleKind != TargetTransformInfo::SK_PermuteSingleSrc || !NumOfParts ||
-      VecTy->getNumElements() < NumOfParts)
-    return TTI.getShuffleCost(ShuffleKind, VecTy, Mask);
-
-  bool AllConsecutive = true;
-  unsigned EltsPerVector = VecTy->getNumElements() / NumOfParts;
-  unsigned Idx = -1;
-  InstructionCost Cost = 0;
-
-  // Process extracts in blocks of EltsPerVector to check if the source vector
-  // operand can be re-used directly. If not, add the cost of creating a shuffle
-  // to extract the values into a vector register.
-  SmallVector<int> RegMask(EltsPerVector, UndefMaskElem);
-  for (auto *V : VL) {
-    ++Idx;
-
-    // Reached the start of a new vector registers.
-    if (Idx % EltsPerVector == 0) {
-      RegMask.assign(EltsPerVector, UndefMaskElem);
-      AllConsecutive = true;
-      continue;
-    }
-
-    // Need to exclude undefs from analysis.
-    if (isa<UndefValue>(V) || Mask[Idx] == UndefMaskElem)
-      continue;
-
-    // Check all extracts for a vector register on the target directly
-    // extract values in order.
-    unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V));
-    if (!isa<UndefValue>(VL[Idx - 1]) && Mask[Idx - 1] != UndefMaskElem) {
-      unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1]));
-      AllConsecutive &= PrevIdx + 1 == CurrentIdx &&
-                        CurrentIdx % EltsPerVector == Idx % EltsPerVector;
-      RegMask[Idx % EltsPerVector] = CurrentIdx % EltsPerVector;
-    }
-
-    if (AllConsecutive)
-      continue;
-
-    // Skip all indices, except for the last index per vector block.
-    if ((Idx + 1) % EltsPerVector != 0 && Idx + 1 != VL.size())
-      continue;
-
-    // If we have a series of extracts which are not consecutive and hence
-    // cannot re-use the source vector register directly, compute the shuffle
-    // cost to extract the vector with EltsPerVector elements.
-    Cost += TTI.getShuffleCost(
-        TargetTransformInfo::SK_PermuteSingleSrc,
-        FixedVectorType::get(VecTy->getElementType(), EltsPerVector), RegMask);
-  }
-  return Cost;
-}
-
 /// Build shuffle mask for shuffle graph entries and lists of main and alternate
 /// operations operands.
 static void
@@ -6596,9 +6521,10 @@
         LocalVF = SVOpTy->getNumElements();
       SmallVector<int> ExtMask(Mask.size(), UndefMaskElem);
       for (auto [Idx, I] : enumerate(Mask)) {
-         if (I == UndefMaskElem)
-           continue;
-         ExtMask[Idx] = SV->getMaskValue(I);
+        if (I == UndefMaskElem ||
+            static_cast<unsigned>(I) >= SV->getShuffleMask().size())
+          continue;
+        ExtMask[Idx] = SV->getMaskValue(I);
       }
       bool IsOp1Undef =
           isUndefVector(SV->getOperand(0),
@@ -6657,10 +6583,12 @@
   /// Smart shuffle instruction emission, walks through shuffles trees and
   /// tries to find the best matching vector for the actual shuffle
   /// instruction.
-  template <typename ShuffleBuilderTy>
-  static Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
-                              ShuffleBuilderTy &Builder) {
+  template <typename T, typename ShuffleBuilderTy>
+  static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
+                         ShuffleBuilderTy &Builder) {
     assert(V1 && "Expected at least one vector value.");
+    if (V2)
+      Builder.resizeToMatch(V1, V2);
     int VF = Mask.size();
     if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
       VF = FTy->getNumElements();
@@ -6735,12 +6663,9 @@
           }
       } while (PrevOp1 != Op1 || PrevOp2 != Op2);
       Builder.resizeToMatch(Op1, Op2);
-      VF = std::max(cast<VectorType>(Op1->getType())
-                        ->getElementCount()
-                        .getKnownMinValue(),
-                    cast<VectorType>(Op2->getType())
-                        ->getElementCount()
-                        .getKnownMinValue());
+      VF = cast<VectorType>(Op1->getType())
+               ->getElementCount()
+               .getKnownMinValue();
       for (int I = 0, E = Mask.size(); I < E; ++I) {
         if (CombinedMask2[I] != UndefMaskElem) {
           assert(CombinedMask1[I] == UndefMaskElem &&
@@ -6748,61 +6673,120 @@
           CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
         }
       }
+      const int Limit = CombinedMask1.size() * 2;
+      if (Op1 == Op2 && Limit == 2 * VF &&
+          all_of(CombinedMask1, [=](int Idx) { return Idx < Limit; }) &&
+          (ShuffleVectorInst::isIdentityMask(CombinedMask1) ||
+           (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1) &&
+            isa<ShuffleVectorInst>(Op1) &&
+            cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
+                ArrayRef(CombinedMask1))))
+        return Builder.createIdentity(Op1);
       return Builder.createShuffleVector(
           Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
           CombinedMask1);
     }
     if (isa<PoisonValue>(V1))
-      return PoisonValue::get(FixedVectorType::get(
-          cast<VectorType>(V1->getType())->getElementType(), Mask.size()));
+      return Builder.createPoison(
+          cast<VectorType>(V1->getType())->getElementType(), Mask.size());
     SmallVector<int> NewMask(Mask.begin(), Mask.end());
     bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
     assert(V1 && "Expected non-null value after looking through shuffles.");
 
     if (!IsIdentity)
       return Builder.createShuffleVector(V1, NewMask);
-    return V1;
+    return Builder.createIdentity(V1);
   }
 };
 } // namespace
 
-InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
-                                      ArrayRef<Value *> VectorizedVals) {
-  ArrayRef<Value *> VL = E->Scalars;
+/// Merges shuffle masks and emits final shuffle instruction, if required. It
+/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
+/// when the actual shuffle instruction is generated only if this is actually
+/// required. Otherwise, the shuffle instruction emission is delayed till the
+/// end of the process, to reduce the number of emitted instructions and further
+/// analysis/transformations.
+class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
+  bool IsFinalized = false;
+  SmallVector<int> CommonMask;
+  SmallVector<Value *, 2> InVectors;
+  const TargetTransformInfo &TTI;
+  InstructionCost Cost = 0;
+  ArrayRef<Value *> VectorizedVals;
+  BoUpSLP &R;
+  constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
 
-  Type *ScalarTy = VL[0]->getType();
-  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
-    ScalarTy = SI->getValueOperand()->getType();
-  else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))
-    ScalarTy = CI->getOperand(0)->getType();
-  else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
-    ScalarTy = IE->getOperand(1)->getType();
-  auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
-  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  class ShuffleCostBuilder {
+    const TargetTransformInfo &TTI;
 
-  // If we have computed a smaller type for the expression, update VecTy so
-  // that the costs will be accurate.
-  if (MinBWs.count(VL[0]))
-    VecTy = FixedVectorType::get(
-        IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
-  unsigned EntryVF = E->getVectorFactor();
-  auto *FinalVecTy = FixedVectorType::get(VecTy->getElementType(), EntryVF);
+    static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
+      return Mask.empty() ||
+             (VF == Mask.size() && all_of(enumerate(Mask), [](auto Pair) {
+                return Pair.value() == UndefMaskElem ||
+                       Pair.index() == static_cast<unsigned>(Pair.value());
+              }));
+    }
 
-  bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
-  // FIXME: it tries to fix a problem with MSVC buildbots.
-  TargetTransformInfo *TTI = this->TTI;
-  auto AdjustExtractsCost = [=](InstructionCost &Cost,
-                                ArrayRef<int> Mask) -> Value * {
+  public:
+    ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
+    ~ShuffleCostBuilder() = default;
+    InstructionCost createShuffleVector(Value *V1, Value *,
+                                        ArrayRef<int> Mask) const {
+      // Empty mask or identity mask are free.
+      unsigned VF =
+          cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
+      if (isEmptyOrIdentity(Mask, VF))
+        return TTI::TCC_Free;
+      return TTI.getShuffleCost(
+          TTI::SK_PermuteTwoSrc,
+          FixedVectorType::get(
+              cast<VectorType>(V1->getType())->getElementType(), Mask.size()),
+          Mask);
+    }
+    InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
+      // Empty mask or identity mask are free.
+      if (isEmptyOrIdentity(Mask, Mask.size()))
+        return TTI::TCC_Free;
+      return TTI.getShuffleCost(
+          TTI::SK_PermuteSingleSrc,
+          FixedVectorType::get(
+              cast<VectorType>(V1->getType())->getElementType(), Mask.size()),
+          Mask);
+    }
+    InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
+    InstructionCost createPoison(Type *Ty, unsigned VF) const {
+      return TTI::TCC_Free;
+    }
+    void resizeToMatch(Value *&, Value *&) const {}
+  };
+
+  /// Smart shuffle instruction emission, walks through shuffles trees and
+  /// tries to find the best matching vector for the actual shuffle
+  /// instruction.
+  InstructionCost createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
+    ShuffleCostBuilder Builder(TTI);
+    return BaseShuffleAnalysis::createShuffle<InstructionCost>(V1, V2, Mask,
+                                                               Builder);
+  }
+
+public:
+  ShuffleCostEstimator(TargetTransformInfo &TTI,
+                       ArrayRef<Value *> VectorizedVals, BoUpSLP &R)
+      : TTI(TTI), VectorizedVals(VectorizedVals), R(R) {}
+  Value *adjustExtracts(const TreeEntry *E, ArrayRef<int> Mask) {
     if (Mask.empty())
       return nullptr;
     Value *VecBase = nullptr;
+    ArrayRef<Value *> VL = E->Scalars;
+    auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());
     // If the resulting type is scalarized, do not adjust the cost.
-    unsigned VecNumParts = TTI->getNumberOfParts(VecTy);
+    unsigned VecNumParts = TTI.getNumberOfParts(VecTy);
     if (VecNumParts == VecTy->getNumElements())
       return nullptr;
     DenseMap<Value *, int> ExtractVectorsTys;
     SmallPtrSet<Value *, 4> CheckedExtracts;
     for (auto [I, V] : enumerate(VL)) {
+      // Ignore non-extractelement scalars.
       if (isa<UndefValue>(V) || (!Mask.empty() && Mask[I] == UndefMaskElem))
         continue;
       // If all users of instruction are going to be vectorized and this
@@ -6811,9 +6795,9 @@
       // vectorized tree.
       // Also, avoid adjusting the cost for extractelements with multiple uses
       // in different graph entries.
-      const TreeEntry *VE = getTreeEntry(V);
+      const TreeEntry *VE = R.getTreeEntry(V);
       if (!CheckedExtracts.insert(V).second ||
-          !areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) ||
+          !R.areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) ||
           (VE && VE != E))
         continue;
       auto *EE = cast<ExtractElementInst>(V);
@@ -6822,7 +6806,7 @@
       if (!EEIdx)
         continue;
       unsigned Idx = *EEIdx;
-      if (VecNumParts != TTI->getNumberOfParts(EE->getVectorOperandType())) {
+      if (VecNumParts != TTI.getNumberOfParts(EE->getVectorOperandType())) {
         auto It =
             ExtractVectorsTys.try_emplace(EE->getVectorOperand(), Idx).first;
         It->getSecond() = std::min<int>(It->second, Idx);
@@ -6835,18 +6819,17 @@
             })) {
           // Use getExtractWithExtendCost() to calculate the cost of
           // extractelement/ext pair.
-          Cost -=
-              TTI->getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
-                                            EE->getVectorOperandType(), Idx);
+          Cost -= TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
+                                               EE->getVectorOperandType(), Idx);
           // Add back the cost of s|zext which is subtracted separately.
-          Cost += TTI->getCastInstrCost(
+          Cost += TTI.getCastInstrCost(
               Ext->getOpcode(), Ext->getType(), EE->getType(),
               TTI::getCastContextHint(Ext), CostKind, Ext);
           continue;
         }
       }
-      Cost -= TTI->getVectorInstrCost(*EE, EE->getVectorOperandType(), CostKind,
-                                      Idx);
+      Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(), CostKind,
+                                     Idx);
     }
     // Add a cost for subvector extracts/inserts if required.
     for (const auto &Data : ExtractVectorsTys) {
@@ -6854,256 +6837,364 @@
       unsigned NumElts = VecTy->getNumElements();
       if (Data.second % NumElts == 0)
         continue;
-      if (TTI->getNumberOfParts(EEVTy) > VecNumParts) {
+      if (TTI.getNumberOfParts(EEVTy) > VecNumParts) {
         unsigned Idx = (Data.second / NumElts) * NumElts;
         unsigned EENumElts = EEVTy->getNumElements();
         if (Idx % NumElts == 0)
           continue;
         if (Idx + NumElts <= EENumElts) {
-          Cost +=
-              TTI->getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
-                                  EEVTy, std::nullopt, CostKind, Idx, VecTy);
+          Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
+                                     EEVTy, std::nullopt, CostKind, Idx, VecTy);
         } else {
           // Need to round up the subvector type vectorization factor to avoid a
           // crash in cost model functions. Make SubVT so that Idx + VF of SubVT
           // <= EENumElts.
           auto *SubVT =
               FixedVectorType::get(VecTy->getElementType(), EENumElts - Idx);
-          Cost +=
-              TTI->getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
-                                  EEVTy, std::nullopt, CostKind, Idx, SubVT);
+          Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
+                                     EEVTy, std::nullopt, CostKind, Idx, SubVT);
         }
       } else {
-        Cost += TTI->getShuffleCost(TargetTransformInfo::SK_InsertSubvector,
-                                    VecTy, std::nullopt, CostKind, 0, EEVTy);
+        Cost += TTI.getShuffleCost(TargetTransformInfo::SK_InsertSubvector,
+                                   VecTy, std::nullopt, CostKind, 0, EEVTy);
       }
     }
     return VecBase;
-  };
-  if (E->State == TreeEntry::NeedToGather) {
-    if (allConstant(VL))
-      return 0;
-    if (isa<InsertElementInst>(VL[0]))
-      return InstructionCost::getInvalid();
-    unsigned VF = E->getVectorFactor();
-    SmallVector<int> ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(),
-                                          E->ReuseShuffleIndices.end());
-    SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
-    // Build a mask out of the reorder indices and reorder scalars per this
-    // mask.
-    SmallVector<int> ReorderMask;
-    inversePermutation(E->ReorderIndices, ReorderMask);
-    if (!ReorderMask.empty())
-      reorderScalars(GatheredScalars, ReorderMask);
-    SmallVector<int> Mask;
-    SmallVector<int> ExtractMask;
-    std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle;
-    std::optional<TargetTransformInfo::ShuffleKind> GatherShuffle;
-    SmallVector<const TreeEntry *> Entries;
-    Type *ScalarTy = GatheredScalars.front()->getType();
-    // Check for gathered extracts.
-    ExtractShuffle = tryToGatherExtractElements(GatheredScalars, ExtractMask);
-    SmallVector<Value *> IgnoredVals;
-    if (UserIgnoreList)
-      IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end());
-
-    InstructionCost Cost = 0;
-    bool Resized = false;
-    if (Value *VecBase = AdjustExtractsCost(Cost, ExtractMask))
-      if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
-        if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) {
-          Resized = true;
-          GatheredScalars.append(VF - GatheredScalars.size(),
-                                 PoisonValue::get(ScalarTy));
-        }
-
-    // Do not try to look for reshuffled loads for gathered loads (they will be
-    // handled later), for vectorized scalars, and cases, which are definitely
-    // not profitable (splats and small gather nodes.)
-    if (ExtractShuffle || E->getOpcode() != Instruction::Load ||
-        E->isAltShuffle() ||
-        all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
-        isSplat(E->Scalars) ||
-        (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2))
-      GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries);
-    if (GatherShuffle) {
-      assert((Entries.size() == 1 || Entries.size() == 2) &&
-             "Expected shuffle of 1 or 2 entries.");
-      if (!Resized) {
-        unsigned VF1 = Entries.front()->getVectorFactor();
-        unsigned VF2 = Entries.back()->getVectorFactor();
-        if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF)
-          GatheredScalars.append(VF - GatheredScalars.size(),
-                                 PoisonValue::get(ScalarTy));
+  }
+  std::optional<InstructionCost>
+  needToDelay(const TreeEntry *, ArrayRef<const TreeEntry *>) const {
+    // No need to delay the cost estimation during analysis.
+    return std::nullopt;
+  }
+  void add(const TreeEntry *E1, const TreeEntry *E2, ArrayRef<int> Mask) {
+    // Use zeroinitializer instead of actual vector value here, since they are
+    // not ready yet.
+    add(Constant::getNullValue(FixedVectorType::get(
+            E1->Scalars.front()->getType(), E1->getVectorFactor())),
+        Constant::getNullValue(FixedVectorType::get(
+            E2->Scalars.front()->getType(), E2->getVectorFactor())),
+        Mask);
+  }
+  void add(const TreeEntry *E1, ArrayRef<int> Mask) {
+    // Use zeroinitializer instead of actual vector value here, since they are
+    // not ready yet.
+    add(Constant::getNullValue(FixedVectorType::get(
+            E1->Scalars.front()->getType(), E1->getVectorFactor())),
+        Mask);
+  }
+  /// Adds 2 input vectors and the mask for their shuffling.
+  void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
+    assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
+    if (InVectors.empty()) {
+      InVectors.push_back(V1);
+      InVectors.push_back(V2);
+      CommonMask.assign(Mask.begin(), Mask.end());
+      return;
+    }
+    Value *Vec = InVectors.front();
+    if (InVectors.size() == 2) {
+      Cost += createShuffle(Vec, InVectors.back(), CommonMask);
+      for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
+        if (Mask[Idx] != UndefMaskElem)
+          CommonMask[Idx] = Idx;
+    } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
+               Mask.size()) {
+      Cost += createShuffle(Vec, nullptr, CommonMask);
+      for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
+        if (Mask[Idx] != UndefMaskElem)
+          CommonMask[Idx] = Idx;
+    }
+    Cost += createShuffle(V1, V2, Mask);
+    for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
+      if (Mask[Idx] != UndefMaskElem)
+        CommonMask[Idx] = Idx + Sz;
+    InVectors.front() = Vec;
+    if (InVectors.size() == 2)
+      InVectors.back() = V1;
+    else
+      InVectors.push_back(V1);
+  }
+  /// Adds another one input vector and the mask for the shuffling.
+  void add(Value *V1, ArrayRef<int> Mask) {
+    if (InVectors.empty()) {
+      if (!isa<FixedVectorType>(V1->getType())) {
+        Cost += createShuffle(V1, nullptr, CommonMask);
+        CommonMask.assign(Mask.size(), UndefMaskElem);
+        for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
+          if (Mask[Idx] != UndefMaskElem)
+            CommonMask[Idx] = Idx;
       }
-      // Remove shuffled elements from list of gathers.
-      for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
-        if (Mask[I] != UndefMaskElem)
-          GatheredScalars[I] = PoisonValue::get(ScalarTy);
+      InVectors.push_back(V1);
+      CommonMask.assign(Mask.begin(), Mask.end());
+      return;
+    }
+    const auto *It = find(InVectors, V1);
+    if (It == InVectors.end()) {
+      if (InVectors.size() == 2 ||
+          InVectors.front()->getType() != V1->getType() ||
+          !isa<FixedVectorType>(V1->getType())) {
+        Value *V = InVectors.front();
+        if (InVectors.size() == 2) {
+          Cost +=
+              createShuffle(InVectors.front(), InVectors.back(), CommonMask);
+          for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
+            if (CommonMask[Idx] != UndefMaskElem)
+              CommonMask[Idx] = Idx;
+        } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
+                   CommonMask.size()) {
+          Cost += createShuffle(InVectors.front(), nullptr, CommonMask);
+          for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
+            if (CommonMask[Idx] != UndefMaskElem)
+              CommonMask[Idx] = Idx;
+        }
+        for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
+          if (CommonMask[Idx] == UndefMaskElem && Mask[Idx] != UndefMaskElem)
+            CommonMask[Idx] =
+                V->getType() != V1->getType()
+                    ? Idx + Sz
+                    : Mask[Idx] + cast<FixedVectorType>(V1->getType())
+                                      ->getNumElements();
+        if (V->getType() != V1->getType())
+          Cost += createShuffle(V1, nullptr, Mask);
+        InVectors.front() = V;
+        if (InVectors.size() == 2)
+          InVectors.back() = V1;
+        else
+          InVectors.push_back(V1);
+        return;
       }
-      InstructionCost GatherCost = 0;
-      int Limit = Mask.size() * 2;
-      if (all_of(Mask, [=](int Idx) { return Idx < Limit; }) &&
-          ShuffleVectorInst::isIdentityMask(Mask)) {
-        // Perfect match in the graph, will reuse the previously vectorized
-        // node. Cost is 0.
-        LLVM_DEBUG(
-            dbgs()
-            << "SLP: perfect diamond match for gather bundle that starts with "
-            << *VL.front() << ".\n");
-        if (NeedToShuffleReuses)
-          GatherCost =
-              TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
-                                  FinalVecTy, E->ReuseShuffleIndices);
-      } else {
-        LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size()
-                          << " entries for bundle that starts with "
-                          << *VL.front() << ".\n");
-        // Detected that instead of gather we can emit a shuffle of single/two
-        // previously vectorized nodes. Add the cost of the permutation rather
-        // than gather.
-        ::addMask(Mask, E->ReuseShuffleIndices);
-        GatherCost = TTI->getShuffleCost(*GatherShuffle, FinalVecTy, Mask);
-      }
-      if (!all_of(GatheredScalars, UndefValue::classof))
-        GatherCost += getGatherCost(GatheredScalars);
-      return GatherCost;
-    }
-    if (ExtractShuffle && all_of(GatheredScalars, PoisonValue::classof)) {
-      // Check that gather of extractelements can be represented as just a
-      // shuffle of a single/two vectors the scalars are extracted from.
-      // Found the bunch of extractelement instructions that must be gathered
-      // into a vector and can be represented as a permutation elements in a
-      // single input vector or of 2 input vectors.
-      InstructionCost Cost =
-          computeExtractCost(VL, VecTy, *ExtractShuffle, ExtractMask, *TTI);
-      AdjustExtractsCost(Cost, ExtractMask);
-      if (NeedToShuffleReuses)
-        Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
-                                    FinalVecTy, E->ReuseShuffleIndices);
-      return Cost;
+      // Check if second vector is required if the used elements are already
+      // used from the first one.
+      for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
+        if (Mask[Idx] != UndefMaskElem && CommonMask[Idx] == UndefMaskElem) {
+          InVectors.push_back(V1);
+          break;
+        }
     }
-    if (isSplat(VL)) {
-      // Found the broadcasting of the single scalar, calculate the cost as the
-      // broadcast.
-      assert(VecTy == FinalVecTy &&
-             "No reused scalars expected for broadcast.");
+    int VF = CommonMask.size();
+    if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
+      VF = FTy->getNumElements();
+    for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
+      if (Mask[Idx] != UndefMaskElem && CommonMask[Idx] == UndefMaskElem)
+        CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
+  }
+  /// Adds another one input vector and the mask for the shuffling.
+  void addOrdered(Value *V1, ArrayRef<unsigned> Order) {
+    SmallVector<int, 4> NewMask;
+    inversePermutation(Order, NewMask);
+    add(V1, NewMask);
+  }
+  Value *gather(ArrayRef<Value *> VL, Value *Root = nullptr) {
+    auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());
+    auto BuildVectorCost = [&](ArrayRef<Value *> VL,
+                               Value *Root) -> InstructionCost {
+      InstructionCost GatherCost = 0;
+      SmallVector<Value *> Gathers(VL.begin(), VL.end());
+      BoUpSLP::ValueSet VectorizedLoads;
+      // Improve gather cost for gather of loads, if we can group some of the
+      // loads into vector loads.
+      InstructionsState S = getSameOpcode(VL, *R.TLI);
+      if (VL.size() > 2 && S.getOpcode() == Instruction::Load &&
+          !S.isAltShuffle() &&
+          !all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) &&
+          !isSplat(Gathers)) {
+        unsigned StartIdx = 0;
+        unsigned VF = VL.size() / 2;
+        unsigned VectorizedCnt = 0;
+        unsigned ScatterVectorizeCnt = 0;
+        const unsigned Sz = R.DL->getTypeSizeInBits(S.MainOp->getType());
+        for (unsigned MinVF = R.getMinVF(2 * Sz); VF >= MinVF; VF /= 2) {
+          for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;
+               Cnt += VF) {
+            ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
+            if (!VectorizedLoads.count(Slice.front()) &&
+                !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {
+              SmallVector<Value *> PointerOps;
+              OrdersType CurrentOrder;
+              LoadsState LS =
+                  canVectorizeLoads(Slice, Slice.front(), TTI, *R.DL, *R.SE,
+                                    *R.LI, *R.TLI, CurrentOrder, PointerOps);
+              switch (LS) {
+              case LoadsState::Vectorize:
+              case LoadsState::ScatterVectorize:
+                // Mark the vectorized loads so that we don't vectorize them
+                // again.
+                if (LS == LoadsState::Vectorize)
+                  ++VectorizedCnt;
+                else
+                  ++ScatterVectorizeCnt;
+                VectorizedLoads.insert(Slice.begin(), Slice.end());
+                // If we vectorized initial block, no need to try to vectorize
+                // it again.
+                if (Cnt == StartIdx)
+                  StartIdx += VF;
+                break;
+              case LoadsState::Gather:
+                break;
+              }
+            }
+          }
+          // Check if the whole array was vectorized already - exit.
+          if (StartIdx >= VL.size())
+            break;
+          // Found vectorizable parts - exit.
+          if (!VectorizedLoads.empty())
+            break;
+        }
+        if (!VectorizedLoads.empty()) {
+          // Get the cost for gathered loads.
+          for (unsigned I = 0, End = VL.size(); I < End; I += VF) {
+            if (!VectorizedLoads.contains(VL[I]))
+              continue;
+            // Exclude potentially vectorized loads from list of gathered
+            // scalars.
+            for (unsigned K = I, End = I + VF; K < End; ++K)
+              Gathers[K] = PoisonValue::get(Gathers[K]->getType());
+          }
+          // The cost for vectorized loads.
+          InstructionCost ScalarsCost = 0;
+          for (Value *V : VectorizedLoads) {
+            auto *LI = cast<LoadInst>(V);
+            ScalarsCost += TTI.getMemoryOpCost(
+                Instruction::Load, LI->getType(), LI->getAlign(),
+                LI->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo(),
+                LI);
+          }
+          auto *LI = cast<LoadInst>(S.MainOp);
+          auto *LoadTy = FixedVectorType::get(LI->getType(), VF);
+          Align Alignment = LI->getAlign();
+          GatherCost +=
+              VectorizedCnt *
+              TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
+                                  LI->getPointerAddressSpace(), CostKind,
+                                  TTI::OperandValueInfo(), LI);
+          GatherCost += ScatterVectorizeCnt *
+                        TTI.getGatherScatterOpCost(
+                            Instruction::Load, LoadTy, LI->getPointerOperand(),
+                            /*VariableMask=*/false, Alignment, CostKind, LI);
+          // Add the cost for the subvectors shuffling.
+          GatherCost += (VectorizedCnt + ScatterVectorizeCnt - 1) *
+                        TTI.getShuffleCost(TTI::SK_Select, VecTy);
+          GatherCost -= ScalarsCost;
+        }
+      } else if (!Root && !allConstant(VL) && isSplat(VL)) {
+        // Found the broadcasting of the single scalar, calculate the cost as
+        // the broadcast.
       const auto *It =
           find_if(VL, [](Value *V) { return !isa<UndefValue>(V); });
-      // If all values are undefs - consider cost free.
-      if (It == VL.end())
-        return TTI::TCC_Free;
+        assert(It != VL.end() && "Expected at least one non-undef value.");
       // Add broadcast for non-identity shuffle only.
       bool NeedShuffle =
           count(VL, *It) > 1 &&
           (VL.front() != *It || !all_of(VL.drop_front(), UndefValue::classof));
-      InstructionCost InsertCost = TTI->getVectorInstrCost(
+      InstructionCost InsertCost = TTI.getVectorInstrCost(
           Instruction::InsertElement, VecTy, CostKind,
           NeedShuffle ? 0 : std::distance(VL.begin(), It),
           PoisonValue::get(VecTy), *It);
-      return InsertCost + (NeedShuffle
-                               ? TTI->getShuffleCost(
-                                     TargetTransformInfo::SK_Broadcast, VecTy,
-                                     /*Mask=*/std::nullopt, CostKind,
-                                     /*Index=*/0,
-                                     /*SubTp=*/nullptr, /*Args=*/*It)
-                               : TTI::TCC_Free);
-    }
-    InstructionCost ReuseShuffleCost = 0;
-    if (NeedToShuffleReuses)
-      ReuseShuffleCost = TTI->getShuffleCost(
-          TTI::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices);
-    // Improve gather cost for gather of loads, if we can group some of the
-    // loads into vector loads.
-    if (VL.size() > 2 && E->getOpcode() == Instruction::Load &&
-        !E->isAltShuffle()) {
-      BoUpSLP::ValueSet VectorizedLoads;
-      unsigned StartIdx = 0;
-      unsigned VF = VL.size() / 2;
-      unsigned VectorizedCnt = 0;
-      unsigned ScatterVectorizeCnt = 0;
-      const unsigned Sz = DL->getTypeSizeInBits(E->getMainOp()->getType());
-      for (unsigned MinVF = getMinVF(2 * Sz); VF >= MinVF; VF /= 2) {
-        for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;
-             Cnt += VF) {
-          ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
-          if (!VectorizedLoads.count(Slice.front()) &&
-              !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {
-            SmallVector<Value *> PointerOps;
-            OrdersType CurrentOrder;
-            LoadsState LS =
-                canVectorizeLoads(Slice, Slice.front(), *TTI, *DL, *SE, *LI,
-                                  *TLI, CurrentOrder, PointerOps);
-            switch (LS) {
-            case LoadsState::Vectorize:
-            case LoadsState::ScatterVectorize:
-              // Mark the vectorized loads so that we don't vectorize them
-              // again.
-              if (LS == LoadsState::Vectorize)
-                ++VectorizedCnt;
-              else
-                ++ScatterVectorizeCnt;
-              VectorizedLoads.insert(Slice.begin(), Slice.end());
-              // If we vectorized initial block, no need to try to vectorize it
-              // again.
-              if (Cnt == StartIdx)
-                StartIdx += VF;
-              break;
-            case LoadsState::Gather:
-              break;
-            }
-          }
+      return InsertCost +
+             (NeedShuffle ? TTI.getShuffleCost(
+                                TargetTransformInfo::SK_Broadcast, VecTy,
+                                /*Mask=*/std::nullopt, CostKind,
+                                /*Index=*/0, /*SubTp=*/nullptr, /*Args=*/*It)
+                          : TTI::TCC_Free);
+      }
+      return GatherCost + R.getGatherCost(Gathers);
+    };
+    Cost += BuildVectorCost(VL, Root);
+    if (!Root) {
+      SmallVector<Constant *> Vals;
+      for (Value *V : VL) {
+        if (isa<UndefValue>(V)) {
+          Vals.push_back(cast<Constant>(V));
+          continue;
         }
-        // Check if the whole array was vectorized already - exit.
-        if (StartIdx >= VL.size())
-          break;
-        // Found vectorizable parts - exit.
-        if (!VectorizedLoads.empty())
-          break;
+        Vals.push_back(Constant::getNullValue(V->getType()));
+      }
+      return ConstantVector::get(Vals);
+    }
+    return ConstantVector::getSplat(
+        ElementCount::getFixed(VL.size()),
+        Constant::getNullValue(VL.front()->getType()));
+  }
+  InstructionCost createFreeze(InstructionCost Cost) { return Cost; }
+  /// Finalize emission of the shuffles.
+  InstructionCost
+  finalize(ArrayRef<int> ExtMask,
+           function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
+    IsFinalized = true;
+    if (Action) {
+      Value *Vec = InVectors.front();
+      if (InVectors.size() == 2) {
+        Cost += createShuffle(Vec, InVectors.back(), CommonMask);
+        InVectors.pop_back();
+      } else {
+        Cost += createShuffle(Vec, nullptr, CommonMask);
       }
-      if (!VectorizedLoads.empty()) {
-        InstructionCost GatherCost = 0;
-        unsigned NumParts = TTI->getNumberOfParts(VecTy);
-        bool NeedInsertSubvectorAnalysis =
-            !NumParts || (VL.size() / VF) > NumParts;
-        // Get the cost for gathered loads.
-        for (unsigned I = 0, End = VL.size(); I < End; I += VF) {
-          if (VectorizedLoads.contains(VL[I]))
+      for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
+        if (CommonMask[Idx] != UndefMaskElem)
+          CommonMask[Idx] = Idx;
+      Action(Vec, CommonMask);
+      InVectors.front() = Vec;
+    }
+    if (!ExtMask.empty()) {
+      if (CommonMask.empty()) {
+        CommonMask.assign(ExtMask.begin(), ExtMask.end());
+      } else {
+        SmallVector<int> NewMask(ExtMask.size(), UndefMaskElem);
+        for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
+          if (ExtMask[I] == UndefMaskElem)
             continue;
-          GatherCost += getGatherCost(VL.slice(I, VF));
-        }
-        // The cost for vectorized loads.
-        InstructionCost ScalarsCost = 0;
-        for (Value *V : VectorizedLoads) {
-          auto *LI = cast<LoadInst>(V);
-          ScalarsCost +=
-              TTI->getMemoryOpCost(Instruction::Load, LI->getType(),
-                                   LI->getAlign(), LI->getPointerAddressSpace(),
-                                   CostKind, TTI::OperandValueInfo(), LI);
-        }
-        auto *LI = cast<LoadInst>(E->getMainOp());
-        auto *LoadTy = FixedVectorType::get(LI->getType(), VF);
-        Align Alignment = LI->getAlign();
-        GatherCost +=
-            VectorizedCnt *
-            TTI->getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
-                                 LI->getPointerAddressSpace(), CostKind,
-                                 TTI::OperandValueInfo(), LI);
-        GatherCost += ScatterVectorizeCnt *
-                      TTI->getGatherScatterOpCost(
-                          Instruction::Load, LoadTy, LI->getPointerOperand(),
-                          /*VariableMask=*/false, Alignment, CostKind, LI);
-        if (NeedInsertSubvectorAnalysis) {
-          // Add the cost for the subvectors insert.
-          for (int I = VF, E = VL.size(); I < E; I += VF)
-            GatherCost +=
-                TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy,
-                                    std::nullopt, CostKind, I, LoadTy);
-        }
-        return ReuseShuffleCost + GatherCost - ScalarsCost;
-      }
-    }
-    return ReuseShuffleCost + getGatherCost(VL);
+          NewMask[I] = CommonMask[ExtMask[I]];
+        }
+        CommonMask.swap(NewMask);
+      }
+    }
+    if (CommonMask.empty()) {
+      assert(InVectors.size() == 1 && "Expected only one vector with no mask");
+      return Cost;
+    }
+    if (InVectors.size() == 2)
+      return Cost +
+             createShuffle(InVectors.front(), InVectors.back(), CommonMask);
+    return Cost + createShuffle(InVectors.front(), nullptr, CommonMask);
+  }
+
+  ~ShuffleCostEstimator() {
+    assert((IsFinalized || CommonMask.empty()) &&
+           "Shuffle construction must be finalized.");
+  }
+};
+
+InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
+                                      ArrayRef<Value *> VectorizedVals) {
+  ArrayRef<Value *> VL = E->Scalars;
+
+  Type *ScalarTy = VL[0]->getType();
+  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+    ScalarTy = SI->getValueOperand()->getType();
+  else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))
+    ScalarTy = CI->getOperand(0)->getType();
+  else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
+    ScalarTy = IE->getOperand(1)->getType();
+  auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+
+  // If we have computed a smaller type for the expression, update VecTy so
+  // that the costs will be accurate.
+  if (MinBWs.count(VL[0]))
+    VecTy = FixedVectorType::get(
+        IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
+  unsigned EntryVF = E->getVectorFactor();
+  auto *FinalVecTy = FixedVectorType::get(VecTy->getElementType(), EntryVF);
+
+  bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
+  if (E->State == TreeEntry::NeedToGather) {
+    if (allConstant(VL))
+      return 0;
+    if (isa<InsertElementInst>(VL[0]))
+      return InstructionCost::getInvalid();
+    return processBuildVector<ShuffleCostEstimator, InstructionCost>(
+        E, *TTI, VectorizedVals, *this);
   }
   InstructionCost CommonCost = 0;
   SmallVector<int> Mask;
@@ -8861,7 +8952,7 @@
   Builder.SetCurrentDebugLocation(Front->getDebugLoc());
 }
 
-Value *BoUpSLP::gather(ArrayRef<Value *> VL) {
+Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root) {
   // List of instructions/lanes from current block and/or the blocks which are
   // part of the current loop. These instructions will be inserted at the end to
   // make it possible to optimize loops and hoist invariant instructions out of
@@ -8878,7 +8969,8 @@
   for (int I = 0, E = VL.size(); I < E; ++I) {
     if (auto *Inst = dyn_cast<Instruction>(VL[I]))
       if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
-           getTreeEntry(Inst) || (L && (L->contains(Inst)))) &&
+           getTreeEntry(Inst) ||
+           (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
           PostponedIndices.insert(I).second)
         PostponedInsts.emplace_back(Inst, I);
   }
@@ -8901,7 +8993,7 @@
   Value *Val0 =
       isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0];
   FixedVectorType *VecTy = FixedVectorType::get(Val0->getType(), VL.size());
-  Value *Vec = PoisonValue::get(VecTy);
+  Value *Vec = Root ? Root : PoisonValue::get(VecTy);
   SmallVector<int> NonConsts;
   // Insert constant values at first.
   for (int I = 0, E = VL.size(); I < E; ++I) {
@@ -8911,6 +9003,18 @@
       NonConsts.push_back(I);
       continue;
     }
+    if (Root) {
+      if (!isa<UndefValue>(VL[I])) {
+        NonConsts.push_back(I);
+        continue;
+      }
+      if (isa<PoisonValue>(VL[I]))
+        continue;
+      if (auto *SV = dyn_cast<ShuffleVectorInst>(Root)) {
+        if (SV->getMaskValue(I) == UndefMaskElem)
+          continue;
+      }
+    }
     Vec = CreateInsertElement(Vec, VL[I], I);
   }
   // Insert non-constant values.
@@ -9009,6 +9113,10 @@
       }
       return Vec;
     }
+    Value *createIdentity(Value *V) { return V; }
+    Value *createPoison(Type *Ty, unsigned VF) {
+      return PoisonValue::get(FixedVectorType::get(Ty, VF));
+    }
     /// Resizes 2 input vector to match the sizes, if the they are not equal
     /// yet. The smallest vector is resized to the size of the larger vector.
     void resizeToMatch(Value *&V1, Value *&V2) {
@@ -9041,7 +9149,8 @@
     assert(V1 && "Expected at least one vector value.");
     ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
                                     R.CSEBlocks);
-    return BaseShuffleAnalysis::createShuffle(V1, V2, Mask, ShuffleBuilder);
+    return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
+                                                       ShuffleBuilder);
   }
 
   /// Transforms mask \p CommonMask per given \p Mask to make proper set after
@@ -9057,6 +9166,41 @@
   ShuffleInstructionBuilder(IRBuilderBase &Builder, BoUpSLP &R)
       : Builder(Builder), R(R) {}
 
+  Value *adjustExtracts(const TreeEntry *E, ArrayRef<int> Mask) {
+    Value *VecBase = nullptr;
+    for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
+      int Idx = Mask[I];
+      if (Idx == UndefMaskElem)
+        continue;
+      auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
+      VecBase = EI->getVectorOperand();
+      // If all users are vectorized - can delete the extractelement itself.
+      if (any_of(EI->users(),
+                 [&](User *U) { return !R.ScalarToTreeEntry.count(U); }))
+        continue;
+      R.eraseInstruction(EI);
+    }
+    return VecBase;
+  }
+  std::optional<Value *> needToDelay(const TreeEntry *E,
+                                     ArrayRef<const TreeEntry *> Deps) const {
+    // No need to delay emission if all deps are ready.
+    if (all_of(Deps, [](const TreeEntry *TE) { return TE->VectorizedValue; }))
+      return std::nullopt;
+    // Postpone gather emission, will be emitted after the end of the
+    // process to keep correct order.
+    auto *VecTy = FixedVectorType::get(E->Scalars.front()->getType(),
+                                       E->getVectorFactor());
+    Value *Vec = Builder.CreateAlignedLoad(
+        VecTy, PoisonValue::get(VecTy->getPointerTo()), MaybeAlign());
+    return Vec;
+  }
+  void add(const TreeEntry *E1, const TreeEntry *E2, ArrayRef<int> Mask) {
+    add(E1->VectorizedValue, E2->VectorizedValue, Mask);
+  }
+  void add(const TreeEntry *E1, ArrayRef<int> Mask) {
+    add(E1->VectorizedValue, Mask);
+  }
   /// Adds 2 input vectors and the mask for their shuffling.
   void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
     assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
@@ -9148,10 +9292,29 @@
     inversePermutation(Order, NewMask);
     add(V1, NewMask);
   }
+  Value *gather(ArrayRef<Value *> VL, Value *Root = nullptr) {
+    return R.gather(VL, Root);
+  }
+  Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
   /// Finalize emission of the shuffles.
   Value *
-  finalize(ArrayRef<int> ExtMask = std::nullopt) {
+  finalize(ArrayRef<int> ExtMask,
+           function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
     IsFinalized = true;
+    if (Action) {
+      Value *Vec = InVectors.front();
+      if (InVectors.size() == 2) {
+        Vec = createShuffle(Vec, InVectors.back(), CommonMask);
+        InVectors.pop_back();
+      } else {
+        Vec = createShuffle(Vec, nullptr, CommonMask);
+      }
+      for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
+        if (CommonMask[Idx] != UndefMaskElem)
+          CommonMask[Idx] = Idx;
+      Action(Vec, CommonMask);
+      InVectors.front() = Vec;
+    }
     if (!ExtMask.empty()) {
       if (CommonMask.empty()) {
         CommonMask.assign(ExtMask.begin(), ExtMask.end());
@@ -9276,101 +9439,326 @@
   return vectorizeTree(I->get());
 }
 
-Value *BoUpSLP::createBuildVector(const TreeEntry *E) {
+template <typename BVTy, typename ResTy, typename... Args>
+ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
   assert(E->State == TreeEntry::NeedToGather && "Expected gather node.");
   unsigned VF = E->getVectorFactor();
 
-  ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
-  SmallVector<Value *> Gathered(
-      VF, PoisonValue::get(E->Scalars.front()->getType()));
   bool NeedFreeze = false;
-  SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
+  bool RebuiltVector = false;
+  SmallVector<int> ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(),
+                                        E->ReuseShuffleIndices.end());
+  SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
   // Build a mask out of the redorder indices and reorder scalars per this mask.
   SmallVector<int> ReorderMask;
   inversePermutation(E->ReorderIndices, ReorderMask);
   if (!ReorderMask.empty())
-    reorderScalars(VL, ReorderMask);
-  SmallVector<int> ReuseMask(VF, UndefMaskElem);
-  if (!allConstant(VL)) {
-    // For splats with can emit broadcasts instead of gathers, so try to find
-    // such sequences.
-    bool IsSplat = isSplat(VL) && (VL.size() > 2 || VL.front() == VL.back());
-    SmallVector<int> UndefPos;
-    DenseMap<Value *, unsigned> UniquePositions;
-    // Gather unique non-const values and all constant values.
-    // For repeated values, just shuffle them.
-    for (auto [I, V] : enumerate(VL)) {
-      if (isa<UndefValue>(V)) {
-        if (!isa<PoisonValue>(V)) {
-          Gathered[I] = V;
-          ReuseMask[I] = I;
-          UndefPos.push_back(I);
+    reorderScalars(GatheredScalars, ReorderMask);
+  auto FindReusedSplat = [&](SmallVectorImpl<int> &Mask) {
+    if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
+          return isa<UndefValue>(V) && !isa<PoisonValue>(V);
+        }))
+      return false;
+    TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
+    unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
+    if (UserTE->getNumOperands() != 2)
+      return false;
+    auto *It =
+        find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) {
+          return find_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) {
+                   return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
+                 }) != TE->UserTreeIndices.end();
+        });
+    if (It == VectorizableTree.end())
+      return false;
+    unsigned I =
+        *find_if_not(Mask, [](int Idx) { return Idx == UndefMaskElem; });
+    int Sz = Mask.size();
+    if (all_of(Mask, [Sz](int Idx) { return Idx < 2 * Sz; }) &&
+        ShuffleVectorInst::isIdentityMask(Mask))
+      std::iota(Mask.begin(), Mask.end(), 0);
+    else
+      std::fill(Mask.begin(), Mask.end(), I);
+    return true;
+  };
+  BVTy GatherBuilder(Params...);
+  ResTy Res = ResTy();
+  SmallVector<int> Mask;
+  SmallVector<int> ExtractMask;
+  SmallVector<int> ReuseMask;
+  std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle;
+  std::optional<TargetTransformInfo::ShuffleKind> GatherShuffle;
+  SmallVector<const TreeEntry *> Entries;
+  Type *ScalarTy = GatheredScalars.front()->getType();
+  bool IsNonPoisoned = true;
+  bool IsUsedInExpr = false;
+  SmallVector<const TreeEntry *, 2> ReusedEntries;
+  if (!all_of(GatheredScalars, UndefValue::classof)) {
+    // Check for gathered extracts.
+    ExtractShuffle = tryToGatherExtractElements(GatheredScalars, ExtractMask);
+    SmallVector<Value *> IgnoredVals;
+    if (UserIgnoreList)
+      IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end());
+    // Need to remove vectorized extracelement instructions.
+    Value *VecBase = GatherBuilder.adjustExtracts(E, ExtractMask);
+    bool Resized = false;
+    if (VecBase)
+      if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
+        if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) {
+          Resized = true;
+          GatheredScalars.append(VF - GatheredScalars.size(),
+                                 PoisonValue::get(ScalarTy));
         }
-        continue;
+    // Gather extracts after we check for full matched gathers only.
+    if (E->getOpcode() != Instruction::Load || E->isAltShuffle() ||
+        all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
+        isSplat(E->Scalars) ||
+        (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
+      GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries);
+    }
+    if (GatherShuffle) {
+      if (std::optional<ResTy> Delayed =
+              GatherBuilder.needToDelay(E, Entries)) {
+        // Delay emission of gathers which are not ready yet.
+        PostponedGathers.insert(E);
+        // Postpone gather emission, will be emitted after the end of the
+        // process to keep correct order.
+        return *Delayed;
       }
-      if (isConstant(V)) {
-        Gathered[I] = V;
-        ReuseMask[I] = I;
-        continue;
+      assert((Entries.size() == 1 || Entries.size() == 2) &&
+             "Expected shuffle of 1 or 2 entries.");
+      if (!Resized) {
+        unsigned VF1 = Entries.front()->getVectorFactor();
+        unsigned VF2 = Entries.back()->getVectorFactor();
+        if ((VF == VF1 && GatheredScalars.size() != VF1) ||
+            (VF == VF2 && GatheredScalars.size() != VF2))
+          GatheredScalars.append(VF - GatheredScalars.size(),
+                                 PoisonValue::get(ScalarTy));
       }
-      if (IsSplat) {
-        Gathered.front() = V;
-        ReuseMask[I] = 0;
-      } else {
-        const auto Res = UniquePositions.try_emplace(V, I);
-        Gathered[Res.first->second] = V;
-        ReuseMask[I] = Res.first->second;
-      }
-    }
-    if (!UndefPos.empty() && IsSplat) {
-      // For undef values, try to replace them with the simple broadcast.
-      // We can do it if the broadcasted value is guaranteed to be
-      // non-poisonous, or by freezing the incoming scalar value first.
-      auto *It = find_if(Gathered, [this, E](Value *V) {
-        return !isa<UndefValue>(V) &&
-               (getTreeEntry(V) || isGuaranteedNotToBePoison(V) ||
-                any_of(V->uses(), [E](const Use &U) {
-                  // Check if the value already used in the same operation in
-                  // one of the nodes already.
-                  return E->UserTreeIndices.size() == 1 &&
-                         is_contained(
-                             E->UserTreeIndices.front().UserTE->Scalars,
-                             U.getUser()) &&
-                         E->UserTreeIndices.front().EdgeIdx != U.getOperandNo();
-                }));
-      });
-      if (It != Gathered.end()) {
-        // Replace undefs by the non-poisoned scalars and emit broadcast.
-        int Pos = std::distance(Gathered.begin(), It);
-        for_each(UndefPos, [&](int I) {
-          // Set the undef position to the non-poisoned scalar.
-          ReuseMask[I] = Pos;
-          // Replace the undef by the poison, in the mask it is replaced by non-poisoned scalar already.
-          if (I != Pos)
-            Gathered[I] = PoisonValue::get(Gathered[I]->getType());
-        });
+      if (*GatherShuffle == TTI::SK_PermuteSingleSrc)
+        IsUsedInExpr = FindReusedSplat(Mask);
+      // Remove shuffled elements from list of gathers.
+      for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
+        if (Mask[I] != UndefMaskElem)
+          GatheredScalars[I] = PoisonValue::get(ScalarTy);
+      }
+      if (Entries.front()->VectorizedValue)
+        IsNonPoisoned &=
+            isGuaranteedNotToBePoison(Entries.front()->VectorizedValue);
+      ReusedEntries.push_back(Entries.front());
+      if (Entries.size() == 1) {
+        GatherBuilder.add(Entries.front(), Mask);
       } else {
-        // Replace undefs by the poisons, emit broadcast and then emit
-        // freeze.
-        for_each(UndefPos, [&](int I) {
-          ReuseMask[I] = UndefMaskElem;
-          if (isa<UndefValue>(Gathered[I]))
-            Gathered[I] = PoisonValue::get(Gathered[I]->getType());
+        if (Entries.back()->VectorizedValue)
+          IsNonPoisoned &=
+              isGuaranteedNotToBePoison(Entries.back()->VectorizedValue);
+        GatherBuilder.add(Entries.front(), Entries.back(), Mask);
+        ReusedEntries.push_back(Entries.back());
+      }
+    } else if (!allConstant(GatheredScalars)) {
+      // For splats we can emit broadcasts instead of gathers, so try to find
+      // such sequences.
+      bool IsSplat = isSplat(GatheredScalars) &&
+                     (GatheredScalars.size() > 2 ||
+                      GatheredScalars.front() == GatheredScalars.back());
+      GatheredScalars.append(VF - GatheredScalars.size(),
+                             PoisonValue::get(ScalarTy));
+      ReuseMask.assign(VF, UndefMaskElem);
+      SmallVector<int> UndefPos;
+      DenseMap<Value *, unsigned> UniquePositions;
+      // Gather unique non-const values and all constant values.
+      // For repeated values, just shuffle them.
+      int NumNonConsts = 0;
+      int SinglePos = 0;
+      for (auto [I, V] : enumerate(GatheredScalars)) {
+        if (isa<UndefValue>(V)) {
+          if (!isa<PoisonValue>(V)) {
+            ReuseMask[I] = I;
+            UndefPos.push_back(I);
+          }
+          continue;
+        }
+        if (isConstant(V)) {
+          ReuseMask[I] = I;
+          continue;
+        }
+        ++NumNonConsts;
+        SinglePos = I;
+        Value *OrigV = V;
+        V = PoisonValue::get(ScalarTy);
+        if (IsSplat) {
+          RebuiltVector |= I != 0;
+          GatheredScalars.front() = OrigV;
+          ReuseMask[I] = 0;
+        } else {
+          const auto Res = UniquePositions.try_emplace(OrigV, I);
+          RebuiltVector |= Res.first->second != I;
+          GatheredScalars[Res.first->second] = OrigV;
+          ReuseMask[I] = Res.first->second;
+        }
+      }
+      if (NumNonConsts == 1) {
+        // Restore single insert element.
+        RebuiltVector = false;
+        if (IsSplat) {
+          ReuseMask.assign(VF, UndefMaskElem);
+          std::swap(GatheredScalars.front(), GatheredScalars[SinglePos]);
+          if (!UndefPos.empty() && UndefPos.front() == 0)
+            GatheredScalars.front() = UndefValue::get(ScalarTy);
+        }
+        ReuseMask[SinglePos] = SinglePos;
+      } else if (!UndefPos.empty() && IsSplat) {
+        // For undef values, try to replace them with the simple broadcast.
+        // We can do it if the broadcasted value is guaranteed to be
+        // non-poisonous, or by freezing the incoming scalar value first.
+        auto *It = find_if(GatheredScalars, [this, E](Value *V) {
+          return !isa<UndefValue>(V) &&
+                 (getTreeEntry(V) || isGuaranteedNotToBePoison(V) ||
+                  (E->UserTreeIndices.size() == 1 &&
+                   any_of(V->uses(), [E](const Use &U) {
+                     // Check if the value already used in the same operation in
+                     // one of the nodes already.
+                     return E->UserTreeIndices.front().EdgeIdx !=
+                                U.getOperandNo() &&
+                            is_contained(
+                                E->UserTreeIndices.front().UserTE->Scalars,
+                                U.getUser());
+                   })));
         });
-        NeedFreeze = true;
+        if (It != GatheredScalars.end()) {
+          // Replace undefs by the non-poisoned scalars and emit broadcast.
+          int Pos = std::distance(GatheredScalars.begin(), It);
+          for_each(UndefPos, [&](int I) {
+            // Set the undef position to the non-poisoned scalar.
+            ReuseMask[I] = Pos;
+            // Replace the undef by the poison, in the mask it is replaced by
+            // non-poisoned scalar already.
+            if (I != Pos)
+              GatheredScalars[I] = PoisonValue::get(ScalarTy);
+          });
+        } else {
+          // Replace undefs by the poisons, emit broadcast and then emit
+          // freeze.
+          for_each(UndefPos, [&](int I) {
+            ReuseMask[I] = UndefMaskElem;
+            if (isa<UndefValue>(GatheredScalars[I]))
+              GatheredScalars[I] = PoisonValue::get(ScalarTy);
+          });
+          NeedFreeze = true;
+        }
       }
     }
+  }
+  // Combine generated extracts mask and reused scalars masks and
+  // corresponding input vectors.
+  if (ExtractShuffle) {
+    // Gather of extractelements can be represented as just a shuffle of
+    // a single/two vectors the scalars are extracted from.
+    // Find input vectors.
+    Value *Vec1 = nullptr;
+    Value *Vec2 = nullptr;
+    if (*ExtractShuffle == TTI::SK_PermuteSingleSrc)
+      IsUsedInExpr = FindReusedSplat(ExtractMask);
+    for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
+      if (ExtractMask[I] == UndefMaskElem ||
+          (!Mask.empty() && Mask[I] != UndefMaskElem)) {
+        ExtractMask[I] = UndefMaskElem;
+        continue;
+      }
+      if (isa<UndefValue>(E->Scalars[I]))
+        continue;
+      auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
+      if (!Vec1) {
+        Vec1 = EI->getVectorOperand();
+      } else if (Vec1 != EI->getVectorOperand()) {
+        assert((!Vec2 || Vec2 == EI->getVectorOperand()) &&
+               "Expected only 1 or 2 vectors shuffle.");
+        Vec2 = EI->getVectorOperand();
+      }
+    }
+    if (Vec2) {
+      IsNonPoisoned &=
+          isGuaranteedNotToBePoison(Vec1) && isGuaranteedNotToBePoison(Vec2);
+      GatherBuilder.add(Vec1, Vec2, ExtractMask);
+    } else if (Vec1) {
+      IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1);
+      GatherBuilder.add(Vec1, ExtractMask);
+    } else {
+      GatherBuilder.add(PoisonValue::get(FixedVectorType::get(
+                            ScalarTy, GatheredScalars.size())),
+                        ExtractMask);
+    }
+  }
+  if (ExtractShuffle || GatherShuffle) {
+    // Insert non-constant scalars.
+    SmallVector<Value *> NonConstants(GatheredScalars);
+    int EMSz = ExtractMask.size();
+    int MSz = Mask.size();
+    bool EnoughConsts =
+        !RebuiltVector && (!ExtractShuffle || !GatherShuffle) &&
+        ((ExtractShuffle &&
+          (*ExtractShuffle != TTI::SK_PermuteSingleSrc ||
+           any_of(ExtractMask, [&](int I) { return I >= EMSz; }) ||
+           !ShuffleVectorInst::isIdentityMask(ExtractMask))) ||
+         (GatherShuffle && (*GatherShuffle != TTI::SK_PermuteSingleSrc ||
+                            any_of(Mask, [&](int I) { return I >= MSz; }) ||
+                            !ShuffleVectorInst::isIdentityMask(Mask))) ||
+         count_if(GatheredScalars, [](Value *V) {
+           return isa<Constant>(V) && !isa<PoisonValue>(V);
+         }) > 1);
+    for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
+      if (EnoughConsts && isa<Constant>(GatheredScalars[I]))
+        NonConstants[I] = PoisonValue::get(ScalarTy);
+      else
+        GatheredScalars[I] = PoisonValue::get(ScalarTy);
+    }
+    // Generate constants for final shuffle.
+    if (!all_of(GatheredScalars, UndefValue::classof)) {
+      Mask.assign(GatheredScalars.size(), UndefMaskElem);
+      Value *VecVal = GatherBuilder.gather(GatheredScalars);
+      for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
+        if (!isa<UndefValue>(GatheredScalars[I]))
+          Mask[I] = I;
+      }
+      GatherBuilder.add(VecVal, Mask);
+      IsNonPoisoned &= isGuaranteedNotToBePoison(VecVal);
+    }
+    NeedFreeze = !IsNonPoisoned && !IsUsedInExpr &&
+                 any_of(GatheredScalars, [](Value *V) {
+                   return isa<UndefValue>(V) && !isa<PoisonValue>(V);
+                 });
+    // Emit final insertelement instructions for defined values.
+    if (!RebuiltVector && !all_of(NonConstants, UndefValue::classof)) {
+      Res = GatherBuilder.finalize(
+          ReuseShuffleIndicies, [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
+            Vec = GatherBuilder.gather(NonConstants, Vec);
+            for (unsigned I = 0, Sz = Mask.size(); I < Sz; ++I)
+              if ((!EnoughConsts && !isa<PoisonValue>(NonConstants[I])) ||
+                  !isa<Constant>(NonConstants[I]))
+                Mask[I] = I;
+          });
+    } else {
+      if (RebuiltVector && !all_of(NonConstants, UndefValue::classof)) {
+        // Just generate simple gather, no reused scalars/extracts.
+        Value *BV = GatherBuilder.gather(NonConstants);
+        GatherBuilder.add(BV, ReuseMask);
+      }
+      Res = GatherBuilder.finalize(ReuseShuffleIndicies);
+    }
   } else {
-    ReuseMask.clear();
-    copy(VL, Gathered.begin());
+    // Just generate simple gather, no reused scalars/extracts.
+    Value *BV = GatherBuilder.gather(GatheredScalars);
+    GatherBuilder.add(BV, ReuseMask);
+    Res = GatherBuilder.finalize(ReuseShuffleIndicies);
   }
-  // Gather unique scalars and all constants.
-  Value *Vec = gather(Gathered);
-  ShuffleBuilder.add(Vec, ReuseMask);
-  Vec = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
   if (NeedFreeze)
-    Vec = Builder.CreateFreeze(Vec);
-  return Vec;
+    Res = GatherBuilder.createFreeze(Res);
+  return Res;
+}
+
+Value *BoUpSLP::createBuildVector(const TreeEntry *E) {
+  return processBuildVector<ShuffleInstructionBuilder, Value *>(E, Builder,
+                                                                *this);
 }
 
 Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
@@ -9381,6 +9769,14 @@
     return E->VectorizedValue;
   }
 
+  if (E->State == TreeEntry::NeedToGather) {
+    if (E->getMainOp() && E->Idx == 0)
+      setInsertPointAfterBundle(E);
+    Value *Vec = createBuildVector(E);
+    E->VectorizedValue = Vec;
+    return Vec;
+  }
+
   auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
     ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
     if (E->State != TreeEntry::NeedToGather &&
@@ -9394,199 +9790,6 @@
     }
     return ShuffleBuilder.finalize(E->ReuseShuffleIndices);
   };
-
-  if (E->State == TreeEntry::NeedToGather) {
-    if (E->getMainOp() && E->Idx == 0)
-      setInsertPointAfterBundle(E);
-    unsigned VF = E->getVectorFactor();
-    auto AdjustExtracts = [&](const TreeEntry *E, ArrayRef<int> Mask) {
-      Value *VecBase = nullptr;
-      for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
-        int Idx = Mask[I];
-        if (Idx == UndefMaskElem)
-          continue;
-        auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
-        VecBase = EI->getVectorOperand();
-        // TODO: EI can be erased, if all its users are vectorized. But need to
-        // emit shuffles for such extractelement instructions.
-      }
-      return VecBase;
-    };
-    auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
-      unsigned VF1 = cast<FixedVectorType>(V1->getType())->getNumElements();
-      unsigned VF2 = cast<FixedVectorType>(V2->getType())->getNumElements();
-      unsigned VF = std::max(VF1, VF2);
-      if (VF1 != VF2) {
-        SmallVector<int> ExtMask(VF, UndefMaskElem);
-        std::iota(ExtMask.begin(),
-                  std::next(ExtMask.begin(), std::min(VF1, VF2)), 0);
-        if (VF1 < VF2) {
-          V1 = Builder.CreateShuffleVector(V1, ExtMask);
-          if (auto *I = dyn_cast<Instruction>(V1)) {
-            GatherShuffleExtractSeq.insert(I);
-            CSEBlocks.insert(I->getParent());
-          }
-        } else {
-          V2 = Builder.CreateShuffleVector(V2, ExtMask);
-          if (auto *I = dyn_cast<Instruction>(V2)) {
-            GatherShuffleExtractSeq.insert(I);
-            CSEBlocks.insert(I->getParent());
-          }
-        }
-      }
-      const int Limit = Mask.size() * 2;
-      if (V1 == V2 && Mask.size() == VF &&
-          all_of(Mask, [=](int Idx) { return Idx < Limit; }) &&
-          (ShuffleVectorInst::isIdentityMask(Mask) ||
-           (ShuffleVectorInst::isZeroEltSplatMask(Mask) &&
-            isa<ShuffleVectorInst>(V1) &&
-            cast<ShuffleVectorInst>(V1)->getShuffleMask() == Mask)))
-        return V1;
-      Value *Vec = V1 == V2 ? Builder.CreateShuffleVector(V1, Mask)
-                            : Builder.CreateShuffleVector(V1, V2, Mask);
-      if (auto *I = dyn_cast<Instruction>(Vec)) {
-        GatherShuffleExtractSeq.insert(I);
-        CSEBlocks.insert(I->getParent());
-      }
-      return Vec;
-    };
-    auto NeedToDelay = [=](const TreeEntry *E,
-                           ArrayRef<const TreeEntry *> Deps) -> Value * {
-      // No need to delay emission if all deps are ready.
-      if (all_of(Deps, [](const TreeEntry *TE) { return TE->VectorizedValue; }))
-        return nullptr;
-      // Postpone gather emission, will be emitted after the end of the
-      // process to keep correct order.
-      auto *VecTy = FixedVectorType::get(E->Scalars.front()->getType(),
-                                         E->getVectorFactor());
-      Value *Vec = Builder.CreateAlignedLoad(
-          VecTy, PoisonValue::get(VecTy->getPointerTo()), MaybeAlign());
-      return Vec;
-    };
-
-    SmallVector<int>
-        ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(),
-                             E->ReuseShuffleIndices.end());
-    SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
-    // Build a mask out of the reorder indices and reorder scalars per this
-    // mask.
-    SmallVector<int> ReorderMask;
-    inversePermutation(E->ReorderIndices, ReorderMask);
-    if (!ReorderMask.empty())
-      reorderScalars(GatheredScalars, ReorderMask);
-    Value *Vec = nullptr;
-    SmallVector<int> Mask;
-    SmallVector<int> ExtractMask;
-    std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle;
-    std::optional<TargetTransformInfo::ShuffleKind> GatherShuffle;
-    SmallVector<const TreeEntry *> Entries;
-    Type *ScalarTy = GatheredScalars.front()->getType();
-    if (!all_of(GatheredScalars, UndefValue::classof)) {
-      // Check for gathered extracts.
-      ExtractShuffle = tryToGatherExtractElements(GatheredScalars, ExtractMask);
-      SmallVector<Value *> IgnoredVals;
-      if (UserIgnoreList)
-        IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end());
-      bool Resized = false;
-      if (Value *VecBase = AdjustExtracts(E, ExtractMask))
-        if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
-          if (VF == VecBaseTy->getNumElements() &&
-              GatheredScalars.size() != VF) {
-            Resized = true;
-            GatheredScalars.append(VF - GatheredScalars.size(),
-                                   PoisonValue::get(ScalarTy));
-          }
-      // Gather extracts after we check for full matched gathers only.
-      if (ExtractShuffle || E->getOpcode() != Instruction::Load ||
-          E->isAltShuffle() ||
-          all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
-          isSplat(E->Scalars) ||
-          (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
-        GatherShuffle =
-            isGatherShuffledEntry(E, GatheredScalars, Mask, Entries);
-      }
-      if (GatherShuffle) {
-        if (Value *Delayed = NeedToDelay(E, Entries)) {
-          E->VectorizedValue = Delayed;
-          // Delay emission of gathers which are not ready yet.
-          PostponedGathers.insert(E);
-          // Postpone gather emission, will be emitted after the end of the
-          // process to keep correct order.
-          return Delayed;
-        }
-        assert((Entries.size() == 1 || Entries.size() == 2) &&
-               "Expected shuffle of 1 or 2 entries.");
-        if (!Resized) {
-          unsigned VF1 = Entries.front()->getVectorFactor();
-          unsigned VF2 = Entries.back()->getVectorFactor();
-          if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF)
-            GatheredScalars.append(VF - GatheredScalars.size(),
-                                   PoisonValue::get(ScalarTy));
-        }
-        // Remove shuffled elements from list of gathers.
-        for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
-          if (Mask[I] != UndefMaskElem)
-            GatheredScalars[I] = PoisonValue::get(ScalarTy);
-        }
-      }
-    }
-    if ((ExtractShuffle || GatherShuffle) &&
-        all_of(GatheredScalars, PoisonValue::classof)) {
-      Value *Vec1 = nullptr;
-      if (ExtractShuffle) {
-        // Gather of extractelements can be represented as just a shuffle of
-        // a single/two vectors the scalars are extracted from.
-        // Find input vectors.
-        Value *Vec2 = nullptr;
-        for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
-          if (ExtractMask[I] == UndefMaskElem ||
-              (!Mask.empty() && Mask[I] != UndefMaskElem)) {
-            ExtractMask[I] = UndefMaskElem;
-            continue;
-          }
-          if (isa<UndefValue>(E->Scalars[I]))
-            continue;
-          auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
-          if (!Vec1) {
-            Vec1 = EI->getVectorOperand();
-          } else if (Vec1 != EI->getVectorOperand()) {
-            assert((!Vec2 || Vec2 == EI->getVectorOperand()) &&
-                   "Expected only 1 or 2 vectors shuffle.");
-            Vec2 = EI->getVectorOperand();
-          }
-        }
-        if (Vec2)
-          Vec1 = CreateShuffle(Vec1, Vec2, ExtractMask);
-        else if (Vec1)
-          Vec1 = CreateShuffle(Vec1, Vec1, ExtractMask);
-        else
-          Vec1 = PoisonValue::get(
-              FixedVectorType::get(ScalarTy, GatheredScalars.size()));
-      }
-      if (GatherShuffle) {
-        Vec = CreateShuffle(Entries.front()->VectorizedValue,
-                            Entries.back()->VectorizedValue, Mask);
-        if (Vec1) {
-          // Build final mask.
-          for (auto [I, Idx] : enumerate(Mask)) {
-            if (ExtractMask[I] != UndefMaskElem)
-              Idx = I;
-            else if (Idx != UndefMaskElem)
-              Idx = I + VF;
-          }
-          Vec = CreateShuffle(Vec1, Vec, Mask);
-        }
-      } else {
-        Vec = Vec1;
-      }
-      Vec = FinalShuffle(Vec, E);
-    } else {
-      Vec = createBuildVector(E);
-    }
-    E->VectorizedValue = Vec;
-    return Vec;
-  }
-
   assert((E->State == TreeEntry::Vectorize ||
           E->State == TreeEntry::ScatterVectorize) &&
          "Unhandled state");
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/slp-vectorizer/merge-scalars.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/slp-vectorizer/merge-scalars.ll
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/slp-vectorizer/merge-scalars.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/slp-vectorizer/merge-scalars.ll
@@ -23,10 +23,11 @@
 ;; the vector store that replaces them.
 
 ; CHECK: call void @llvm.dbg.assign(metadata float undef, metadata ![[VAR:[0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 32), metadata ![[ID:[0-9]+]], metadata ptr %arrayidx, metadata !DIExpression())
+; CHECK: store <2 x float> {{.*}} !DIAssignID ![[ID]]
 ; CHECK: call void @llvm.dbg.assign(metadata float undef, metadata ![[VAR]], metadata !DIExpression(DW_OP_LLVM_fragment, 32, 32), metadata ![[ID]], metadata ptr %quad, metadata !DIExpression(DW_OP_plus_uconst, 4))
-; CHECK: call void @llvm.dbg.assign(metadata float undef, metadata ![[VAR]], metadata !DIExpression(DW_OP_LLVM_fragment, 64, 32), metadata ![[ID]], metadata ptr %quad, metadata !DIExpression(DW_OP_plus_uconst, 8))
-; CHECK: store <4 x float> {{.*}} !DIAssignID ![[ID]]
-; CHECK: call void @llvm.dbg.assign(metadata float undef, metadata ![[VAR]], metadata !DIExpression(DW_OP_LLVM_fragment, 96, 32), metadata ![[ID]], metadata ptr %quad, metadata !DIExpression(DW_OP_plus_uconst, 12))
+; CHECK: call void @llvm.dbg.assign(metadata float undef, metadata ![[VAR]], metadata !DIExpression(DW_OP_LLVM_fragment, 64, 32), metadata ![[ID1:[0-9]+]], metadata ptr %arrayidx7, metadata !DIExpression())
+; CHECK: store <2 x float> {{.*}} !DIAssignID ![[ID1]]
+; CHECK: call void @llvm.dbg.assign(metadata float undef, metadata ![[VAR]], metadata !DIExpression(DW_OP_LLVM_fragment, 96, 32), metadata ![[ID1]], metadata ptr %quad, metadata !DIExpression(DW_OP_plus_uconst, 12))
 
 target triple = "x86_64-unknown-unknown"
 
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
@@ -3,27 +3,21 @@
 
 define void @test(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2) {
 ; CHECK-LABEL: @test(
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP1:%.*]], i64 0
-; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP0:%.*]], i64 0
-; CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP7]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = trunc i64 [[TMP8]] to i32
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP2:%.*]], i64 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1
-; CHECK-NEXT:    [[TMP12:%.*]] = or i64 [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i32
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP0]], i64 0
-; CHECK-NEXT:    [[TMP15:%.*]] = or i64 [[TMP14]], 0
-; CHECK-NEXT:    [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32
-; CHECK-NEXT:    br label [[TMP17:%.*]]
-; CHECK:       17:
-; CHECK-NEXT:    [[TMP18:%.*]] = phi i32 [ [[TMP22:%.*]], [[TMP17]] ], [ [[TMP6]], [[TMP3:%.*]] ]
-; CHECK-NEXT:    [[TMP19:%.*]] = phi i32 [ 0, [[TMP17]] ], [ [[TMP9]], [[TMP3]] ]
-; CHECK-NEXT:    [[TMP20:%.*]] = phi i32 [ 0, [[TMP17]] ], [ [[TMP13]], [[TMP3]] ]
-; CHECK-NEXT:    [[TMP21:%.*]] = phi i32 [ 0, [[TMP17]] ], [ [[TMP16]], [[TMP3]] ]
-; CHECK-NEXT:    [[TMP22]] = or i32 [[TMP18]], 0
-; CHECK-NEXT:    br label [[TMP17]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2:%.*]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP1:%.*]], <2 x i64> [[TMP0:%.*]], <4 x i32> <i32 0, i32 2, i32 undef, i32 2>
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i64> [[TMP5]], i64 [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> <i64 0, i64 0, i64 poison, i64 0>, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP9:%.*]] = or <4 x i64> [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32>
+; CHECK-NEXT:    br label [[TMP11:%.*]]
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi <4 x i32> [ [[TMP16:%.*]], [[TMP11]] ], [ [[TMP10]], [[TMP3:%.*]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i32> zeroinitializer, [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = add <4 x i32> zeroinitializer, [[TMP13]]
+; CHECK-NEXT:    [[TMP16]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    br label [[TMP11]]
 ;
   %4 = extractelement <2 x i64> %1, i64 0
   %5 = or i64 %4, 0
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
@@ -343,15 +343,16 @@
 ; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[STRIDE:%.*]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i64 [[IDXPROM]]
 ; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i64 [[IDXPROM]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[X]], align 2
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[X]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[Y]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i16> [[TMP1]], [[TMP0]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 2
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i16>, ptr [[Y]], align 2
-; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX20]], align 2
-; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i16> [[TMP5]], [[TMP1]]
-; CHECK-NEXT:    [[TMP9:%.*]] = mul <4 x i16> [[TMP7]], [[TMP3]]
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP11:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP10]])
-; CHECK-NEXT:    ret i16 [[TMP11]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX20]], align 2
+; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i16> [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[TMP2]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[TMP5]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = add i16 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    ret i16 [[OP_RDX]]
 ;
 entry:
   %0 = load i16, ptr %x, align 2
@@ -420,29 +421,34 @@
 ; CHECK-NEXT:    [[ADD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]]
 ; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 4
 ; CHECK-NEXT:    [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[P1]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i8>, ptr [[P2]], align 1
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw nsw <4 x i32> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i8>, ptr [[P2]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i8> [[TMP5]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1
-; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1
-; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> [[TMP13]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP14]], <16 x i8> [[TMP15]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:    [[TMP17:%.*]] = zext <16 x i8> [[TMP16]] to <16 x i32>
-; CHECK-NEXT:    [[TMP19:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
-; CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
-; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP21]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:    [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i32>
-; CHECK-NEXT:    [[TMP28:%.*]] = mul nuw nsw <16 x i32> [[TMP17]], [[TMP27]]
-; CHECK-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP28]])
-; CHECK-NEXT:    ret i32 [[TMP29]]
+; CHECK-NEXT:    [[TMP8:%.*]] = zext <4 x i8> [[TMP7]] to <4 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw nsw <4 x i32> [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = zext <4 x i8> [[TMP10]] to <4 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = zext <4 x i8> [[TMP12]] to <4 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = mul nuw nsw <4 x i32> [[TMP11]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1
+; CHECK-NEXT:    [[TMP16:%.*]] = zext <4 x i8> [[TMP15]] to <4 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = zext <4 x i8> [[TMP17]] to <4 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = mul nuw nsw <4 x i32> [[TMP16]], [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP9]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = add i32 [[TMP20]], [[TMP21]]
+; CHECK-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP14]])
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = add i32 [[OP_RDX]], [[TMP22]]
+; CHECK-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP19]])
+; CHECK-NEXT:    [[OP_RDX2:%.*]] = add i32 [[OP_RDX1]], [[TMP23]]
+; CHECK-NEXT:    ret i32 [[OP_RDX2]]
 ;
 entry:
   %idx.ext = sext i32 %off1 to i64
@@ -708,29 +714,29 @@
 ; CHECK-NEXT:    [[ARRAYIDX72:%.*]] = getelementptr inbounds i32, ptr [[Z:%.*]], i64 1
 ; CHECK-NEXT:    [[MUL73:%.*]] = mul nsw i32 [[TMP3]], [[TMP0]]
 ; CHECK-NEXT:    [[ARRAYIDX76:%.*]] = getelementptr inbounds i32, ptr [[Z]], i64 6
-; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr [[X]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = load <2 x i32>, ptr [[Y]], align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x i32>, ptr [[ARRAYIDX41]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = mul nsw <2 x i32> [[TMP11]], [[TMP7]]
-; CHECK-NEXT:    [[TMP15:%.*]] = mul nsw <2 x i32> [[TMP13]], [[TMP9]]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr [[X]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i32>, ptr [[Y]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX41]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nsw <2 x i32> [[TMP8]], [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = mul nsw <2 x i32> [[TMP9]], [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
 ; CHECK-NEXT:    [[ARRAYIDX84:%.*]] = getelementptr inbounds i32, ptr [[Z]], i64 7
 ; CHECK-NEXT:    [[MUL85:%.*]] = mul nsw i32 [[TMP4]], [[TMP1]]
 ; CHECK-NEXT:    [[MUL87:%.*]] = mul nsw i32 [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[ARRAYIDX88:%.*]] = getelementptr inbounds i32, ptr [[Z]], i64 11
-; CHECK-NEXT:    [[TMP18:%.*]] = load <2 x i32>, ptr [[ARRAYIDX12]], align 4
-; CHECK-NEXT:    [[TMP20:%.*]] = load <2 x i32>, ptr [[ARRAYIDX28]], align 4
-; CHECK-NEXT:    [[TMP22:%.*]] = load <2 x i32>, ptr [[ARRAYIDX48]], align 4
-; CHECK-NEXT:    [[TMP24:%.*]] = load <2 x i32>, ptr [[ARRAYIDX64]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x i32>, ptr [[ARRAYIDX12]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = load <2 x i32>, ptr [[ARRAYIDX28]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = load <2 x i32>, ptr [[ARRAYIDX48]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = load <2 x i32>, ptr [[ARRAYIDX64]], align 4
 ; CHECK-NEXT:    store i32 [[MUL73]], ptr [[Z]], align 4
-; CHECK-NEXT:    store <4 x i32> [[SHUFFLE]], ptr [[ARRAYIDX72]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP12]], ptr [[ARRAYIDX72]], align 4
 ; CHECK-NEXT:    store i32 [[MUL85]], ptr [[ARRAYIDX76]], align 4
 ; CHECK-NEXT:    store i32 [[MUL87]], ptr [[ARRAYIDX88]], align 4
-; CHECK-NEXT:    [[TMP25:%.*]] = mul nsw <2 x i32> [[TMP22]], [[TMP18]]
-; CHECK-NEXT:    [[TMP26:%.*]] = mul nsw <2 x i32> [[TMP24]], [[TMP20]]
-; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP25]], <2 x i32> [[TMP26]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    store <4 x i32> [[SHUFFLE1]], ptr [[ARRAYIDX84]], align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = mul nsw <2 x i32> [[TMP15]], [[TMP13]]
+; CHECK-NEXT:    [[TMP18:%.*]] = mul nsw <2 x i32> [[TMP16]], [[TMP14]]
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <2 x i32> [[TMP17]], <2 x i32> [[TMP18]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    store <4 x i32> [[TMP19]], ptr [[ARRAYIDX84]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -833,14 +839,17 @@
 ; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[STRIDE:%.*]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i64 [[IDXPROM]]
 ; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i64 [[IDXPROM]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[X]], align 2
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 2
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i16>, ptr [[Y]], align 2
-; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX20]], align 2
-; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i16> [[TMP5]], [[TMP1]]
-; CHECK-NEXT:    [[TMP9:%.*]] = mul <4 x i16> [[TMP7]], [[TMP3]]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-NEXT:    store <8 x i16> [[SHUFFLE]], ptr [[DST0:%.*]], align 2
+; CHECK-NEXT:    [[DST4:%.*]] = getelementptr inbounds i16, ptr [[DST0:%.*]], i64 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[X]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[Y]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i16> [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 2
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX20]], align 2
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i16> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    store <4 x i16> [[TMP3]], ptr [[DST0]], align 2
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    store <4 x i16> [[TMP7]], ptr [[DST4]], align 2
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -921,30 +930,30 @@
 ; CHECK-NEXT:    [[DST4:%.*]] = getelementptr inbounds i32, ptr [[DST0:%.*]], i64 4
 ; CHECK-NEXT:    [[DST8:%.*]] = getelementptr inbounds i32, ptr [[DST0]], i64 8
 ; CHECK-NEXT:    [[DST12:%.*]] = getelementptr inbounds i32, ptr [[DST0]], i64 12
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[P1]], align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw nsw <4 x i32> [[TMP2]], [[TMP5]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i8>, ptr [[P2]], align 1
-; CHECK-NEXT:    [[TMP10:%.*]] = zext <4 x i8> [[TMP9]] to <4 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw nsw <4 x i32> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i8>, ptr [[P2]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i8> [[TMP5]] to <4 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = zext <4 x i8> [[TMP7]] to <4 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw nsw <4 x i32> [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = zext <4 x i8> [[TMP10]] to <4 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
 ; CHECK-NEXT:    [[TMP13:%.*]] = zext <4 x i8> [[TMP12]] to <4 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = mul nuw nsw <4 x i32> [[TMP10]], [[TMP13]]
-; CHECK-NEXT:    [[TMP17:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1
+; CHECK-NEXT:    [[TMP14:%.*]] = mul nuw nsw <4 x i32> [[TMP11]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1
+; CHECK-NEXT:    [[TMP16:%.*]] = zext <4 x i8> [[TMP15]] to <4 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
 ; CHECK-NEXT:    [[TMP18:%.*]] = zext <4 x i8> [[TMP17]] to <4 x i32>
-; CHECK-NEXT:    [[TMP20:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
-; CHECK-NEXT:    [[TMP21:%.*]] = zext <4 x i8> [[TMP20]] to <4 x i32>
-; CHECK-NEXT:    [[TMP22:%.*]] = mul nuw nsw <4 x i32> [[TMP18]], [[TMP21]]
-; CHECK-NEXT:    [[TMP25:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1
-; CHECK-NEXT:    [[TMP26:%.*]] = zext <4 x i8> [[TMP25]] to <4 x i32>
-; CHECK-NEXT:    [[TMP28:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
-; CHECK-NEXT:    [[TMP29:%.*]] = zext <4 x i8> [[TMP28]] to <4 x i32>
-; CHECK-NEXT:    [[TMP30:%.*]] = mul nuw nsw <4 x i32> [[TMP26]], [[TMP29]]
-; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr [[DST0]], align 4
-; CHECK-NEXT:    store <4 x i32> [[TMP14]], ptr [[DST4]], align 4
-; CHECK-NEXT:    store <4 x i32> [[TMP22]], ptr [[DST8]], align 4
-; CHECK-NEXT:    store <4 x i32> [[TMP30]], ptr [[DST12]], align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = mul nuw nsw <4 x i32> [[TMP16]], [[TMP18]]
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[DST0]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP9]], ptr [[DST4]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP14]], ptr [[DST8]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP19]], ptr [[DST12]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -1198,88 +1207,421 @@
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[ST1:%.*]] to i64
 ; CHECK-NEXT:    [[IDX_EXT63:%.*]] = sext i32 [[ST2:%.*]] to i64
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P1:%.*]], i64 4
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[P2:%.*]], i64 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[P1:%.*]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[P2:%.*]], align 1
+; CHECK-NEXT:    [[CONV2:%.*]] = zext i8 [[TMP1]] to i32
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[CONV]], [[CONV2]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX3]], align 1
+; CHECK-NEXT:    [[CONV4:%.*]] = zext i8 [[TMP2]] to i32
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX5]], align 1
+; CHECK-NEXT:    [[CONV6:%.*]] = zext i8 [[TMP3]] to i32
+; CHECK-NEXT:    [[SUB7:%.*]] = sub nsw i32 [[CONV4]], [[CONV6]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl nsw i32 [[SUB7]], 16
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[SHL]], [[SUB]]
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[ARRAYIDX8]], align 1
+; CHECK-NEXT:    [[CONV9:%.*]] = zext i8 [[TMP4]] to i32
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr [[ARRAYIDX10]], align 1
+; CHECK-NEXT:    [[CONV11:%.*]] = zext i8 [[TMP5]] to i32
+; CHECK-NEXT:    [[SUB12:%.*]] = sub nsw i32 [[CONV9]], [[CONV11]]
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 5
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[ARRAYIDX13]], align 1
+; CHECK-NEXT:    [[CONV14:%.*]] = zext i8 [[TMP6]] to i32
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 5
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX15]], align 1
+; CHECK-NEXT:    [[CONV16:%.*]] = zext i8 [[TMP7]] to i32
+; CHECK-NEXT:    [[SUB17:%.*]] = sub nsw i32 [[CONV14]], [[CONV16]]
+; CHECK-NEXT:    [[SHL18:%.*]] = shl nsw i32 [[SUB17]], 16
+; CHECK-NEXT:    [[ADD19:%.*]] = add nsw i32 [[SHL18]], [[SUB12]]
+; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 2
+; CHECK-NEXT:    [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX20]], align 1
+; CHECK-NEXT:    [[CONV21:%.*]] = zext i8 [[TMP8]] to i32
+; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 2
+; CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX22]], align 1
+; CHECK-NEXT:    [[CONV23:%.*]] = zext i8 [[TMP9]] to i32
+; CHECK-NEXT:    [[SUB24:%.*]] = sub nsw i32 [[CONV21]], [[CONV23]]
+; CHECK-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 6
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX25]], align 1
+; CHECK-NEXT:    [[CONV26:%.*]] = zext i8 [[TMP10]] to i32
+; CHECK-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 6
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr [[ARRAYIDX27]], align 1
+; CHECK-NEXT:    [[CONV28:%.*]] = zext i8 [[TMP11]] to i32
+; CHECK-NEXT:    [[SUB29:%.*]] = sub nsw i32 [[CONV26]], [[CONV28]]
+; CHECK-NEXT:    [[SHL30:%.*]] = shl nsw i32 [[SUB29]], 16
+; CHECK-NEXT:    [[ADD31:%.*]] = add nsw i32 [[SHL30]], [[SUB24]]
+; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 3
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX32]], align 1
+; CHECK-NEXT:    [[CONV33:%.*]] = zext i8 [[TMP12]] to i32
+; CHECK-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 3
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX34]], align 1
+; CHECK-NEXT:    [[CONV35:%.*]] = zext i8 [[TMP13]] to i32
+; CHECK-NEXT:    [[SUB36:%.*]] = sub nsw i32 [[CONV33]], [[CONV35]]
+; CHECK-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 7
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX37]], align 1
+; CHECK-NEXT:    [[CONV38:%.*]] = zext i8 [[TMP14]] to i32
+; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 7
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX39]], align 1
+; CHECK-NEXT:    [[CONV40:%.*]] = zext i8 [[TMP15]] to i32
+; CHECK-NEXT:    [[SUB41:%.*]] = sub nsw i32 [[CONV38]], [[CONV40]]
+; CHECK-NEXT:    [[SHL42:%.*]] = shl nsw i32 [[SUB41]], 16
+; CHECK-NEXT:    [[ADD43:%.*]] = add nsw i32 [[SHL42]], [[SUB36]]
+; CHECK-NEXT:    [[ADD44:%.*]] = add nsw i32 [[ADD19]], [[ADD]]
+; CHECK-NEXT:    [[SUB45:%.*]] = sub nsw i32 [[ADD]], [[ADD19]]
+; CHECK-NEXT:    [[ADD46:%.*]] = add nsw i32 [[ADD43]], [[ADD31]]
+; CHECK-NEXT:    [[SUB47:%.*]] = sub nsw i32 [[ADD31]], [[ADD43]]
+; CHECK-NEXT:    [[ADD48:%.*]] = add nsw i32 [[ADD46]], [[ADD44]]
+; CHECK-NEXT:    [[SUB51:%.*]] = sub nsw i32 [[ADD44]], [[ADD46]]
+; CHECK-NEXT:    [[ADD55:%.*]] = add nsw i32 [[SUB47]], [[SUB45]]
+; CHECK-NEXT:    [[SUB59:%.*]] = sub nsw i32 [[SUB45]], [[SUB47]]
 ; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]]
 ; CHECK-NEXT:    [[ADD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i8, ptr [[ADD_PTR]], align 1
+; CHECK-NEXT:    [[CONV_1:%.*]] = zext i8 [[TMP16]] to i32
+; CHECK-NEXT:    [[TMP17:%.*]] = load i8, ptr [[ADD_PTR64]], align 1
+; CHECK-NEXT:    [[CONV2_1:%.*]] = zext i8 [[TMP17]] to i32
+; CHECK-NEXT:    [[SUB_1:%.*]] = sub nsw i32 [[CONV_1]], [[CONV2_1]]
 ; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 4
+; CHECK-NEXT:    [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX3_1]], align 1
+; CHECK-NEXT:    [[CONV4_1:%.*]] = zext i8 [[TMP18]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 4
+; CHECK-NEXT:    [[TMP19:%.*]] = load i8, ptr [[ARRAYIDX5_1]], align 1
+; CHECK-NEXT:    [[CONV6_1:%.*]] = zext i8 [[TMP19]] to i32
+; CHECK-NEXT:    [[SUB7_1:%.*]] = sub nsw i32 [[CONV4_1]], [[CONV6_1]]
+; CHECK-NEXT:    [[SHL_1:%.*]] = shl nsw i32 [[SUB7_1]], 16
+; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[SHL_1]], [[SUB_1]]
+; CHECK-NEXT:    [[ARRAYIDX8_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 1
+; CHECK-NEXT:    [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX8_1]], align 1
+; CHECK-NEXT:    [[CONV9_1:%.*]] = zext i8 [[TMP20]] to i32
+; CHECK-NEXT:    [[ARRAYIDX10_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 1
+; CHECK-NEXT:    [[TMP21:%.*]] = load i8, ptr [[ARRAYIDX10_1]], align 1
+; CHECK-NEXT:    [[CONV11_1:%.*]] = zext i8 [[TMP21]] to i32
+; CHECK-NEXT:    [[SUB12_1:%.*]] = sub nsw i32 [[CONV9_1]], [[CONV11_1]]
+; CHECK-NEXT:    [[ARRAYIDX13_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 5
+; CHECK-NEXT:    [[TMP22:%.*]] = load i8, ptr [[ARRAYIDX13_1]], align 1
+; CHECK-NEXT:    [[CONV14_1:%.*]] = zext i8 [[TMP22]] to i32
+; CHECK-NEXT:    [[ARRAYIDX15_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 5
+; CHECK-NEXT:    [[TMP23:%.*]] = load i8, ptr [[ARRAYIDX15_1]], align 1
+; CHECK-NEXT:    [[CONV16_1:%.*]] = zext i8 [[TMP23]] to i32
+; CHECK-NEXT:    [[SUB17_1:%.*]] = sub nsw i32 [[CONV14_1]], [[CONV16_1]]
+; CHECK-NEXT:    [[SHL18_1:%.*]] = shl nsw i32 [[SUB17_1]], 16
+; CHECK-NEXT:    [[ADD19_1:%.*]] = add nsw i32 [[SHL18_1]], [[SUB12_1]]
+; CHECK-NEXT:    [[ARRAYIDX20_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 2
+; CHECK-NEXT:    [[TMP24:%.*]] = load i8, ptr [[ARRAYIDX20_1]], align 1
+; CHECK-NEXT:    [[CONV21_1:%.*]] = zext i8 [[TMP24]] to i32
+; CHECK-NEXT:    [[ARRAYIDX22_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 2
+; CHECK-NEXT:    [[TMP25:%.*]] = load i8, ptr [[ARRAYIDX22_1]], align 1
+; CHECK-NEXT:    [[CONV23_1:%.*]] = zext i8 [[TMP25]] to i32
+; CHECK-NEXT:    [[SUB24_1:%.*]] = sub nsw i32 [[CONV21_1]], [[CONV23_1]]
+; CHECK-NEXT:    [[ARRAYIDX25_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 6
+; CHECK-NEXT:    [[TMP26:%.*]] = load i8, ptr [[ARRAYIDX25_1]], align 1
+; CHECK-NEXT:    [[CONV26_1:%.*]] = zext i8 [[TMP26]] to i32
+; CHECK-NEXT:    [[ARRAYIDX27_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 6
+; CHECK-NEXT:    [[TMP27:%.*]] = load i8, ptr [[ARRAYIDX27_1]], align 1
+; CHECK-NEXT:    [[CONV28_1:%.*]] = zext i8 [[TMP27]] to i32
+; CHECK-NEXT:    [[SUB29_1:%.*]] = sub nsw i32 [[CONV26_1]], [[CONV28_1]]
+; CHECK-NEXT:    [[SHL30_1:%.*]] = shl nsw i32 [[SUB29_1]], 16
+; CHECK-NEXT:    [[ADD31_1:%.*]] = add nsw i32 [[SHL30_1]], [[SUB24_1]]
+; CHECK-NEXT:    [[ARRAYIDX32_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 3
+; CHECK-NEXT:    [[TMP28:%.*]] = load i8, ptr [[ARRAYIDX32_1]], align 1
+; CHECK-NEXT:    [[CONV33_1:%.*]] = zext i8 [[TMP28]] to i32
+; CHECK-NEXT:    [[ARRAYIDX34_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 3
+; CHECK-NEXT:    [[TMP29:%.*]] = load i8, ptr [[ARRAYIDX34_1]], align 1
+; CHECK-NEXT:    [[CONV35_1:%.*]] = zext i8 [[TMP29]] to i32
+; CHECK-NEXT:    [[SUB36_1:%.*]] = sub nsw i32 [[CONV33_1]], [[CONV35_1]]
+; CHECK-NEXT:    [[ARRAYIDX37_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 7
+; CHECK-NEXT:    [[TMP30:%.*]] = load i8, ptr [[ARRAYIDX37_1]], align 1
+; CHECK-NEXT:    [[CONV38_1:%.*]] = zext i8 [[TMP30]] to i32
+; CHECK-NEXT:    [[ARRAYIDX39_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 7
+; CHECK-NEXT:    [[TMP31:%.*]] = load i8, ptr [[ARRAYIDX39_1]], align 1
+; CHECK-NEXT:    [[CONV40_1:%.*]] = zext i8 [[TMP31]] to i32
+; CHECK-NEXT:    [[SUB41_1:%.*]] = sub nsw i32 [[CONV38_1]], [[CONV40_1]]
+; CHECK-NEXT:    [[SHL42_1:%.*]] = shl nsw i32 [[SUB41_1]], 16
+; CHECK-NEXT:    [[ADD43_1:%.*]] = add nsw i32 [[SHL42_1]], [[SUB36_1]]
+; CHECK-NEXT:    [[ADD44_1:%.*]] = add nsw i32 [[ADD19_1]], [[ADD_1]]
+; CHECK-NEXT:    [[SUB45_1:%.*]] = sub nsw i32 [[ADD_1]], [[ADD19_1]]
+; CHECK-NEXT:    [[ADD46_1:%.*]] = add nsw i32 [[ADD43_1]], [[ADD31_1]]
+; CHECK-NEXT:    [[SUB47_1:%.*]] = sub nsw i32 [[ADD31_1]], [[ADD43_1]]
+; CHECK-NEXT:    [[ADD48_1:%.*]] = add nsw i32 [[ADD46_1]], [[ADD44_1]]
+; CHECK-NEXT:    [[SUB51_1:%.*]] = sub nsw i32 [[ADD44_1]], [[ADD46_1]]
+; CHECK-NEXT:    [[ADD55_1:%.*]] = add nsw i32 [[SUB47_1]], [[SUB45_1]]
+; CHECK-NEXT:    [[SUB59_1:%.*]] = sub nsw i32 [[SUB45_1]], [[SUB47_1]]
 ; CHECK-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]]
 ; CHECK-NEXT:    [[ADD_PTR64_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load i8, ptr [[ADD_PTR_1]], align 1
+; CHECK-NEXT:    [[CONV_2:%.*]] = zext i8 [[TMP32]] to i32
+; CHECK-NEXT:    [[TMP33:%.*]] = load i8, ptr [[ADD_PTR64_1]], align 1
+; CHECK-NEXT:    [[CONV2_2:%.*]] = zext i8 [[TMP33]] to i32
+; CHECK-NEXT:    [[SUB_2:%.*]] = sub nsw i32 [[CONV_2]], [[CONV2_2]]
 ; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 4
+; CHECK-NEXT:    [[TMP34:%.*]] = load i8, ptr [[ARRAYIDX3_2]], align 1
+; CHECK-NEXT:    [[CONV4_2:%.*]] = zext i8 [[TMP34]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX5_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 4
+; CHECK-NEXT:    [[TMP35:%.*]] = load i8, ptr [[ARRAYIDX5_2]], align 1
+; CHECK-NEXT:    [[CONV6_2:%.*]] = zext i8 [[TMP35]] to i32
+; CHECK-NEXT:    [[SUB7_2:%.*]] = sub nsw i32 [[CONV4_2]], [[CONV6_2]]
+; CHECK-NEXT:    [[SHL_2:%.*]] = shl nsw i32 [[SUB7_2]], 16
+; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[SHL_2]], [[SUB_2]]
+; CHECK-NEXT:    [[ARRAYIDX8_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 1
+; CHECK-NEXT:    [[TMP36:%.*]] = load i8, ptr [[ARRAYIDX8_2]], align 1
+; CHECK-NEXT:    [[CONV9_2:%.*]] = zext i8 [[TMP36]] to i32
+; CHECK-NEXT:    [[ARRAYIDX10_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 1
+; CHECK-NEXT:    [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX10_2]], align 1
+; CHECK-NEXT:    [[CONV11_2:%.*]] = zext i8 [[TMP37]] to i32
+; CHECK-NEXT:    [[SUB12_2:%.*]] = sub nsw i32 [[CONV9_2]], [[CONV11_2]]
+; CHECK-NEXT:    [[ARRAYIDX13_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 5
+; CHECK-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX13_2]], align 1
+; CHECK-NEXT:    [[CONV14_2:%.*]] = zext i8 [[TMP38]] to i32
+; CHECK-NEXT:    [[ARRAYIDX15_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 5
+; CHECK-NEXT:    [[TMP39:%.*]] = load i8, ptr [[ARRAYIDX15_2]], align 1
+; CHECK-NEXT:    [[CONV16_2:%.*]] = zext i8 [[TMP39]] to i32
+; CHECK-NEXT:    [[SUB17_2:%.*]] = sub nsw i32 [[CONV14_2]], [[CONV16_2]]
+; CHECK-NEXT:    [[SHL18_2:%.*]] = shl nsw i32 [[SUB17_2]], 16
+; CHECK-NEXT:    [[ADD19_2:%.*]] = add nsw i32 [[SHL18_2]], [[SUB12_2]]
+; CHECK-NEXT:    [[ARRAYIDX20_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 2
+; CHECK-NEXT:    [[TMP40:%.*]] = load i8, ptr [[ARRAYIDX20_2]], align 1
+; CHECK-NEXT:    [[CONV21_2:%.*]] = zext i8 [[TMP40]] to i32
+; CHECK-NEXT:    [[ARRAYIDX22_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 2
+; CHECK-NEXT:    [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX22_2]], align 1
+; CHECK-NEXT:    [[CONV23_2:%.*]] = zext i8 [[TMP41]] to i32
+; CHECK-NEXT:    [[SUB24_2:%.*]] = sub nsw i32 [[CONV21_2]], [[CONV23_2]]
+; CHECK-NEXT:    [[ARRAYIDX25_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 6
+; CHECK-NEXT:    [[TMP42:%.*]] = load i8, ptr [[ARRAYIDX25_2]], align 1
+; CHECK-NEXT:    [[CONV26_2:%.*]] = zext i8 [[TMP42]] to i32
+; CHECK-NEXT:    [[ARRAYIDX27_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 6
+; CHECK-NEXT:    [[TMP43:%.*]] = load i8, ptr [[ARRAYIDX27_2]], align 1
+; CHECK-NEXT:    [[CONV28_2:%.*]] = zext i8 [[TMP43]] to i32
+; CHECK-NEXT:    [[SUB29_2:%.*]] = sub nsw i32 [[CONV26_2]], [[CONV28_2]]
+; CHECK-NEXT:    [[SHL30_2:%.*]] = shl nsw i32 [[SUB29_2]], 16
+; CHECK-NEXT:    [[ADD31_2:%.*]] = add nsw i32 [[SHL30_2]], [[SUB24_2]]
+; CHECK-NEXT:    [[ARRAYIDX32_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 3
+; CHECK-NEXT:    [[TMP44:%.*]] = load i8, ptr [[ARRAYIDX32_2]], align 1
+; CHECK-NEXT:    [[CONV33_2:%.*]] = zext i8 [[TMP44]] to i32
+; CHECK-NEXT:    [[ARRAYIDX34_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 3
+; CHECK-NEXT:    [[TMP45:%.*]] = load i8, ptr [[ARRAYIDX34_2]], align 1
+; CHECK-NEXT:    [[CONV35_2:%.*]] = zext i8 [[TMP45]] to i32
+; CHECK-NEXT:    [[SUB36_2:%.*]] = sub nsw i32 [[CONV33_2]], [[CONV35_2]]
+; CHECK-NEXT:    [[ARRAYIDX37_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 7
+; CHECK-NEXT:    [[TMP46:%.*]] = load i8, ptr [[ARRAYIDX37_2]], align 1
+; CHECK-NEXT:    [[CONV38_2:%.*]] = zext i8 [[TMP46]] to i32
+; CHECK-NEXT:    [[ARRAYIDX39_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 7
+; CHECK-NEXT:    [[TMP47:%.*]] = load i8, ptr [[ARRAYIDX39_2]], align 1
+; CHECK-NEXT:    [[CONV40_2:%.*]] = zext i8 [[TMP47]] to i32
+; CHECK-NEXT:    [[SUB41_2:%.*]] = sub nsw i32 [[CONV38_2]], [[CONV40_2]]
+; CHECK-NEXT:    [[SHL42_2:%.*]] = shl nsw i32 [[SUB41_2]], 16
+; CHECK-NEXT:    [[ADD43_2:%.*]] = add nsw i32 [[SHL42_2]], [[SUB36_2]]
+; CHECK-NEXT:    [[ADD44_2:%.*]] = add nsw i32 [[ADD19_2]], [[ADD_2]]
+; CHECK-NEXT:    [[SUB45_2:%.*]] = sub nsw i32 [[ADD_2]], [[ADD19_2]]
+; CHECK-NEXT:    [[ADD46_2:%.*]] = add nsw i32 [[ADD43_2]], [[ADD31_2]]
+; CHECK-NEXT:    [[SUB47_2:%.*]] = sub nsw i32 [[ADD31_2]], [[ADD43_2]]
+; CHECK-NEXT:    [[ADD48_2:%.*]] = add nsw i32 [[ADD46_2]], [[ADD44_2]]
+; CHECK-NEXT:    [[SUB51_2:%.*]] = sub nsw i32 [[ADD44_2]], [[ADD46_2]]
+; CHECK-NEXT:    [[ADD55_2:%.*]] = add nsw i32 [[SUB47_2]], [[SUB45_2]]
+; CHECK-NEXT:    [[SUB59_2:%.*]] = sub nsw i32 [[SUB45_2]], [[SUB47_2]]
 ; CHECK-NEXT:    [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]]
 ; CHECK-NEXT:    [[ADD_PTR64_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 [[IDX_EXT63]]
+; CHECK-NEXT:    [[TMP48:%.*]] = load i8, ptr [[ADD_PTR_2]], align 1
+; CHECK-NEXT:    [[CONV_3:%.*]] = zext i8 [[TMP48]] to i32
+; CHECK-NEXT:    [[TMP49:%.*]] = load i8, ptr [[ADD_PTR64_2]], align 1
+; CHECK-NEXT:    [[CONV2_3:%.*]] = zext i8 [[TMP49]] to i32
+; CHECK-NEXT:    [[SUB_3:%.*]] = sub nsw i32 [[CONV_3]], [[CONV2_3]]
 ; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 4
+; CHECK-NEXT:    [[TMP50:%.*]] = load i8, ptr [[ARRAYIDX3_3]], align 1
+; CHECK-NEXT:    [[CONV4_3:%.*]] = zext i8 [[TMP50]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX5_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[P1]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i8>, ptr [[P2]], align 1
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1
-; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1
-; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1
-; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1
-; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
-; CHECK-NEXT:    [[TMP15:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
-; CHECK-NEXT:    [[TMP17:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1
-; CHECK-NEXT:    [[TMP19:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1
-; CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1
-; CHECK-NEXT:    [[TMP23:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1
-; CHECK-NEXT:    [[TMP25:%.*]] = load <4 x i8>, ptr [[ADD_PTR_2]], align 1
-; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i8> [[TMP25]], <4 x i8> [[TMP17]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <16 x i8> [[TMP26]], <16 x i8> [[TMP27]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <16 x i8> [[TMP28]], <16 x i8> [[TMP29]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:    [[TMP31:%.*]] = zext <16 x i8> [[TMP30]] to <16 x i32>
-; CHECK-NEXT:    [[TMP33:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1
-; CHECK-NEXT:    [[TMP34:%.*]] = shufflevector <4 x i8> [[TMP33]], <4 x i8> [[TMP19]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <16 x i8> [[TMP34]], <16 x i8> [[TMP35]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP37:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <16 x i8> [[TMP36]], <16 x i8> [[TMP37]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:    [[TMP39:%.*]] = zext <16 x i8> [[TMP38]] to <16 x i32>
-; CHECK-NEXT:    [[TMP40:%.*]] = sub nsw <16 x i32> [[TMP31]], [[TMP39]]
-; CHECK-NEXT:    [[TMP42:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1
-; CHECK-NEXT:    [[TMP43:%.*]] = shufflevector <4 x i8> [[TMP42]], <4 x i8> [[TMP21]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP44:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP45:%.*]] = shufflevector <16 x i8> [[TMP43]], <16 x i8> [[TMP44]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP46:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP47:%.*]] = shufflevector <16 x i8> [[TMP45]], <16 x i8> [[TMP46]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:    [[TMP48:%.*]] = zext <16 x i8> [[TMP47]] to <16 x i32>
-; CHECK-NEXT:    [[TMP50:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1
-; CHECK-NEXT:    [[TMP51:%.*]] = shufflevector <4 x i8> [[TMP50]], <4 x i8> [[TMP23]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP52:%.*]] = shufflevector <4 x i8> [[TMP15]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP53:%.*]] = shufflevector <16 x i8> [[TMP51]], <16 x i8> [[TMP52]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP54:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP55:%.*]] = shufflevector <16 x i8> [[TMP53]], <16 x i8> [[TMP54]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:    [[TMP56:%.*]] = zext <16 x i8> [[TMP55]] to <16 x i32>
-; CHECK-NEXT:    [[TMP57:%.*]] = sub nsw <16 x i32> [[TMP48]], [[TMP56]]
-; CHECK-NEXT:    [[TMP58:%.*]] = shl nsw <16 x i32> [[TMP57]], <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
-; CHECK-NEXT:    [[TMP59:%.*]] = add nsw <16 x i32> [[TMP58]], [[TMP40]]
-; CHECK-NEXT:    [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> poison, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-; CHECK-NEXT:    [[TMP61:%.*]] = add nsw <16 x i32> [[TMP59]], [[TMP60]]
-; CHECK-NEXT:    [[TMP62:%.*]] = sub nsw <16 x i32> [[TMP59]], [[TMP60]]
-; CHECK-NEXT:    [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP61]], <16 x i32> [[TMP62]], <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 22, i32 18, i32 26, i32 30, i32 5, i32 1, i32 9, i32 13, i32 20, i32 16, i32 24, i32 28>
-; CHECK-NEXT:    [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP63]], <16 x i32> poison, <16 x i32> <i32 9, i32 8, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 1, i32 0, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP65:%.*]] = add nsw <16 x i32> [[TMP63]], [[TMP64]]
-; CHECK-NEXT:    [[TMP66:%.*]] = sub nsw <16 x i32> [[TMP63]], [[TMP64]]
-; CHECK-NEXT:    [[TMP67:%.*]] = shufflevector <16 x i32> [[TMP65]], <16 x i32> [[TMP66]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; CHECK-NEXT:    [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP67]], <16 x i32> poison, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-; CHECK-NEXT:    [[TMP69:%.*]] = add nsw <16 x i32> [[TMP67]], [[TMP68]]
-; CHECK-NEXT:    [[TMP70:%.*]] = sub nsw <16 x i32> [[TMP67]], [[TMP68]]
-; CHECK-NEXT:    [[TMP71:%.*]] = shufflevector <16 x i32> [[TMP69]], <16 x i32> [[TMP70]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 20, i32 5, i32 6, i32 23, i32 24, i32 9, i32 10, i32 27, i32 28, i32 13, i32 14, i32 31>
-; CHECK-NEXT:    [[TMP72:%.*]] = shufflevector <16 x i32> [[TMP71]], <16 x i32> poison, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-; CHECK-NEXT:    [[TMP73:%.*]] = add nsw <16 x i32> [[TMP71]], [[TMP72]]
-; CHECK-NEXT:    [[TMP74:%.*]] = sub nsw <16 x i32> [[TMP71]], [[TMP72]]
-; CHECK-NEXT:    [[TMP75:%.*]] = shufflevector <16 x i32> [[TMP73]], <16 x i32> [[TMP74]], <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 4, i32 5, i32 22, i32 23, i32 8, i32 9, i32 26, i32 27, i32 12, i32 13, i32 30, i32 31>
-; CHECK-NEXT:    [[TMP76:%.*]] = lshr <16 x i32> [[TMP75]], <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; CHECK-NEXT:    [[TMP77:%.*]] = and <16 x i32> [[TMP76]], <i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP78:%.*]] = mul nuw <16 x i32> [[TMP77]], <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP79:%.*]] = add <16 x i32> [[TMP78]], [[TMP75]]
-; CHECK-NEXT:    [[TMP80:%.*]] = xor <16 x i32> [[TMP79]], [[TMP78]]
-; CHECK-NEXT:    [[TMP81:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP80]])
-; CHECK-NEXT:    [[CONV118:%.*]] = and i32 [[TMP81]], 65535
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[TMP81]], 16
+; CHECK-NEXT:    [[TMP51:%.*]] = load i8, ptr [[ARRAYIDX5_3]], align 1
+; CHECK-NEXT:    [[CONV6_3:%.*]] = zext i8 [[TMP51]] to i32
+; CHECK-NEXT:    [[SUB7_3:%.*]] = sub nsw i32 [[CONV4_3]], [[CONV6_3]]
+; CHECK-NEXT:    [[SHL_3:%.*]] = shl nsw i32 [[SUB7_3]], 16
+; CHECK-NEXT:    [[ADD_3:%.*]] = add nsw i32 [[SHL_3]], [[SUB_3]]
+; CHECK-NEXT:    [[ARRAYIDX8_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 1
+; CHECK-NEXT:    [[TMP52:%.*]] = load i8, ptr [[ARRAYIDX8_3]], align 1
+; CHECK-NEXT:    [[CONV9_3:%.*]] = zext i8 [[TMP52]] to i32
+; CHECK-NEXT:    [[ARRAYIDX10_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 1
+; CHECK-NEXT:    [[TMP53:%.*]] = load i8, ptr [[ARRAYIDX10_3]], align 1
+; CHECK-NEXT:    [[CONV11_3:%.*]] = zext i8 [[TMP53]] to i32
+; CHECK-NEXT:    [[SUB12_3:%.*]] = sub nsw i32 [[CONV9_3]], [[CONV11_3]]
+; CHECK-NEXT:    [[ARRAYIDX13_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 5
+; CHECK-NEXT:    [[TMP54:%.*]] = load i8, ptr [[ARRAYIDX13_3]], align 1
+; CHECK-NEXT:    [[CONV14_3:%.*]] = zext i8 [[TMP54]] to i32
+; CHECK-NEXT:    [[ARRAYIDX15_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 5
+; CHECK-NEXT:    [[TMP55:%.*]] = load i8, ptr [[ARRAYIDX15_3]], align 1
+; CHECK-NEXT:    [[CONV16_3:%.*]] = zext i8 [[TMP55]] to i32
+; CHECK-NEXT:    [[SUB17_3:%.*]] = sub nsw i32 [[CONV14_3]], [[CONV16_3]]
+; CHECK-NEXT:    [[SHL18_3:%.*]] = shl nsw i32 [[SUB17_3]], 16
+; CHECK-NEXT:    [[ADD19_3:%.*]] = add nsw i32 [[SHL18_3]], [[SUB12_3]]
+; CHECK-NEXT:    [[ARRAYIDX20_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 2
+; CHECK-NEXT:    [[TMP56:%.*]] = load i8, ptr [[ARRAYIDX20_3]], align 1
+; CHECK-NEXT:    [[CONV21_3:%.*]] = zext i8 [[TMP56]] to i32
+; CHECK-NEXT:    [[ARRAYIDX22_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 2
+; CHECK-NEXT:    [[TMP57:%.*]] = load i8, ptr [[ARRAYIDX22_3]], align 1
+; CHECK-NEXT:    [[CONV23_3:%.*]] = zext i8 [[TMP57]] to i32
+; CHECK-NEXT:    [[SUB24_3:%.*]] = sub nsw i32 [[CONV21_3]], [[CONV23_3]]
+; CHECK-NEXT:    [[ARRAYIDX25_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 6
+; CHECK-NEXT:    [[TMP58:%.*]] = load i8, ptr [[ARRAYIDX25_3]], align 1
+; CHECK-NEXT:    [[CONV26_3:%.*]] = zext i8 [[TMP58]] to i32
+; CHECK-NEXT:    [[ARRAYIDX27_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 6
+; CHECK-NEXT:    [[TMP59:%.*]] = load i8, ptr [[ARRAYIDX27_3]], align 1
+; CHECK-NEXT:    [[CONV28_3:%.*]] = zext i8 [[TMP59]] to i32
+; CHECK-NEXT:    [[SUB29_3:%.*]] = sub nsw i32 [[CONV26_3]], [[CONV28_3]]
+; CHECK-NEXT:    [[SHL30_3:%.*]] = shl nsw i32 [[SUB29_3]], 16
+; CHECK-NEXT:    [[ADD31_3:%.*]] = add nsw i32 [[SHL30_3]], [[SUB24_3]]
+; CHECK-NEXT:    [[ARRAYIDX32_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 3
+; CHECK-NEXT:    [[TMP60:%.*]] = load i8, ptr [[ARRAYIDX32_3]], align 1
+; CHECK-NEXT:    [[CONV33_3:%.*]] = zext i8 [[TMP60]] to i32
+; CHECK-NEXT:    [[ARRAYIDX34_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 3
+; CHECK-NEXT:    [[TMP61:%.*]] = load i8, ptr [[ARRAYIDX34_3]], align 1
+; CHECK-NEXT:    [[CONV35_3:%.*]] = zext i8 [[TMP61]] to i32
+; CHECK-NEXT:    [[SUB36_3:%.*]] = sub nsw i32 [[CONV33_3]], [[CONV35_3]]
+; CHECK-NEXT:    [[ARRAYIDX37_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 7
+; CHECK-NEXT:    [[TMP62:%.*]] = load i8, ptr [[ARRAYIDX37_3]], align 1
+; CHECK-NEXT:    [[CONV38_3:%.*]] = zext i8 [[TMP62]] to i32
+; CHECK-NEXT:    [[ARRAYIDX39_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 7
+; CHECK-NEXT:    [[TMP63:%.*]] = load i8, ptr [[ARRAYIDX39_3]], align 1
+; CHECK-NEXT:    [[CONV40_3:%.*]] = zext i8 [[TMP63]] to i32
+; CHECK-NEXT:    [[SUB41_3:%.*]] = sub nsw i32 [[CONV38_3]], [[CONV40_3]]
+; CHECK-NEXT:    [[SHL42_3:%.*]] = shl nsw i32 [[SUB41_3]], 16
+; CHECK-NEXT:    [[ADD43_3:%.*]] = add nsw i32 [[SHL42_3]], [[SUB36_3]]
+; CHECK-NEXT:    [[ADD44_3:%.*]] = add nsw i32 [[ADD19_3]], [[ADD_3]]
+; CHECK-NEXT:    [[SUB45_3:%.*]] = sub nsw i32 [[ADD_3]], [[ADD19_3]]
+; CHECK-NEXT:    [[ADD46_3:%.*]] = add nsw i32 [[ADD43_3]], [[ADD31_3]]
+; CHECK-NEXT:    [[SUB47_3:%.*]] = sub nsw i32 [[ADD31_3]], [[ADD43_3]]
+; CHECK-NEXT:    [[ADD48_3:%.*]] = add nsw i32 [[ADD46_3]], [[ADD44_3]]
+; CHECK-NEXT:    [[SUB51_3:%.*]] = sub nsw i32 [[ADD44_3]], [[ADD46_3]]
+; CHECK-NEXT:    [[ADD55_3:%.*]] = add nsw i32 [[SUB47_3]], [[SUB45_3]]
+; CHECK-NEXT:    [[SUB59_3:%.*]] = sub nsw i32 [[SUB45_3]], [[SUB47_3]]
+; CHECK-NEXT:    [[ADD78:%.*]] = add nsw i32 [[ADD48_1]], [[ADD48]]
+; CHECK-NEXT:    [[SUB86:%.*]] = sub nsw i32 [[ADD48]], [[ADD48_1]]
+; CHECK-NEXT:    [[ADD94:%.*]] = add nsw i32 [[ADD48_3]], [[ADD48_2]]
+; CHECK-NEXT:    [[SUB102:%.*]] = sub nsw i32 [[ADD48_2]], [[ADD48_3]]
+; CHECK-NEXT:    [[ADD103:%.*]] = add nsw i32 [[ADD94]], [[ADD78]]
+; CHECK-NEXT:    [[SUB104:%.*]] = sub nsw i32 [[ADD78]], [[ADD94]]
+; CHECK-NEXT:    [[ADD105:%.*]] = add nsw i32 [[SUB102]], [[SUB86]]
+; CHECK-NEXT:    [[SUB106:%.*]] = sub nsw i32 [[SUB86]], [[SUB102]]
+; CHECK-NEXT:    [[SHR_I:%.*]] = lshr i32 [[ADD103]], 15
+; CHECK-NEXT:    [[AND_I:%.*]] = and i32 [[SHR_I]], 65537
+; CHECK-NEXT:    [[MUL_I:%.*]] = mul nuw i32 [[AND_I]], 65535
+; CHECK-NEXT:    [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]]
+; CHECK-NEXT:    [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[MUL_I]]
+; CHECK-NEXT:    [[SHR_I184:%.*]] = lshr i32 [[ADD105]], 15
+; CHECK-NEXT:    [[AND_I185:%.*]] = and i32 [[SHR_I184]], 65537
+; CHECK-NEXT:    [[MUL_I186:%.*]] = mul nuw i32 [[AND_I185]], 65535
+; CHECK-NEXT:    [[ADD_I187:%.*]] = add i32 [[MUL_I186]], [[ADD105]]
+; CHECK-NEXT:    [[XOR_I188:%.*]] = xor i32 [[ADD_I187]], [[MUL_I186]]
+; CHECK-NEXT:    [[SHR_I189:%.*]] = lshr i32 [[SUB104]], 15
+; CHECK-NEXT:    [[AND_I190:%.*]] = and i32 [[SHR_I189]], 65537
+; CHECK-NEXT:    [[MUL_I191:%.*]] = mul nuw i32 [[AND_I190]], 65535
+; CHECK-NEXT:    [[ADD_I192:%.*]] = add i32 [[MUL_I191]], [[SUB104]]
+; CHECK-NEXT:    [[XOR_I193:%.*]] = xor i32 [[ADD_I192]], [[MUL_I191]]
+; CHECK-NEXT:    [[SHR_I194:%.*]] = lshr i32 [[SUB106]], 15
+; CHECK-NEXT:    [[AND_I195:%.*]] = and i32 [[SHR_I194]], 65537
+; CHECK-NEXT:    [[MUL_I196:%.*]] = mul nuw i32 [[AND_I195]], 65535
+; CHECK-NEXT:    [[ADD_I197:%.*]] = add i32 [[MUL_I196]], [[SUB106]]
+; CHECK-NEXT:    [[XOR_I198:%.*]] = xor i32 [[ADD_I197]], [[MUL_I196]]
+; CHECK-NEXT:    [[ADD110:%.*]] = add i32 [[XOR_I188]], [[XOR_I]]
+; CHECK-NEXT:    [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I193]]
+; CHECK-NEXT:    [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I198]]
+; CHECK-NEXT:    [[ADD78_1:%.*]] = add nsw i32 [[ADD55_1]], [[ADD55]]
+; CHECK-NEXT:    [[SUB86_1:%.*]] = sub nsw i32 [[ADD55]], [[ADD55_1]]
+; CHECK-NEXT:    [[ADD94_1:%.*]] = add nsw i32 [[ADD55_3]], [[ADD55_2]]
+; CHECK-NEXT:    [[SUB102_1:%.*]] = sub nsw i32 [[ADD55_2]], [[ADD55_3]]
+; CHECK-NEXT:    [[ADD103_1:%.*]] = add nsw i32 [[ADD94_1]], [[ADD78_1]]
+; CHECK-NEXT:    [[SUB104_1:%.*]] = sub nsw i32 [[ADD78_1]], [[ADD94_1]]
+; CHECK-NEXT:    [[ADD105_1:%.*]] = add nsw i32 [[SUB102_1]], [[SUB86_1]]
+; CHECK-NEXT:    [[SUB106_1:%.*]] = sub nsw i32 [[SUB86_1]], [[SUB102_1]]
+; CHECK-NEXT:    [[SHR_I_1:%.*]] = lshr i32 [[ADD103_1]], 15
+; CHECK-NEXT:    [[AND_I_1:%.*]] = and i32 [[SHR_I_1]], 65537
+; CHECK-NEXT:    [[MUL_I_1:%.*]] = mul nuw i32 [[AND_I_1]], 65535
+; CHECK-NEXT:    [[ADD_I_1:%.*]] = add i32 [[MUL_I_1]], [[ADD103_1]]
+; CHECK-NEXT:    [[XOR_I_1:%.*]] = xor i32 [[ADD_I_1]], [[MUL_I_1]]
+; CHECK-NEXT:    [[SHR_I184_1:%.*]] = lshr i32 [[ADD105_1]], 15
+; CHECK-NEXT:    [[AND_I185_1:%.*]] = and i32 [[SHR_I184_1]], 65537
+; CHECK-NEXT:    [[MUL_I186_1:%.*]] = mul nuw i32 [[AND_I185_1]], 65535
+; CHECK-NEXT:    [[ADD_I187_1:%.*]] = add i32 [[MUL_I186_1]], [[ADD105_1]]
+; CHECK-NEXT:    [[XOR_I188_1:%.*]] = xor i32 [[ADD_I187_1]], [[MUL_I186_1]]
+; CHECK-NEXT:    [[SHR_I189_1:%.*]] = lshr i32 [[SUB104_1]], 15
+; CHECK-NEXT:    [[AND_I190_1:%.*]] = and i32 [[SHR_I189_1]], 65537
+; CHECK-NEXT:    [[MUL_I191_1:%.*]] = mul nuw i32 [[AND_I190_1]], 65535
+; CHECK-NEXT:    [[ADD_I192_1:%.*]] = add i32 [[MUL_I191_1]], [[SUB104_1]]
+; CHECK-NEXT:    [[XOR_I193_1:%.*]] = xor i32 [[ADD_I192_1]], [[MUL_I191_1]]
+; CHECK-NEXT:    [[SHR_I194_1:%.*]] = lshr i32 [[SUB106_1]], 15
+; CHECK-NEXT:    [[AND_I195_1:%.*]] = and i32 [[SHR_I194_1]], 65537
+; CHECK-NEXT:    [[MUL_I196_1:%.*]] = mul nuw i32 [[AND_I195_1]], 65535
+; CHECK-NEXT:    [[ADD_I197_1:%.*]] = add i32 [[MUL_I196_1]], [[SUB106_1]]
+; CHECK-NEXT:    [[XOR_I198_1:%.*]] = xor i32 [[ADD_I197_1]], [[MUL_I196_1]]
+; CHECK-NEXT:    [[ADD108_1:%.*]] = add i32 [[XOR_I188_1]], [[ADD113]]
+; CHECK-NEXT:    [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[XOR_I_1]]
+; CHECK-NEXT:    [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[XOR_I193_1]]
+; CHECK-NEXT:    [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I198_1]]
+; CHECK-NEXT:    [[ADD78_2:%.*]] = add nsw i32 [[SUB51_1]], [[SUB51]]
+; CHECK-NEXT:    [[SUB86_2:%.*]] = sub nsw i32 [[SUB51]], [[SUB51_1]]
+; CHECK-NEXT:    [[ADD94_2:%.*]] = add nsw i32 [[SUB51_3]], [[SUB51_2]]
+; CHECK-NEXT:    [[SUB102_2:%.*]] = sub nsw i32 [[SUB51_2]], [[SUB51_3]]
+; CHECK-NEXT:    [[ADD103_2:%.*]] = add nsw i32 [[ADD94_2]], [[ADD78_2]]
+; CHECK-NEXT:    [[SUB104_2:%.*]] = sub nsw i32 [[ADD78_2]], [[ADD94_2]]
+; CHECK-NEXT:    [[ADD105_2:%.*]] = add nsw i32 [[SUB102_2]], [[SUB86_2]]
+; CHECK-NEXT:    [[SUB106_2:%.*]] = sub nsw i32 [[SUB86_2]], [[SUB102_2]]
+; CHECK-NEXT:    [[SHR_I_2:%.*]] = lshr i32 [[ADD103_2]], 15
+; CHECK-NEXT:    [[AND_I_2:%.*]] = and i32 [[SHR_I_2]], 65537
+; CHECK-NEXT:    [[MUL_I_2:%.*]] = mul nuw i32 [[AND_I_2]], 65535
+; CHECK-NEXT:    [[ADD_I_2:%.*]] = add i32 [[MUL_I_2]], [[ADD103_2]]
+; CHECK-NEXT:    [[XOR_I_2:%.*]] = xor i32 [[ADD_I_2]], [[MUL_I_2]]
+; CHECK-NEXT:    [[SHR_I184_2:%.*]] = lshr i32 [[ADD105_2]], 15
+; CHECK-NEXT:    [[AND_I185_2:%.*]] = and i32 [[SHR_I184_2]], 65537
+; CHECK-NEXT:    [[MUL_I186_2:%.*]] = mul nuw i32 [[AND_I185_2]], 65535
+; CHECK-NEXT:    [[ADD_I187_2:%.*]] = add i32 [[MUL_I186_2]], [[ADD105_2]]
+; CHECK-NEXT:    [[XOR_I188_2:%.*]] = xor i32 [[ADD_I187_2]], [[MUL_I186_2]]
+; CHECK-NEXT:    [[SHR_I189_2:%.*]] = lshr i32 [[SUB104_2]], 15
+; CHECK-NEXT:    [[AND_I190_2:%.*]] = and i32 [[SHR_I189_2]], 65537
+; CHECK-NEXT:    [[MUL_I191_2:%.*]] = mul nuw i32 [[AND_I190_2]], 65535
+; CHECK-NEXT:    [[ADD_I192_2:%.*]] = add i32 [[MUL_I191_2]], [[SUB104_2]]
+; CHECK-NEXT:    [[XOR_I193_2:%.*]] = xor i32 [[ADD_I192_2]], [[MUL_I191_2]]
+; CHECK-NEXT:    [[SHR_I194_2:%.*]] = lshr i32 [[SUB106_2]], 15
+; CHECK-NEXT:    [[AND_I195_2:%.*]] = and i32 [[SHR_I194_2]], 65537
+; CHECK-NEXT:    [[MUL_I196_2:%.*]] = mul nuw i32 [[AND_I195_2]], 65535
+; CHECK-NEXT:    [[ADD_I197_2:%.*]] = add i32 [[MUL_I196_2]], [[SUB106_2]]
+; CHECK-NEXT:    [[XOR_I198_2:%.*]] = xor i32 [[ADD_I197_2]], [[MUL_I196_2]]
+; CHECK-NEXT:    [[ADD108_2:%.*]] = add i32 [[XOR_I188_2]], [[ADD113_1]]
+; CHECK-NEXT:    [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[XOR_I_2]]
+; CHECK-NEXT:    [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[XOR_I193_2]]
+; CHECK-NEXT:    [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I198_2]]
+; CHECK-NEXT:    [[ADD78_3:%.*]] = add nsw i32 [[SUB59_1]], [[SUB59]]
+; CHECK-NEXT:    [[SUB86_3:%.*]] = sub nsw i32 [[SUB59]], [[SUB59_1]]
+; CHECK-NEXT:    [[ADD94_3:%.*]] = add nsw i32 [[SUB59_3]], [[SUB59_2]]
+; CHECK-NEXT:    [[SUB102_3:%.*]] = sub nsw i32 [[SUB59_2]], [[SUB59_3]]
+; CHECK-NEXT:    [[ADD103_3:%.*]] = add nsw i32 [[ADD94_3]], [[ADD78_3]]
+; CHECK-NEXT:    [[SUB104_3:%.*]] = sub nsw i32 [[ADD78_3]], [[ADD94_3]]
+; CHECK-NEXT:    [[ADD105_3:%.*]] = add nsw i32 [[SUB102_3]], [[SUB86_3]]
+; CHECK-NEXT:    [[SUB106_3:%.*]] = sub nsw i32 [[SUB86_3]], [[SUB102_3]]
+; CHECK-NEXT:    [[SHR_I_3:%.*]] = lshr i32 [[ADD103_3]], 15
+; CHECK-NEXT:    [[AND_I_3:%.*]] = and i32 [[SHR_I_3]], 65537
+; CHECK-NEXT:    [[MUL_I_3:%.*]] = mul nuw i32 [[AND_I_3]], 65535
+; CHECK-NEXT:    [[ADD_I_3:%.*]] = add i32 [[MUL_I_3]], [[ADD103_3]]
+; CHECK-NEXT:    [[XOR_I_3:%.*]] = xor i32 [[ADD_I_3]], [[MUL_I_3]]
+; CHECK-NEXT:    [[SHR_I184_3:%.*]] = lshr i32 [[ADD105_3]], 15
+; CHECK-NEXT:    [[AND_I185_3:%.*]] = and i32 [[SHR_I184_3]], 65537
+; CHECK-NEXT:    [[MUL_I186_3:%.*]] = mul nuw i32 [[AND_I185_3]], 65535
+; CHECK-NEXT:    [[ADD_I187_3:%.*]] = add i32 [[MUL_I186_3]], [[ADD105_3]]
+; CHECK-NEXT:    [[XOR_I188_3:%.*]] = xor i32 [[ADD_I187_3]], [[MUL_I186_3]]
+; CHECK-NEXT:    [[SHR_I189_3:%.*]] = lshr i32 [[SUB104_3]], 15
+; CHECK-NEXT:    [[AND_I190_3:%.*]] = and i32 [[SHR_I189_3]], 65537
+; CHECK-NEXT:    [[MUL_I191_3:%.*]] = mul nuw i32 [[AND_I190_3]], 65535
+; CHECK-NEXT:    [[ADD_I192_3:%.*]] = add i32 [[MUL_I191_3]], [[SUB104_3]]
+; CHECK-NEXT:    [[XOR_I193_3:%.*]] = xor i32 [[ADD_I192_3]], [[MUL_I191_3]]
+; CHECK-NEXT:    [[SHR_I194_3:%.*]] = lshr i32 [[SUB106_3]], 15
+; CHECK-NEXT:    [[AND_I195_3:%.*]] = and i32 [[SHR_I194_3]], 65537
+; CHECK-NEXT:    [[MUL_I196_3:%.*]] = mul nuw i32 [[AND_I195_3]], 65535
+; CHECK-NEXT:    [[ADD_I197_3:%.*]] = add i32 [[MUL_I196_3]], [[SUB106_3]]
+; CHECK-NEXT:    [[XOR_I198_3:%.*]] = xor i32 [[ADD_I197_3]], [[MUL_I196_3]]
+; CHECK-NEXT:    [[ADD108_3:%.*]] = add i32 [[XOR_I188_3]], [[ADD113_2]]
+; CHECK-NEXT:    [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[XOR_I_3]]
+; CHECK-NEXT:    [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[XOR_I193_3]]
+; CHECK-NEXT:    [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I198_3]]
+; CHECK-NEXT:    [[CONV118:%.*]] = and i32 [[ADD113_3]], 65535
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[ADD113_3]], 16
 ; CHECK-NEXT:    [[ADD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]]
 ; CHECK-NEXT:    [[SHR120:%.*]] = lshr i32 [[ADD119]], 1
 ; CHECK-NEXT:    ret i32 [[SHR120]]
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
@@ -7,21 +7,20 @@
 ; CHECK-LABEL: @slp_not_profitable_with_fast_fmf(
 ; CHECK-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1
 ; CHECK-NEXT:    [[A_0:%.*]] = load float, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4
+; CHECK-NEXT:    [[MUL_0:%.*]] = fmul fast float [[B_1]], [[A_0]]
 ; CHECK-NEXT:    [[B_0:%.*]] = load float, ptr [[B]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0
-; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <2 x float> [[SHUFFLE1]], [[TMP1]]
-; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast <2 x float> [[TMP1]], [[SHUFFLE]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fsub fast <2 x float> [[TMP5]], [[SHUFFLE2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <2 x float> [[TMP5]], [[SHUFFLE2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    store <2 x float> [[TMP8]], ptr [[A]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    store float [[TMP9]], ptr [[B]], align 4
+; CHECK-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2
+; CHECK-NEXT:    [[B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
+; CHECK-NEXT:    [[MUL_1:%.*]] = fmul fast float [[B_2]], [[B_0]]
+; CHECK-NEXT:    [[SUB:%.*]] = fsub fast float [[MUL_0]], [[MUL_1]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = fmul fast float [[B_0]], [[B_1]]
+; CHECK-NEXT:    [[MUL_3:%.*]] = fmul fast float [[B_2]], [[A_0]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[MUL_3]], [[MUL_2]]
+; CHECK-NEXT:    store float [[SUB]], ptr [[A]], align 4
+; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1
+; CHECK-NEXT:    store float [[ADD]], ptr [[GEP_A_1]], align 4
+; CHECK-NEXT:    store float [[B_2]], ptr [[B]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %gep.B.1 = getelementptr inbounds float, ptr %B, i64 1
@@ -47,21 +46,20 @@
 ; CHECK-LABEL: @slp_not_profitable_with_reassoc_fmf(
 ; CHECK-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1
 ; CHECK-NEXT:    [[A_0:%.*]] = load float, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4
+; CHECK-NEXT:    [[MUL_0:%.*]] = fmul reassoc float [[B_1]], [[A_0]]
 ; CHECK-NEXT:    [[B_0:%.*]] = load float, ptr [[B]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0
-; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x float> [[SHUFFLE1]], [[TMP1]]
-; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul reassoc <2 x float> [[TMP1]], [[SHUFFLE]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fsub reassoc <2 x float> [[TMP5]], [[SHUFFLE2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd reassoc <2 x float> [[TMP5]], [[SHUFFLE2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    store <2 x float> [[TMP8]], ptr [[A]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    store float [[TMP9]], ptr [[B]], align 4
+; CHECK-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2
+; CHECK-NEXT:    [[B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
+; CHECK-NEXT:    [[MUL_1:%.*]] = fmul float [[B_2]], [[B_0]]
+; CHECK-NEXT:    [[SUB:%.*]] = fsub reassoc float [[MUL_0]], [[MUL_1]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = fmul float [[B_0]], [[B_1]]
+; CHECK-NEXT:    [[MUL_3:%.*]] = fmul reassoc float [[B_2]], [[A_0]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd reassoc float [[MUL_3]], [[MUL_2]]
+; CHECK-NEXT:    store float [[SUB]], ptr [[A]], align 4
+; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1
+; CHECK-NEXT:    store float [[ADD]], ptr [[GEP_A_1]], align 4
+; CHECK-NEXT:    store float [[B_2]], ptr [[B]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %gep.B.1 = getelementptr inbounds float, ptr %B, i64 1
@@ -88,21 +86,20 @@
 ; CHECK-LABEL: @slp_profitable_missing_fmf_on_fadd_fsub(
 ; CHECK-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1
 ; CHECK-NEXT:    [[A_0:%.*]] = load float, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4
+; CHECK-NEXT:    [[MUL_0:%.*]] = fmul fast float [[B_1]], [[A_0]]
 ; CHECK-NEXT:    [[B_0:%.*]] = load float, ptr [[B]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0
-; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <2 x float> [[SHUFFLE1]], [[TMP1]]
-; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast <2 x float> [[TMP1]], [[SHUFFLE]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fsub <2 x float> [[TMP5]], [[SHUFFLE2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x float> [[TMP5]], [[SHUFFLE2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    store <2 x float> [[TMP8]], ptr [[A]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    store float [[TMP9]], ptr [[B]], align 4
+; CHECK-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2
+; CHECK-NEXT:    [[B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
+; CHECK-NEXT:    [[MUL_1:%.*]] = fmul fast float [[B_2]], [[B_0]]
+; CHECK-NEXT:    [[SUB:%.*]] = fsub float [[MUL_0]], [[MUL_1]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = fmul fast float [[B_0]], [[B_1]]
+; CHECK-NEXT:    [[MUL_3:%.*]] = fmul fast float [[B_2]], [[A_0]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[MUL_3]], [[MUL_2]]
+; CHECK-NEXT:    store float [[SUB]], ptr [[A]], align 4
+; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1
+; CHECK-NEXT:    store float [[ADD]], ptr [[GEP_A_1]], align 4
+; CHECK-NEXT:    store float [[B_2]], ptr [[B]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %gep.B.1 = getelementptr inbounds float, ptr %B, i64 1
@@ -129,21 +126,20 @@
 ; CHECK-LABEL: @slp_profitable_missing_fmf_on_fmul_fadd_fsub(
 ; CHECK-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1
 ; CHECK-NEXT:    [[A_0:%.*]] = load float, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4
+; CHECK-NEXT:    [[MUL_0:%.*]] = fmul float [[B_1]], [[A_0]]
 ; CHECK-NEXT:    [[B_0:%.*]] = load float, ptr [[B]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0
-; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x float> [[SHUFFLE1]], [[TMP1]]
-; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x float> [[TMP1]], [[SHUFFLE]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fsub <2 x float> [[TMP5]], [[SHUFFLE2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x float> [[TMP5]], [[SHUFFLE2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    store <2 x float> [[TMP8]], ptr [[A]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    store float [[TMP9]], ptr [[B]], align 4
+; CHECK-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2
+; CHECK-NEXT:    [[B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
+; CHECK-NEXT:    [[MUL_1:%.*]] = fmul float [[B_2]], [[B_0]]
+; CHECK-NEXT:    [[SUB:%.*]] = fsub float [[MUL_0]], [[MUL_1]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = fmul float [[B_0]], [[B_1]]
+; CHECK-NEXT:    [[MUL_3:%.*]] = fmul float [[B_2]], [[A_0]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[MUL_3]], [[MUL_2]]
+; CHECK-NEXT:    store float [[SUB]], ptr [[A]], align 4
+; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1
+; CHECK-NEXT:    store float [[ADD]], ptr [[GEP_A_1]], align 4
+; CHECK-NEXT:    store float [[B_2]], ptr [[B]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %gep.B.1 = getelementptr inbounds float, ptr %B, i64 1
@@ -170,21 +166,20 @@
 ; CHECK-LABEL: @slp_profitable_missing_fmf_nnans_only(
 ; CHECK-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1
 ; CHECK-NEXT:    [[A_0:%.*]] = load float, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4
+; CHECK-NEXT:    [[MUL_0:%.*]] = fmul nnan float [[B_1]], [[A_0]]
 ; CHECK-NEXT:    [[B_0:%.*]] = load float, ptr [[B]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0
-; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul nnan <2 x float> [[SHUFFLE1]], [[TMP1]]
-; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul nnan <2 x float> [[TMP1]], [[SHUFFLE]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fsub nnan <2 x float> [[TMP5]], [[SHUFFLE2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd nnan <2 x float> [[TMP5]], [[SHUFFLE2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    store <2 x float> [[TMP8]], ptr [[A]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    store float [[TMP9]], ptr [[B]], align 4
+; CHECK-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2
+; CHECK-NEXT:    [[B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
+; CHECK-NEXT:    [[MUL_1:%.*]] = fmul nnan float [[B_2]], [[B_0]]
+; CHECK-NEXT:    [[SUB:%.*]] = fsub nnan float [[MUL_0]], [[MUL_1]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = fmul nnan float [[B_0]], [[B_1]]
+; CHECK-NEXT:    [[MUL_3:%.*]] = fmul nnan float [[B_2]], [[A_0]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd nnan float [[MUL_3]], [[MUL_2]]
+; CHECK-NEXT:    store float [[SUB]], ptr [[A]], align 4
+; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1
+; CHECK-NEXT:    store float [[ADD]], ptr [[GEP_A_1]], align 4
+; CHECK-NEXT:    store float [[B_2]], ptr [[B]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %gep.B.1 = getelementptr inbounds float, ptr %B, i64 1
@@ -267,16 +262,16 @@
 ; CHECK-NEXT:    [[SUB_I1096:%.*]] = fsub fast float 1.000000e+00, [[TMP0:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[A:%.*]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <2 x float> [[TMP1]], [[SHUFFLE]]
-; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> poison, float [[SUB_I1096]], i32 0
-; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast <2 x float> [[TMP1]], [[SHUFFLE2]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fadd fast <2 x float> [[SHUFFLE1]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fsub fast <2 x float> [[SHUFFLE1]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    store <2 x float> [[TMP8]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> poison, float [[SUB_I1096]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd fast <2 x float> [[SHUFFLE]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fsub fast <2 x float> [[SHUFFLE]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> [[TMP9]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    store <2 x float> [[TMP10]], ptr [[B:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll
@@ -18,19 +18,20 @@
 define void @s116_modified(ptr %a) {
 ; CHECK-LABEL: @s116_modified(
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 1
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 2
 ; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 3
 ; CHECK-NEXT:    [[LD0:%.*]] = load float, ptr [[A]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[GEP1]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[GEP3]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> poison, float [[LD0]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 undef>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> [[TMP8]], <4 x i32> <i32 0, i32 undef, i32 2, i32 4>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast <4 x float> [[TMP9]], [[TMP11]]
-; CHECK-NEXT:    store <4 x float> [[TMP12]], ptr [[A]], align 4
+; CHECK-NEXT:    [[LD1:%.*]] = load float, ptr [[GEP1]], align 4
+; CHECK-NEXT:    [[LD2:%.*]] = load float, ptr [[GEP2]], align 4
+; CHECK-NEXT:    [[MUL0:%.*]] = fmul fast float [[LD0]], [[LD1]]
+; CHECK-NEXT:    [[MUL1:%.*]] = fmul fast float [[LD2]], [[LD1]]
+; CHECK-NEXT:    store float [[MUL0]], ptr [[A]], align 4
+; CHECK-NEXT:    store float [[MUL1]], ptr [[GEP1]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[GEP3]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[LD2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP1]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    store <2 x float> [[TMP4]], ptr [[GEP2]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %gep1 = getelementptr inbounds float, ptr %a, i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll
@@ -211,15 +211,14 @@
 ; CHECK-NEXT:    [[V_2:%.*]] = load <4 x double>, ptr [[PTR_2:%.*]], align 16
 ; CHECK-NEXT:    [[V2_LANE_1:%.*]] = extractelement <4 x double> [[V_2]], i32 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <2 x i32> <i32 2, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 0>
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <4 x double> [[TMP0]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <4 x double> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    call void @use(double [[V1_LANE_0]])
 ; CHECK-NEXT:    call void @use(double [[V1_LANE_1]])
 ; CHECK-NEXT:    call void @use(double [[V1_LANE_2]])
 ; CHECK-NEXT:    call void @use(double [[V1_LANE_3]])
-; CHECK-NEXT:    store <9 x double> [[TMP4]], ptr [[PTR_1]], align 8
+; CHECK-NEXT:    store <9 x double> [[TMP3]], ptr [[PTR_1]], align 8
 ; CHECK-NEXT:    ret void
 ;
 bb:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
@@ -13,7 +13,7 @@
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]])
 ; CHECK-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP0:%.*]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> <i32 poison, i32 14910>, i32 [[OP_RDX]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = and <2 x i32> [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = add <2 x i32> [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP9]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> [[TMP8]], <2 x i32> <i32 0, i32 3>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll
@@ -232,23 +232,27 @@
 
 define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-LABEL: @ashr_lshr_shl_v8i32(
-; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 6
+; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 4
+; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
+; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i64 6
 ; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i64 7
-; SSE-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i64 6
+; SSE-NEXT:    [[B4:%.*]] = extractelement <8 x i32> [[B:%.*]], i64 4
+; SSE-NEXT:    [[B5:%.*]] = extractelement <8 x i32> [[B]], i64 5
+; SSE-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B]], i64 6
 ; SSE-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i64 7
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]]
 ; SSE-NEXT:    [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]]
 ; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; SSE-NEXT:    [[TMP6:%.*]] = lshr <8 x i32> [[A]], [[B]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <2 x i32> <i32 4, i32 5>
+; SSE-NEXT:    [[AB4:%.*]] = lshr i32 [[A4]], [[B4]]
+; SSE-NEXT:    [[AB5:%.*]] = lshr i32 [[A5]], [[B5]]
 ; SSE-NEXT:    [[AB6:%.*]] = shl i32 [[A6]], [[B6]]
 ; SSE-NEXT:    [[AB7:%.*]] = shl i32 [[A7]], [[B7]]
-; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE-NEXT:    [[R51:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 undef, i32 undef>
-; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R51]], i32 [[AB6]], i64 6
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[AB4]], i64 4
+; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i64 5
+; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i64 6
 ; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i64 7
 ; SSE-NEXT:    ret <8 x i32> [[R7]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
@@ -232,23 +232,27 @@
 
 define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-LABEL: @ashr_lshr_shl_v8i32(
-; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 6
+; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 4
+; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
+; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i64 6
 ; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i64 7
-; SSE-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i64 6
+; SSE-NEXT:    [[B4:%.*]] = extractelement <8 x i32> [[B:%.*]], i64 4
+; SSE-NEXT:    [[B5:%.*]] = extractelement <8 x i32> [[B]], i64 5
+; SSE-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B]], i64 6
 ; SSE-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i64 7
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]]
 ; SSE-NEXT:    [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]]
 ; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; SSE-NEXT:    [[TMP6:%.*]] = lshr <8 x i32> [[A]], [[B]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <2 x i32> <i32 4, i32 5>
+; SSE-NEXT:    [[AB4:%.*]] = lshr i32 [[A4]], [[B4]]
+; SSE-NEXT:    [[AB5:%.*]] = lshr i32 [[A5]], [[B5]]
 ; SSE-NEXT:    [[AB6:%.*]] = shl i32 [[A6]], [[B6]]
 ; SSE-NEXT:    [[AB7:%.*]] = shl i32 [[A7]], [[B7]]
-; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE-NEXT:    [[R51:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 undef, i32 undef>
-; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R51]], i32 [[AB6]], i64 6
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[AB4]], i64 4
+; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i64 5
+; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i64 6
 ; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i64 7
 ; SSE-NEXT:    ret <8 x i32> [[R7]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-inseltpoison.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-inseltpoison.ll
@@ -607,26 +607,35 @@
 ; SSE-NEXT:    ret <8 x double> [[TMP1]]
 ;
 ; SLM-LABEL: @buildvector_div_8f64(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A:%.*]], <8 x double> poison, <2 x i32> <i32 0, i32 1>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <2 x i32> <i32 0, i32 1>
+; SLM-NEXT:    [[A2:%.*]] = extractelement <8 x double> [[A:%.*]], i32 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x double> [[A]], i32 3
+; SLM-NEXT:    [[A4:%.*]] = extractelement <8 x double> [[A]], i32 4
+; SLM-NEXT:    [[A5:%.*]] = extractelement <8 x double> [[A]], i32 5
+; SLM-NEXT:    [[A6:%.*]] = extractelement <8 x double> [[A]], i32 6
+; SLM-NEXT:    [[A7:%.*]] = extractelement <8 x double> [[A]], i32 7
+; SLM-NEXT:    [[B2:%.*]] = extractelement <8 x double> [[B:%.*]], i32 2
+; SLM-NEXT:    [[B3:%.*]] = extractelement <8 x double> [[B]], i32 3
+; SLM-NEXT:    [[B4:%.*]] = extractelement <8 x double> [[B]], i32 4
+; SLM-NEXT:    [[B5:%.*]] = extractelement <8 x double> [[B]], i32 5
+; SLM-NEXT:    [[B6:%.*]] = extractelement <8 x double> [[B]], i32 6
+; SLM-NEXT:    [[B7:%.*]] = extractelement <8 x double> [[B]], i32 7
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> <i32 0, i32 1>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> <i32 0, i32 1>
 ; SLM-NEXT:    [[TMP3:%.*]] = fdiv <2 x double> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> <i32 2, i32 3>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> <i32 2, i32 3>
-; SLM-NEXT:    [[TMP6:%.*]] = fdiv <2 x double> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> <i32 4, i32 5>
-; SLM-NEXT:    [[TMP8:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> <i32 4, i32 5>
-; SLM-NEXT:    [[TMP9:%.*]] = fdiv <2 x double> [[TMP7]], [[TMP8]]
-; SLM-NEXT:    [[TMP10:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> <i32 6, i32 7>
-; SLM-NEXT:    [[TMP11:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> <i32 6, i32 7>
-; SLM-NEXT:    [[TMP12:%.*]] = fdiv <2 x double> [[TMP10]], [[TMP11]]
-; SLM-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SLM-NEXT:    [[TMP14:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SLM-NEXT:    [[R31:%.*]] = shufflevector <8 x double> [[TMP13]], <8 x double> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    [[TMP15:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SLM-NEXT:    [[R52:%.*]] = shufflevector <8 x double> [[R31]], <8 x double> [[TMP15]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SLM-NEXT:    [[TMP16:%.*]] = shufflevector <2 x double> [[TMP12]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SLM-NEXT:    [[R73:%.*]] = shufflevector <8 x double> [[R52]], <8 x double> [[TMP16]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SLM-NEXT:    ret <8 x double> [[R73]]
+; SLM-NEXT:    [[C2:%.*]] = fdiv double [[A2]], [[B2]]
+; SLM-NEXT:    [[C3:%.*]] = fdiv double [[A3]], [[B3]]
+; SLM-NEXT:    [[C4:%.*]] = fdiv double [[A4]], [[B4]]
+; SLM-NEXT:    [[C5:%.*]] = fdiv double [[A5]], [[B5]]
+; SLM-NEXT:    [[C6:%.*]] = fdiv double [[A6]], [[B6]]
+; SLM-NEXT:    [[C7:%.*]] = fdiv double [[A7]], [[B7]]
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x double> [[TMP4]], double [[C2]], i32 2
+; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[C3]], i32 3
+; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[C4]], i32 4
+; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[C5]], i32 5
+; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[C6]], i32 6
+; SLM-NEXT:    [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[C7]], i32 7
+; SLM-NEXT:    ret <8 x double> [[R7]]
 ;
 ; AVX-LABEL: @buildvector_div_8f64(
 ; AVX-NEXT:    [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fp.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fp.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fp.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fp.ll
@@ -607,26 +607,35 @@
 ; SSE-NEXT:    ret <8 x double> [[TMP1]]
 ;
 ; SLM-LABEL: @buildvector_div_8f64(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A:%.*]], <8 x double> poison, <2 x i32> <i32 0, i32 1>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <2 x i32> <i32 0, i32 1>
+; SLM-NEXT:    [[A2:%.*]] = extractelement <8 x double> [[A:%.*]], i32 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x double> [[A]], i32 3
+; SLM-NEXT:    [[A4:%.*]] = extractelement <8 x double> [[A]], i32 4
+; SLM-NEXT:    [[A5:%.*]] = extractelement <8 x double> [[A]], i32 5
+; SLM-NEXT:    [[A6:%.*]] = extractelement <8 x double> [[A]], i32 6
+; SLM-NEXT:    [[A7:%.*]] = extractelement <8 x double> [[A]], i32 7
+; SLM-NEXT:    [[B2:%.*]] = extractelement <8 x double> [[B:%.*]], i32 2
+; SLM-NEXT:    [[B3:%.*]] = extractelement <8 x double> [[B]], i32 3
+; SLM-NEXT:    [[B4:%.*]] = extractelement <8 x double> [[B]], i32 4
+; SLM-NEXT:    [[B5:%.*]] = extractelement <8 x double> [[B]], i32 5
+; SLM-NEXT:    [[B6:%.*]] = extractelement <8 x double> [[B]], i32 6
+; SLM-NEXT:    [[B7:%.*]] = extractelement <8 x double> [[B]], i32 7
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> <i32 0, i32 1>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> <i32 0, i32 1>
 ; SLM-NEXT:    [[TMP3:%.*]] = fdiv <2 x double> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> <i32 2, i32 3>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> <i32 2, i32 3>
-; SLM-NEXT:    [[TMP6:%.*]] = fdiv <2 x double> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> <i32 4, i32 5>
-; SLM-NEXT:    [[TMP8:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> <i32 4, i32 5>
-; SLM-NEXT:    [[TMP9:%.*]] = fdiv <2 x double> [[TMP7]], [[TMP8]]
-; SLM-NEXT:    [[TMP10:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> <i32 6, i32 7>
-; SLM-NEXT:    [[TMP11:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> <i32 6, i32 7>
-; SLM-NEXT:    [[TMP12:%.*]] = fdiv <2 x double> [[TMP10]], [[TMP11]]
-; SLM-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SLM-NEXT:    [[TMP14:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SLM-NEXT:    [[R31:%.*]] = shufflevector <8 x double> [[TMP13]], <8 x double> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    [[TMP15:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SLM-NEXT:    [[R52:%.*]] = shufflevector <8 x double> [[R31]], <8 x double> [[TMP15]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SLM-NEXT:    [[TMP16:%.*]] = shufflevector <2 x double> [[TMP12]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SLM-NEXT:    [[R73:%.*]] = shufflevector <8 x double> [[R52]], <8 x double> [[TMP16]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SLM-NEXT:    ret <8 x double> [[R73]]
+; SLM-NEXT:    [[C2:%.*]] = fdiv double [[A2]], [[B2]]
+; SLM-NEXT:    [[C3:%.*]] = fdiv double [[A3]], [[B3]]
+; SLM-NEXT:    [[C4:%.*]] = fdiv double [[A4]], [[B4]]
+; SLM-NEXT:    [[C5:%.*]] = fdiv double [[A5]], [[B5]]
+; SLM-NEXT:    [[C6:%.*]] = fdiv double [[A6]], [[B6]]
+; SLM-NEXT:    [[C7:%.*]] = fdiv double [[A7]], [[B7]]
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x double> [[TMP4]], double [[C2]], i32 2
+; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[C3]], i32 3
+; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[C4]], i32 4
+; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[C5]], i32 5
+; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[C6]], i32 6
+; SLM-NEXT:    [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[C7]], i32 7
+; SLM-NEXT:    ret <8 x double> [[R7]]
 ;
 ; AVX-LABEL: @buildvector_div_8f64(
 ; AVX-NEXT:    [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll b/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll
@@ -67,7 +67,7 @@
 ; CHECK-NEXT:    [[MUL88:%.*]] = fmul double [[TMP4]], 2.000000e+00
 ; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <2 x double> poison, double [[FNEG87]], i32 0
 ; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <2 x double> [[TMP26]], double [[CALL]], i32 1
-; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <2 x double> poison, double [[CALL]], i32 0
+; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <2 x double> [[TMP27]], <2 x double> poison, <2 x i32> <i32 1, i32 undef>
 ; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <2 x double> [[TMP28]], double [[TMP12]], i32 1
 ; CHECK-NEXT:    [[TMP30:%.*]] = fsub <2 x double> [[TMP27]], [[TMP29]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <2 x double> poison, double [[MUL88]], i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll
@@ -16,13 +16,28 @@
 
 define void @splat(i8 %a, i8 %b, i8 %c) {
 ; SSE-LABEL: @splat(
-; SSE-NEXT:    [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[A:%.*]], i32 0
-; SSE-NEXT:    [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[B:%.*]], i32 1
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
-; SSE-NEXT:    [[TMP4:%.*]] = insertelement <16 x i8> poison, i8 [[C:%.*]], i32 0
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i8> [[TMP4]], <16 x i8> poison, <16 x i32> zeroinitializer
-; SSE-NEXT:    [[TMP6:%.*]] = xor <16 x i8> [[TMP3]], [[TMP5]]
-; SSE-NEXT:    store <16 x i8> [[TMP6]], ptr @cle, align 16
+; SSE-NEXT:    [[TMP1:%.*]] = xor i8 [[C:%.*]], [[A:%.*]]
+; SSE-NEXT:    store i8 [[TMP1]], ptr @cle, align 16
+; SSE-NEXT:    [[TMP2:%.*]] = xor i8 [[A]], [[C]]
+; SSE-NEXT:    store i8 [[TMP2]], ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 1), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = xor i8 [[A]], [[C]]
+; SSE-NEXT:    store i8 [[TMP3]], ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 2), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = xor i8 [[A]], [[C]]
+; SSE-NEXT:    store i8 [[TMP4]], ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 3), align 1
+; SSE-NEXT:    [[TMP5:%.*]] = xor i8 [[C]], [[A]]
+; SSE-NEXT:    store i8 [[TMP5]], ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 4), align 1
+; SSE-NEXT:    [[TMP6:%.*]] = xor i8 [[C]], [[B:%.*]]
+; SSE-NEXT:    store i8 [[TMP6]], ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 5), align 1
+; SSE-NEXT:    [[TMP7:%.*]] = xor i8 [[C]], [[A]]
+; SSE-NEXT:    store i8 [[TMP7]], ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 6), align 1
+; SSE-NEXT:    [[TMP8:%.*]] = xor i8 [[C]], [[B]]
+; SSE-NEXT:    store i8 [[TMP8]], ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 7), align 1
+; SSE-NEXT:    [[TMP9:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i32 0
+; SSE-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i8> [[TMP9]], <8 x i8> poison, <8 x i32> zeroinitializer
+; SSE-NEXT:    [[TMP11:%.*]] = insertelement <8 x i8> poison, i8 [[C]], i32 0
+; SSE-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i8> [[TMP11]], <8 x i8> poison, <8 x i32> zeroinitializer
+; SSE-NEXT:    [[TMP13:%.*]] = xor <8 x i8> [[TMP10]], [[TMP12]]
+; SSE-NEXT:    store <8 x i8> [[TMP13]], ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 8), align 1
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @splat(
@@ -95,11 +110,10 @@
 ; AVX-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[A:%.*]], i32 0
 ; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; AVX-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
-; AVX-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[B:%.*]], i32 1
-; AVX-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[C]], i32 2
-; AVX-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 0>
-; AVX-NEXT:    [[TMP9:%.*]] = xor <4 x i32> [[TMP5]], [[TMP8]]
-; AVX-NEXT:    store <4 x i32> [[TMP9]], ptr @cle32, align 16
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 undef, i32 4, i32 0>
+; AVX-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[B:%.*]], i32 1
+; AVX-NEXT:    [[TMP8:%.*]] = xor <4 x i32> [[TMP5]], [[TMP7]]
+; AVX-NEXT:    store <4 x i32> [[TMP8]], ptr @cle32, align 16
 ; AVX-NEXT:    ret void
 ;
   %add1 = add i32 %c, %a
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll
@@ -5,23 +5,19 @@
 define i1 @foo() {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr null, align 4
-; CHECK-NEXT:    br i1 false, label [[TMP15:%.*]], label [[TMP2:%.*]]
+; CHECK-NEXT:    br i1 false, label [[TMP11:%.*]], label [[TMP2:%.*]]
 ; CHECK:       2:
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> zeroinitializer, i64 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x float> [[TMP5]], zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> zeroinitializer, <4 x float> [[TMP7]], <4 x float> [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = fsub <4 x float> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = fadd <4 x float> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; CHECK-NEXT:    br label [[TMP15]]
-; CHECK:       15:
-; CHECK-NEXT:    [[TMP16:%.*]] = phi <4 x float> [ [[TMP14]], [[TMP2]] ], [ zeroinitializer, [[TMP0:%.*]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> <float undef, float 0.000000e+00>, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x float> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> zeroinitializer, <4 x float> [[TMP5]], <4 x float> [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fsub <4 x float> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = fadd <4 x float> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    br label [[TMP11]]
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi <4 x float> [ [[TMP10]], [[TMP2]] ], [ zeroinitializer, [[TMP0:%.*]] ]
 ; CHECK-NEXT:    ret i1 false
 ;
   %1 = load float, ptr null, align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll
@@ -30,7 +30,7 @@
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[IX2:%.*]] = fmul double [[TMP8]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]]
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP5]], <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP11:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast <2 x double> [[TMP11]], [[TMP9]]
 ; CHECK-NEXT:    [[IXX101:%.*]] = fsub double undef, undef
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_lencod.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_lencod.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_lencod.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_lencod.ll
@@ -127,10 +127,10 @@
 ; CHECK-LABEL: @dct36(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds double, ptr [[INBUF:%.*]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[INBUF]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 1, i32 undef>
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[ARRAYIDX44]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[INBUF]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x double> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    store <2 x double> [[TMP2]], ptr [[ARRAYIDX44]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_netbsd_decompress.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_netbsd_decompress.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_netbsd_decompress.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_netbsd_decompress.ll
@@ -23,7 +23,7 @@
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @c, align 4
 ; CHECK-NEXT:    [[AND:%.*]] = and i32 [[TMP2]], 7
 ; CHECK-NEXT:    store i32 [[AND]], ptr @a, align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> <i32 poison, i32 0>, <2 x i32> [[TMP0]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMP0]], i32 0, i32 1
 ; CHECK-NEXT:    switch i32 [[AND]], label [[IF_END:%.*]] [
 ; CHECK-NEXT:    i32 7, label [[SAVE_STATE_AND_RETURN]]
 ; CHECK-NEXT:    i32 0, label [[SAVE_STATE_AND_RETURN]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
@@ -99,15 +99,7 @@
 ; CHECK-NEXT:    br i1 undef, label [[IF_THEN78:%.*]], label [[IF_THEN38:%.*]]
 ; CHECK:       if.then38:
 ; CHECK-NEXT:    [[AGG_TMP74663_SROA_0_0_IDX:%.*]] = getelementptr inbounds [[STRUCT_RAY:%.*]], ptr undef, i64 0, i32 1, i32 0
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> <double 6.000000e-01, double poison>, double 6.000000e-02, i32 1
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul <2 x double> <double 5.000000e-01, double 8.000000e-01>, [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = fsub <2 x double> <double 2.400000e-02, double 0.000000e+00>, [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> <double 9.000000e-01, double 9.100000e-01>, [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> <double 9.200000e-01, double 9.300000e-01>, [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> <double 0x3FEE147AE147AE14, double 0x3FEE666666666666>, [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> <double 0x3FEEB851EB851EB8, double 0x3FEF0A3D70A3D70A>, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x double> <double 0x3FEF5C28F5C28F5C, double 0x3FEFAE147AE147AE>, [[TMP6]]
-; CHECK-NEXT:    store <2 x double> [[TMP7]], ptr [[AGG_TMP74663_SROA_0_0_IDX]], align 8
+; CHECK-NEXT:    store <2 x double> <double 0x3FFA356C1D8A7F76, double 0x3FFDC4F38B38BEF4>, ptr [[AGG_TMP74663_SROA_0_0_IDX]], align 8
 ; CHECK-NEXT:    br label [[IF_THEN78]]
 ; CHECK:       if.then78:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll
@@ -19,14 +19,12 @@
 ; CHECK-NEXT:    [[TMP1:%.*]] = fmul <2 x double> [[TMP0]], <double 4.000000e+00, double 3.000000e+00>
 ; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], <double 1.000000e+00, double 6.000000e+00>
 ; CHECK-NEXT:    store <2 x double> [[TMP2]], ptr [[G]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds double, ptr [[G]], i64 2
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP0]], i32 1
-; CHECK-NEXT:    [[MUL11:%.*]] = fmul double [[TMP4]], 4.000000e+00
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> poison, double [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[MUL11]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], <double 7.000000e+00, double 8.000000e+00>
-; CHECK-NEXT:    store <2 x double> [[TMP7]], ptr [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP0]], i32 1
+; CHECK-NEXT:    [[MUL11:%.*]] = fmul double [[TMP3]], 4.000000e+00
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP1]], double [[MUL11]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP4]], <double 7.000000e+00, double 8.000000e+00>
+; CHECK-NEXT:    store <2 x double> [[TMP5]], ptr [[ARRAYIDX9]], align 8
 ; CHECK-NEXT:    ret i32 undef
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll
@@ -10,11 +10,10 @@
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
 ; CHECK-NEXT:    br i1 false, label [[BB5:%.*]], label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP5:%.*]] = mul <2 x i32> [[TMP4]], <i32 3, i32 1>
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
-; CHECK-NEXT:    [[OP_RDX10:%.*]] = add i32 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i32> [[TMP1]], <i32 3, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1
+; CHECK-NEXT:    [[OP_RDX10:%.*]] = add i32 [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    [[OP_RDX11:%.*]] = add i32 [[OP_RDX10]], 0
 ; CHECK-NEXT:    br label [[BB3:%.*]]
 ; CHECK:       bb2:
@@ -23,10 +22,10 @@
 ; CHECK-NEXT:    [[P1:%.*]] = phi i32 [ [[OP_RDX11]], [[BB1]] ], [ 0, [[BB2:%.*]] ]
 ; CHECK-NEXT:    ret i32 0
 ; CHECK:       bb4:
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP2]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP9]])
-; CHECK-NEXT:    [[OP_RDX8:%.*]] = add i32 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP2]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]])
+; CHECK-NEXT:    [[OP_RDX8:%.*]] = add i32 [[TMP9]], 0
 ; CHECK-NEXT:    [[OP_RDX9:%.*]] = add i32 [[OP_RDX8]], [[TMP3]]
 ; CHECK-NEXT:    ret i32 [[OP_RDX9]]
 ; CHECK:       bb5:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll
@@ -9,11 +9,10 @@
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr @b, align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <4 x i32> [[TMP0]], zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> <i32 8, i32 poison, i32 ptrtoint (ptr @fn1 to i32), i32 poison>, <4 x i32> [[TMP0]], <4 x i32> <i32 0, i32 5, i32 2, i32 undef>
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[SHUFFLE]], <4 x i32> <i32 0, i32 6, i32 0, i32 0>
-; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
-; CHECK-NEXT:    store <4 x i32> [[SHUFFLE1]], ptr @a, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> <i32 8, i32 poison, i32 ptrtoint (ptr @fn1 to i32), i32 ptrtoint (ptr @fn1 to i32)>, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 6, i32 0, i32 0>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr @a, align 4
 ; CHECK-NEXT:    ret i32 0
 ;
   entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
@@ -524,37 +524,22 @@
 
 
 define i1 @foo(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) {
-; SSE-LABEL: @foo(
-; SSE-NEXT:    [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0
-; SSE-NEXT:    [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
-; SSE-NEXT:    [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 1
-; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0
-; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1
-; SSE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[SUB14_I167]], i32 0
-; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_I276_I169]], i32 1
-; SSE-NEXT:    [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+01>, float [[B:%.*]], i32 0
-; SSE-NEXT:    [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]]
-; SSE-NEXT:    [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], <float 1.000000e+01, float 2.000000e+00>
-; SSE-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
-; SSE-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
-; SSE-NEXT:    [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]]
-; SSE-NEXT:    [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
-; SSE-NEXT:    ret i1 [[CMP_I185]]
-;
-; AVX-LABEL: @foo(
-; AVX-NEXT:    [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0
-; AVX-NEXT:    [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
-; AVX-NEXT:    [[FM:%.*]] = fmul float [[A:%.*]], [[SUB14_I167]]
-; AVX-NEXT:    [[SUB25_I168:%.*]] = fsub float [[FM]], [[B:%.*]]
-; AVX-NEXT:    [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 1
-; AVX-NEXT:    [[ADD36_I173:%.*]] = fadd float [[SUB25_I168]], 1.000000e+01
-; AVX-NEXT:    [[MUL72_I179:%.*]] = fmul float [[C:%.*]], [[VECEXT_I276_I169]]
-; AVX-NEXT:    [[ADD78_I180:%.*]] = fsub float [[MUL72_I179]], 3.000000e+01
-; AVX-NEXT:    [[ADD79_I181:%.*]] = fadd float 2.000000e+00, [[ADD78_I180]]
-; AVX-NEXT:    [[MUL123_I184:%.*]] = fmul float [[ADD36_I173]], [[ADD79_I181]]
-; AVX-NEXT:    [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
-; AVX-NEXT:    ret i1 [[CMP_I185]]
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:    [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0
+; CHECK-NEXT:    [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[VEC]], <4 x float> poison, <2 x i32> <i32 undef, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[SUB14_I167]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+01>, float [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], <float 1.000000e+01, float 2.000000e+00>
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
+; CHECK-NEXT:    [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
+; CHECK-NEXT:    ret i1 [[CMP_I185]]
 ;
   %vecext.i291.i166 = extractelement <4 x float> %vec, i64 0
   %sub14.i167 = fsub float undef, %vecext.i291.i166
@@ -643,12 +628,11 @@
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAY2:%.*]], align 8
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
 ; SSE-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP0]], [[TMP2]]
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; SSE-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP0]], [[TMP4]]
-; SSE-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP3]], [[TMP5]]
-; SSE-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
-; SSE-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
-; SSE-NEXT:    [[ADD3:%.*]] = fadd double [[TMP7]], [[TMP8]]
+; SSE-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP0]], [[TMP1]]
+; SSE-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP3]], [[TMP4]]
+; SSE-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
+; SSE-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
+; SSE-NEXT:    [[ADD3:%.*]] = fadd double [[TMP6]], [[TMP7]]
 ; SSE-NEXT:    ret double [[ADD3]]
 ;
 ; AVX-LABEL: @splat_loads(
@@ -700,14 +684,13 @@
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAY2:%.*]], align 8
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
 ; SSE-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP0]], [[TMP2]]
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; SSE-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP0]], [[TMP4]]
-; SSE-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP3]], [[TMP5]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; SSE-NEXT:    [[TMP8:%.*]] = fsub <2 x double> [[TMP6]], [[TMP7]]
-; SSE-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP8]], i32 0
-; SSE-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP8]], i32 1
-; SSE-NEXT:    [[RES:%.*]] = fadd double [[TMP9]], [[TMP10]]
+; SSE-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP0]], [[TMP1]]
+; SSE-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP3]], [[TMP4]]
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> zeroinitializer
+; SSE-NEXT:    [[TMP7:%.*]] = fsub <2 x double> [[TMP5]], [[TMP6]]
+; SSE-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1
+; SSE-NEXT:    [[RES:%.*]] = fadd double [[TMP8]], [[TMP9]]
 ; SSE-NEXT:    ret double [[RES]]
 ;
 ; AVX-LABEL: @splat_loads_with_internal_uses(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll b/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll
@@ -15,15 +15,13 @@
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[ADD78_1]], i32 6
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[SUB86_1]], i32 7
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[ADD78_2]], i32 9
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 poison, i32 poison, i32 poison, i32 poison, i32 undef, i32 undef, i32 undef, i32 undef, i32 poison, i32 undef, i32 undef, i32 poison>, i32 [[SUB86_1]], i32 4
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x i32> [[TMP5]], i32 [[ADD78_1]], i32 5
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[ADD94_1]], i32 6
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[SUB102_1]], i32 7
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[SUB102_3]], i32 12
-; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <16 x i32> [[TMP9]], <16 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12>
-; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <16 x i32> [[SHUFFLE]], [[SHUFFLE1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = sub nsw <16 x i32> [[SHUFFLE]], [[SHUFFLE1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 7, i32 6, i32 5, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[SUB102_3]], i32 12
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[SUB102_3]], i32 15
+; CHECK-NEXT:    [[TMP9:%.*]] = freeze <16 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <16 x i32> [[TMP5]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = sub nsw <16 x i32> [[TMP5]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <16 x i32> [[TMP10]], <16 x i32> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 4, i32 5, i32 22, i32 23, i32 8, i32 9, i32 26, i32 27, i32 12, i32 13, i32 30, i32 31>
 ; CHECK-NEXT:    [[TMP13:%.*]] = lshr <16 x i32> [[TMP12]], <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
 ; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i32> [[TMP13]], <i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll b/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll
@@ -187,9 +187,9 @@
 ; CHECK-NEXT:    br i1 [[C_1:%.*]], label [[BB16:%.*]], label [[BB6:%.*]]
 ; CHECK:       bb6:
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[ARG1:%.*]], i32 3
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> <float poison, float poison, float poison, float 0.000000e+00>, float [[ARG2:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x float> [[ARG:%.*]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 4, i32 5, i32 3>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <2 x float> [[ARG:%.*]], <2 x float> poison, <4 x i32> <i32 undef, i32 0, i32 1, i32 undef>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> <float poison, float poison, float poison, float 0.000000e+00>, <4 x i32> <i32 undef, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[ARG2:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    store <4 x float> [[TMP3]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    ret void
@@ -226,9 +226,9 @@
 ; CHECK-NEXT:    [[TMP38:%.*]] = fadd float 0.000000e+00, [[TMP37]]
 ; CHECK-NEXT:    store float [[TMP38]], ptr [[TMP35]], align 4
 ; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr float, ptr [[ARG4]], i64 1
-; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x float>, ptr [[TMP39]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = fadd <2 x float> zeroinitializer, [[TMP7]]
-; CHECK-NEXT:    store <2 x float> [[TMP8]], ptr [[TMP39]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[TMP39]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x float> zeroinitializer, [[TMP4]]
+; CHECK-NEXT:    store <2 x float> [[TMP5]], ptr [[TMP39]], align 4
 ; CHECK-NEXT:    [[TMP44:%.*]] = load float, ptr [[ARG3:%.*]], align 4
 ; CHECK-NEXT:    [[TMP45:%.*]] = load float, ptr [[ARG4]], align 4
 ; CHECK-NEXT:    [[TMP46:%.*]] = fadd float 0.000000e+00, [[TMP45]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
@@ -9,19 +9,19 @@
 
 define void @shuffle_operands1(ptr noalias %from, ptr noalias %to, double %v1, double %v2) {
 ; CHECK-LABEL: @shuffle_operands1(
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[V1:%.*]], i64 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[V2:%.*]], i64 1
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    store <2 x double> [[TMP5]], ptr [[TO:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V1:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2:%.*]], i64 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; SSE2-LABEL: @shuffle_operands1(
-; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
-; SSE2-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[V1:%.*]], i64 0
-; SSE2-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[V2:%.*]], i64 1
-; SSE2-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
-; SSE2-NEXT:    store <2 x double> [[TMP5]], ptr [[TO:%.*]], align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V1:%.*]], i64 0
+; SSE2-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2:%.*]], i64 1
+; SSE2-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
+; SSE2-NEXT:    store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
 ; SSE2-NEXT:    ret void
 ;
   %from_1 = getelementptr double, ptr %from, i64 1
@@ -41,11 +41,11 @@
 ; CHECK-NEXT:    br label [[LP:%.*]]
 ; CHECK:       lp:
 ; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[P]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 undef, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP0]], [[TMP2]]
+; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
 ; CHECK-NEXT:    br i1 undef, label [[LP]], label [[EXT:%.*]]
 ; CHECK:       ext:
 ; CHECK-NEXT:    ret void
@@ -55,11 +55,11 @@
 ; SSE2-NEXT:    br label [[LP:%.*]]
 ; SSE2:       lp:
 ; SSE2-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
-; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[P]], i64 0
-; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 2>
-; SSE2-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
-; SSE2-NEXT:    store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
+; SSE2-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 undef, i32 0>
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
+; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP0]], [[TMP2]]
+; SSE2-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
 ; SSE2-NEXT:    br i1 undef, label [[LP]], label [[EXT:%.*]]
 ; SSE2:       ext:
 ; SSE2-NEXT:    ret void
@@ -89,11 +89,11 @@
 ; CHECK-NEXT:    br label [[LP:%.*]]
 ; CHECK:       lp:
 ; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[P]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP1]]
-; CHECK-NEXT:    store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 undef, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
 ; CHECK-NEXT:    br i1 undef, label [[LP]], label [[EXT:%.*]]
 ; CHECK:       ext:
 ; CHECK-NEXT:    ret void
@@ -103,11 +103,11 @@
 ; SSE2-NEXT:    br label [[LP:%.*]]
 ; SSE2:       lp:
 ; SSE2-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
-; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[P]], i64 0
-; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 2>
-; SSE2-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP1]]
-; SSE2-NEXT:    store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
+; SSE2-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 undef, i32 0>
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
+; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]]
+; SSE2-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
 ; SSE2-NEXT:    br i1 undef, label [[LP]], label [[EXT:%.*]]
 ; SSE2:       ext:
 ; SSE2-NEXT:    ret void
@@ -137,11 +137,11 @@
 ; CHECK-NEXT:    br label [[LP:%.*]]
 ; CHECK:       lp:
 ; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[P]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP1]]
-; CHECK-NEXT:    store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 undef, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
 ; CHECK-NEXT:    br i1 undef, label [[LP]], label [[EXT:%.*]]
 ; CHECK:       ext:
 ; CHECK-NEXT:    ret void
@@ -151,11 +151,11 @@
 ; SSE2-NEXT:    br label [[LP:%.*]]
 ; SSE2:       lp:
 ; SSE2-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
-; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[P]], i64 0
-; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 2>
-; SSE2-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP1]]
-; SSE2-NEXT:    store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
+; SSE2-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 undef, i32 0>
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
+; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]]
+; SSE2-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
 ; SSE2-NEXT:    br i1 undef, label [[LP]], label [[EXT:%.*]]
 ; SSE2:       ext:
 ; SSE2-NEXT:    ret void
@@ -185,10 +185,10 @@
 ; CHECK-NEXT:    br label [[LP:%.*]]
 ; CHECK:       lp:
 ; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 1
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[SHUFFLE]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
 ; CHECK-NEXT:    br i1 undef, label [[LP]], label [[EXT:%.*]]
 ; CHECK:       ext:
@@ -199,10 +199,10 @@
 ; SSE2-NEXT:    br label [[LP:%.*]]
 ; SSE2:       lp:
 ; SSE2-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
-; SSE2-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 1
-; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[SHUFFLE]]
+; SSE2-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
+; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP1]]
 ; SSE2-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
 ; SSE2-NEXT:    br i1 undef, label [[LP]], label [[EXT:%.*]]
 ; SSE2:       ext:
@@ -251,10 +251,10 @@
 ; SSE2-NEXT:    br label [[LP:%.*]]
 ; SSE2:       lp:
 ; SSE2-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
-; SSE2-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 1
-; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[SHUFFLE]]
+; SSE2-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
+; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP1]]
 ; SSE2-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
 ; SSE2-NEXT:    br i1 undef, label [[LP]], label [[EXT:%.*]]
 ; SSE2:       ext:
@@ -304,10 +304,10 @@
 ; SSE2-NEXT:    br label [[LP:%.*]]
 ; SSE2:       lp:
 ; SSE2-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
-; SSE2-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 1
-; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[SHUFFLE]], [[TMP2]]
+; SSE2-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
+; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
 ; SSE2-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
 ; SSE2-NEXT:    br i1 undef, label [[LP]], label [[EXT:%.*]]
 ; SSE2:       ext:
@@ -345,7 +345,7 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr @a, align 16
 ; CHECK-NEXT:    br label [[FOR_BODY3:%.*]]
 ; CHECK:       for.body3:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP14:%.*]], [[FOR_BODY3]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP12:%.*]], [[FOR_BODY3]] ]
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], 1
@@ -355,20 +355,20 @@
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[INDVARS_IV]] to i32
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], 4
 ; CHECK-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP8]], <4 x i32> <i32 0, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP11:%.*]] = fmul <4 x float> [[TMP8]], [[TMP10]]
-; CHECK-NEXT:    store <4 x float> [[TMP11]], ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> poison, <4 x i32> <i32 undef, i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP10:%.*]] = fmul <4 x float> [[TMP7]], [[TMP9]]
+; CHECK-NEXT:    store <4 x float> [[TMP10]], ptr [[ARRAYIDX5]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5
-; CHECK-NEXT:    [[TMP13:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP13]]
-; CHECK-NEXT:    [[TMP14]] = load float, ptr [[ARRAYIDX41]], align 4
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[TMP8]], i64 3
-; CHECK-NEXT:    [[MUL45:%.*]] = fmul float [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP11:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP11]]
+; CHECK-NEXT:    [[TMP12]] = load float, ptr [[ARRAYIDX41]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP7]], i64 3
+; CHECK-NEXT:    [[MUL45:%.*]] = fmul float [[TMP12]], [[TMP13]]
 ; CHECK-NEXT:    store float [[MUL45]], ptr [[ARRAYIDX31]], align 4
-; CHECK-NEXT:    [[TMP16:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[TMP16]], 31995
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[TMP14]], 31995
 ; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_END:%.*]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
@@ -380,7 +380,7 @@
 ; SSE2-NEXT:    [[TMP0:%.*]] = load float, ptr @a, align 16
 ; SSE2-NEXT:    br label [[FOR_BODY3:%.*]]
 ; SSE2:       for.body3:
-; SSE2-NEXT:    [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP14:%.*]], [[FOR_BODY3]] ]
+; SSE2-NEXT:    [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP12:%.*]], [[FOR_BODY3]] ]
 ; SSE2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ]
 ; SSE2-NEXT:    [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32
 ; SSE2-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], 1
@@ -390,20 +390,20 @@
 ; SSE2-NEXT:    [[TMP5:%.*]] = trunc i64 [[INDVARS_IV]] to i32
 ; SSE2-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], 4
 ; SSE2-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP6]]
-; SSE2-NEXT:    [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4
-; SSE2-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0
-; SSE2-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP8]], <4 x i32> <i32 0, i32 4, i32 5, i32 6>
-; SSE2-NEXT:    [[TMP11:%.*]] = fmul <4 x float> [[TMP8]], [[TMP10]]
-; SSE2-NEXT:    store <4 x float> [[TMP11]], ptr [[ARRAYIDX5]], align 4
+; SSE2-NEXT:    [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4
+; SSE2-NEXT:    [[TMP8:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> poison, <4 x i32> <i32 undef, i32 0, i32 1, i32 2>
+; SSE2-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP1]], i64 0
+; SSE2-NEXT:    [[TMP10:%.*]] = fmul <4 x float> [[TMP7]], [[TMP9]]
+; SSE2-NEXT:    store <4 x float> [[TMP10]], ptr [[ARRAYIDX5]], align 4
 ; SSE2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5
-; SSE2-NEXT:    [[TMP13:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; SSE2-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP13]]
-; SSE2-NEXT:    [[TMP14]] = load float, ptr [[ARRAYIDX41]], align 4
-; SSE2-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[TMP8]], i64 3
-; SSE2-NEXT:    [[MUL45:%.*]] = fmul float [[TMP14]], [[TMP15]]
+; SSE2-NEXT:    [[TMP11:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; SSE2-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP11]]
+; SSE2-NEXT:    [[TMP12]] = load float, ptr [[ARRAYIDX41]], align 4
+; SSE2-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP7]], i64 3
+; SSE2-NEXT:    [[MUL45:%.*]] = fmul float [[TMP12]], [[TMP13]]
 ; SSE2-NEXT:    store float [[MUL45]], ptr [[ARRAYIDX31]], align 4
-; SSE2-NEXT:    [[TMP16:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; SSE2-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[TMP16]], 31995
+; SSE2-NEXT:    [[TMP14:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; SSE2-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[TMP14]], 31995
 ; SSE2-NEXT:    br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_END:%.*]]
 ; SSE2:       for.end:
 ; SSE2-NEXT:    ret void
@@ -458,17 +458,17 @@
 
 define void @load_reorder_double(ptr nocapture %c, ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b){
 ; CHECK-LABEL: @load_reorder_double(
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[B:%.*]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    store <2 x double> [[TMP5]], ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[C:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; SSE2-LABEL: @load_reorder_double(
-; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[B:%.*]], align 4
-; SSE2-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[A:%.*]], align 4
-; SSE2-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
-; SSE2-NEXT:    store <2 x double> [[TMP5]], ptr [[C:%.*]], align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B:%.*]], align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[A:%.*]], align 4
+; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    store <2 x double> [[TMP3]], ptr [[C:%.*]], align 4
 ; SSE2-NEXT:    ret void
 ;
   %1 = load double, ptr %a
@@ -493,17 +493,17 @@
 
 define void @load_reorder_float(ptr nocapture %c, ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b){
 ; CHECK-LABEL: @load_reorder_float(
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    store <4 x float> [[TMP5]], ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    store <4 x float> [[TMP3]], ptr [[C:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; SSE2-LABEL: @load_reorder_float(
-; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[A:%.*]], align 4
-; SSE2-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4
-; SSE2-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
-; SSE2-NEXT:    store <4 x float> [[TMP5]], ptr [[C:%.*]], align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[A:%.*]], align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4
+; SSE2-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    store <4 x float> [[TMP3]], ptr [[C:%.*]], align 4
 ; SSE2-NEXT:    ret void
 ;
   %1 = load float, ptr %a
@@ -542,21 +542,21 @@
 
 define void @opcode_reorder(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, ptr noalias nocapture readonly %c,ptr noalias nocapture readonly %d) {
 ; CHECK-LABEL: @opcode_reorder(
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[C:%.*]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x float>, ptr [[D:%.*]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = fadd <4 x float> [[TMP7]], [[TMP5]]
-; CHECK-NEXT:    store <4 x float> [[TMP8]], ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[D:%.*]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    store <4 x float> [[TMP5]], ptr [[A:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; SSE2-LABEL: @opcode_reorder(
-; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4
-; SSE2-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[C:%.*]], align 4
-; SSE2-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
-; SSE2-NEXT:    [[TMP7:%.*]] = load <4 x float>, ptr [[D:%.*]], align 4
-; SSE2-NEXT:    [[TMP8:%.*]] = fadd <4 x float> [[TMP7]], [[TMP5]]
-; SSE2-NEXT:    store <4 x float> [[TMP8]], ptr [[A:%.*]], align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[C:%.*]], align 4
+; SSE2-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[D:%.*]], align 4
+; SSE2-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP4]], [[TMP3]]
+; SSE2-NEXT:    store <4 x float> [[TMP5]], ptr [[A:%.*]], align 4
 ; SSE2-NEXT:    ret void
 ;
   %1 = load float, ptr %b
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
@@ -26,11 +26,11 @@
 ; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_ELSE:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.else:
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 10
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    br label [[IF_END]]
 ; CHECK:       if.end:
-; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x double> [ [[TMP1]], [[IF_ELSE]] ], [ <double 3.000000e+00, double 5.000000e+00>, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    store <2 x double> [[TMP2]], ptr [[A]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x double> [ [[TMP0]], [[IF_ELSE]] ], [ <double 3.000000e+00, double 5.000000e+00>, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    store <2 x double> [[TMP1]], ptr [[A]], align 8
 ; CHECK-NEXT:    ret i32 undef
 ;
 entry:
@@ -73,19 +73,19 @@
 define i32 @foo2(ptr noalias nocapture %B, ptr noalias nocapture %A, i32 %n, i32 %m) #0 {
 ; CHECK-LABEL: @foo2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_019:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x double> [ [[TMP1]], [[ENTRY]] ], [ [[TMP5:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], <double 1.000000e+01, double 1.000000e+01>
-; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], <double 4.000000e+00, double 4.000000e+00>
-; CHECK-NEXT:    [[TMP5]] = fadd <2 x double> [[TMP4]], <double 4.000000e+00, double 4.000000e+00>
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x double> [ [[TMP0]], [[ENTRY]] ], [ [[TMP4:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], <double 1.000000e+01, double 1.000000e+01>
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], <double 4.000000e+00, double 4.000000e+00>
+; CHECK-NEXT:    [[TMP4]] = fadd <2 x double> [[TMP3]], <double 4.000000e+00, double 4.000000e+00>
 ; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_019]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 100
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    store <2 x double> [[TMP5]], ptr [[B:%.*]], align 8
+; CHECK-NEXT:    store <2 x double> [[TMP4]], ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    ret i32 0
 ;
 entry:
@@ -138,41 +138,40 @@
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[A:%.*]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <2 x i32> <i32 undef, i32 0>
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP0]], i32 0
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[R_052:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = phi <4 x float> [ [[TMP2]], [[ENTRY]] ], [ [[TMP16:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = phi <2 x float> [ [[TMP5]], [[ENTRY]] ], [ [[TMP12:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x float> [[TMP7]], i32 0
-; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP8]], 7.000000e+00
+; CHECK-NEXT:    [[TMP4:%.*]] = phi <4 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP13:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi <2 x float> [ [[TMP3]], [[ENTRY]] ], [ [[TMP9:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0
+; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP6]], 7.000000e+00
 ; CHECK-NEXT:    [[ADD6]] = fadd float [[R_052]], [[MUL]]
-; CHECK-NEXT:    [[TMP9:%.*]] = add nsw i64 [[INDVARS_IV]], 2
-; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX14]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw i64 [[INDVARS_IV]], 2
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX14]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 3
 ; CHECK-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV_NEXT]]
-; CHECK-NEXT:    [[TMP12]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> [[TMP12]], <4 x i32> <i32 1, i32 undef, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[TMP10]], i32 1
-; CHECK-NEXT:    [[TMP15:%.*]] = fmul <4 x float> [[TMP14]], <float 8.000000e+00, float 9.000000e+00, float 1.000000e+01, float 1.100000e+01>
-; CHECK-NEXT:    [[TMP16]] = fadd <4 x float> [[TMP6]], [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP17]], 121
+; CHECK-NEXT:    [[TMP9]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> [[TMP9]], <4 x i32> <i32 1, i32 undef, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP8]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = fmul <4 x float> [[TMP11]], <float 8.000000e+00, float 9.000000e+00, float 1.000000e+01, float 1.100000e+01>
+; CHECK-NEXT:    [[TMP13]] = fadd <4 x float> [[TMP4]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP14]], 121
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x float> [[TMP16]], i32 0
-; CHECK-NEXT:    [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP18]]
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x float> [[TMP16]], i32 1
-; CHECK-NEXT:    [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP19]]
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[TMP16]], i32 2
-; CHECK-NEXT:    [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP20]]
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x float> [[TMP16]], i32 3
-; CHECK-NEXT:    [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP21]]
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[TMP13]], i32 0
+; CHECK-NEXT:    [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP15]]
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x float> [[TMP13]], i32 1
+; CHECK-NEXT:    [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP16]]
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x float> [[TMP13]], i32 2
+; CHECK-NEXT:    [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP17]]
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x float> [[TMP13]], i32 3
+; CHECK-NEXT:    [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP18]]
 ; CHECK-NEXT:    ret float [[ADD31]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
@@ -68,14 +68,13 @@
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
 ; SSE-NEXT:    [[TMP5:%.*]] = add nuw nsw <2 x i64> [[TMP4]], zeroinitializer
 ; SSE-NEXT:    store <2 x i64> [[TMP5]], ptr undef, align 1
-; SSE-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> poison, i64 [[ADD]], i32 0
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> [[TMP5]], <2 x i32> <i32 0, i32 3>
-; SSE-NEXT:    [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], <i64 2, i64 2>
-; SSE-NEXT:    [[TMP9:%.*]] = and <2 x i64> [[TMP8]], <i64 20, i64 20>
-; SSE-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
-; SSE-NEXT:    [[TMP11:%.*]] = lshr <2 x i64> [[TMP5]], <i64 6, i64 6>
-; SSE-NEXT:    [[TMP12:%.*]] = add nuw nsw <2 x i64> [[TMP10]], [[TMP11]]
-; SSE-NEXT:    store <2 x i64> [[TMP12]], ptr [[ARRAYIDX2_2]], align 1
+; SSE-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[ADD]], i32 0
+; SSE-NEXT:    [[TMP7:%.*]] = shl <2 x i64> [[TMP6]], <i64 2, i64 2>
+; SSE-NEXT:    [[TMP8:%.*]] = and <2 x i64> [[TMP7]], <i64 20, i64 20>
+; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
+; SSE-NEXT:    [[TMP10:%.*]] = lshr <2 x i64> [[TMP5]], <i64 6, i64 6>
+; SSE-NEXT:    [[TMP11:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP10]]
+; SSE-NEXT:    store <2 x i64> [[TMP11]], ptr [[ARRAYIDX2_2]], align 1
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @pr35497(
@@ -89,14 +88,13 @@
 ; AVX-NEXT:    [[TMP3:%.*]] = and <2 x i64> [[TMP2]], <i64 20, i64 20>
 ; AVX-NEXT:    [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer
 ; AVX-NEXT:    store <2 x i64> [[TMP4]], ptr undef, align 1
-; AVX-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> poison, i64 [[ADD]], i32 0
-; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; AVX-NEXT:    [[TMP7:%.*]] = shl <2 x i64> [[TMP6]], <i64 2, i64 2>
-; AVX-NEXT:    [[TMP8:%.*]] = and <2 x i64> [[TMP7]], <i64 20, i64 20>
-; AVX-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
-; AVX-NEXT:    [[TMP10:%.*]] = lshr <2 x i64> [[TMP4]], <i64 6, i64 6>
-; AVX-NEXT:    [[TMP11:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP10]]
-; AVX-NEXT:    store <2 x i64> [[TMP11]], ptr [[ARRAYIDX2_2]], align 1
+; AVX-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[ADD]], i32 0
+; AVX-NEXT:    [[TMP6:%.*]] = shl <2 x i64> [[TMP5]], <i64 2, i64 2>
+; AVX-NEXT:    [[TMP7:%.*]] = and <2 x i64> [[TMP6]], <i64 20, i64 20>
+; AVX-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
+; AVX-NEXT:    [[TMP9:%.*]] = lshr <2 x i64> [[TMP4]], <i64 6, i64 6>
+; AVX-NEXT:    [[TMP10:%.*]] = add nuw nsw <2 x i64> [[TMP8]], [[TMP9]]
+; AVX-NEXT:    store <2 x i64> [[TMP10]], ptr [[ARRAYIDX2_2]], align 1
 ; AVX-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN:  opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+sse2     | FileCheck %s --check-prefixes=SSE
 ; RUN:  opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx      | FileCheck %s --check-prefixes=AVX
-; RUN:  opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2     | FileCheck %s --check-prefixes=AVX
+; RUN:  opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2     | FileCheck %s --check-prefixes=AVX2
 ; RUN:  opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f  | FileCheck %s --check-prefixes=AVX512
 ; RUN:  opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
 
@@ -25,20 +25,33 @@
 ;
 ; AVX-LABEL: @foo(
 ; AVX-NEXT:    [[TMP1:%.*]] = load i32, ptr @b, align 16
+; AVX-NEXT:    store i32 [[TMP1]], ptr @a, align 16
 ; AVX-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @b, i64 0, i64 2), align 8
-; AVX-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[TMP1]], i64 0
-; AVX-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP2]], i64 1
-; AVX-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
-; AVX-NEXT:    store <8 x i32> [[SHUFFLE]], ptr @a, align 16
+; AVX-NEXT:    store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 1), align 4
+; AVX-NEXT:    store i32 [[TMP1]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 2), align 8
+; AVX-NEXT:    store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 3), align 4
+; AVX-NEXT:    store i32 [[TMP1]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 4), align 16
+; AVX-NEXT:    store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 5), align 4
+; AVX-NEXT:    store i32 [[TMP1]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 6), align 8
+; AVX-NEXT:    store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 7), align 4
 ; AVX-NEXT:    ret void
 ;
+; AVX2-LABEL: @foo(
+; AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr @b, align 16
+; AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @b, i64 0, i64 2), align 8
+; AVX2-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[TMP1]], i64 0
+; AVX2-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP2]], i64 1
+; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+; AVX2-NEXT:    store <8 x i32> [[TMP5]], ptr @a, align 16
+; AVX2-NEXT:    ret void
+;
 ; AVX512-LABEL: @foo(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load i32, ptr @b, align 16
 ; AVX512-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @b, i64 0, i64 2), align 8
 ; AVX512-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[TMP1]], i64 0
 ; AVX512-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP2]], i64 1
-; AVX512-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
-; AVX512-NEXT:    store <8 x i32> [[SHUFFLE]], ptr @a, align 16
+; AVX512-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+; AVX512-NEXT:    store <8 x i32> [[TMP5]], ptr @a, align 16
 ; AVX512-NEXT:    ret void
 ;
   %1 = load i32, ptr @b, align 16
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
 
+; FIXME: fix the cost of the xor reduction ops
+
 define i16 @test() {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
@@ -8,19 +10,17 @@
 ; CHECK-NEXT:    [[A1:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 6
 ; CHECK-NEXT:    br label [[WHILE:%.*]]
 ; CHECK:       while:
-; CHECK-NEXT:    [[PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX25:%.*]], [[WHILE]] ]
+; CHECK-NEXT:    [[PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX27:%.*]], [[WHILE]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr null, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr null, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[A]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[A1]], align 16
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> <i32 1, i32 0, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
-; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> [[TMP7]])
-; CHECK-NEXT:    [[OP_RDX23:%.*]] = xor i64 0, [[TMP1]]
-; CHECK-NEXT:    [[OP_RDX24:%.*]] = xor i64 [[TMP0]], [[TMP8]]
-; CHECK-NEXT:    [[OP_RDX25]] = xor i64 [[OP_RDX23]], [[OP_RDX24]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A1]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr null, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr [[A]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> [[TMP4]])
+; CHECK-NEXT:    [[OP_RDX24:%.*]] = xor i64 0, [[TMP2]]
+; CHECK-NEXT:    [[OP_RDX25:%.*]] = xor i64 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[OP_RDX26:%.*]] = xor i64 [[OP_RDX24]], [[OP_RDX25]]
+; CHECK-NEXT:    [[OP_RDX27]] = xor i64 [[OP_RDX26]], [[TMP5]]
 ; CHECK-NEXT:    br label [[WHILE]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
@@ -329,20 +329,12 @@
 
 define i1 @logical_and_icmp_clamp_v8i32(<8 x i32> %x, <8 x i32> %y) {
 ; CHECK-LABEL: @logical_and_icmp_clamp_v8i32(
-; CHECK-NEXT:    [[Y0:%.*]] = extractelement <8 x i32> [[Y:%.*]], i32 0
-; CHECK-NEXT:    [[Y1:%.*]] = extractelement <8 x i32> [[Y]], i32 1
-; CHECK-NEXT:    [[Y2:%.*]] = extractelement <8 x i32> [[Y]], i32 2
-; CHECK-NEXT:    [[Y3:%.*]] = extractelement <8 x i32> [[Y]], i32 3
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[X:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[X]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> <i32 42, i32 42, i32 42, i32 42, i32 poison, i32 poison, i32 poison, i32 poison>, i32 [[Y0]], i32 4
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[Y1]], i32 5
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[Y2]], i32 6
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[Y3]], i32 7
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp slt <8 x i32> [[TMP2]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = freeze <8 x i1> [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP8]])
-; CHECK-NEXT:    ret i1 [[TMP9]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[X:%.*]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[Y:%.*]], <8 x i32> <i32 42, i32 42, i32 42, i32 42, i32 poison, i32 poison, i32 poison, i32 poison>, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp slt <8 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = freeze <8 x i1> [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP4]])
+; CHECK-NEXT:    ret i1 [[TMP5]]
 ;
   %x0 = extractelement <8 x i32> %x, i32 0
   %x1 = extractelement <8 x i32> %x, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll
@@ -89,7 +89,7 @@
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[A:%.*]], 2.000000e+00
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i32 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[FNEG]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[C]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 1, i32 undef>
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[B]], i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> poison, double [[MUL]], i32 0
@@ -138,7 +138,7 @@
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[A:%.*]], 2.000000e+00
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[FNEG]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> <i32 1, i32 undef>
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[B]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> poison, double [[MUL]], i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
@@ -11,18 +11,18 @@
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ARG:%.*]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, <8 x ptr> [[SHUFFLE]], <8 x i64> <i64 1, i64 3, i64 5, i64 7, i64 9, i64 11, i64 13, i64 15>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr double, <8 x ptr> [[TMP1]], <8 x i64> <i64 1, i64 3, i64 5, i64 7, i64 9, i64 11, i64 13, i64 15>
 ; CHECK-NEXT:    [[GEP2_0:%.*]] = getelementptr inbounds double, ptr [[ARG1:%.*]], i64 16
-; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> [[TMP1]], i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x double> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x double> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x double>, ptr [[GEP2_0]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast <8 x double> [[TMP4]], [[TMP2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x double>, ptr [[ARG1]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast <8 x double> [[TMP7]], [[TMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP8]])
-; CHECK-NEXT:    [[TMP10:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP5]])
-; CHECK-NEXT:    [[I142:%.*]] = insertelement <2 x double> poison, double [[TMP9]], i64 0
-; CHECK-NEXT:    [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP10]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast <8 x double> [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x double>, ptr [[ARG1]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul fast <8 x double> [[TMP6]], [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP5]])
+; CHECK-NEXT:    [[I142:%.*]] = insertelement <2 x double> poison, double [[TMP8]], i64 0
+; CHECK-NEXT:    [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP9]], i64 1
 ; CHECK-NEXT:    [[P:%.*]] = getelementptr inbounds double, ptr [[ARG2:%.*]], <2 x i64> <i64 0, i64 16>
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[I143]], <2 x ptr> [[P]], i32 8, <2 x i1> <i1 true, i1 true>)
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll
@@ -8,7 +8,7 @@
 ; CHECK-NEXT:    [[LD:%.*]] = load <8 x i16>, ptr undef, align 16
 ; CHECK-NEXT:    br label [[T:%.*]]
 ; CHECK:       t:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i16> [[LD]], <8 x i16> poison, <8 x i32> <i32 0, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i16> [[LD]], <8 x i16> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = add <8 x i16> [[LD]], [[TMP0]]
 ; CHECK-NEXT:    store <8 x i16> [[TMP1]], ptr [[PTR:%.*]], align 2
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-clustered-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-clustered-node.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-clustered-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-clustered-node.ll
@@ -14,17 +14,16 @@
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x ptr> <ptr poison, ptr null, ptr poison, ptr null, ptr null, ptr null, ptr null, ptr null>, ptr [[I242]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x ptr> [[TMP2]], ptr [[I250]], i32 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult <8 x ptr> [[TMP3]], [[TMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x ptr> poison, ptr [[I250]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x ptr> [[TMP5]], ptr [[I242]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x ptr> [[TMP6]], ptr [[I245]], i32 2
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x ptr> [[TMP7]], ptr [[I248]], i32 3
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x ptr> [[TMP8]], <8 x ptr> poison, <8 x i32> <i32 2, i32 0, i32 1, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x ptr> <ptr poison, ptr poison, ptr null, ptr null, ptr null, ptr null, ptr null, ptr null>, <8 x ptr> [[TMP1]], <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp ult <8 x ptr> [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> poison, <8 x i32> <i32 1, i32 2, i32 0, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i1> [[TMP4]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP13]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = and i1 [[TMP14]], false
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <4 x i32> <i32 2, i32 0, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x ptr> [[TMP5]], ptr [[I245]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x ptr> [[TMP6]], ptr [[I248]], i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x ptr> [[TMP7]], <4 x ptr> poison, <8 x i32> <i32 2, i32 0, i32 1, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> <ptr poison, ptr poison, ptr null, ptr null, ptr null, ptr null, ptr null, ptr null>, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ult <8 x ptr> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> poison, <8 x i32> <i32 1, i32 2, i32 0, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i1> [[TMP4]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP12]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = and i1 [[TMP13]], false
 ; CHECK-NEXT:    ret i1 [[OP_RDX]]
 ;
 bb:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll
@@ -10,7 +10,7 @@
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP4]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> poison)
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> poison, <16 x i32> <i32 4, i32 3, i32 0, i32 1, i32 2, i32 0, i32 1, i32 2, i32 0, i32 2, i32 5, i32 6, i32 7, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x float> <float poison, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, <16 x float> [[TMP8]], <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x float> [[TMP8]], <16 x float> <float poison, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[TMP10:%.*]] = fadd reassoc nsz arcp contract afn <16 x float> [[TMP7]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <16 x float> [[TMP10]], <16 x float> poison, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 9, i32 0, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    store <16 x float> [[TMP11]], ptr [[TMP5]], align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/root-trunc-extract-reuse.ll b/llvm/test/Transforms/SLPVectorizer/X86/root-trunc-extract-reuse.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/root-trunc-extract-reuse.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/root-trunc-extract-reuse.ll
@@ -17,9 +17,8 @@
 ; CHECK-NEXT:    [[T13:%.*]] = and <2 x i32> [[TMP4]], zeroinitializer
 ; CHECK-NEXT:    br label [[ELSE1:%.*]]
 ; CHECK:       else1:
-; CHECK-NEXT:    [[T20:%.*]] = extractelement <2 x i32> [[T13]], i64 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[BF_CAST162]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[T20]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[T13]], <2 x i32> poison, <2 x i32> <i32 undef, i32 0>
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[BF_CAST162]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt <2 x i32> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
 ; CHECK-NEXT:    ret i1 [[TMP8]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll
@@ -12,10 +12,10 @@
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x float> zeroinitializer, [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX10_I_I86]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr undef, align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> <float 0.000000e+00, float poison>, <2 x float> [[TMP0]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP0]], float 0.000000e+00, i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> <float poison, float 0.000000e+00>, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP6]], <2 x float> [[TMP7]])
 ; CHECK-NEXT:    br i1 false, label [[BB2:%.*]], label [[BB3:%.*]]
 ; CHECK:       bb2:
@@ -23,12 +23,12 @@
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    [[TMP10:%.*]] = phi <2 x float> [ [[TMP9]], [[BB2]] ], [ zeroinitializer, [[BB1]] ]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x float> [[TMP1]], [[SHUFFLE]]
-; CHECK-NEXT:    [[TMP12:%.*]] = fadd <2 x float> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = fsub <2 x float> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP12:%.*]] = fadd <2 x float> [[TMP1]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = fadd <2 x float> [[TMP12]], zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = fsub <2 x float> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    store <2 x float> [[TMP14]], ptr [[ARRAYIDX21_I]], align 16
+; CHECK-NEXT:    [[TMP15:%.*]] = fsub <2 x float> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    store <2 x float> [[TMP15]], ptr [[ARRAYIDX21_I]], align 16
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll
@@ -7,13 +7,12 @@
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[PTR1:%.*]], i32 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[PTR:%.*]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -1, i32 -1>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt <4 x i32> [[TMP1]], undef
-; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> undef, <4 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> zeroinitializer, <4 x i32> zeroinitializer, <4 x i32> [[TMP6]]
-; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP27]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <2 x i32> [[TMP0]], <i32 -1, i32 -1>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt <4 x i32> [[TMP1]], undef
+; CHECK-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> undef, <4 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> zeroinitializer, <4 x i32> zeroinitializer, <4 x i32> [[TMP5]]
+; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr [[TMP27]], align 8
 ; CHECK-NEXT:    ret void
 ;
 bb:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll
@@ -13,16 +13,15 @@
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr null, align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP0]], i32 3
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP3]], i32 2
-; CHECK-NEXT:    [[TMP8:%.*]] = fmul <4 x float> [[TMP5]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP11:%.*]] = fmul <4 x float> [[TMP5]], zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = fadd <4 x float> [[TMP9]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = fadd <4 x float> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    store <4 x float> [[TMP13]], ptr [[RESULT]], align 4
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul <4 x float> [[SHUFFLE]], [[TMP6]]
+; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul <4 x float> [[SHUFFLE]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = fadd <4 x float> [[SHUFFLE1]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd <4 x float> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    store <4 x float> [[TMP10]], ptr [[RESULT]], align 4
 ; CHECK-NEXT:    br label [[FOR_BODY]]
 ;
 entry: