diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -430,26 +430,6 @@
 ///                                                         i32 6>
 /// %2 = mul <4 x i8> %1, %1
 /// ret <4 x i8> %2
-/// We convert this initially to something like:
-/// %x0 = extractelement <4 x i8> %x, i32 0
-/// %x3 = extractelement <4 x i8> %x, i32 3
-/// %y1 = extractelement <4 x i8> %y, i32 1
-/// %y2 = extractelement <4 x i8> %y, i32 2
-/// %1 = insertelement <4 x i8> poison, i8 %x0, i32 0
-/// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1
-/// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2
-/// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3
-/// %5 = mul <4 x i8> %4, %4
-/// %6 = extractelement <4 x i8> %5, i32 0
-/// %ins1 = insertelement <4 x i8> poison, i8 %6, i32 0
-/// %7 = extractelement <4 x i8> %5, i32 1
-/// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1
-/// %8 = extractelement <4 x i8> %5, i32 2
-/// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2
-/// %9 = extractelement <4 x i8> %5, i32 3
-/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3
-/// ret <4 x i8> %ins4
-/// InstCombiner transforms this into a shuffle and vector mul
 /// Mask will return the Shuffle Mask equivalent to the extracted elements.
 /// TODO: Can we split off and reuse the shuffle mask detection from
 /// ShuffleVectorInst/getShuffleCost?
@@ -1080,6 +1060,7 @@
 class BoUpSLP {
   struct TreeEntry;
   struct ScheduleData;
+  class ShuffleCostEstimator;
   class ShuffleInstructionBuilder;
 
 public:
@@ -2459,6 +2440,12 @@
   /// \p E.
   Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx);
 
+  /// Create a new vector from a list of scalar values.  Produces a sequence
+  /// which exploits values reused across lanes, and arranges the inserts
+  /// for ease of later optimization.
+  template <typename BVTy, typename ResTy, typename... Args>
+  ResTy processBuildVector(const TreeEntry *E, Args &...Params);
+
   /// Create a new vector from a list of scalar values.  Produces a sequence
   /// which exploits values reused across lanes, and arranges the inserts
   /// for ease of later optimization.
@@ -2559,7 +2546,7 @@
         // directly, without reordering.
         SmallVector<int> Mask;
         inversePermutation(ReorderIndices, Mask);
-        if (VL.size() == Scalars.size())
+        if (VL.size() == Scalars.size() && ReuseShuffleIndices.empty())
           return IsSame(Scalars, Mask);
         if (VL.size() == ReuseShuffleIndices.size()) {
           ::addMask(Mask, ReuseShuffleIndices);
@@ -6295,68 +6282,6 @@
   return {IntrinsicCost, LibCost};
 }
 
-/// Compute the cost of creating a vector of type \p VecTy containing the
-/// extracted values from \p VL.
-static InstructionCost
-computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
-                   TargetTransformInfo::ShuffleKind ShuffleKind,
-                   ArrayRef<int> Mask, TargetTransformInfo &TTI) {
-  unsigned NumOfParts = TTI.getNumberOfParts(VecTy);
-
-  if (ShuffleKind != TargetTransformInfo::SK_PermuteSingleSrc || !NumOfParts ||
-      VecTy->getNumElements() < NumOfParts)
-    return TTI.getShuffleCost(ShuffleKind, VecTy, Mask);
-
-  bool AllConsecutive = true;
-  unsigned EltsPerVector = VecTy->getNumElements() / NumOfParts;
-  unsigned Idx = -1;
-  InstructionCost Cost = 0;
-
-  // Process extracts in blocks of EltsPerVector to check if the source vector
-  // operand can be re-used directly. If not, add the cost of creating a shuffle
-  // to extract the values into a vector register.
-  SmallVector<int> RegMask(EltsPerVector, UndefMaskElem);
-  for (auto *V : VL) {
-    ++Idx;
-
-    // Reached the start of a new vector registers.
-    if (Idx % EltsPerVector == 0) {
-      RegMask.assign(EltsPerVector, UndefMaskElem);
-      AllConsecutive = true;
-      continue;
-    }
-
-    // Need to exclude undefs from analysis.
-    if (isa<UndefValue>(V) || Mask[Idx] == UndefMaskElem)
-      continue;
-
-    // Check all extracts for a vector register on the target directly
-    // extract values in order.
-    unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V));
-    if (!isa<UndefValue>(VL[Idx - 1]) && Mask[Idx - 1] != UndefMaskElem) {
-      unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1]));
-      AllConsecutive &= PrevIdx + 1 == CurrentIdx &&
-                        CurrentIdx % EltsPerVector == Idx % EltsPerVector;
-      RegMask[Idx % EltsPerVector] = CurrentIdx % EltsPerVector;
-    }
-
-    if (AllConsecutive)
-      continue;
-
-    // Skip all indices, except for the last index per vector block.
-    if ((Idx + 1) % EltsPerVector != 0 && Idx + 1 != VL.size())
-      continue;
-
-    // If we have a series of extracts which are not consecutive and hence
-    // cannot re-use the source vector register directly, compute the shuffle
-    // cost to extract the vector with EltsPerVector elements.
-    Cost += TTI.getShuffleCost(
-        TargetTransformInfo::SK_PermuteSingleSrc,
-        FixedVectorType::get(VecTy->getElementType(), EltsPerVector), RegMask);
-  }
-  return Cost;
-}
-
 /// Build shuffle mask for shuffle graph entries and lists of main and alternate
 /// operations operands.
 static void
@@ -6667,9 +6592,9 @@
   /// Smart shuffle instruction emission, walks through shuffles trees and
   /// tries to find the best matching vector for the actual shuffle
   /// instruction.
-  template <typename ShuffleBuilderTy>
-  static Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
-                              ShuffleBuilderTy &Builder) {
+  template <typename T, typename ShuffleBuilderTy>
+  static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
+                         ShuffleBuilderTy &Builder) {
     assert(V1 && "Expected at least one vector value.");
     if (V2)
       Builder.resizeToMatch(V1, V2);
@@ -6768,62 +6693,112 @@
             isa<ShuffleVectorInst>(Op1) &&
             cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
                 ArrayRef(CombinedMask1))))
-        return Op1;
+        return Builder.createIdentity(Op1);
       return Builder.createShuffleVector(
           Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
           CombinedMask1);
     }
     if (isa<PoisonValue>(V1))
-      return PoisonValue::get(FixedVectorType::get(
-          cast<VectorType>(V1->getType())->getElementType(), Mask.size()));
+      return Builder.createPoison(
+          cast<VectorType>(V1->getType())->getElementType(), Mask.size());
     SmallVector<int> NewMask(Mask.begin(), Mask.end());
     bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
     assert(V1 && "Expected non-null value after looking through shuffles.");
 
     if (!IsIdentity)
       return Builder.createShuffleVector(V1, NewMask);
-    return V1;
+    return Builder.createIdentity(V1);
   }
 };
 } // namespace
 
-InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
-                                      ArrayRef<Value *> VectorizedVals) {
-  ArrayRef<Value *> VL = E->Scalars;
+/// Merges shuffle masks and emits final shuffle instruction, if required. It
+/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
+/// when the actual shuffle instruction is generated only if this is actually
+/// required. Otherwise, the shuffle instruction emission is delayed till the
+/// end of the process, to reduce the number of emitted instructions and further
+/// analysis/transformations.
+class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
+  bool IsFinalized = false;
+  SmallVector<int> CommonMask;
+  SmallVector<Value *, 2> InVectors;
+  const TargetTransformInfo &TTI;
+  InstructionCost Cost = 0;
+  ArrayRef<Value *> VectorizedVals;
+  BoUpSLP &R;
+  constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
 
-  Type *ScalarTy = VL[0]->getType();
-  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
-    ScalarTy = SI->getValueOperand()->getType();
-  else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))
-    ScalarTy = CI->getOperand(0)->getType();
-  else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
-    ScalarTy = IE->getOperand(1)->getType();
-  auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
-  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  class ShuffleCostBuilder {
+    const TargetTransformInfo &TTI;
 
-  // If we have computed a smaller type for the expression, update VecTy so
-  // that the costs will be accurate.
-  if (MinBWs.count(VL[0]))
-    VecTy = FixedVectorType::get(
-        IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
-  unsigned EntryVF = E->getVectorFactor();
-  auto *FinalVecTy = FixedVectorType::get(VecTy->getElementType(), EntryVF);
+    static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
+      int Limit = 2 * VF;
+      return Mask.empty() ||
+             (VF == Mask.size() &&
+              all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) &&
+              ShuffleVectorInst::isIdentityMask(Mask));
+    }
 
-  bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
-  // FIXME: it tries to fix a problem with MSVC buildbots.
-  TargetTransformInfo *TTI = this->TTI;
-  auto AdjustExtractsCost = [=](InstructionCost &Cost,
-                                ArrayRef<int> Mask) -> Value * {
+  public:
+    ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
+    ~ShuffleCostBuilder() = default;
+    InstructionCost createShuffleVector(Value *V1, Value *,
+                                        ArrayRef<int> Mask) const {
+      // Empty mask or identity mask are free.
+      unsigned VF =
+          cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
+      if (isEmptyOrIdentity(Mask, VF))
+        return TTI::TCC_Free;
+      return TTI.getShuffleCost(
+          TTI::SK_PermuteTwoSrc,
+          FixedVectorType::get(
+              cast<VectorType>(V1->getType())->getElementType(), Mask.size()),
+          Mask);
+    }
+    InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
+      // Empty mask or identity mask are free.
+      if (isEmptyOrIdentity(Mask, Mask.size()))
+        return TTI::TCC_Free;
+      return TTI.getShuffleCost(
+          TTI::SK_PermuteSingleSrc,
+          FixedVectorType::get(
+              cast<VectorType>(V1->getType())->getElementType(), Mask.size()),
+          Mask);
+    }
+    InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
+    InstructionCost createPoison(Type *Ty, unsigned VF) const {
+      return TTI::TCC_Free;
+    }
+    void resizeToMatch(Value *&, Value *&) const {}
+  };
+
+  /// Smart shuffle instruction emission, walks through shuffles trees and
+  /// tries to find the best matching vector for the actual shuffle
+  /// instruction.
+  InstructionCost createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
+    ShuffleCostBuilder Builder(TTI);
+    return BaseShuffleAnalysis::createShuffle<InstructionCost>(V1, V2, Mask,
+                                                               Builder);
+  }
+
+public:
+  ShuffleCostEstimator(TargetTransformInfo &TTI,
+                       ArrayRef<Value *> VectorizedVals, BoUpSLP &R)
+      : TTI(TTI), VectorizedVals(VectorizedVals), R(R) {}
+  Value *adjustExtracts(const TreeEntry *E, ArrayRef<int> Mask) {
     if (Mask.empty())
       return nullptr;
     Value *VecBase = nullptr;
+    ArrayRef<Value *> VL = E->Scalars;
+    auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());
     // If the resulting type is scalarized, do not adjust the cost.
-    unsigned VecNumParts = TTI->getNumberOfParts(VecTy);
+    unsigned VecNumParts = TTI.getNumberOfParts(VecTy);
     if (VecNumParts == VecTy->getNumElements())
       return nullptr;
     DenseMap<Value *, int> ExtractVectorsTys;
     SmallPtrSet<Value *, 4> CheckedExtracts;
     for (auto [I, V] : enumerate(VL)) {
+      // Ignore non-extractelement scalars.
       if (isa<UndefValue>(V) || (!Mask.empty() && Mask[I] == UndefMaskElem))
         continue;
       // If all users of instruction are going to be vectorized and this
@@ -6832,9 +6807,9 @@
       // vectorized tree.
       // Also, avoid adjusting the cost for extractelements with multiple uses
       // in different graph entries.
-      const TreeEntry *VE = getTreeEntry(V);
+      const TreeEntry *VE = R.getTreeEntry(V);
       if (!CheckedExtracts.insert(V).second ||
-          !areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) ||
+          !R.areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) ||
           (VE && VE != E))
         continue;
       auto *EE = cast<ExtractElementInst>(V);
@@ -6843,7 +6818,7 @@
       if (!EEIdx)
         continue;
       unsigned Idx = *EEIdx;
-      if (VecNumParts != TTI->getNumberOfParts(EE->getVectorOperandType())) {
+      if (VecNumParts != TTI.getNumberOfParts(EE->getVectorOperandType())) {
         auto It =
             ExtractVectorsTys.try_emplace(EE->getVectorOperand(), Idx).first;
         It->getSecond() = std::min<int>(It->second, Idx);
@@ -6856,18 +6831,17 @@
             })) {
           // Use getExtractWithExtendCost() to calculate the cost of
           // extractelement/ext pair.
-          Cost -=
-              TTI->getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
-                                            EE->getVectorOperandType(), Idx);
+          Cost -= TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
+                                               EE->getVectorOperandType(), Idx);
           // Add back the cost of s|zext which is subtracted separately.
-          Cost += TTI->getCastInstrCost(
+          Cost += TTI.getCastInstrCost(
               Ext->getOpcode(), Ext->getType(), EE->getType(),
               TTI::getCastContextHint(Ext), CostKind, Ext);
           continue;
         }
       }
-      Cost -= TTI->getVectorInstrCost(*EE, EE->getVectorOperandType(), CostKind,
-                                      Idx);
+      Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(), CostKind,
+                                     Idx);
     }
     // Add a cost for subvector extracts/inserts if required.
     for (const auto &Data : ExtractVectorsTys) {
@@ -6875,256 +6849,366 @@
       unsigned NumElts = VecTy->getNumElements();
       if (Data.second % NumElts == 0)
         continue;
-      if (TTI->getNumberOfParts(EEVTy) > VecNumParts) {
+      if (TTI.getNumberOfParts(EEVTy) > VecNumParts) {
         unsigned Idx = (Data.second / NumElts) * NumElts;
         unsigned EENumElts = EEVTy->getNumElements();
         if (Idx % NumElts == 0)
           continue;
         if (Idx + NumElts <= EENumElts) {
-          Cost +=
-              TTI->getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
-                                  EEVTy, std::nullopt, CostKind, Idx, VecTy);
+          Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
+                                     EEVTy, std::nullopt, CostKind, Idx, VecTy);
         } else {
           // Need to round up the subvector type vectorization factor to avoid a
           // crash in cost model functions. Make SubVT so that Idx + VF of SubVT
           // <= EENumElts.
           auto *SubVT =
               FixedVectorType::get(VecTy->getElementType(), EENumElts - Idx);
-          Cost +=
-              TTI->getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
-                                  EEVTy, std::nullopt, CostKind, Idx, SubVT);
+          Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
+                                     EEVTy, std::nullopt, CostKind, Idx, SubVT);
         }
       } else {
-        Cost += TTI->getShuffleCost(TargetTransformInfo::SK_InsertSubvector,
-                                    VecTy, std::nullopt, CostKind, 0, EEVTy);
+        Cost += TTI.getShuffleCost(TargetTransformInfo::SK_InsertSubvector,
+                                   VecTy, std::nullopt, CostKind, 0, EEVTy);
       }
     }
     return VecBase;
-  };
-  if (E->State == TreeEntry::NeedToGather) {
-    if (allConstant(VL))
-      return 0;
-    if (isa<InsertElementInst>(VL[0]))
-      return InstructionCost::getInvalid();
-    unsigned VF = E->getVectorFactor();
-    SmallVector<int> ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(),
-                                          E->ReuseShuffleIndices.end());
-    SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
-    // Build a mask out of the reorder indices and reorder scalars per this
-    // mask.
-    SmallVector<int> ReorderMask;
-    inversePermutation(E->ReorderIndices, ReorderMask);
-    if (!ReorderMask.empty())
-      reorderScalars(GatheredScalars, ReorderMask);
-    SmallVector<int> Mask;
-    SmallVector<int> ExtractMask;
-    std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle;
-    std::optional<TargetTransformInfo::ShuffleKind> GatherShuffle;
-    SmallVector<const TreeEntry *> Entries;
-    Type *ScalarTy = GatheredScalars.front()->getType();
-    // Check for gathered extracts.
-    ExtractShuffle = tryToGatherExtractElements(GatheredScalars, ExtractMask);
-    SmallVector<Value *> IgnoredVals;
-    if (UserIgnoreList)
-      IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end());
-
-    InstructionCost Cost = 0;
-    bool Resized = false;
-    if (Value *VecBase = AdjustExtractsCost(Cost, ExtractMask))
-      if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
-        if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) {
-          Resized = true;
-          GatheredScalars.append(VF - GatheredScalars.size(),
-                                 PoisonValue::get(ScalarTy));
-        }
-
-    // Do not try to look for reshuffled loads for gathered loads (they will be
-    // handled later), for vectorized scalars, and cases, which are definitely
-    // not profitable (splats and small gather nodes.)
-    if (ExtractShuffle || E->getOpcode() != Instruction::Load ||
-        E->isAltShuffle() ||
-        all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
-        isSplat(E->Scalars) ||
-        (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2))
-      GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries);
-    if (GatherShuffle) {
-      assert((Entries.size() == 1 || Entries.size() == 2) &&
-             "Expected shuffle of 1 or 2 entries.");
-      if (!Resized) {
-        unsigned VF1 = Entries.front()->getVectorFactor();
-        unsigned VF2 = Entries.back()->getVectorFactor();
-        if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF)
-          GatheredScalars.append(VF - GatheredScalars.size(),
-                                 PoisonValue::get(ScalarTy));
+  }
+  std::optional<InstructionCost>
+  needToDelay(const TreeEntry *, ArrayRef<const TreeEntry *>) const {
+    // No need to delay the cost estimation during analysis.
+    return std::nullopt;
+  }
+  void add(const TreeEntry *E1, const TreeEntry *E2, ArrayRef<int> Mask) {
+    // Use zeroinitializer instead of actual vector value here, since they are
+    // not ready yet.
+    add(Constant::getNullValue(FixedVectorType::get(
+            E1->Scalars.front()->getType(), E1->getVectorFactor())),
+        Constant::getNullValue(FixedVectorType::get(
+            E2->Scalars.front()->getType(), E2->getVectorFactor())),
+        Mask);
+  }
+  void add(const TreeEntry *E1, ArrayRef<int> Mask) {
+    // Use zeroinitializer instead of actual vector value here, since they are
+    // not ready yet.
+    add(Constant::getNullValue(FixedVectorType::get(
+            E1->Scalars.front()->getType(), E1->getVectorFactor())),
+        Mask);
+  }
+  /// Adds 2 input vectors and the mask for their shuffling.
+  void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
+    assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
+    if (InVectors.empty()) {
+      InVectors.push_back(V1);
+      InVectors.push_back(V2);
+      CommonMask.assign(Mask.begin(), Mask.end());
+      return;
+    }
+    Value *Vec = InVectors.front();
+    if (InVectors.size() == 2) {
+      Cost += createShuffle(Vec, InVectors.back(), CommonMask);
+      for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
+        if (Mask[Idx] != UndefMaskElem)
+          CommonMask[Idx] = Idx;
+    } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
+               Mask.size()) {
+      Cost += createShuffle(Vec, nullptr, CommonMask);
+      for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
+        if (Mask[Idx] != UndefMaskElem)
+          CommonMask[Idx] = Idx;
+    }
+    Cost += createShuffle(V1, V2, Mask);
+    for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
+      if (Mask[Idx] != UndefMaskElem)
+        CommonMask[Idx] = Idx + Sz;
+    InVectors.front() = Vec;
+    if (InVectors.size() == 2)
+      InVectors.back() = V1;
+    else
+      InVectors.push_back(V1);
+  }
+  /// Adds another one input vector and the mask for the shuffling.
+  void add(Value *V1, ArrayRef<int> Mask) {
+    if (InVectors.empty()) {
+      if (!isa<FixedVectorType>(V1->getType())) {
+        Cost += createShuffle(V1, nullptr, CommonMask);
+        CommonMask.assign(Mask.size(), UndefMaskElem);
+        for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
+          if (Mask[Idx] != UndefMaskElem)
+            CommonMask[Idx] = Idx;
       }
-      // Remove shuffled elements from list of gathers.
-      for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
-        if (Mask[I] != UndefMaskElem)
-          GatheredScalars[I] = PoisonValue::get(ScalarTy);
+      InVectors.push_back(V1);
+      CommonMask.assign(Mask.begin(), Mask.end());
+      return;
+    }
+    const auto *It = find(InVectors, V1);
+    if (It == InVectors.end()) {
+      if (InVectors.size() == 2 ||
+          InVectors.front()->getType() != V1->getType() ||
+          !isa<FixedVectorType>(V1->getType())) {
+        Value *V = InVectors.front();
+        if (InVectors.size() == 2) {
+          Cost +=
+              createShuffle(InVectors.front(), InVectors.back(), CommonMask);
+          for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
+            if (CommonMask[Idx] != UndefMaskElem)
+              CommonMask[Idx] = Idx;
+        } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
+                   CommonMask.size()) {
+          Cost += createShuffle(InVectors.front(), nullptr, CommonMask);
+          for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
+            if (CommonMask[Idx] != UndefMaskElem)
+              CommonMask[Idx] = Idx;
+        }
+        for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
+          if (CommonMask[Idx] == UndefMaskElem && Mask[Idx] != UndefMaskElem)
+            CommonMask[Idx] =
+                V->getType() != V1->getType()
+                    ? Idx + Sz
+                    : Mask[Idx] + cast<FixedVectorType>(V1->getType())
+                                      ->getNumElements();
+        if (V->getType() != V1->getType())
+          Cost += createShuffle(V1, nullptr, Mask);
+        InVectors.front() = V;
+        if (InVectors.size() == 2)
+          InVectors.back() = V1;
+        else
+          InVectors.push_back(V1);
+        return;
       }
-      InstructionCost GatherCost = 0;
-      int Limit = Mask.size() * 2;
-      if (all_of(Mask, [=](int Idx) { return Idx < Limit; }) &&
-          ShuffleVectorInst::isIdentityMask(Mask)) {
-        // Perfect match in the graph, will reuse the previously vectorized
-        // node. Cost is 0.
-        LLVM_DEBUG(
-            dbgs()
-            << "SLP: perfect diamond match for gather bundle that starts with "
-            << *VL.front() << ".\n");
-        if (NeedToShuffleReuses)
-          GatherCost =
-              TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
-                                  FinalVecTy, E->ReuseShuffleIndices);
-      } else {
-        LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size()
-                          << " entries for bundle that starts with "
-                          << *VL.front() << ".\n");
-        // Detected that instead of gather we can emit a shuffle of single/two
-        // previously vectorized nodes. Add the cost of the permutation rather
-        // than gather.
-        ::addMask(Mask, E->ReuseShuffleIndices);
-        GatherCost = TTI->getShuffleCost(*GatherShuffle, FinalVecTy, Mask);
-      }
-      if (!all_of(GatheredScalars, UndefValue::classof))
-        GatherCost += getGatherCost(GatheredScalars);
-      return GatherCost;
-    }
-    if (ExtractShuffle && all_of(GatheredScalars, PoisonValue::classof)) {
-      // Check that gather of extractelements can be represented as just a
-      // shuffle of a single/two vectors the scalars are extracted from.
-      // Found the bunch of extractelement instructions that must be gathered
-      // into a vector and can be represented as a permutation elements in a
-      // single input vector or of 2 input vectors.
-      InstructionCost Cost =
-          computeExtractCost(VL, VecTy, *ExtractShuffle, ExtractMask, *TTI);
-      AdjustExtractsCost(Cost, ExtractMask);
-      if (NeedToShuffleReuses)
-        Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
-                                    FinalVecTy, E->ReuseShuffleIndices);
-      return Cost;
+      // Check if second vector is required if the used elements are already
+      // used from the first one.
+      for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
+        if (Mask[Idx] != UndefMaskElem && CommonMask[Idx] == UndefMaskElem) {
+          InVectors.push_back(V1);
+          break;
+        }
     }
-    if (isSplat(VL)) {
-      // Found the broadcasting of the single scalar, calculate the cost as the
-      // broadcast.
-      assert(VecTy == FinalVecTy &&
-             "No reused scalars expected for broadcast.");
+    int VF = CommonMask.size();
+    if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
+      VF = FTy->getNumElements();
+    for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
+      if (Mask[Idx] != UndefMaskElem && CommonMask[Idx] == UndefMaskElem)
+        CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
+  }
+  /// Adds another one input vector and the mask for the shuffling.
+  void addOrdered(Value *V1, ArrayRef<unsigned> Order) {
+    SmallVector<int, 4> NewMask;
+    inversePermutation(Order, NewMask);
+    add(V1, NewMask);
+  }
+  Value *gather(ArrayRef<Value *> VL, Value *Root = nullptr) {
+    auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());
+    auto BuildVectorCost = [&](ArrayRef<Value *> VL,
+                               Value *Root) -> InstructionCost {
+      InstructionCost GatherCost = 0;
+      SmallVector<Value *> Gathers(VL.begin(), VL.end());
+      BoUpSLP::ValueSet VectorizedLoads;
+      // Improve gather cost for gather of loads, if we can group some of the
+      // loads into vector loads.
+      InstructionsState S = getSameOpcode(VL, *R.TLI);
+      if (VL.size() > 2 && S.getOpcode() == Instruction::Load &&
+          !S.isAltShuffle() &&
+          !all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) &&
+          !isSplat(Gathers)) {
+        unsigned StartIdx = 0;
+        unsigned VF = VL.size() / 2;
+        unsigned VectorizedCnt = 0;
+        unsigned ScatterVectorizeCnt = 0;
+        const unsigned Sz = R.DL->getTypeSizeInBits(S.MainOp->getType());
+        for (unsigned MinVF = R.getMinVF(2 * Sz); VF >= MinVF; VF /= 2) {
+          for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;
+               Cnt += VF) {
+            ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
+            if (!VectorizedLoads.count(Slice.front()) &&
+                !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {
+              SmallVector<Value *> PointerOps;
+              OrdersType CurrentOrder;
+              LoadsState LS =
+                  canVectorizeLoads(Slice, Slice.front(), TTI, *R.DL, *R.SE,
+                                    *R.LI, *R.TLI, CurrentOrder, PointerOps);
+              switch (LS) {
+              case LoadsState::Vectorize:
+              case LoadsState::ScatterVectorize:
+                // Mark the vectorized loads so that we don't vectorize them
+                // again.
+                if (LS == LoadsState::Vectorize)
+                  ++VectorizedCnt;
+                else
+                  ++ScatterVectorizeCnt;
+                VectorizedLoads.insert(Slice.begin(), Slice.end());
+                // If we vectorized initial block, no need to try to vectorize
+                // it again.
+                if (Cnt == StartIdx)
+                  StartIdx += VF;
+                break;
+              case LoadsState::Gather:
+                break;
+              }
+            }
+          }
+          // Check if the whole array was vectorized already - exit.
+          if (StartIdx >= VL.size())
+            break;
+          // Found vectorizable parts - exit.
+          if (!VectorizedLoads.empty())
+            break;
+        }
+        if (!VectorizedLoads.empty()) {
+          // Get the cost for gathered loads.
+          for (unsigned I = 0, End = VL.size(); I < End; I += VF) {
+            if (!VectorizedLoads.contains(VL[I]))
+              continue;
+            // Exclude potentially vectorized loads from list of gathered
+            // scalars.
+            for (unsigned K = I, End = I + VF; K < End; ++K)
+              Gathers[K] = PoisonValue::get(Gathers[K]->getType());
+          }
+          // The cost for vectorized loads.
+          InstructionCost ScalarsCost = 0;
+          for (Value *V : VectorizedLoads) {
+            auto *LI = cast<LoadInst>(V);
+            ScalarsCost += TTI.getMemoryOpCost(
+                Instruction::Load, LI->getType(), LI->getAlign(),
+                LI->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo(),
+                LI);
+          }
+          auto *LI = cast<LoadInst>(S.MainOp);
+          auto *LoadTy = FixedVectorType::get(LI->getType(), VF);
+          Align Alignment = LI->getAlign();
+          GatherCost +=
+              VectorizedCnt *
+              TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
+                                  LI->getPointerAddressSpace(), CostKind,
+                                  TTI::OperandValueInfo(), LI);
+          GatherCost += ScatterVectorizeCnt *
+                        TTI.getGatherScatterOpCost(
+                            Instruction::Load, LoadTy, LI->getPointerOperand(),
+                            /*VariableMask=*/false, Alignment, CostKind, LI);
+          // Add the cost for the subvectors shuffling.
+          GatherCost += (VectorizedCnt + ScatterVectorizeCnt - 1) *
+                        TTI.getShuffleCost(TTI::SK_Select, VecTy);
+          GatherCost -= ScalarsCost;
+        }
+      } else if (!Root && !allConstant(VL) && isSplat(VL)) {
+        // Found the broadcasting of the single scalar, calculate the cost as
+        // the broadcast.
       const auto *It =
           find_if(VL, [](Value *V) { return !isa<UndefValue>(V); });
-      // If all values are undefs - consider cost free.
-      if (It == VL.end())
-        return TTI::TCC_Free;
+        assert(It != VL.end() && "Expected at least one non-undef value.");
       // Add broadcast for non-identity shuffle only.
       bool NeedShuffle =
           count(VL, *It) > 1 &&
           (VL.front() != *It || !all_of(VL.drop_front(), UndefValue::classof));
-      InstructionCost InsertCost = TTI->getVectorInstrCost(
+      InstructionCost InsertCost = TTI.getVectorInstrCost(
           Instruction::InsertElement, VecTy, CostKind,
           NeedShuffle ? 0 : std::distance(VL.begin(), It),
           PoisonValue::get(VecTy), *It);
-      return InsertCost + (NeedShuffle
-                               ? TTI->getShuffleCost(
-                                     TargetTransformInfo::SK_Broadcast, VecTy,
-                                     /*Mask=*/std::nullopt, CostKind,
-                                     /*Index=*/0,
-                                     /*SubTp=*/nullptr, /*Args=*/*It)
-                               : TTI::TCC_Free);
-    }
-    InstructionCost ReuseShuffleCost = 0;
-    if (NeedToShuffleReuses)
-      ReuseShuffleCost = TTI->getShuffleCost(
-          TTI::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices);
-    // Improve gather cost for gather of loads, if we can group some of the
-    // loads into vector loads.
-    if (VL.size() > 2 && E->getOpcode() == Instruction::Load &&
-        !E->isAltShuffle()) {
-      BoUpSLP::ValueSet VectorizedLoads;
-      unsigned StartIdx = 0;
-      unsigned VF = VL.size() / 2;
-      unsigned VectorizedCnt = 0;
-      unsigned ScatterVectorizeCnt = 0;
-      const unsigned Sz = DL->getTypeSizeInBits(E->getMainOp()->getType());
-      for (unsigned MinVF = getMinVF(2 * Sz); VF >= MinVF; VF /= 2) {
-        for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;
-             Cnt += VF) {
-          ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
-          if (!VectorizedLoads.count(Slice.front()) &&
-              !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {
-            SmallVector<Value *> PointerOps;
-            OrdersType CurrentOrder;
-            LoadsState LS =
-                canVectorizeLoads(Slice, Slice.front(), *TTI, *DL, *SE, *LI,
-                                  *TLI, CurrentOrder, PointerOps);
-            switch (LS) {
-            case LoadsState::Vectorize:
-            case LoadsState::ScatterVectorize:
-              // Mark the vectorized loads so that we don't vectorize them
-              // again.
-              if (LS == LoadsState::Vectorize)
-                ++VectorizedCnt;
-              else
-                ++ScatterVectorizeCnt;
-              VectorizedLoads.insert(Slice.begin(), Slice.end());
-              // If we vectorized initial block, no need to try to vectorize it
-              // again.
-              if (Cnt == StartIdx)
-                StartIdx += VF;
-              break;
-            case LoadsState::Gather:
-              break;
-            }
-          }
+      return InsertCost +
+             (NeedShuffle ? TTI.getShuffleCost(
+                                TargetTransformInfo::SK_Broadcast, VecTy,
+                                /*Mask=*/std::nullopt, CostKind,
+                                /*Index=*/0, /*SubTp=*/nullptr, /*Args=*/*It)
+                          : TTI::TCC_Free);
+      }
+      return GatherCost + R.getGatherCost(Gathers);
+    };
+    Cost += BuildVectorCost(VL, Root);
+    if (!Root) {
+      SmallVector<Constant *> Vals;
+      for (Value *V : VL) {
+        if (isa<UndefValue>(V)) {
+          Vals.push_back(cast<Constant>(V));
+          continue;
         }
-        // Check if the whole array was vectorized already - exit.
-        if (StartIdx >= VL.size())
-          break;
-        // Found vectorizable parts - exit.
-        if (!VectorizedLoads.empty())
-          break;
+        Vals.push_back(Constant::getNullValue(V->getType()));
       }
-      if (!VectorizedLoads.empty()) {
-        InstructionCost GatherCost = 0;
-        unsigned NumParts = TTI->getNumberOfParts(VecTy);
-        bool NeedInsertSubvectorAnalysis =
-            !NumParts || (VL.size() / VF) > NumParts;
-        // Get the cost for gathered loads.
-        for (unsigned I = 0, End = VL.size(); I < End; I += VF) {
-          if (VectorizedLoads.contains(VL[I]))
+      return ConstantVector::get(Vals);
+    }
+    return ConstantVector::getSplat(
+        ElementCount::getFixed(VL.size()),
+        Constant::getNullValue(VL.front()->getType()));
+  }
+  InstructionCost createFreeze(InstructionCost Cost) { return Cost; }
+  /// Finalize emission of the shuffles.
+  InstructionCost
+  finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
+           function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
+    IsFinalized = true;
+    if (Action) {
+      Value *Vec = InVectors.front();
+      if (InVectors.size() == 2) {
+        Cost += createShuffle(Vec, InVectors.back(), CommonMask);
+        InVectors.pop_back();
+      } else {
+        Cost += createShuffle(Vec, nullptr, CommonMask);
+      }
+      for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
+        if (CommonMask[Idx] != UndefMaskElem)
+          CommonMask[Idx] = Idx;
+      assert(VF > 0 &&
+             "Expected vector length for the final value before action.");
+      Action(Vec, CommonMask);
+      InVectors.front() = Vec;
+    }
+    if (!ExtMask.empty()) {
+      if (CommonMask.empty()) {
+        CommonMask.assign(ExtMask.begin(), ExtMask.end());
+      } else {
+        SmallVector<int> NewMask(ExtMask.size(), UndefMaskElem);
+        for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
+          if (ExtMask[I] == UndefMaskElem)
             continue;
-          GatherCost += getGatherCost(VL.slice(I, VF));
-        }
-        // The cost for vectorized loads.
-        InstructionCost ScalarsCost = 0;
-        for (Value *V : VectorizedLoads) {
-          auto *LI = cast<LoadInst>(V);
-          ScalarsCost +=
-              TTI->getMemoryOpCost(Instruction::Load, LI->getType(),
-                                   LI->getAlign(), LI->getPointerAddressSpace(),
-                                   CostKind, TTI::OperandValueInfo(), LI);
-        }
-        auto *LI = cast<LoadInst>(E->getMainOp());
-        auto *LoadTy = FixedVectorType::get(LI->getType(), VF);
-        Align Alignment = LI->getAlign();
-        GatherCost +=
-            VectorizedCnt *
-            TTI->getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
-                                 LI->getPointerAddressSpace(), CostKind,
-                                 TTI::OperandValueInfo(), LI);
-        GatherCost += ScatterVectorizeCnt *
-                      TTI->getGatherScatterOpCost(
-                          Instruction::Load, LoadTy, LI->getPointerOperand(),
-                          /*VariableMask=*/false, Alignment, CostKind, LI);
-        if (NeedInsertSubvectorAnalysis) {
-          // Add the cost for the subvectors insert.
-          for (int I = VF, E = VL.size(); I < E; I += VF)
-            GatherCost +=
-                TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy,
-                                    std::nullopt, CostKind, I, LoadTy);
-        }
-        return ReuseShuffleCost + GatherCost - ScalarsCost;
-      }
-    }
-    return ReuseShuffleCost + getGatherCost(VL);
+          NewMask[I] = CommonMask[ExtMask[I]];
+        }
+        CommonMask.swap(NewMask);
+      }
+    }
+    if (CommonMask.empty()) {
+      assert(InVectors.size() == 1 && "Expected only one vector with no mask");
+      return Cost;
+    }
+    if (InVectors.size() == 2)
+      return Cost +
+             createShuffle(InVectors.front(), InVectors.back(), CommonMask);
+    return Cost + createShuffle(InVectors.front(), nullptr, CommonMask);
+  }
+
+  ~ShuffleCostEstimator() {
+    assert((IsFinalized || CommonMask.empty()) &&
+           "Shuffle construction must be finalized.");
+  }
+};
+
+InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
+                                      ArrayRef<Value *> VectorizedVals) {
+  ArrayRef<Value *> VL = E->Scalars;
+
+  Type *ScalarTy = VL[0]->getType();
+  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+    ScalarTy = SI->getValueOperand()->getType();
+  else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))
+    ScalarTy = CI->getOperand(0)->getType();
+  else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
+    ScalarTy = IE->getOperand(1)->getType();
+  auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+
+  // If we have computed a smaller type for the expression, update VecTy so
+  // that the costs will be accurate.
+  if (MinBWs.count(VL[0]))
+    VecTy = FixedVectorType::get(
+        IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
+  unsigned EntryVF = E->getVectorFactor();
+  auto *FinalVecTy = FixedVectorType::get(VecTy->getElementType(), EntryVF);
+
+  bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
+  if (E->State == TreeEntry::NeedToGather) {
+    if (allConstant(VL))
+      return 0;
+    if (isa<InsertElementInst>(VL[0]))
+      return InstructionCost::getInvalid();
+    return processBuildVector<ShuffleCostEstimator, InstructionCost>(
+        E, *TTI, VectorizedVals, *this);
   }
   InstructionCost CommonCost = 0;
   SmallVector<int> Mask;
@@ -9097,6 +9181,10 @@
       }
       return Vec;
     }
+    Value *createIdentity(Value *V) { return V; }
+    Value *createPoison(Type *Ty, unsigned VF) {
+      return PoisonValue::get(FixedVectorType::get(Ty, VF));
+    }
     /// Resizes 2 input vector to match the sizes, if the they are not equal
     /// yet. The smallest vector is resized to the size of the larger vector.
     void resizeToMatch(Value *&V1, Value *&V2) {
@@ -9129,7 +9217,8 @@
     assert(V1 && "Expected at least one vector value.");
     ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
                                     R.CSEBlocks);
-    return BaseShuffleAnalysis::createShuffle(V1, V2, Mask, ShuffleBuilder);
+    return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
+                                                       ShuffleBuilder);
   }
 
   /// Transforms mask \p CommonMask per given \p Mask to make proper set after
@@ -9163,13 +9252,14 @@
       R.eraseInstruction(EI);
     }
     return VecBase;
-  };
+  }
   /// Checks if the specified entry \p E needs to be delayed because of its
   /// dependency nodes.
-  Value *needToDelay(const TreeEntry *E, ArrayRef<const TreeEntry *> Deps) {
+  std::optional<Value *> needToDelay(const TreeEntry *E,
+                                     ArrayRef<const TreeEntry *> Deps) const {
     // No need to delay emission if all deps are ready.
     if (all_of(Deps, [](const TreeEntry *TE) { return TE->VectorizedValue; }))
-      return nullptr;
+      return std::nullopt;
     // Postpone gather emission, will be emitted after the end of the
     // process to keep correct order.
     auto *VecTy = FixedVectorType::get(E->Scalars.front()->getType(),
@@ -9177,7 +9267,13 @@
     Value *Vec = Builder.CreateAlignedLoad(
         VecTy, PoisonValue::get(VecTy->getPointerTo()), MaybeAlign());
     return Vec;
-  };
+  }
+  void add(const TreeEntry *E1, const TreeEntry *E2, ArrayRef<int> Mask) {
+    add(E1->VectorizedValue, E2->VectorizedValue, Mask);
+  }
+  void add(const TreeEntry *E1, ArrayRef<int> Mask) {
+    add(E1->VectorizedValue, Mask);
+  }
   /// Adds 2 input vectors and the mask for their shuffling.
   void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
     assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
@@ -9269,6 +9365,10 @@
     inversePermutation(Order, NewMask);
     add(V1, NewMask);
   }
+  Value *gather(ArrayRef<Value *> VL, Value *Root = nullptr) {
+    return R.gather(VL, Root);
+  }
+  Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
   /// Finalize emission of the shuffles.
   /// \param Action the action (if any) to be performed before final applying of
   /// the \p ExtMask mask.
@@ -9437,7 +9537,8 @@
   return vectorizeTree(I->get());
 }
 
-Value *BoUpSLP::createBuildVector(const TreeEntry *E) {
+template <typename BVTy, typename ResTy, typename... Args>
+ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
   assert(E->State == TreeEntry::NeedToGather && "Expected gather node.");
   unsigned VF = E->getVectorFactor();
 
@@ -9478,8 +9579,8 @@
       std::fill(Mask.begin(), Mask.end(), I);
     return true;
   };
-  ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
-  Value *Vec = nullptr;
+  BVTy ShuffleBuilder(Params...);
+  ResTy Res = ResTy();
   SmallVector<int> Mask;
   SmallVector<int> ExtractMask;
   std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle;
@@ -9509,12 +9610,13 @@
       GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries);
     }
     if (GatherShuffle) {
-      if (Value *Delayed = ShuffleBuilder.needToDelay(E, Entries)) {
+      if (std::optional<ResTy> Delayed =
+              ShuffleBuilder.needToDelay(E, Entries)) {
         // Delay emission of gathers which are not ready yet.
         PostponedGathers.insert(E);
         // Postpone gather emission, will be emitted after the end of the
         // process to keep correct order.
-        return Delayed;
+        return *Delayed;
       }
       assert((Entries.size() == 1 || Entries.size() == 2) &&
              "Expected shuffle of 1 or 2 entries.");
@@ -9664,15 +9766,16 @@
     if (GatherShuffle) {
       if (Entries.size() == 1) {
         IsUsedInExpr = FindReusedSplat(Mask);
-        ShuffleBuilder.add(Entries.front()->VectorizedValue, Mask);
-        IsNonPoisoned &=
-            isGuaranteedNotToBePoison(Entries.front()->VectorizedValue);
+        ShuffleBuilder.add(Entries.front(), Mask);
+        if (Entries.front()->VectorizedValue)
+          IsNonPoisoned &=
+              isGuaranteedNotToBePoison(Entries.front()->VectorizedValue);
       } else {
-        ShuffleBuilder.add(Entries.front()->VectorizedValue,
-                           Entries.back()->VectorizedValue, Mask);
-        IsNonPoisoned &=
-            isGuaranteedNotToBePoison(Entries.front()->VectorizedValue) &&
-            isGuaranteedNotToBePoison(Entries.back()->VectorizedValue);
+        ShuffleBuilder.add(Entries.front(), Entries.back(), Mask);
+        if (Entries.front()->VectorizedValue && Entries.back()->VectorizedValue)
+          IsNonPoisoned &=
+              isGuaranteedNotToBePoison(Entries.front()->VectorizedValue) &&
+              isGuaranteedNotToBePoison(Entries.back()->VectorizedValue);
       }
     }
     // Try to figure out best way to combine values: build a shuffle and insert
@@ -9722,7 +9825,7 @@
     if (!all_of(GatheredScalars, PoisonValue::classof)) {
       SmallVector<int> BVMask(GatheredScalars.size(), UndefMaskElem);
       TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
-      Value *BV = gather(GatheredScalars);
+      Value *BV = ShuffleBuilder.gather(GatheredScalars);
       ShuffleBuilder.add(BV, BVMask);
     }
     if (all_of(NonConstants, [=](Value *V) {
@@ -9730,21 +9833,21 @@
                  (IsSingleShuffle && ((IsIdentityShuffle &&
                   IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
         }))
-      Vec = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
+      Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
     else
-      Vec = ShuffleBuilder.finalize(
+      Res = ShuffleBuilder.finalize(
           E->ReuseShuffleIndices, E->Scalars.size(),
           [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
             TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false);
-            Vec = gather(NonConstants, Vec);
+            Vec = ShuffleBuilder.gather(NonConstants, Vec);
           });
   } else if (!allConstant(GatheredScalars)) {
     // Gather unique scalars and all constants.
     SmallVector<int> ReuseMask(GatheredScalars.size(), UndefMaskElem);
     TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
-    Vec = gather(GatheredScalars);
-    ShuffleBuilder.add(Vec, ReuseMask);
-    Vec = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
+    Value *BV = ShuffleBuilder.gather(GatheredScalars);
+    ShuffleBuilder.add(BV, ReuseMask);
+    Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
   } else {
     // Gather all constants.
     SmallVector<int> Mask(E->Scalars.size(), UndefMaskElem);
@@ -9752,14 +9855,19 @@
       if (!isa<PoisonValue>(V))
         Mask[I] = I;
     }
-    Vec = gather(E->Scalars);
-    ShuffleBuilder.add(Vec, Mask);
-    Vec = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
+    Value *BV = ShuffleBuilder.gather(E->Scalars);
+    ShuffleBuilder.add(BV, Mask);
+    Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
   }
 
   if (NeedFreeze)
-    Vec = Builder.CreateFreeze(Vec);
-  return Vec;
+    Res = ShuffleBuilder.createFreeze(Res);
+  return Res;
+}
+
+Value *BoUpSLP::createBuildVector(const TreeEntry *E) {
+  return processBuildVector<ShuffleInstructionBuilder, Value *>(E, Builder,
+                                                                *this);
 }
 
 Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/slp-vectorizer/merge-scalars.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/slp-vectorizer/merge-scalars.ll
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/slp-vectorizer/merge-scalars.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/slp-vectorizer/merge-scalars.ll
@@ -23,10 +23,11 @@
 ;; the vector store that replaces them.
 
 ; CHECK: call void @llvm.dbg.assign(metadata float undef, metadata ![[VAR:[0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 32), metadata ![[ID:[0-9]+]], metadata ptr %arrayidx, metadata !DIExpression())
+; CHECK: store <2 x float> {{.*}} !DIAssignID ![[ID]]
 ; CHECK: call void @llvm.dbg.assign(metadata float undef, metadata ![[VAR]], metadata !DIExpression(DW_OP_LLVM_fragment, 32, 32), metadata ![[ID]], metadata ptr %quad, metadata !DIExpression(DW_OP_plus_uconst, 4))
-; CHECK: call void @llvm.dbg.assign(metadata float undef, metadata ![[VAR]], metadata !DIExpression(DW_OP_LLVM_fragment, 64, 32), metadata ![[ID]], metadata ptr %quad, metadata !DIExpression(DW_OP_plus_uconst, 8))
-; CHECK: store <4 x float> {{.*}} !DIAssignID ![[ID]]
-; CHECK: call void @llvm.dbg.assign(metadata float undef, metadata ![[VAR]], metadata !DIExpression(DW_OP_LLVM_fragment, 96, 32), metadata ![[ID]], metadata ptr %quad, metadata !DIExpression(DW_OP_plus_uconst, 12))
+; CHECK: call void @llvm.dbg.assign(metadata float undef, metadata ![[VAR]], metadata !DIExpression(DW_OP_LLVM_fragment, 64, 32), metadata ![[ID1:[0-9]+]], metadata ptr %arrayidx7, metadata !DIExpression())
+; CHECK: store <2 x float> {{.*}} !DIAssignID ![[ID1]]
+; CHECK: call void @llvm.dbg.assign(metadata float undef, metadata ![[VAR]], metadata !DIExpression(DW_OP_LLVM_fragment, 96, 32), metadata ![[ID1]], metadata ptr %quad, metadata !DIExpression(DW_OP_plus_uconst, 12))
 
 target triple = "x86_64-unknown-unknown"
 
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
@@ -3,27 +3,21 @@
 
 define void @test(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2) {
 ; CHECK-LABEL: @test(
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP1:%.*]], i64 0
-; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP0:%.*]], i64 0
-; CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP7]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = trunc i64 [[TMP8]] to i32
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP2:%.*]], i64 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1
-; CHECK-NEXT:    [[TMP12:%.*]] = or i64 [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i32
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP0]], i64 0
-; CHECK-NEXT:    [[TMP15:%.*]] = or i64 [[TMP14]], 0
-; CHECK-NEXT:    [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32
-; CHECK-NEXT:    br label [[TMP17:%.*]]
-; CHECK:       17:
-; CHECK-NEXT:    [[TMP18:%.*]] = phi i32 [ [[TMP22:%.*]], [[TMP17]] ], [ [[TMP6]], [[TMP3:%.*]] ]
-; CHECK-NEXT:    [[TMP19:%.*]] = phi i32 [ 0, [[TMP17]] ], [ [[TMP9]], [[TMP3]] ]
-; CHECK-NEXT:    [[TMP20:%.*]] = phi i32 [ 0, [[TMP17]] ], [ [[TMP13]], [[TMP3]] ]
-; CHECK-NEXT:    [[TMP21:%.*]] = phi i32 [ 0, [[TMP17]] ], [ [[TMP16]], [[TMP3]] ]
-; CHECK-NEXT:    [[TMP22]] = or i32 [[TMP18]], 0
-; CHECK-NEXT:    br label [[TMP17]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2:%.*]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP1:%.*]], <2 x i64> [[TMP0:%.*]], <4 x i32> <i32 0, i32 2, i32 undef, i32 2>
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i64> [[TMP5]], i64 [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> <i64 0, i64 0, i64 poison, i64 0>, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP9:%.*]] = or <4 x i64> [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32>
+; CHECK-NEXT:    br label [[TMP11:%.*]]
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi <4 x i32> [ [[TMP16:%.*]], [[TMP11]] ], [ [[TMP10]], [[TMP3:%.*]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i32> zeroinitializer, [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = add <4 x i32> zeroinitializer, [[TMP13]]
+; CHECK-NEXT:    [[TMP16]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    br label [[TMP11]]
 ;
   %4 = extractelement <2 x i64> %1, i64 0
   %5 = or i64 %4, 0
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
@@ -343,15 +343,16 @@
 ; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[STRIDE:%.*]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i64 [[IDXPROM]]
 ; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i64 [[IDXPROM]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[X]], align 2
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[X]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[Y]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i16> [[TMP1]], [[TMP0]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 2
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i16>, ptr [[Y]], align 2
-; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX20]], align 2
-; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i16> [[TMP5]], [[TMP1]]
-; CHECK-NEXT:    [[TMP9:%.*]] = mul <4 x i16> [[TMP7]], [[TMP3]]
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP11:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP10]])
-; CHECK-NEXT:    ret i16 [[TMP11]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX20]], align 2
+; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i16> [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[TMP2]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[TMP5]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = add i16 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    ret i16 [[OP_RDX]]
 ;
 entry:
   %0 = load i16, ptr %x, align 2
@@ -420,29 +421,34 @@
 ; CHECK-NEXT:    [[ADD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]]
 ; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 4
 ; CHECK-NEXT:    [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[P1]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i8>, ptr [[P2]], align 1
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw nsw <4 x i32> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i8>, ptr [[P2]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i8> [[TMP5]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1
-; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1
-; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> [[TMP13]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP14]], <16 x i8> [[TMP15]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:    [[TMP17:%.*]] = zext <16 x i8> [[TMP16]] to <16 x i32>
-; CHECK-NEXT:    [[TMP19:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
-; CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
-; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP21]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:    [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i32>
-; CHECK-NEXT:    [[TMP28:%.*]] = mul nuw nsw <16 x i32> [[TMP17]], [[TMP27]]
-; CHECK-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP28]])
-; CHECK-NEXT:    ret i32 [[TMP29]]
+; CHECK-NEXT:    [[TMP8:%.*]] = zext <4 x i8> [[TMP7]] to <4 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw nsw <4 x i32> [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = zext <4 x i8> [[TMP10]] to <4 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = zext <4 x i8> [[TMP12]] to <4 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = mul nuw nsw <4 x i32> [[TMP11]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1
+; CHECK-NEXT:    [[TMP16:%.*]] = zext <4 x i8> [[TMP15]] to <4 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = zext <4 x i8> [[TMP17]] to <4 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = mul nuw nsw <4 x i32> [[TMP16]], [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP9]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = add i32 [[TMP20]], [[TMP21]]
+; CHECK-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP14]])
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = add i32 [[OP_RDX]], [[TMP22]]
+; CHECK-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP19]])
+; CHECK-NEXT:    [[OP_RDX2:%.*]] = add i32 [[OP_RDX1]], [[TMP23]]
+; CHECK-NEXT:    ret i32 [[OP_RDX2]]
 ;
 entry:
   %idx.ext = sext i32 %off1 to i64
@@ -708,29 +714,29 @@
 ; CHECK-NEXT:    [[ARRAYIDX72:%.*]] = getelementptr inbounds i32, ptr [[Z:%.*]], i64 1
 ; CHECK-NEXT:    [[MUL73:%.*]] = mul nsw i32 [[TMP3]], [[TMP0]]
 ; CHECK-NEXT:    [[ARRAYIDX76:%.*]] = getelementptr inbounds i32, ptr [[Z]], i64 6
-; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr [[X]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = load <2 x i32>, ptr [[Y]], align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x i32>, ptr [[ARRAYIDX41]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = mul nsw <2 x i32> [[TMP11]], [[TMP7]]
-; CHECK-NEXT:    [[TMP15:%.*]] = mul nsw <2 x i32> [[TMP13]], [[TMP9]]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr [[X]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i32>, ptr [[Y]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX41]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nsw <2 x i32> [[TMP8]], [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = mul nsw <2 x i32> [[TMP9]], [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
 ; CHECK-NEXT:    [[ARRAYIDX84:%.*]] = getelementptr inbounds i32, ptr [[Z]], i64 7
 ; CHECK-NEXT:    [[MUL85:%.*]] = mul nsw i32 [[TMP4]], [[TMP1]]
 ; CHECK-NEXT:    [[MUL87:%.*]] = mul nsw i32 [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[ARRAYIDX88:%.*]] = getelementptr inbounds i32, ptr [[Z]], i64 11
-; CHECK-NEXT:    [[TMP18:%.*]] = load <2 x i32>, ptr [[ARRAYIDX12]], align 4
-; CHECK-NEXT:    [[TMP20:%.*]] = load <2 x i32>, ptr [[ARRAYIDX28]], align 4
-; CHECK-NEXT:    [[TMP22:%.*]] = load <2 x i32>, ptr [[ARRAYIDX48]], align 4
-; CHECK-NEXT:    [[TMP24:%.*]] = load <2 x i32>, ptr [[ARRAYIDX64]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x i32>, ptr [[ARRAYIDX12]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = load <2 x i32>, ptr [[ARRAYIDX28]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = load <2 x i32>, ptr [[ARRAYIDX48]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = load <2 x i32>, ptr [[ARRAYIDX64]], align 4
 ; CHECK-NEXT:    store i32 [[MUL73]], ptr [[Z]], align 4
-; CHECK-NEXT:    store <4 x i32> [[SHUFFLE]], ptr [[ARRAYIDX72]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP12]], ptr [[ARRAYIDX72]], align 4
 ; CHECK-NEXT:    store i32 [[MUL85]], ptr [[ARRAYIDX76]], align 4
 ; CHECK-NEXT:    store i32 [[MUL87]], ptr [[ARRAYIDX88]], align 4
-; CHECK-NEXT:    [[TMP25:%.*]] = mul nsw <2 x i32> [[TMP22]], [[TMP18]]
-; CHECK-NEXT:    [[TMP26:%.*]] = mul nsw <2 x i32> [[TMP24]], [[TMP20]]
-; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP25]], <2 x i32> [[TMP26]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    store <4 x i32> [[SHUFFLE1]], ptr [[ARRAYIDX84]], align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = mul nsw <2 x i32> [[TMP15]], [[TMP13]]
+; CHECK-NEXT:    [[TMP18:%.*]] = mul nsw <2 x i32> [[TMP16]], [[TMP14]]
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <2 x i32> [[TMP17]], <2 x i32> [[TMP18]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    store <4 x i32> [[TMP19]], ptr [[ARRAYIDX84]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -833,14 +839,17 @@
 ; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[STRIDE:%.*]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i64 [[IDXPROM]]
 ; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i64 [[IDXPROM]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[X]], align 2
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 2
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i16>, ptr [[Y]], align 2
-; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX20]], align 2
-; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i16> [[TMP5]], [[TMP1]]
-; CHECK-NEXT:    [[TMP9:%.*]] = mul <4 x i16> [[TMP7]], [[TMP3]]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-NEXT:    store <8 x i16> [[SHUFFLE]], ptr [[DST0:%.*]], align 2
+; CHECK-NEXT:    [[DST4:%.*]] = getelementptr inbounds i16, ptr [[DST0:%.*]], i64 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[X]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[Y]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i16> [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 2
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX20]], align 2
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i16> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    store <4 x i16> [[TMP3]], ptr [[DST0]], align 2
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    store <4 x i16> [[TMP7]], ptr [[DST4]], align 2
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -921,30 +930,30 @@
 ; CHECK-NEXT:    [[DST4:%.*]] = getelementptr inbounds i32, ptr [[DST0:%.*]], i64 4
 ; CHECK-NEXT:    [[DST8:%.*]] = getelementptr inbounds i32, ptr [[DST0]], i64 8
 ; CHECK-NEXT:    [[DST12:%.*]] = getelementptr inbounds i32, ptr [[DST0]], i64 12
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[P1]], align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw nsw <4 x i32> [[TMP2]], [[TMP5]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i8>, ptr [[P2]], align 1
-; CHECK-NEXT:    [[TMP10:%.*]] = zext <4 x i8> [[TMP9]] to <4 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw nsw <4 x i32> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i8>, ptr [[P2]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i8> [[TMP5]] to <4 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = zext <4 x i8> [[TMP7]] to <4 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw nsw <4 x i32> [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = zext <4 x i8> [[TMP10]] to <4 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
 ; CHECK-NEXT:    [[TMP13:%.*]] = zext <4 x i8> [[TMP12]] to <4 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = mul nuw nsw <4 x i32> [[TMP10]], [[TMP13]]
-; CHECK-NEXT:    [[TMP17:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1
+; CHECK-NEXT:    [[TMP14:%.*]] = mul nuw nsw <4 x i32> [[TMP11]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1
+; CHECK-NEXT:    [[TMP16:%.*]] = zext <4 x i8> [[TMP15]] to <4 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
 ; CHECK-NEXT:    [[TMP18:%.*]] = zext <4 x i8> [[TMP17]] to <4 x i32>
-; CHECK-NEXT:    [[TMP20:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
-; CHECK-NEXT:    [[TMP21:%.*]] = zext <4 x i8> [[TMP20]] to <4 x i32>
-; CHECK-NEXT:    [[TMP22:%.*]] = mul nuw nsw <4 x i32> [[TMP18]], [[TMP21]]
-; CHECK-NEXT:    [[TMP25:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1
-; CHECK-NEXT:    [[TMP26:%.*]] = zext <4 x i8> [[TMP25]] to <4 x i32>
-; CHECK-NEXT:    [[TMP28:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
-; CHECK-NEXT:    [[TMP29:%.*]] = zext <4 x i8> [[TMP28]] to <4 x i32>
-; CHECK-NEXT:    [[TMP30:%.*]] = mul nuw nsw <4 x i32> [[TMP26]], [[TMP29]]
-; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr [[DST0]], align 4
-; CHECK-NEXT:    store <4 x i32> [[TMP14]], ptr [[DST4]], align 4
-; CHECK-NEXT:    store <4 x i32> [[TMP22]], ptr [[DST8]], align 4
-; CHECK-NEXT:    store <4 x i32> [[TMP30]], ptr [[DST12]], align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = mul nuw nsw <4 x i32> [[TMP16]], [[TMP18]]
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[DST0]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP9]], ptr [[DST4]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP14]], ptr [[DST8]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP19]], ptr [[DST12]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -1198,88 +1207,421 @@
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[ST1:%.*]] to i64
 ; CHECK-NEXT:    [[IDX_EXT63:%.*]] = sext i32 [[ST2:%.*]] to i64
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P1:%.*]], i64 4
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[P2:%.*]], i64 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[P1:%.*]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[P2:%.*]], align 1
+; CHECK-NEXT:    [[CONV2:%.*]] = zext i8 [[TMP1]] to i32
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[CONV]], [[CONV2]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX3]], align 1
+; CHECK-NEXT:    [[CONV4:%.*]] = zext i8 [[TMP2]] to i32
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX5]], align 1
+; CHECK-NEXT:    [[CONV6:%.*]] = zext i8 [[TMP3]] to i32
+; CHECK-NEXT:    [[SUB7:%.*]] = sub nsw i32 [[CONV4]], [[CONV6]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl nsw i32 [[SUB7]], 16
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[SHL]], [[SUB]]
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[ARRAYIDX8]], align 1
+; CHECK-NEXT:    [[CONV9:%.*]] = zext i8 [[TMP4]] to i32
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr [[ARRAYIDX10]], align 1
+; CHECK-NEXT:    [[CONV11:%.*]] = zext i8 [[TMP5]] to i32
+; CHECK-NEXT:    [[SUB12:%.*]] = sub nsw i32 [[CONV9]], [[CONV11]]
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 5
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[ARRAYIDX13]], align 1
+; CHECK-NEXT:    [[CONV14:%.*]] = zext i8 [[TMP6]] to i32
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 5
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX15]], align 1
+; CHECK-NEXT:    [[CONV16:%.*]] = zext i8 [[TMP7]] to i32
+; CHECK-NEXT:    [[SUB17:%.*]] = sub nsw i32 [[CONV14]], [[CONV16]]
+; CHECK-NEXT:    [[SHL18:%.*]] = shl nsw i32 [[SUB17]], 16
+; CHECK-NEXT:    [[ADD19:%.*]] = add nsw i32 [[SHL18]], [[SUB12]]
+; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 2
+; CHECK-NEXT:    [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX20]], align 1
+; CHECK-NEXT:    [[CONV21:%.*]] = zext i8 [[TMP8]] to i32
+; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 2
+; CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX22]], align 1
+; CHECK-NEXT:    [[CONV23:%.*]] = zext i8 [[TMP9]] to i32
+; CHECK-NEXT:    [[SUB24:%.*]] = sub nsw i32 [[CONV21]], [[CONV23]]
+; CHECK-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 6
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX25]], align 1
+; CHECK-NEXT:    [[CONV26:%.*]] = zext i8 [[TMP10]] to i32
+; CHECK-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 6
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr [[ARRAYIDX27]], align 1
+; CHECK-NEXT:    [[CONV28:%.*]] = zext i8 [[TMP11]] to i32
+; CHECK-NEXT:    [[SUB29:%.*]] = sub nsw i32 [[CONV26]], [[CONV28]]
+; CHECK-NEXT:    [[SHL30:%.*]] = shl nsw i32 [[SUB29]], 16
+; CHECK-NEXT:    [[ADD31:%.*]] = add nsw i32 [[SHL30]], [[SUB24]]
+; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 3
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX32]], align 1
+; CHECK-NEXT:    [[CONV33:%.*]] = zext i8 [[TMP12]] to i32
+; CHECK-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 3
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX34]], align 1
+; CHECK-NEXT:    [[CONV35:%.*]] = zext i8 [[TMP13]] to i32
+; CHECK-NEXT:    [[SUB36:%.*]] = sub nsw i32 [[CONV33]], [[CONV35]]
+; CHECK-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 7
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX37]], align 1
+; CHECK-NEXT:    [[CONV38:%.*]] = zext i8 [[TMP14]] to i32
+; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 7
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX39]], align 1
+; CHECK-NEXT:    [[CONV40:%.*]] = zext i8 [[TMP15]] to i32
+; CHECK-NEXT:    [[SUB41:%.*]] = sub nsw i32 [[CONV38]], [[CONV40]]
+; CHECK-NEXT:    [[SHL42:%.*]] = shl nsw i32 [[SUB41]], 16
+; CHECK-NEXT:    [[ADD43:%.*]] = add nsw i32 [[SHL42]], [[SUB36]]
+; CHECK-NEXT:    [[ADD44:%.*]] = add nsw i32 [[ADD19]], [[ADD]]
+; CHECK-NEXT:    [[SUB45:%.*]] = sub nsw i32 [[ADD]], [[ADD19]]
+; CHECK-NEXT:    [[ADD46:%.*]] = add nsw i32 [[ADD43]], [[ADD31]]
+; CHECK-NEXT:    [[SUB47:%.*]] = sub nsw i32 [[ADD31]], [[ADD43]]
+; CHECK-NEXT:    [[ADD48:%.*]] = add nsw i32 [[ADD46]], [[ADD44]]
+; CHECK-NEXT:    [[SUB51:%.*]] = sub nsw i32 [[ADD44]], [[ADD46]]
+; CHECK-NEXT:    [[ADD55:%.*]] = add nsw i32 [[SUB47]], [[SUB45]]
+; CHECK-NEXT:    [[SUB59:%.*]] = sub nsw i32 [[SUB45]], [[SUB47]]
 ; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]]
 ; CHECK-NEXT:    [[ADD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i8, ptr [[ADD_PTR]], align 1
+; CHECK-NEXT:    [[CONV_1:%.*]] = zext i8 [[TMP16]] to i32
+; CHECK-NEXT:    [[TMP17:%.*]] = load i8, ptr [[ADD_PTR64]], align 1
+; CHECK-NEXT:    [[CONV2_1:%.*]] = zext i8 [[TMP17]] to i32
+; CHECK-NEXT:    [[SUB_1:%.*]] = sub nsw i32 [[CONV_1]], [[CONV2_1]]
 ; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 4
+; CHECK-NEXT:    [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX3_1]], align 1
+; CHECK-NEXT:    [[CONV4_1:%.*]] = zext i8 [[TMP18]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 4
+; CHECK-NEXT:    [[TMP19:%.*]] = load i8, ptr [[ARRAYIDX5_1]], align 1
+; CHECK-NEXT:    [[CONV6_1:%.*]] = zext i8 [[TMP19]] to i32
+; CHECK-NEXT:    [[SUB7_1:%.*]] = sub nsw i32 [[CONV4_1]], [[CONV6_1]]
+; CHECK-NEXT:    [[SHL_1:%.*]] = shl nsw i32 [[SUB7_1]], 16
+; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[SHL_1]], [[SUB_1]]
+; CHECK-NEXT:    [[ARRAYIDX8_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 1
+; CHECK-NEXT:    [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX8_1]], align 1
+; CHECK-NEXT:    [[CONV9_1:%.*]] = zext i8 [[TMP20]] to i32
+; CHECK-NEXT:    [[ARRAYIDX10_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 1
+; CHECK-NEXT:    [[TMP21:%.*]] = load i8, ptr [[ARRAYIDX10_1]], align 1
+; CHECK-NEXT:    [[CONV11_1:%.*]] = zext i8 [[TMP21]] to i32
+; CHECK-NEXT:    [[SUB12_1:%.*]] = sub nsw i32 [[CONV9_1]], [[CONV11_1]]
+; CHECK-NEXT:    [[ARRAYIDX13_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 5
+; CHECK-NEXT:    [[TMP22:%.*]] = load i8, ptr [[ARRAYIDX13_1]], align 1
+; CHECK-NEXT:    [[CONV14_1:%.*]] = zext i8 [[TMP22]] to i32
+; CHECK-NEXT:    [[ARRAYIDX15_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 5
+; CHECK-NEXT:    [[TMP23:%.*]] = load i8, ptr [[ARRAYIDX15_1]], align 1
+; CHECK-NEXT:    [[CONV16_1:%.*]] = zext i8 [[TMP23]] to i32
+; CHECK-NEXT:    [[SUB17_1:%.*]] = sub nsw i32 [[CONV14_1]], [[CONV16_1]]
+; CHECK-NEXT:    [[SHL18_1:%.*]] = shl nsw i32 [[SUB17_1]], 16
+; CHECK-NEXT:    [[ADD19_1:%.*]] = add nsw i32 [[SHL18_1]], [[SUB12_1]]
+; CHECK-NEXT:    [[ARRAYIDX20_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 2
+; CHECK-NEXT:    [[TMP24:%.*]] = load i8, ptr [[ARRAYIDX20_1]], align 1
+; CHECK-NEXT:    [[CONV21_1:%.*]] = zext i8 [[TMP24]] to i32
+; CHECK-NEXT:    [[ARRAYIDX22_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 2
+; CHECK-NEXT:    [[TMP25:%.*]] = load i8, ptr [[ARRAYIDX22_1]], align 1
+; CHECK-NEXT:    [[CONV23_1:%.*]] = zext i8 [[TMP25]] to i32
+; CHECK-NEXT:    [[SUB24_1:%.*]] = sub nsw i32 [[CONV21_1]], [[CONV23_1]]
+; CHECK-NEXT:    [[ARRAYIDX25_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 6
+; CHECK-NEXT:    [[TMP26:%.*]] = load i8, ptr [[ARRAYIDX25_1]], align 1
+; CHECK-NEXT:    [[CONV26_1:%.*]] = zext i8 [[TMP26]] to i32
+; CHECK-NEXT:    [[ARRAYIDX27_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 6
+; CHECK-NEXT:    [[TMP27:%.*]] = load i8, ptr [[ARRAYIDX27_1]], align 1
+; CHECK-NEXT:    [[CONV28_1:%.*]] = zext i8 [[TMP27]] to i32
+; CHECK-NEXT:    [[SUB29_1:%.*]] = sub nsw i32 [[CONV26_1]], [[CONV28_1]]
+; CHECK-NEXT:    [[SHL30_1:%.*]] = shl nsw i32 [[SUB29_1]], 16
+; CHECK-NEXT:    [[ADD31_1:%.*]] = add nsw i32 [[SHL30_1]], [[SUB24_1]]
+; CHECK-NEXT:    [[ARRAYIDX32_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 3
+; CHECK-NEXT:    [[TMP28:%.*]] = load i8, ptr [[ARRAYIDX32_1]], align 1
+; CHECK-NEXT:    [[CONV33_1:%.*]] = zext i8 [[TMP28]] to i32
+; CHECK-NEXT:    [[ARRAYIDX34_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 3
+; CHECK-NEXT:    [[TMP29:%.*]] = load i8, ptr [[ARRAYIDX34_1]], align 1
+; CHECK-NEXT:    [[CONV35_1:%.*]] = zext i8 [[TMP29]] to i32
+; CHECK-NEXT:    [[SUB36_1:%.*]] = sub nsw i32 [[CONV33_1]], [[CONV35_1]]
+; CHECK-NEXT:    [[ARRAYIDX37_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 7
+; CHECK-NEXT:    [[TMP30:%.*]] = load i8, ptr [[ARRAYIDX37_1]], align 1
+; CHECK-NEXT:    [[CONV38_1:%.*]] = zext i8 [[TMP30]] to i32
+; CHECK-NEXT:    [[ARRAYIDX39_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 7
+; CHECK-NEXT:    [[TMP31:%.*]] = load i8, ptr [[ARRAYIDX39_1]], align 1
+; CHECK-NEXT:    [[CONV40_1:%.*]] = zext i8 [[TMP31]] to i32
+; CHECK-NEXT:    [[SUB41_1:%.*]] = sub nsw i32 [[CONV38_1]], [[CONV40_1]]
+; CHECK-NEXT:    [[SHL42_1:%.*]] = shl nsw i32 [[SUB41_1]], 16
+; CHECK-NEXT:    [[ADD43_1:%.*]] = add nsw i32 [[SHL42_1]], [[SUB36_1]]
+; CHECK-NEXT:    [[ADD44_1:%.*]] = add nsw i32 [[ADD19_1]], [[ADD_1]]
+; CHECK-NEXT:    [[SUB45_1:%.*]] = sub nsw i32 [[ADD_1]], [[ADD19_1]]
+; CHECK-NEXT:    [[ADD46_1:%.*]] = add nsw i32 [[ADD43_1]], [[ADD31_1]]
+; CHECK-NEXT:    [[SUB47_1:%.*]] = sub nsw i32 [[ADD31_1]], [[ADD43_1]]
+; CHECK-NEXT:    [[ADD48_1:%.*]] = add nsw i32 [[ADD46_1]], [[ADD44_1]]
+; CHECK-NEXT:    [[SUB51_1:%.*]] = sub nsw i32 [[ADD44_1]], [[ADD46_1]]
+; CHECK-NEXT:    [[ADD55_1:%.*]] = add nsw i32 [[SUB47_1]], [[SUB45_1]]
+; CHECK-NEXT:    [[SUB59_1:%.*]] = sub nsw i32 [[SUB45_1]], [[SUB47_1]]
 ; CHECK-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]]
 ; CHECK-NEXT:    [[ADD_PTR64_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load i8, ptr [[ADD_PTR_1]], align 1
+; CHECK-NEXT:    [[CONV_2:%.*]] = zext i8 [[TMP32]] to i32
+; CHECK-NEXT:    [[TMP33:%.*]] = load i8, ptr [[ADD_PTR64_1]], align 1
+; CHECK-NEXT:    [[CONV2_2:%.*]] = zext i8 [[TMP33]] to i32
+; CHECK-NEXT:    [[SUB_2:%.*]] = sub nsw i32 [[CONV_2]], [[CONV2_2]]
 ; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 4
+; CHECK-NEXT:    [[TMP34:%.*]] = load i8, ptr [[ARRAYIDX3_2]], align 1
+; CHECK-NEXT:    [[CONV4_2:%.*]] = zext i8 [[TMP34]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX5_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 4
+; CHECK-NEXT:    [[TMP35:%.*]] = load i8, ptr [[ARRAYIDX5_2]], align 1
+; CHECK-NEXT:    [[CONV6_2:%.*]] = zext i8 [[TMP35]] to i32
+; CHECK-NEXT:    [[SUB7_2:%.*]] = sub nsw i32 [[CONV4_2]], [[CONV6_2]]
+; CHECK-NEXT:    [[SHL_2:%.*]] = shl nsw i32 [[SUB7_2]], 16
+; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[SHL_2]], [[SUB_2]]
+; CHECK-NEXT:    [[ARRAYIDX8_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 1
+; CHECK-NEXT:    [[TMP36:%.*]] = load i8, ptr [[ARRAYIDX8_2]], align 1
+; CHECK-NEXT:    [[CONV9_2:%.*]] = zext i8 [[TMP36]] to i32
+; CHECK-NEXT:    [[ARRAYIDX10_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 1
+; CHECK-NEXT:    [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX10_2]], align 1
+; CHECK-NEXT:    [[CONV11_2:%.*]] = zext i8 [[TMP37]] to i32
+; CHECK-NEXT:    [[SUB12_2:%.*]] = sub nsw i32 [[CONV9_2]], [[CONV11_2]]
+; CHECK-NEXT:    [[ARRAYIDX13_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 5
+; CHECK-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX13_2]], align 1
+; CHECK-NEXT:    [[CONV14_2:%.*]] = zext i8 [[TMP38]] to i32
+; CHECK-NEXT:    [[ARRAYIDX15_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 5
+; CHECK-NEXT:    [[TMP39:%.*]] = load i8, ptr [[ARRAYIDX15_2]], align 1
+; CHECK-NEXT:    [[CONV16_2:%.*]] = zext i8 [[TMP39]] to i32
+; CHECK-NEXT:    [[SUB17_2:%.*]] = sub nsw i32 [[CONV14_2]], [[CONV16_2]]
+; CHECK-NEXT:    [[SHL18_2:%.*]] = shl nsw i32 [[SUB17_2]], 16
+; CHECK-NEXT:    [[ADD19_2:%.*]] = add nsw i32 [[SHL18_2]], [[SUB12_2]]
+; CHECK-NEXT:    [[ARRAYIDX20_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 2
+; CHECK-NEXT:    [[TMP40:%.*]] = load i8, ptr [[ARRAYIDX20_2]], align 1
+; CHECK-NEXT:    [[CONV21_2:%.*]] = zext i8 [[TMP40]] to i32
+; CHECK-NEXT:    [[ARRAYIDX22_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 2
+; CHECK-NEXT:    [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX22_2]], align 1
+; CHECK-NEXT:    [[CONV23_2:%.*]] = zext i8 [[TMP41]] to i32
+; CHECK-NEXT:    [[SUB24_2:%.*]] = sub nsw i32 [[CONV21_2]], [[CONV23_2]]
+; CHECK-NEXT:    [[ARRAYIDX25_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 6
+; CHECK-NEXT:    [[TMP42:%.*]] = load i8, ptr [[ARRAYIDX25_2]], align 1
+; CHECK-NEXT:    [[CONV26_2:%.*]] = zext i8 [[TMP42]] to i32
+; CHECK-NEXT:    [[ARRAYIDX27_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 6
+; CHECK-NEXT:    [[TMP43:%.*]] = load i8, ptr [[ARRAYIDX27_2]], align 1
+; CHECK-NEXT:    [[CONV28_2:%.*]] = zext i8 [[TMP43]] to i32
+; CHECK-NEXT:    [[SUB29_2:%.*]] = sub nsw i32 [[CONV26_2]], [[CONV28_2]]
+; CHECK-NEXT:    [[SHL30_2:%.*]] = shl nsw i32 [[SUB29_2]], 16
+; CHECK-NEXT:    [[ADD31_2:%.*]] = add nsw i32 [[SHL30_2]], [[SUB24_2]]
+; CHECK-NEXT:    [[ARRAYIDX32_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 3
+; CHECK-NEXT:    [[TMP44:%.*]] = load i8, ptr [[ARRAYIDX32_2]], align 1
+; CHECK-NEXT:    [[CONV33_2:%.*]] = zext i8 [[TMP44]] to i32
+; CHECK-NEXT:    [[ARRAYIDX34_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 3
+; CHECK-NEXT:    [[TMP45:%.*]] = load i8, ptr [[ARRAYIDX34_2]], align 1
+; CHECK-NEXT:    [[CONV35_2:%.*]] = zext i8 [[TMP45]] to i32
+; CHECK-NEXT:    [[SUB36_2:%.*]] = sub nsw i32 [[CONV33_2]], [[CONV35_2]]
+; CHECK-NEXT:    [[ARRAYIDX37_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 7
+; CHECK-NEXT:    [[TMP46:%.*]] = load i8, ptr [[ARRAYIDX37_2]], align 1
+; CHECK-NEXT:    [[CONV38_2:%.*]] = zext i8 [[TMP46]] to i32
+; CHECK-NEXT:    [[ARRAYIDX39_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 7
+; CHECK-NEXT:    [[TMP47:%.*]] = load i8, ptr [[ARRAYIDX39_2]], align 1
+; CHECK-NEXT:    [[CONV40_2:%.*]] = zext i8 [[TMP47]] to i32
+; CHECK-NEXT:    [[SUB41_2:%.*]] = sub nsw i32 [[CONV38_2]], [[CONV40_2]]
+; CHECK-NEXT:    [[SHL42_2:%.*]] = shl nsw i32 [[SUB41_2]], 16
+; CHECK-NEXT:    [[ADD43_2:%.*]] = add nsw i32 [[SHL42_2]], [[SUB36_2]]
+; CHECK-NEXT:    [[ADD44_2:%.*]] = add nsw i32 [[ADD19_2]], [[ADD_2]]
+; CHECK-NEXT:    [[SUB45_2:%.*]] = sub nsw i32 [[ADD_2]], [[ADD19_2]]
+; CHECK-NEXT:    [[ADD46_2:%.*]] = add nsw i32 [[ADD43_2]], [[ADD31_2]]
+; CHECK-NEXT:    [[SUB47_2:%.*]] = sub nsw i32 [[ADD31_2]], [[ADD43_2]]
+; CHECK-NEXT:    [[ADD48_2:%.*]] = add nsw i32 [[ADD46_2]], [[ADD44_2]]
+; CHECK-NEXT:    [[SUB51_2:%.*]] = sub nsw i32 [[ADD44_2]], [[ADD46_2]]
+; CHECK-NEXT:    [[ADD55_2:%.*]] = add nsw i32 [[SUB47_2]], [[SUB45_2]]
+; CHECK-NEXT:    [[SUB59_2:%.*]] = sub nsw i32 [[SUB45_2]], [[SUB47_2]]
 ; CHECK-NEXT:    [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]]
 ; CHECK-NEXT:    [[ADD_PTR64_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 [[IDX_EXT63]]
+; CHECK-NEXT:    [[TMP48:%.*]] = load i8, ptr [[ADD_PTR_2]], align 1
+; CHECK-NEXT:    [[CONV_3:%.*]] = zext i8 [[TMP48]] to i32
+; CHECK-NEXT:    [[TMP49:%.*]] = load i8, ptr [[ADD_PTR64_2]], align 1
+; CHECK-NEXT:    [[CONV2_3:%.*]] = zext i8 [[TMP49]] to i32
+; CHECK-NEXT:    [[SUB_3:%.*]] = sub nsw i32 [[CONV_3]], [[CONV2_3]]
 ; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 4
+; CHECK-NEXT:    [[TMP50:%.*]] = load i8, ptr [[ARRAYIDX3_3]], align 1
+; CHECK-NEXT:    [[CONV4_3:%.*]] = zext i8 [[TMP50]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX5_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[P1]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i8>, ptr [[P2]], align 1
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1
-; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1
-; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1
-; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1
-; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
-; CHECK-NEXT:    [[TMP15:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
-; CHECK-NEXT:    [[TMP17:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1
-; CHECK-NEXT:    [[TMP19:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1
-; CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1
-; CHECK-NEXT:    [[TMP23:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1
-; CHECK-NEXT:    [[TMP25:%.*]] = load <4 x i8>, ptr [[ADD_PTR_2]], align 1
-; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i8> [[TMP25]], <4 x i8> [[TMP17]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <16 x i8> [[TMP26]], <16 x i8> [[TMP27]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <16 x i8> [[TMP28]], <16 x i8> [[TMP29]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:    [[TMP31:%.*]] = zext <16 x i8> [[TMP30]] to <16 x i32>
-; CHECK-NEXT:    [[TMP33:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1
-; CHECK-NEXT:    [[TMP34:%.*]] = shufflevector <4 x i8> [[TMP33]], <4 x i8> [[TMP19]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <16 x i8> [[TMP34]], <16 x i8> [[TMP35]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP37:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <16 x i8> [[TMP36]], <16 x i8> [[TMP37]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:    [[TMP39:%.*]] = zext <16 x i8> [[TMP38]] to <16 x i32>
-; CHECK-NEXT:    [[TMP40:%.*]] = sub nsw <16 x i32> [[TMP31]], [[TMP39]]
-; CHECK-NEXT:    [[TMP42:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1
-; CHECK-NEXT:    [[TMP43:%.*]] = shufflevector <4 x i8> [[TMP42]], <4 x i8> [[TMP21]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP44:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP45:%.*]] = shufflevector <16 x i8> [[TMP43]], <16 x i8> [[TMP44]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP46:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP47:%.*]] = shufflevector <16 x i8> [[TMP45]], <16 x i8> [[TMP46]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:    [[TMP48:%.*]] = zext <16 x i8> [[TMP47]] to <16 x i32>
-; CHECK-NEXT:    [[TMP50:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1
-; CHECK-NEXT:    [[TMP51:%.*]] = shufflevector <4 x i8> [[TMP50]], <4 x i8> [[TMP23]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP52:%.*]] = shufflevector <4 x i8> [[TMP15]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP53:%.*]] = shufflevector <16 x i8> [[TMP51]], <16 x i8> [[TMP52]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP54:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP55:%.*]] = shufflevector <16 x i8> [[TMP53]], <16 x i8> [[TMP54]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:    [[TMP56:%.*]] = zext <16 x i8> [[TMP55]] to <16 x i32>
-; CHECK-NEXT:    [[TMP57:%.*]] = sub nsw <16 x i32> [[TMP48]], [[TMP56]]
-; CHECK-NEXT:    [[TMP58:%.*]] = shl nsw <16 x i32> [[TMP57]], <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
-; CHECK-NEXT:    [[TMP59:%.*]] = add nsw <16 x i32> [[TMP58]], [[TMP40]]
-; CHECK-NEXT:    [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> poison, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-; CHECK-NEXT:    [[TMP61:%.*]] = add nsw <16 x i32> [[TMP59]], [[TMP60]]
-; CHECK-NEXT:    [[TMP62:%.*]] = sub nsw <16 x i32> [[TMP59]], [[TMP60]]
-; CHECK-NEXT:    [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP61]], <16 x i32> [[TMP62]], <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 22, i32 18, i32 26, i32 30, i32 5, i32 1, i32 9, i32 13, i32 20, i32 16, i32 24, i32 28>
-; CHECK-NEXT:    [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP63]], <16 x i32> poison, <16 x i32> <i32 9, i32 8, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 1, i32 0, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP65:%.*]] = add nsw <16 x i32> [[TMP63]], [[TMP64]]
-; CHECK-NEXT:    [[TMP66:%.*]] = sub nsw <16 x i32> [[TMP63]], [[TMP64]]
-; CHECK-NEXT:    [[TMP67:%.*]] = shufflevector <16 x i32> [[TMP65]], <16 x i32> [[TMP66]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; CHECK-NEXT:    [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP67]], <16 x i32> poison, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-; CHECK-NEXT:    [[TMP69:%.*]] = add nsw <16 x i32> [[TMP67]], [[TMP68]]
-; CHECK-NEXT:    [[TMP70:%.*]] = sub nsw <16 x i32> [[TMP67]], [[TMP68]]
-; CHECK-NEXT:    [[TMP71:%.*]] = shufflevector <16 x i32> [[TMP69]], <16 x i32> [[TMP70]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 20, i32 5, i32 6, i32 23, i32 24, i32 9, i32 10, i32 27, i32 28, i32 13, i32 14, i32 31>
-; CHECK-NEXT:    [[TMP72:%.*]] = shufflevector <16 x i32> [[TMP71]], <16 x i32> poison, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-; CHECK-NEXT:    [[TMP73:%.*]] = add nsw <16 x i32> [[TMP71]], [[TMP72]]
-; CHECK-NEXT:    [[TMP74:%.*]] = sub nsw <16 x i32> [[TMP71]], [[TMP72]]
-; CHECK-NEXT:    [[TMP75:%.*]] = shufflevector <16 x i32> [[TMP73]], <16 x i32> [[TMP74]], <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 4, i32 5, i32 22, i32 23, i32 8, i32 9, i32 26, i32 27, i32 12, i32 13, i32 30, i32 31>
-; CHECK-NEXT:    [[TMP76:%.*]] = lshr <16 x i32> [[TMP75]], <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; CHECK-NEXT:    [[TMP77:%.*]] = and <16 x i32> [[TMP76]], <i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP78:%.*]] = mul nuw <16 x i32> [[TMP77]], <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP79:%.*]] = add <16 x i32> [[TMP78]], [[TMP75]]
-; CHECK-NEXT:    [[TMP80:%.*]] = xor <16 x i32> [[TMP79]], [[TMP78]]
-; CHECK-NEXT:    [[TMP81:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP80]])
-; CHECK-NEXT:    [[CONV118:%.*]] = and i32 [[TMP81]], 65535
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[TMP81]], 16
+; CHECK-NEXT:    [[TMP51:%.*]] = load i8, ptr [[ARRAYIDX5_3]], align 1
+; CHECK-NEXT:    [[CONV6_3:%.*]] = zext i8 [[TMP51]] to i32
+; CHECK-NEXT:    [[SUB7_3:%.*]] = sub nsw i32 [[CONV4_3]], [[CONV6_3]]
+; CHECK-NEXT:    [[SHL_3:%.*]] = shl nsw i32 [[SUB7_3]], 16
+; CHECK-NEXT:    [[ADD_3:%.*]] = add nsw i32 [[SHL_3]], [[SUB_3]]
+; CHECK-NEXT:    [[ARRAYIDX8_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 1
+; CHECK-NEXT:    [[TMP52:%.*]] = load i8, ptr [[ARRAYIDX8_3]], align 1
+; CHECK-NEXT:    [[CONV9_3:%.*]] = zext i8 [[TMP52]] to i32
+; CHECK-NEXT:    [[ARRAYIDX10_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 1
+; CHECK-NEXT:    [[TMP53:%.*]] = load i8, ptr [[ARRAYIDX10_3]], align 1
+; CHECK-NEXT:    [[CONV11_3:%.*]] = zext i8 [[TMP53]] to i32
+; CHECK-NEXT:    [[SUB12_3:%.*]] = sub nsw i32 [[CONV9_3]], [[CONV11_3]]
+; CHECK-NEXT:    [[ARRAYIDX13_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 5
+; CHECK-NEXT:    [[TMP54:%.*]] = load i8, ptr [[ARRAYIDX13_3]], align 1
+; CHECK-NEXT:    [[CONV14_3:%.*]] = zext i8 [[TMP54]] to i32
+; CHECK-NEXT:    [[ARRAYIDX15_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 5
+; CHECK-NEXT:    [[TMP55:%.*]] = load i8, ptr [[ARRAYIDX15_3]], align 1
+; CHECK-NEXT:    [[CONV16_3:%.*]] = zext i8 [[TMP55]] to i32
+; CHECK-NEXT:    [[SUB17_3:%.*]] = sub nsw i32 [[CONV14_3]], [[CONV16_3]]
+; CHECK-NEXT:    [[SHL18_3:%.*]] = shl nsw i32 [[SUB17_3]], 16
+; CHECK-NEXT:    [[ADD19_3:%.*]] = add nsw i32 [[SHL18_3]], [[SUB12_3]]
+; CHECK-NEXT:    [[ARRAYIDX20_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 2
+; CHECK-NEXT:    [[TMP56:%.*]] = load i8, ptr [[ARRAYIDX20_3]], align 1
+; CHECK-NEXT:    [[CONV21_3:%.*]] = zext i8 [[TMP56]] to i32
+; CHECK-NEXT:    [[ARRAYIDX22_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 2
+; CHECK-NEXT:    [[TMP57:%.*]] = load i8, ptr [[ARRAYIDX22_3]], align 1
+; CHECK-NEXT:    [[CONV23_3:%.*]] = zext i8 [[TMP57]] to i32
+; CHECK-NEXT:    [[SUB24_3:%.*]] = sub nsw i32 [[CONV21_3]], [[CONV23_3]]
+; CHECK-NEXT:    [[ARRAYIDX25_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 6
+; CHECK-NEXT:    [[TMP58:%.*]] = load i8, ptr [[ARRAYIDX25_3]], align 1
+; CHECK-NEXT:    [[CONV26_3:%.*]] = zext i8 [[TMP58]] to i32
+; CHECK-NEXT:    [[ARRAYIDX27_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 6
+; CHECK-NEXT:    [[TMP59:%.*]] = load i8, ptr [[ARRAYIDX27_3]], align 1
+; CHECK-NEXT:    [[CONV28_3:%.*]] = zext i8 [[TMP59]] to i32
+; CHECK-NEXT:    [[SUB29_3:%.*]] = sub nsw i32 [[CONV26_3]], [[CONV28_3]]
+; CHECK-NEXT:    [[SHL30_3:%.*]] = shl nsw i32 [[SUB29_3]], 16
+; CHECK-NEXT:    [[ADD31_3:%.*]] = add nsw i32 [[SHL30_3]], [[SUB24_3]]
+; CHECK-NEXT:    [[ARRAYIDX32_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 3
+; CHECK-NEXT:    [[TMP60:%.*]] = load i8, ptr [[ARRAYIDX32_3]], align 1
+; CHECK-NEXT:    [[CONV33_3:%.*]] = zext i8 [[TMP60]] to i32
+; CHECK-NEXT:    [[ARRAYIDX34_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 3
+; CHECK-NEXT:    [[TMP61:%.*]] = load i8, ptr [[ARRAYIDX34_3]], align 1
+; CHECK-NEXT:    [[CONV35_3:%.*]] = zext i8 [[TMP61]] to i32
+; CHECK-NEXT:    [[SUB36_3:%.*]] = sub nsw i32 [[CONV33_3]], [[CONV35_3]]
+; CHECK-NEXT:    [[ARRAYIDX37_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 7
+; CHECK-NEXT:    [[TMP62:%.*]] = load i8, ptr [[ARRAYIDX37_3]], align 1
+; CHECK-NEXT:    [[CONV38_3:%.*]] = zext i8 [[TMP62]] to i32
+; CHECK-NEXT:    [[ARRAYIDX39_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 7
+; CHECK-NEXT:    [[TMP63:%.*]] = load i8, ptr [[ARRAYIDX39_3]], align 1
+; CHECK-NEXT:    [[CONV40_3:%.*]] = zext i8 [[TMP63]] to i32
+; CHECK-NEXT:    [[SUB41_3:%.*]] = sub nsw i32 [[CONV38_3]], [[CONV40_3]]
+; CHECK-NEXT:    [[SHL42_3:%.*]] = shl nsw i32 [[SUB41_3]], 16
+; CHECK-NEXT:    [[ADD43_3:%.*]] = add nsw i32 [[SHL42_3]], [[SUB36_3]]
+; CHECK-NEXT:    [[ADD44_3:%.*]] = add nsw i32 [[ADD19_3]], [[ADD_3]]
+; CHECK-NEXT:    [[SUB45_3:%.*]] = sub nsw i32 [[ADD_3]], [[ADD19_3]]
+; CHECK-NEXT:    [[ADD46_3:%.*]] = add nsw i32 [[ADD43_3]], [[ADD31_3]]
+; CHECK-NEXT:    [[SUB47_3:%.*]] = sub nsw i32 [[ADD31_3]], [[ADD43_3]]
+; CHECK-NEXT:    [[ADD48_3:%.*]] = add nsw i32 [[ADD46_3]], [[ADD44_3]]
+; CHECK-NEXT:    [[SUB51_3:%.*]] = sub nsw i32 [[ADD44_3]], [[ADD46_3]]
+; CHECK-NEXT:    [[ADD55_3:%.*]] = add nsw i32 [[SUB47_3]], [[SUB45_3]]
+; CHECK-NEXT:    [[SUB59_3:%.*]] = sub nsw i32 [[SUB45_3]], [[SUB47_3]]
+; CHECK-NEXT:    [[ADD78:%.*]] = add nsw i32 [[ADD48_1]], [[ADD48]]
+; CHECK-NEXT:    [[SUB86:%.*]] = sub nsw i32 [[ADD48]], [[ADD48_1]]
+; CHECK-NEXT:    [[ADD94:%.*]] = add nsw i32 [[ADD48_3]], [[ADD48_2]]
+; CHECK-NEXT:    [[SUB102:%.*]] = sub nsw i32 [[ADD48_2]], [[ADD48_3]]
+; CHECK-NEXT:    [[ADD103:%.*]] = add nsw i32 [[ADD94]], [[ADD78]]
+; CHECK-NEXT:    [[SUB104:%.*]] = sub nsw i32 [[ADD78]], [[ADD94]]
+; CHECK-NEXT:    [[ADD105:%.*]] = add nsw i32 [[SUB102]], [[SUB86]]
+; CHECK-NEXT:    [[SUB106:%.*]] = sub nsw i32 [[SUB86]], [[SUB102]]
+; CHECK-NEXT:    [[SHR_I:%.*]] = lshr i32 [[ADD103]], 15
+; CHECK-NEXT:    [[AND_I:%.*]] = and i32 [[SHR_I]], 65537
+; CHECK-NEXT:    [[MUL_I:%.*]] = mul nuw i32 [[AND_I]], 65535
+; CHECK-NEXT:    [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]]
+; CHECK-NEXT:    [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[MUL_I]]
+; CHECK-NEXT:    [[SHR_I184:%.*]] = lshr i32 [[ADD105]], 15
+; CHECK-NEXT:    [[AND_I185:%.*]] = and i32 [[SHR_I184]], 65537
+; CHECK-NEXT:    [[MUL_I186:%.*]] = mul nuw i32 [[AND_I185]], 65535
+; CHECK-NEXT:    [[ADD_I187:%.*]] = add i32 [[MUL_I186]], [[ADD105]]
+; CHECK-NEXT:    [[XOR_I188:%.*]] = xor i32 [[ADD_I187]], [[MUL_I186]]
+; CHECK-NEXT:    [[SHR_I189:%.*]] = lshr i32 [[SUB104]], 15
+; CHECK-NEXT:    [[AND_I190:%.*]] = and i32 [[SHR_I189]], 65537
+; CHECK-NEXT:    [[MUL_I191:%.*]] = mul nuw i32 [[AND_I190]], 65535
+; CHECK-NEXT:    [[ADD_I192:%.*]] = add i32 [[MUL_I191]], [[SUB104]]
+; CHECK-NEXT:    [[XOR_I193:%.*]] = xor i32 [[ADD_I192]], [[MUL_I191]]
+; CHECK-NEXT:    [[SHR_I194:%.*]] = lshr i32 [[SUB106]], 15
+; CHECK-NEXT:    [[AND_I195:%.*]] = and i32 [[SHR_I194]], 65537
+; CHECK-NEXT:    [[MUL_I196:%.*]] = mul nuw i32 [[AND_I195]], 65535
+; CHECK-NEXT:    [[ADD_I197:%.*]] = add i32 [[MUL_I196]], [[SUB106]]
+; CHECK-NEXT:    [[XOR_I198:%.*]] = xor i32 [[ADD_I197]], [[MUL_I196]]
+; CHECK-NEXT:    [[ADD110:%.*]] = add i32 [[XOR_I188]], [[XOR_I]]
+; CHECK-NEXT:    [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I193]]
+; CHECK-NEXT:    [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I198]]
+; CHECK-NEXT:    [[ADD78_1:%.*]] = add nsw i32 [[ADD55_1]], [[ADD55]]
+; CHECK-NEXT:    [[SUB86_1:%.*]] = sub nsw i32 [[ADD55]], [[ADD55_1]]
+; CHECK-NEXT:    [[ADD94_1:%.*]] = add nsw i32 [[ADD55_3]], [[ADD55_2]]
+; CHECK-NEXT:    [[SUB102_1:%.*]] = sub nsw i32 [[ADD55_2]], [[ADD55_3]]
+; CHECK-NEXT:    [[ADD103_1:%.*]] = add nsw i32 [[ADD94_1]], [[ADD78_1]]
+; CHECK-NEXT:    [[SUB104_1:%.*]] = sub nsw i32 [[ADD78_1]], [[ADD94_1]]
+; CHECK-NEXT:    [[ADD105_1:%.*]] = add nsw i32 [[SUB102_1]], [[SUB86_1]]
+; CHECK-NEXT:    [[SUB106_1:%.*]] = sub nsw i32 [[SUB86_1]], [[SUB102_1]]
+; CHECK-NEXT:    [[SHR_I_1:%.*]] = lshr i32 [[ADD103_1]], 15
+; CHECK-NEXT:    [[AND_I_1:%.*]] = and i32 [[SHR_I_1]], 65537
+; CHECK-NEXT:    [[MUL_I_1:%.*]] = mul nuw i32 [[AND_I_1]], 65535
+; CHECK-NEXT:    [[ADD_I_1:%.*]] = add i32 [[MUL_I_1]], [[ADD103_1]]
+; CHECK-NEXT:    [[XOR_I_1:%.*]] = xor i32 [[ADD_I_1]], [[MUL_I_1]]
+; CHECK-NEXT:    [[SHR_I184_1:%.*]] = lshr i32 [[ADD105_1]], 15
+; CHECK-NEXT:    [[AND_I185_1:%.*]] = and i32 [[SHR_I184_1]], 65537
+; CHECK-NEXT:    [[MUL_I186_1:%.*]] = mul nuw i32 [[AND_I185_1]], 65535
+; CHECK-NEXT:    [[ADD_I187_1:%.*]] = add i32 [[MUL_I186_1]], [[ADD105_1]]
+; CHECK-NEXT:    [[XOR_I188_1:%.*]] = xor i32 [[ADD_I187_1]], [[MUL_I186_1]]
+; CHECK-NEXT:    [[SHR_I189_1:%.*]] = lshr i32 [[SUB104_1]], 15
+; CHECK-NEXT:    [[AND_I190_1:%.*]] = and i32 [[SHR_I189_1]], 65537
+; CHECK-NEXT:    [[MUL_I191_1:%.*]] = mul nuw i32 [[AND_I190_1]], 65535
+; CHECK-NEXT:    [[ADD_I192_1:%.*]] = add i32 [[MUL_I191_1]], [[SUB104_1]]
+; CHECK-NEXT:    [[XOR_I193_1:%.*]] = xor i32 [[ADD_I192_1]], [[MUL_I191_1]]
+; CHECK-NEXT:    [[SHR_I194_1:%.*]] = lshr i32 [[SUB106_1]], 15
+; CHECK-NEXT:    [[AND_I195_1:%.*]] = and i32 [[SHR_I194_1]], 65537
+; CHECK-NEXT:    [[MUL_I196_1:%.*]] = mul nuw i32 [[AND_I195_1]], 65535
+; CHECK-NEXT:    [[ADD_I197_1:%.*]] = add i32 [[MUL_I196_1]], [[SUB106_1]]
+; CHECK-NEXT:    [[XOR_I198_1:%.*]] = xor i32 [[ADD_I197_1]], [[MUL_I196_1]]
+; CHECK-NEXT:    [[ADD108_1:%.*]] = add i32 [[XOR_I188_1]], [[ADD113]]
+; CHECK-NEXT:    [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[XOR_I_1]]
+; CHECK-NEXT:    [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[XOR_I193_1]]
+; CHECK-NEXT:    [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I198_1]]
+; CHECK-NEXT:    [[ADD78_2:%.*]] = add nsw i32 [[SUB51_1]], [[SUB51]]
+; CHECK-NEXT:    [[SUB86_2:%.*]] = sub nsw i32 [[SUB51]], [[SUB51_1]]
+; CHECK-NEXT:    [[ADD94_2:%.*]] = add nsw i32 [[SUB51_3]], [[SUB51_2]]
+; CHECK-NEXT:    [[SUB102_2:%.*]] = sub nsw i32 [[SUB51_2]], [[SUB51_3]]
+; CHECK-NEXT:    [[ADD103_2:%.*]] = add nsw i32 [[ADD94_2]], [[ADD78_2]]
+; CHECK-NEXT:    [[SUB104_2:%.*]] = sub nsw i32 [[ADD78_2]], [[ADD94_2]]
+; CHECK-NEXT:    [[ADD105_2:%.*]] = add nsw i32 [[SUB102_2]], [[SUB86_2]]
+; CHECK-NEXT:    [[SUB106_2:%.*]] = sub nsw i32 [[SUB86_2]], [[SUB102_2]]
+; CHECK-NEXT:    [[SHR_I_2:%.*]] = lshr i32 [[ADD103_2]], 15
+; CHECK-NEXT:    [[AND_I_2:%.*]] = and i32 [[SHR_I_2]], 65537
+; CHECK-NEXT:    [[MUL_I_2:%.*]] = mul nuw i32 [[AND_I_2]], 65535
+; CHECK-NEXT:    [[ADD_I_2:%.*]] = add i32 [[MUL_I_2]], [[ADD103_2]]
+; CHECK-NEXT:    [[XOR_I_2:%.*]] = xor i32 [[ADD_I_2]], [[MUL_I_2]]
+; CHECK-NEXT:    [[SHR_I184_2:%.*]] = lshr i32 [[ADD105_2]], 15
+; CHECK-NEXT:    [[AND_I185_2:%.*]] = and i32 [[SHR_I184_2]], 65537
+; CHECK-NEXT:    [[MUL_I186_2:%.*]] = mul nuw i32 [[AND_I185_2]], 65535
+; CHECK-NEXT:    [[ADD_I187_2:%.*]] = add i32 [[MUL_I186_2]], [[ADD105_2]]
+; CHECK-NEXT:    [[XOR_I188_2:%.*]] = xor i32 [[ADD_I187_2]], [[MUL_I186_2]]
+; CHECK-NEXT:    [[SHR_I189_2:%.*]] = lshr i32 [[SUB104_2]], 15
+; CHECK-NEXT:    [[AND_I190_2:%.*]] = and i32 [[SHR_I189_2]], 65537
+; CHECK-NEXT:    [[MUL_I191_2:%.*]] = mul nuw i32 [[AND_I190_2]], 65535
+; CHECK-NEXT:    [[ADD_I192_2:%.*]] = add i32 [[MUL_I191_2]], [[SUB104_2]]
+; CHECK-NEXT:    [[XOR_I193_2:%.*]] = xor i32 [[ADD_I192_2]], [[MUL_I191_2]]
+; CHECK-NEXT:    [[SHR_I194_2:%.*]] = lshr i32 [[SUB106_2]], 15
+; CHECK-NEXT:    [[AND_I195_2:%.*]] = and i32 [[SHR_I194_2]], 65537
+; CHECK-NEXT:    [[MUL_I196_2:%.*]] = mul nuw i32 [[AND_I195_2]], 65535
+; CHECK-NEXT:    [[ADD_I197_2:%.*]] = add i32 [[MUL_I196_2]], [[SUB106_2]]
+; CHECK-NEXT:    [[XOR_I198_2:%.*]] = xor i32 [[ADD_I197_2]], [[MUL_I196_2]]
+; CHECK-NEXT:    [[ADD108_2:%.*]] = add i32 [[XOR_I188_2]], [[ADD113_1]]
+; CHECK-NEXT:    [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[XOR_I_2]]
+; CHECK-NEXT:    [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[XOR_I193_2]]
+; CHECK-NEXT:    [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I198_2]]
+; CHECK-NEXT:    [[ADD78_3:%.*]] = add nsw i32 [[SUB59_1]], [[SUB59]]
+; CHECK-NEXT:    [[SUB86_3:%.*]] = sub nsw i32 [[SUB59]], [[SUB59_1]]
+; CHECK-NEXT:    [[ADD94_3:%.*]] = add nsw i32 [[SUB59_3]], [[SUB59_2]]
+; CHECK-NEXT:    [[SUB102_3:%.*]] = sub nsw i32 [[SUB59_2]], [[SUB59_3]]
+; CHECK-NEXT:    [[ADD103_3:%.*]] = add nsw i32 [[ADD94_3]], [[ADD78_3]]
+; CHECK-NEXT:    [[SUB104_3:%.*]] = sub nsw i32 [[ADD78_3]], [[ADD94_3]]
+; CHECK-NEXT:    [[ADD105_3:%.*]] = add nsw i32 [[SUB102_3]], [[SUB86_3]]
+; CHECK-NEXT:    [[SUB106_3:%.*]] = sub nsw i32 [[SUB86_3]], [[SUB102_3]]
+; CHECK-NEXT:    [[SHR_I_3:%.*]] = lshr i32 [[ADD103_3]], 15
+; CHECK-NEXT:    [[AND_I_3:%.*]] = and i32 [[SHR_I_3]], 65537
+; CHECK-NEXT:    [[MUL_I_3:%.*]] = mul nuw i32 [[AND_I_3]], 65535
+; CHECK-NEXT:    [[ADD_I_3:%.*]] = add i32 [[MUL_I_3]], [[ADD103_3]]
+; CHECK-NEXT:    [[XOR_I_3:%.*]] = xor i32 [[ADD_I_3]], [[MUL_I_3]]
+; CHECK-NEXT:    [[SHR_I184_3:%.*]] = lshr i32 [[ADD105_3]], 15
+; CHECK-NEXT:    [[AND_I185_3:%.*]] = and i32 [[SHR_I184_3]], 65537
+; CHECK-NEXT:    [[MUL_I186_3:%.*]] = mul nuw i32 [[AND_I185_3]], 65535
+; CHECK-NEXT:    [[ADD_I187_3:%.*]] = add i32 [[MUL_I186_3]], [[ADD105_3]]
+; CHECK-NEXT:    [[XOR_I188_3:%.*]] = xor i32 [[ADD_I187_3]], [[MUL_I186_3]]
+; CHECK-NEXT:    [[SHR_I189_3:%.*]] = lshr i32 [[SUB104_3]], 15
+; CHECK-NEXT:    [[AND_I190_3:%.*]] = and i32 [[SHR_I189_3]], 65537
+; CHECK-NEXT:    [[MUL_I191_3:%.*]] = mul nuw i32 [[AND_I190_3]], 65535
+; CHECK-NEXT:    [[ADD_I192_3:%.*]] = add i32 [[MUL_I191_3]], [[SUB104_3]]
+; CHECK-NEXT:    [[XOR_I193_3:%.*]] = xor i32 [[ADD_I192_3]], [[MUL_I191_3]]
+; CHECK-NEXT:    [[SHR_I194_3:%.*]] = lshr i32 [[SUB106_3]], 15
+; CHECK-NEXT:    [[AND_I195_3:%.*]] = and i32 [[SHR_I194_3]], 65537
+; CHECK-NEXT:    [[MUL_I196_3:%.*]] = mul nuw i32 [[AND_I195_3]], 65535
+; CHECK-NEXT:    [[ADD_I197_3:%.*]] = add i32 [[MUL_I196_3]], [[SUB106_3]]
+; CHECK-NEXT:    [[XOR_I198_3:%.*]] = xor i32 [[ADD_I197_3]], [[MUL_I196_3]]
+; CHECK-NEXT:    [[ADD108_3:%.*]] = add i32 [[XOR_I188_3]], [[ADD113_2]]
+; CHECK-NEXT:    [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[XOR_I_3]]
+; CHECK-NEXT:    [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[XOR_I193_3]]
+; CHECK-NEXT:    [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I198_3]]
+; CHECK-NEXT:    [[CONV118:%.*]] = and i32 [[ADD113_3]], 65535
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[ADD113_3]], 16
 ; CHECK-NEXT:    [[ADD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]]
 ; CHECK-NEXT:    [[SHR120:%.*]] = lshr i32 [[ADD119]], 1
 ; CHECK-NEXT:    ret i32 [[SHR120]]
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
@@ -30,14 +30,14 @@
 
 define void @store_chain_v2i64(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @store_chain_v2i64(
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[B:%.*]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP9:%.*]] = add <2 x i64> [[TMP8]], [[TMP7]]
-; CHECK-NEXT:    store <2 x i64> [[TMP9]], ptr [[C:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[B:%.*]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    store <2 x i64> [[TMP7]], ptr [[C:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %a.1 = getelementptr i64, ptr %a, i64 1
@@ -101,8 +101,8 @@
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 1, i32 2>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT:    ret <4 x i32> [[SHUFFLE]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    ret <4 x i32> [[TMP6]]
 ;
   %v0.0 = extractelement <2 x i32> %v0, i32 0
   %v0.1 = extractelement <2 x i32> %v0, i32 1
@@ -123,22 +123,22 @@
 
 define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-LABEL: @build_vec_v4i32_reuse_1(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[V1:%.*]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[V1]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[V0:%.*]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[V0]], i64 0
-; CHECK-NEXT:    [[TMP0_0:%.*]] = add i32 [[TMP4]], [[TMP2]]
-; CHECK-NEXT:    [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_0]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_1]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP10:%.*]] = sub <2 x i32> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP2_31:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x i32> [[TMP2_31]]
+; CHECK-NEXT:    [[V0_1:%.*]] = extractelement <2 x i32> [[V0:%.*]], i64 1
+; CHECK-NEXT:    [[V1_1:%.*]] = extractelement <2 x i32> [[V1:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[V0]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[V1]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP0_3:%.*]] = xor i32 [[V0_1]], [[V1_1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 1, i32 undef>
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP0_3]], i64 3
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> <i32 undef, i32 undef, i32 3, i32 2>
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP0_1]], i64 0
+; CHECK-NEXT:    [[TMP10:%.*]] = sub <4 x i32> [[TMP7]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x i32> [[TMP11]]
 ;
   %v0.0 = extractelement <2 x i32> %v0, i32 0
   %v0.1 = extractelement <2 x i32> %v0, i32 1
@@ -167,10 +167,10 @@
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor <2 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <2 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = add <2 x i32> [[SHUFFLE]], [[TMP7]]
-; CHECK-NEXT:    [[TMP3_31:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add <2 x i32> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP3_31:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3_31]]
 ;
   %v0.0 = extractelement <2 x i32> %v0, i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
@@ -30,14 +30,14 @@
 
 define void @store_chain_v2i64(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @store_chain_v2i64(
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[B:%.*]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP9:%.*]] = add <2 x i64> [[TMP8]], [[TMP7]]
-; CHECK-NEXT:    store <2 x i64> [[TMP9]], ptr [[C:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[B:%.*]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    store <2 x i64> [[TMP7]], ptr [[C:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %a.1 = getelementptr i64, ptr %a, i64 1
@@ -101,8 +101,8 @@
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 1, i32 2>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT:    ret <4 x i32> [[SHUFFLE]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    ret <4 x i32> [[TMP6]]
 ;
   %v0.0 = extractelement <2 x i32> %v0, i32 0
   %v0.1 = extractelement <2 x i32> %v0, i32 1
@@ -123,22 +123,22 @@
 
 define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-LABEL: @build_vec_v4i32_reuse_1(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[V1:%.*]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[V1]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[V0:%.*]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[V0]], i64 0
-; CHECK-NEXT:    [[TMP0_0:%.*]] = add i32 [[TMP4]], [[TMP2]]
-; CHECK-NEXT:    [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_0]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_1]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP10:%.*]] = sub <2 x i32> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP2_31:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x i32> [[TMP2_31]]
+; CHECK-NEXT:    [[V0_1:%.*]] = extractelement <2 x i32> [[V0:%.*]], i64 1
+; CHECK-NEXT:    [[V1_1:%.*]] = extractelement <2 x i32> [[V1:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[V0]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[V1]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP0_3:%.*]] = xor i32 [[V0_1]], [[V1_1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 1, i32 undef>
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP0_3]], i64 3
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> <i32 undef, i32 undef, i32 3, i32 2>
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP0_1]], i64 0
+; CHECK-NEXT:    [[TMP10:%.*]] = sub <4 x i32> [[TMP7]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x i32> [[TMP11]]
 ;
   %v0.0 = extractelement <2 x i32> %v0, i32 0
   %v0.1 = extractelement <2 x i32> %v0, i32 1
@@ -167,10 +167,10 @@
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor <2 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <2 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = add <2 x i32> [[SHUFFLE]], [[TMP7]]
-; CHECK-NEXT:    [[TMP3_31:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add <2 x i32> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP3_31:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3_31]]
 ;
   %v0.0 = extractelement <2 x i32> %v0, i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll
@@ -18,19 +18,20 @@
 define void @s116_modified(ptr %a) {
 ; CHECK-LABEL: @s116_modified(
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 1
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 2
 ; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 3
 ; CHECK-NEXT:    [[LD0:%.*]] = load float, ptr [[A]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[GEP1]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[GEP3]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> poison, float [[LD0]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 undef>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> [[TMP8]], <4 x i32> <i32 0, i32 undef, i32 2, i32 4>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast <4 x float> [[TMP9]], [[TMP11]]
-; CHECK-NEXT:    store <4 x float> [[TMP12]], ptr [[A]], align 4
+; CHECK-NEXT:    [[LD1:%.*]] = load float, ptr [[GEP1]], align 4
+; CHECK-NEXT:    [[LD2:%.*]] = load float, ptr [[GEP2]], align 4
+; CHECK-NEXT:    [[MUL0:%.*]] = fmul fast float [[LD0]], [[LD1]]
+; CHECK-NEXT:    [[MUL1:%.*]] = fmul fast float [[LD2]], [[LD1]]
+; CHECK-NEXT:    store float [[MUL0]], ptr [[A]], align 4
+; CHECK-NEXT:    store float [[MUL1]], ptr [[GEP1]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[GEP3]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[LD2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP1]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    store <2 x float> [[TMP4]], ptr [[GEP2]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %gep1 = getelementptr inbounds float, ptr %a, i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll
@@ -232,23 +232,27 @@
 
 define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-LABEL: @ashr_lshr_shl_v8i32(
-; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 6
+; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 4
+; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
+; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i64 6
 ; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i64 7
-; SSE-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i64 6
+; SSE-NEXT:    [[B4:%.*]] = extractelement <8 x i32> [[B:%.*]], i64 4
+; SSE-NEXT:    [[B5:%.*]] = extractelement <8 x i32> [[B]], i64 5
+; SSE-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B]], i64 6
 ; SSE-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i64 7
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]]
 ; SSE-NEXT:    [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]]
 ; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; SSE-NEXT:    [[TMP6:%.*]] = lshr <8 x i32> [[A]], [[B]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <2 x i32> <i32 4, i32 5>
+; SSE-NEXT:    [[AB4:%.*]] = lshr i32 [[A4]], [[B4]]
+; SSE-NEXT:    [[AB5:%.*]] = lshr i32 [[A5]], [[B5]]
 ; SSE-NEXT:    [[AB6:%.*]] = shl i32 [[A6]], [[B6]]
 ; SSE-NEXT:    [[AB7:%.*]] = shl i32 [[A7]], [[B7]]
-; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE-NEXT:    [[R51:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 undef, i32 undef>
-; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R51]], i32 [[AB6]], i64 6
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[AB4]], i64 4
+; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i64 5
+; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i64 6
 ; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i64 7
 ; SSE-NEXT:    ret <8 x i32> [[R7]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
@@ -232,23 +232,27 @@
 
 define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-LABEL: @ashr_lshr_shl_v8i32(
-; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 6
+; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 4
+; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
+; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i64 6
 ; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i64 7
-; SSE-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i64 6
+; SSE-NEXT:    [[B4:%.*]] = extractelement <8 x i32> [[B:%.*]], i64 4
+; SSE-NEXT:    [[B5:%.*]] = extractelement <8 x i32> [[B]], i64 5
+; SSE-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B]], i64 6
 ; SSE-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i64 7
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]]
 ; SSE-NEXT:    [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]]
 ; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; SSE-NEXT:    [[TMP6:%.*]] = lshr <8 x i32> [[A]], [[B]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <2 x i32> <i32 4, i32 5>
+; SSE-NEXT:    [[AB4:%.*]] = lshr i32 [[A4]], [[B4]]
+; SSE-NEXT:    [[AB5:%.*]] = lshr i32 [[A5]], [[B5]]
 ; SSE-NEXT:    [[AB6:%.*]] = shl i32 [[A6]], [[B6]]
 ; SSE-NEXT:    [[AB7:%.*]] = shl i32 [[A7]], [[B7]]
-; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE-NEXT:    [[R51:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 undef, i32 undef>
-; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R51]], i32 [[AB6]], i64 6
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[AB4]], i64 4
+; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i64 5
+; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i64 6
 ; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i64 7
 ; SSE-NEXT:    ret <8 x i32> [[R7]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-inseltpoison.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-inseltpoison.ll
@@ -607,26 +607,32 @@
 ; SSE-NEXT:    ret <8 x double> [[TMP1]]
 ;
 ; SLM-LABEL: @buildvector_div_8f64(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A:%.*]], <8 x double> poison, <2 x i32> <i32 0, i32 1>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <2 x i32> <i32 0, i32 1>
+; SLM-NEXT:    [[A4:%.*]] = extractelement <8 x double> [[A:%.*]], i32 4
+; SLM-NEXT:    [[A5:%.*]] = extractelement <8 x double> [[A]], i32 5
+; SLM-NEXT:    [[A6:%.*]] = extractelement <8 x double> [[A]], i32 6
+; SLM-NEXT:    [[A7:%.*]] = extractelement <8 x double> [[A]], i32 7
+; SLM-NEXT:    [[B4:%.*]] = extractelement <8 x double> [[B:%.*]], i32 4
+; SLM-NEXT:    [[B5:%.*]] = extractelement <8 x double> [[B]], i32 5
+; SLM-NEXT:    [[B6:%.*]] = extractelement <8 x double> [[B]], i32 6
+; SLM-NEXT:    [[B7:%.*]] = extractelement <8 x double> [[B]], i32 7
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> <i32 0, i32 1>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> <i32 0, i32 1>
 ; SLM-NEXT:    [[TMP3:%.*]] = fdiv <2 x double> [[TMP1]], [[TMP2]]
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> <i32 2, i32 3>
 ; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> <i32 2, i32 3>
 ; SLM-NEXT:    [[TMP6:%.*]] = fdiv <2 x double> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> <i32 4, i32 5>
-; SLM-NEXT:    [[TMP8:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> <i32 4, i32 5>
-; SLM-NEXT:    [[TMP9:%.*]] = fdiv <2 x double> [[TMP7]], [[TMP8]]
-; SLM-NEXT:    [[TMP10:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> <i32 6, i32 7>
-; SLM-NEXT:    [[TMP11:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> <i32 6, i32 7>
-; SLM-NEXT:    [[TMP12:%.*]] = fdiv <2 x double> [[TMP10]], [[TMP11]]
-; SLM-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SLM-NEXT:    [[TMP14:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SLM-NEXT:    [[R31:%.*]] = shufflevector <8 x double> [[TMP13]], <8 x double> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    [[TMP15:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SLM-NEXT:    [[R52:%.*]] = shufflevector <8 x double> [[R31]], <8 x double> [[TMP15]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SLM-NEXT:    [[TMP16:%.*]] = shufflevector <2 x double> [[TMP12]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SLM-NEXT:    [[R73:%.*]] = shufflevector <8 x double> [[R52]], <8 x double> [[TMP16]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SLM-NEXT:    ret <8 x double> [[R73]]
+; SLM-NEXT:    [[C4:%.*]] = fdiv double [[A4]], [[B4]]
+; SLM-NEXT:    [[C5:%.*]] = fdiv double [[A5]], [[B5]]
+; SLM-NEXT:    [[C6:%.*]] = fdiv double [[A6]], [[B6]]
+; SLM-NEXT:    [[C7:%.*]] = fdiv double [[A7]], [[B7]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SLM-NEXT:    [[TMP8:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SLM-NEXT:    [[R31:%.*]] = shufflevector <8 x double> [[TMP7]], <8 x double> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x double> [[R31]], double [[C4]], i32 4
+; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[C5]], i32 5
+; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[C6]], i32 6
+; SLM-NEXT:    [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[C7]], i32 7
+; SLM-NEXT:    ret <8 x double> [[R7]]
 ;
 ; AVX-LABEL: @buildvector_div_8f64(
 ; AVX-NEXT:    [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fp.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fp.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fp.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fp.ll
@@ -607,26 +607,32 @@
 ; SSE-NEXT:    ret <8 x double> [[TMP1]]
 ;
 ; SLM-LABEL: @buildvector_div_8f64(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A:%.*]], <8 x double> poison, <2 x i32> <i32 0, i32 1>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <2 x i32> <i32 0, i32 1>
+; SLM-NEXT:    [[A4:%.*]] = extractelement <8 x double> [[A:%.*]], i32 4
+; SLM-NEXT:    [[A5:%.*]] = extractelement <8 x double> [[A]], i32 5
+; SLM-NEXT:    [[A6:%.*]] = extractelement <8 x double> [[A]], i32 6
+; SLM-NEXT:    [[A7:%.*]] = extractelement <8 x double> [[A]], i32 7
+; SLM-NEXT:    [[B4:%.*]] = extractelement <8 x double> [[B:%.*]], i32 4
+; SLM-NEXT:    [[B5:%.*]] = extractelement <8 x double> [[B]], i32 5
+; SLM-NEXT:    [[B6:%.*]] = extractelement <8 x double> [[B]], i32 6
+; SLM-NEXT:    [[B7:%.*]] = extractelement <8 x double> [[B]], i32 7
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> <i32 0, i32 1>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> <i32 0, i32 1>
 ; SLM-NEXT:    [[TMP3:%.*]] = fdiv <2 x double> [[TMP1]], [[TMP2]]
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> <i32 2, i32 3>
 ; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> <i32 2, i32 3>
 ; SLM-NEXT:    [[TMP6:%.*]] = fdiv <2 x double> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> <i32 4, i32 5>
-; SLM-NEXT:    [[TMP8:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> <i32 4, i32 5>
-; SLM-NEXT:    [[TMP9:%.*]] = fdiv <2 x double> [[TMP7]], [[TMP8]]
-; SLM-NEXT:    [[TMP10:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> <i32 6, i32 7>
-; SLM-NEXT:    [[TMP11:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> <i32 6, i32 7>
-; SLM-NEXT:    [[TMP12:%.*]] = fdiv <2 x double> [[TMP10]], [[TMP11]]
-; SLM-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SLM-NEXT:    [[TMP14:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SLM-NEXT:    [[R31:%.*]] = shufflevector <8 x double> [[TMP13]], <8 x double> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    [[TMP15:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SLM-NEXT:    [[R52:%.*]] = shufflevector <8 x double> [[R31]], <8 x double> [[TMP15]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SLM-NEXT:    [[TMP16:%.*]] = shufflevector <2 x double> [[TMP12]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SLM-NEXT:    [[R73:%.*]] = shufflevector <8 x double> [[R52]], <8 x double> [[TMP16]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SLM-NEXT:    ret <8 x double> [[R73]]
+; SLM-NEXT:    [[C4:%.*]] = fdiv double [[A4]], [[B4]]
+; SLM-NEXT:    [[C5:%.*]] = fdiv double [[A5]], [[B5]]
+; SLM-NEXT:    [[C6:%.*]] = fdiv double [[A6]], [[B6]]
+; SLM-NEXT:    [[C7:%.*]] = fdiv double [[A7]], [[B7]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SLM-NEXT:    [[TMP8:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SLM-NEXT:    [[R31:%.*]] = shufflevector <8 x double> [[TMP7]], <8 x double> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x double> [[R31]], double [[C4]], i32 4
+; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[C5]], i32 5
+; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[C6]], i32 6
+; SLM-NEXT:    [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[C7]], i32 7
+; SLM-NEXT:    ret <8 x double> [[R7]]
 ;
 ; AVX-LABEL: @buildvector_div_8f64(
 ; AVX-NEXT:    [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll
@@ -16,13 +16,28 @@
 
 define void @splat(i8 %a, i8 %b, i8 %c) {
 ; SSE-LABEL: @splat(
-; SSE-NEXT:    [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[A:%.*]], i32 0
-; SSE-NEXT:    [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[B:%.*]], i32 1
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
-; SSE-NEXT:    [[TMP4:%.*]] = insertelement <16 x i8> poison, i8 [[C:%.*]], i32 0
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i8> [[TMP4]], <16 x i8> poison, <16 x i32> zeroinitializer
-; SSE-NEXT:    [[TMP6:%.*]] = xor <16 x i8> [[TMP3]], [[TMP5]]
-; SSE-NEXT:    store <16 x i8> [[TMP6]], ptr @cle, align 16
+; SSE-NEXT:    [[TMP1:%.*]] = xor i8 [[C:%.*]], [[A:%.*]]
+; SSE-NEXT:    store i8 [[TMP1]], ptr @cle, align 16
+; SSE-NEXT:    [[TMP2:%.*]] = xor i8 [[A]], [[C]]
+; SSE-NEXT:    store i8 [[TMP2]], ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 1), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = xor i8 [[A]], [[C]]
+; SSE-NEXT:    store i8 [[TMP3]], ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 2), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = xor i8 [[A]], [[C]]
+; SSE-NEXT:    store i8 [[TMP4]], ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 3), align 1
+; SSE-NEXT:    [[TMP5:%.*]] = xor i8 [[C]], [[A]]
+; SSE-NEXT:    store i8 [[TMP5]], ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 4), align 1
+; SSE-NEXT:    [[TMP6:%.*]] = xor i8 [[C]], [[B:%.*]]
+; SSE-NEXT:    store i8 [[TMP6]], ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 5), align 1
+; SSE-NEXT:    [[TMP7:%.*]] = xor i8 [[C]], [[A]]
+; SSE-NEXT:    store i8 [[TMP7]], ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 6), align 1
+; SSE-NEXT:    [[TMP8:%.*]] = xor i8 [[C]], [[B]]
+; SSE-NEXT:    store i8 [[TMP8]], ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 7), align 1
+; SSE-NEXT:    [[TMP9:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i32 0
+; SSE-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i8> [[TMP9]], <8 x i8> poison, <8 x i32> zeroinitializer
+; SSE-NEXT:    [[TMP11:%.*]] = insertelement <8 x i8> poison, i8 [[C]], i32 0
+; SSE-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i8> [[TMP11]], <8 x i8> poison, <8 x i32> zeroinitializer
+; SSE-NEXT:    [[TMP13:%.*]] = xor <8 x i8> [[TMP10]], [[TMP12]]
+; SSE-NEXT:    store <8 x i8> [[TMP13]], ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 8), align 1
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @splat(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll
@@ -8,29 +8,30 @@
 ; CHECK-NEXT:    [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double>
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> <double poison, double 0.000000e+00>, double [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> zeroinitializer, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> [[TMP8]], <4 x double> <double poison, double 0.000000e+00, double poison, double 0.000000e+00>, <4 x i32> <i32 undef, i32 5, i32 2, i32 7>
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x double> [[TMP9]], double [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <4 x i32> <i32 1, i32 2, i32 0, i32 undef>
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x double> [[TMP11]], double [[TMP7]], i32 3
-; CHECK-NEXT:    [[TMP13:%.*]] = fmul <4 x double> [[TMP10]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]]
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x double> [[TMP13]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <8 x double> <double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, <8 x double> [[TMP15]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x double> [[TMP14]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <8 x double> <double poison, double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00>, <8 x double> [[TMP17]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <8 x double> [[TMP18]], <8 x double> [[TMP19]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP21:%.*]] = fsub <8 x double> [[TMP16]], [[TMP20]]
-; CHECK-NEXT:    [[TMP22:%.*]] = fmul <8 x double> [[TMP16]], [[TMP20]]
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <8 x double> [[TMP21]], <8 x double> [[TMP22]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP24:%.*]] = fptrunc <8 x double> [[TMP23]] to <8 x float>
-; CHECK-NEXT:    [[TMP25:%.*]] = fmul <8 x float> [[TMP24]], zeroinitializer
-; CHECK-NEXT:    [[TMP26:%.*]] = fcmp oeq <8 x float> [[TMP25]], zeroinitializer
-; CHECK-NEXT:    [[TMP27:%.*]] = freeze <8 x i1> [[TMP26]]
-; CHECK-NEXT:    [[TMP28:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP27]])
-; CHECK-NEXT:    ret i1 [[TMP28]]
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x double> [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x double> [[TMP10]], <2 x double> <double poison, double 0.000000e+00>, <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP12:%.*]] = fmul <2 x double> [[TMP9]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]]
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <8 x double> <double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, <8 x double> [[TMP14]], <8 x i32> <i32 8, i32 9, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x double> [[TMP12]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <8 x double> [[TMP15]], <8 x double> [[TMP16]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <4 x double> [[TMP13]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <8 x double> <double poison, double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00>, <8 x double> [[TMP18]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <8 x double> [[TMP19]], <8 x double> [[TMP20]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP22:%.*]] = fsub <8 x double> [[TMP17]], [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = fmul <8 x double> [[TMP17]], [[TMP21]]
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <8 x double> [[TMP22]], <8 x double> [[TMP23]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP25:%.*]] = fptrunc <8 x double> [[TMP24]] to <8 x float>
+; CHECK-NEXT:    [[TMP26:%.*]] = fmul <8 x float> [[TMP25]], zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = fcmp oeq <8 x float> [[TMP26]], zeroinitializer
+; CHECK-NEXT:    [[TMP28:%.*]] = freeze <8 x i1> [[TMP27]]
+; CHECK-NEXT:    [[TMP29:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP28]])
+; CHECK-NEXT:    ret i1 [[TMP29]]
 ;
   %3 = fpext float %0 to double
   %4 = fpext float 0.000000e+00 to double
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
@@ -524,36 +524,22 @@
 
 
 define i1 @foo(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) {
-; SSE-LABEL: @foo(
-; SSE-NEXT:    [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0
-; SSE-NEXT:    [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
-; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0
-; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[VEC]], <4 x float> poison, <2 x i32> <i32 undef, i32 1>
-; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[SUB14_I167]], i32 0
-; SSE-NEXT:    [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+01>, float [[B:%.*]], i32 0
-; SSE-NEXT:    [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]]
-; SSE-NEXT:    [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], <float 1.000000e+01, float 2.000000e+00>
-; SSE-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
-; SSE-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
-; SSE-NEXT:    [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]]
-; SSE-NEXT:    [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
-; SSE-NEXT:    ret i1 [[CMP_I185]]
-;
-; AVX-LABEL: @foo(
-; AVX-NEXT:    [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0
-; AVX-NEXT:    [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
-; AVX-NEXT:    [[FM:%.*]] = fmul float [[A:%.*]], [[SUB14_I167]]
-; AVX-NEXT:    [[SUB25_I168:%.*]] = fsub float [[FM]], [[B:%.*]]
-; AVX-NEXT:    [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 1
-; AVX-NEXT:    [[ADD36_I173:%.*]] = fadd float [[SUB25_I168]], 1.000000e+01
-; AVX-NEXT:    [[MUL72_I179:%.*]] = fmul float [[C:%.*]], [[VECEXT_I276_I169]]
-; AVX-NEXT:    [[ADD78_I180:%.*]] = fsub float [[MUL72_I179]], 3.000000e+01
-; AVX-NEXT:    [[ADD79_I181:%.*]] = fadd float 2.000000e+00, [[ADD78_I180]]
-; AVX-NEXT:    [[MUL123_I184:%.*]] = fmul float [[ADD36_I173]], [[ADD79_I181]]
-; AVX-NEXT:    [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
-; AVX-NEXT:    ret i1 [[CMP_I185]]
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:    [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0
+; CHECK-NEXT:    [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[VEC]], <4 x float> poison, <2 x i32> <i32 undef, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[SUB14_I167]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+01>, float [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], <float 1.000000e+01, float 2.000000e+00>
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
+; CHECK-NEXT:    [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
+; CHECK-NEXT:    ret i1 [[CMP_I185]]
 ;
   %vecext.i291.i166 = extractelement <4 x float> %vec, i64 0
   %sub14.i167 = fsub float undef, %vecext.i291.i166
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN:  opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+sse2     | FileCheck %s --check-prefixes=SSE
 ; RUN:  opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx      | FileCheck %s --check-prefixes=AVX
-; RUN:  opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2     | FileCheck %s --check-prefixes=AVX
+; RUN:  opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2     | FileCheck %s --check-prefixes=AVX2
 ; RUN:  opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f  | FileCheck %s --check-prefixes=AVX512
 ; RUN:  opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
 
@@ -25,20 +25,33 @@
 ;
 ; AVX-LABEL: @foo(
 ; AVX-NEXT:    [[TMP1:%.*]] = load i32, ptr @b, align 16
+; AVX-NEXT:    store i32 [[TMP1]], ptr @a, align 16
 ; AVX-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @b, i64 0, i64 2), align 8
-; AVX-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[TMP1]], i64 0
-; AVX-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP2]], i64 1
-; AVX-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
-; AVX-NEXT:    store <8 x i32> [[SHUFFLE]], ptr @a, align 16
+; AVX-NEXT:    store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 1), align 4
+; AVX-NEXT:    store i32 [[TMP1]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 2), align 8
+; AVX-NEXT:    store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 3), align 4
+; AVX-NEXT:    store i32 [[TMP1]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 4), align 16
+; AVX-NEXT:    store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 5), align 4
+; AVX-NEXT:    store i32 [[TMP1]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 6), align 8
+; AVX-NEXT:    store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 7), align 4
 ; AVX-NEXT:    ret void
 ;
+; AVX2-LABEL: @foo(
+; AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr @b, align 16
+; AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @b, i64 0, i64 2), align 8
+; AVX2-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[TMP1]], i64 0
+; AVX2-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP2]], i64 1
+; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+; AVX2-NEXT:    store <8 x i32> [[TMP5]], ptr @a, align 16
+; AVX2-NEXT:    ret void
+;
 ; AVX512-LABEL: @foo(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load i32, ptr @b, align 16
 ; AVX512-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @b, i64 0, i64 2), align 8
 ; AVX512-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[TMP1]], i64 0
 ; AVX512-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP2]], i64 1
-; AVX512-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
-; AVX512-NEXT:    store <8 x i32> [[SHUFFLE]], ptr @a, align 16
+; AVX512-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+; AVX512-NEXT:    store <8 x i32> [[TMP5]], ptr @a, align 16
 ; AVX512-NEXT:    ret void
 ;
   %1 = load i32, ptr @b, align 16
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
 
+; FIXME: fix the cost of the xor reduction ops
+
 define i16 @test() {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
@@ -8,19 +10,17 @@
 ; CHECK-NEXT:    [[A1:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 6
 ; CHECK-NEXT:    br label [[WHILE:%.*]]
 ; CHECK:       while:
-; CHECK-NEXT:    [[PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX25:%.*]], [[WHILE]] ]
+; CHECK-NEXT:    [[PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX27:%.*]], [[WHILE]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr null, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr null, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[A]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[A1]], align 16
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> <i32 1, i32 0, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
-; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> [[TMP7]])
-; CHECK-NEXT:    [[OP_RDX23:%.*]] = xor i64 0, [[TMP1]]
-; CHECK-NEXT:    [[OP_RDX24:%.*]] = xor i64 [[TMP0]], [[TMP8]]
-; CHECK-NEXT:    [[OP_RDX25]] = xor i64 [[OP_RDX23]], [[OP_RDX24]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A1]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr null, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr [[A]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> [[TMP4]])
+; CHECK-NEXT:    [[OP_RDX24:%.*]] = xor i64 0, [[TMP2]]
+; CHECK-NEXT:    [[OP_RDX25:%.*]] = xor i64 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[OP_RDX26:%.*]] = xor i64 [[OP_RDX24]], [[OP_RDX25]]
+; CHECK-NEXT:    [[OP_RDX27]] = xor i64 [[OP_RDX26]], [[TMP5]]
 ; CHECK-NEXT:    br label [[WHILE]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
@@ -11,18 +11,18 @@
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ARG:%.*]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, <8 x ptr> [[SHUFFLE]], <8 x i64> <i64 1, i64 3, i64 5, i64 7, i64 9, i64 11, i64 13, i64 15>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr double, <8 x ptr> [[TMP1]], <8 x i64> <i64 1, i64 3, i64 5, i64 7, i64 9, i64 11, i64 13, i64 15>
 ; CHECK-NEXT:    [[GEP2_0:%.*]] = getelementptr inbounds double, ptr [[ARG1:%.*]], i64 16
-; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> [[TMP1]], i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x double> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x double> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x double>, ptr [[GEP2_0]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast <8 x double> [[TMP4]], [[TMP2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x double>, ptr [[ARG1]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast <8 x double> [[TMP7]], [[TMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP8]])
-; CHECK-NEXT:    [[TMP10:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP5]])
-; CHECK-NEXT:    [[I142:%.*]] = insertelement <2 x double> poison, double [[TMP9]], i64 0
-; CHECK-NEXT:    [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP10]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast <8 x double> [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x double>, ptr [[ARG1]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul fast <8 x double> [[TMP6]], [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP5]])
+; CHECK-NEXT:    [[I142:%.*]] = insertelement <2 x double> poison, double [[TMP8]], i64 0
+; CHECK-NEXT:    [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP9]], i64 1
 ; CHECK-NEXT:    [[P:%.*]] = getelementptr inbounds double, ptr [[ARG2:%.*]], <2 x i64> <i64 0, i64 16>
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[I143]], <2 x ptr> [[P]], i32 8, <2 x i1> <i1 true, i1 true>)
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll
@@ -7,13 +7,12 @@
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[PTR1:%.*]], i32 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[PTR:%.*]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -1, i32 -1>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt <4 x i32> [[TMP1]], undef
-; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> undef, <4 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> zeroinitializer, <4 x i32> zeroinitializer, <4 x i32> [[TMP6]]
-; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP27]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <2 x i32> [[TMP0]], <i32 -1, i32 -1>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt <4 x i32> [[TMP1]], undef
+; CHECK-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> undef, <4 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> zeroinitializer, <4 x i32> zeroinitializer, <4 x i32> [[TMP5]]
+; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr [[TMP27]], align 8
 ; CHECK-NEXT:    ret void
 ;
 bb: