diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6286,6 +6286,262 @@
   return {VK, VP};
 }
 
+namespace {
+/// The base class for shuffle instruction emission and shuffle cost estimation.
+class BaseShuffleAnalysis {
+protected:
+  /// Checks if the mask is an identity mask.
+  /// \param IsStrict if is true the function returns false if mask size does
+  /// not match vector size.
+  static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
+                             bool IsStrict) {
+    int Limit = Mask.size();
+    int VF = VecTy->getNumElements();
+    return (VF == Limit || !IsStrict) &&
+           all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) &&
+           ShuffleVectorInst::isIdentityMask(Mask);
+  }
+
+  /// Tries to combine 2 different masks into single one.
+  static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
+                           ArrayRef<int> ExtMask) {
+    unsigned VF = Mask.size();
+    SmallVector<int> NewMask(ExtMask.size(), UndefMaskElem);
+    for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
+      if (ExtMask[I] == UndefMaskElem)
+        continue;
+      int MaskedIdx = Mask[ExtMask[I] % VF];
+      NewMask[I] =
+          MaskedIdx == UndefMaskElem ? UndefMaskElem : MaskedIdx % LocalVF;
+    }
+    Mask.swap(NewMask);
+  }
+
+  /// Looks through shuffles trying to reduce final number of shuffles in the
+  /// code. The function looks through the previously emitted shuffle
+  /// instructions and properly mark indices in mask as undef.
+  /// For example, given the code
+  /// \code
+  /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
+  /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
+  /// \endcode
+  /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
+  /// look through %s1 and %s2 and emit
+  /// \code
+  /// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
+  /// \endcode
+  /// instead.
+  /// If 2 operands are of different size, the smallest one will be resized and
+  /// the mask recalculated properly.
+  /// For example, given the code
+  /// \code
+  /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
+  /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
+  /// \endcode
+  /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
+  /// look through %s1 and %s2 and emit
+  /// \code
+  /// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
+  /// \endcode
+  /// instead.
+  /// So, it tries to transform permutations to simple vector merge, if
+  /// possible.
+  /// \param V The input vector which must be shuffled using the given \p Mask.
+  /// If the better candidate is found, \p V is set to this best candidate
+  /// vector.
+  /// \param Mask The input mask for the shuffle. If the best candidate is found
+  /// during looking-through-shuffles attempt, it is updated accordingly.
+  /// \param SinglePermute true if the shuffle operation is originally a
+  /// single-value-permutation. In this case the look-through-shuffles procedure
+  /// may look for resizing shuffles as the best candidates.
+  static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
+                                  bool SinglePermute) {
+    Value *Op = V;
+    ShuffleVectorInst *IdentityOp = nullptr;
+    SmallVector<int> IdentityMask;
+    while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
+      // Exit if not a fixed vector type or changing size shuffle.
+      auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
+      if (!SVTy)
+        break;
+      // Remember the identity or broadcast mask, if it is not a resizing
+      // shuffle. If no better candidates are found, this Op and Mask will be
+      // used in the final shuffle.
+      if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
+        if (!IdentityOp || !SinglePermute ||
+            (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
+             !ShuffleVectorInst::isZeroEltSplatMask(IdentityMask))) {
+          IdentityOp = SV;
+          // Store current mask in the IdentityMask so later we did not lost
+          // this info if IdentityOp is selected as the best candidate for the
+          // permutation.
+          IdentityMask.assign(Mask);
+        }
+      }
+      // Remember the broadcast mask. If no better candidates are found, this Op
+      // and Mask will be used in the final shuffle.
+      if (SV->isZeroEltSplat()) {
+        IdentityOp = SV;
+        IdentityMask.assign(Mask);
+      }
+      int LocalVF = Mask.size();
+      if (auto *SVOpTy =
+              dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
+        LocalVF = SVOpTy->getNumElements();
+      bool IsOp1Undef =
+          isUndefVector(SV->getOperand(0),
+                        buildUseMask(LocalVF, Mask, UseMask::FirstArg))
+              .all();
+      bool IsOp2Undef =
+          isUndefVector(SV->getOperand(1),
+                        buildUseMask(LocalVF, Mask, UseMask::SecondArg))
+              .all();
+      if (!IsOp1Undef && !IsOp2Undef) {
+        // Update mask and mark undef elems.
+        for (auto [Idx, I] : enumerate(Mask)) {
+          if (I == UndefMaskElem)
+            continue;
+          if (SV->getShuffleMask()[I % SV->getShuffleMask().size()] ==
+              UndefMaskElem)
+            I = UndefMaskElem;
+        }
+        break;
+      }
+      SmallVector<int> ShuffleMask(SV->getShuffleMask().begin(),
+                                   SV->getShuffleMask().end());
+      combineMasks(LocalVF, ShuffleMask, Mask);
+      Mask.swap(ShuffleMask);
+      if (IsOp2Undef)
+        Op = SV->getOperand(0);
+      else
+        Op = SV->getOperand(1);
+    }
+    if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
+        !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute)) {
+      if (IdentityOp) {
+        V = IdentityOp;
+        assert(Mask.size() == IdentityMask.size() &&
+               "Expected masks of same sizes.");
+        // Clear known poison elements.
+        for (auto [I, Idx] : enumerate(Mask))
+          if (Idx == UndefMaskElem)
+            IdentityMask[I] = UndefMaskElem;
+        Mask.swap(IdentityMask);
+        auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
+        return SinglePermute &&
+               (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
+                               /*IsStrict=*/true) ||
+                (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
+                 Shuffle->isZeroEltSplat() &&
+                 ShuffleVectorInst::isZeroEltSplatMask(Mask)));
+      }
+      V = Op;
+      return false;
+    }
+    V = Op;
+    return true;
+  }
+
+  /// Smart shuffle instruction emission, walks through shuffles trees and
+  /// tries to find the best matching vector for the actual shuffle
+  /// instruction.
+  template <typename ShuffleBuilderTy>
+  static Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
+                              ShuffleBuilderTy &Builder) {
+    assert(V1 && "Expected at least one vector value.");
+    int VF = Mask.size();
+    if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
+      VF = FTy->getNumElements();
+    if (V2 &&
+        !isUndefVector(V2, buildUseMask(VF, Mask, UseMask::SecondArg)).all()) {
+      // Peek through shuffles.
+      Value *Op1 = V1;
+      Value *Op2 = V2;
+      int VF =
+          cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
+      SmallVector<int> CombinedMask1(Mask.size(), UndefMaskElem);
+      SmallVector<int> CombinedMask2(Mask.size(), UndefMaskElem);
+      for (int I = 0, E = Mask.size(); I < E; ++I) {
+        if (Mask[I] < VF)
+          CombinedMask1[I] = Mask[I];
+        else
+          CombinedMask2[I] = Mask[I] - VF;
+      }
+      Value *PrevOp1;
+      Value *PrevOp2;
+      do {
+        PrevOp1 = Op1;
+        PrevOp2 = Op2;
+        (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
+        (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
+        // Check if we have 2 resizing shuffles - need to peek through operands
+        // again.
+        if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
+          if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
+            SmallBitVector UseMask1 = buildUseMask(
+                cast<FixedVectorType>(SV1->getOperand(1)->getType())
+                    ->getNumElements(),
+                CombinedMask1, UseMask::FirstArg);
+            SmallBitVector UseMask2 = buildUseMask(
+                cast<FixedVectorType>(SV2->getOperand(1)->getType())
+                    ->getNumElements(),
+                CombinedMask2, UseMask::FirstArg);
+            if (SV1->getOperand(0)->getType() ==
+                    SV2->getOperand(0)->getType() &&
+                SV1->getOperand(0)->getType() != SV1->getType() &&
+                isUndefVector(SV1->getOperand(1), UseMask1).all() &&
+                isUndefVector(SV2->getOperand(1), UseMask2).all()) {
+              Op1 = SV1->getOperand(0);
+              Op2 = SV2->getOperand(0);
+              SmallVector<int> ShuffleMask1(SV1->getShuffleMask().begin(),
+                                            SV1->getShuffleMask().end());
+              int LocalVF = ShuffleMask1.size();
+              if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
+                LocalVF = FTy->getNumElements();
+              combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
+              CombinedMask1.swap(ShuffleMask1);
+              SmallVector<int> ShuffleMask2(SV2->getShuffleMask().begin(),
+                                            SV2->getShuffleMask().end());
+              LocalVF = ShuffleMask2.size();
+              if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
+                LocalVF = FTy->getNumElements();
+              combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
+              CombinedMask2.swap(ShuffleMask2);
+            }
+          }
+      } while (PrevOp1 != Op1 || PrevOp2 != Op2);
+      Builder.resizeToMatch(Op1, Op2);
+      VF = std::max(cast<VectorType>(Op1->getType())
+                        ->getElementCount()
+                        .getKnownMinValue(),
+                    cast<VectorType>(Op2->getType())
+                        ->getElementCount()
+                        .getKnownMinValue());
+      for (int I = 0, E = Mask.size(); I < E; ++I) {
+        if (CombinedMask2[I] != UndefMaskElem) {
+          assert(CombinedMask1[I] == UndefMaskElem &&
+                 "Expected undefined mask element");
+          CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
+        }
+      }
+      return Builder.createShuffleVector(
+          Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
+          CombinedMask1);
+    }
+    if (isa<PoisonValue>(V1))
+      return PoisonValue::get(FixedVectorType::get(
+          cast<VectorType>(V1->getType())->getElementType(), Mask.size()));
+    SmallVector<int> NewMask(Mask.begin(), Mask.end());
+    bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
+    assert(V1 && "Expected non-null value after looking through shuffles.");
+
+    if (!IsIdentity)
+      return Builder.createShuffleVector(V1, NewMask);
+    return V1;
+  }
+};
+} // namespace
+
 InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
                                       ArrayRef<Value *> VectorizedVals) {
   ArrayRef<Value *> VL = E->Scalars;
@@ -8146,7 +8402,7 @@
 /// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
 /// \endcode
 /// instead.
-class BoUpSLP::ShuffleInstructionBuilder {
+class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
   bool IsFinalized = false;
   /// Combined mask for all applied operands and masks. It is built during
   /// analysis and actual emission of shuffle vector instructions.
@@ -8155,7 +8411,7 @@
   /// operands, if the 3rd is going to be added, the first 2 are combined into
   /// shuffle with \p CommonMask mask, the first operand sets to be the
   /// resulting shuffle and the second operand sets to be the newly added
-  /// operand. The \p CombinedMask is transformed in the proper way after that.
+  /// operand. The \p CommonMask is transformed in the proper way after that.
   SmallVector<Value *, 2> InVectors;
   IRBuilderBase &Builder;
   BoUpSLP &R;
@@ -8199,6 +8455,29 @@
       }
       return Vec;
     }
+    /// Resizes 2 input vector to match the sizes, if the they are not equal
+    /// yet. The smallest vector is resized to the size of the larger vector.
+    void resizeToMatch(Value *&V1, Value *&V2) {
+      if (V1->getType() == V2->getType())
+        return;
+      int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
+      int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
+      int VF = std::max(V1VF, V2VF);
+      int MinVF = std::min(V1VF, V2VF);
+      SmallVector<int> IdentityMask(VF, UndefMaskElem);
+      std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
+                0);
+      Value *&Op = MinVF == V1VF ? V1 : V2;
+      Op = Builder.CreateShuffleVector(Op, IdentityMask);
+      if (auto *I = dyn_cast<Instruction>(Op)) {
+        GatherShuffleExtractSeq.insert(I);
+        CSEBlocks.insert(I->getParent());
+      }
+      if (MinVF == V1VF)
+        V1 = Op;
+      else
+        V2 = Op;
+    }
   };
 
   /// Smart shuffle instruction emission, walks through shuffles trees and
@@ -8208,9 +8487,7 @@
     assert(V1 && "Expected at least one vector value.");
     ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
                                     R.CSEBlocks);
-    if (V2)
-      return ShuffleBuilder.createShuffleVector(V1, V2, Mask);
-    return ShuffleBuilder.createShuffleVector(V1, Mask);
+    return BaseShuffleAnalysis::createShuffle(V1, V2, Mask, ShuffleBuilder);
   }
 
   /// Transforms mask \p CommonMask per given \p Mask to make proper set after
@@ -9433,157 +9710,21 @@
     LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
   }
 
-  // Checks if the mask is an identity mask.
-  auto &&IsIdentityMask = [](ArrayRef<int> Mask, FixedVectorType *VecTy) {
-    int Limit = Mask.size();
-    return VecTy->getNumElements() == Mask.size() &&
-           all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) &&
-           ShuffleVectorInst::isIdentityMask(Mask);
-  };
-  // Tries to combine 2 different masks into single one.
-  auto &&CombineMasks = [](SmallVectorImpl<int> &Mask, ArrayRef<int> ExtMask) {
-    SmallVector<int> NewMask(ExtMask.size(), UndefMaskElem);
-    for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
-      if (ExtMask[I] == UndefMaskElem)
-        continue;
-      NewMask[I] = Mask[ExtMask[I]];
-    }
-    Mask.swap(NewMask);
-  };
-  // Peek through shuffles, trying to simplify the final shuffle code.
-  auto &&PeekThroughShuffles =
-      [&IsIdentityMask, &CombineMasks](Value *&V, SmallVectorImpl<int> &Mask,
-                                       bool CheckForLengthChange = false) {
-        while (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
-          // Exit if not a fixed vector type or changing size shuffle.
-          if (!isa<FixedVectorType>(SV->getType()) ||
-              (CheckForLengthChange && SV->changesLength()))
-            break;
-          // Exit if the identity or broadcast mask is found.
-          if (IsIdentityMask(Mask, cast<FixedVectorType>(SV->getType())) ||
-              SV->isZeroEltSplat())
-            break;
-          int LocalVF = Mask.size();
-          if (auto *SVOpTy =
-                  dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
-            LocalVF = SVOpTy->getNumElements();
-          bool IsOp1Undef =
-              isUndefVector(SV->getOperand(0),
-                            buildUseMask(LocalVF, Mask, UseMask::FirstArg))
-                  .all();
-          bool IsOp2Undef =
-              isUndefVector(SV->getOperand(1),
-                            buildUseMask(LocalVF, Mask, UseMask::SecondArg))
-                  .all();
-          if (!IsOp1Undef && !IsOp2Undef)
-            break;
-          SmallVector<int> ShuffleMask(SV->getShuffleMask().begin(),
-                                       SV->getShuffleMask().end());
-          CombineMasks(ShuffleMask, Mask);
-          Mask.swap(ShuffleMask);
-          if (IsOp2Undef)
-            V = SV->getOperand(0);
-          else
-            V = SV->getOperand(1);
-        }
-      };
-  // Smart shuffle instruction emission, walks through shuffles trees and
-  // tries to find the best matching vector for the actual shuffle
-  // instruction.
-  auto &&CreateShuffle = [this, &IsIdentityMask, &PeekThroughShuffles,
-                          &CombineMasks](Value *V1, Value *V2,
-                                         ArrayRef<int> Mask) -> Value * {
-    assert(V1 && "Expected at least one vector value.");
-    int VF = Mask.size();
-    if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
-      VF = FTy->getNumElements();
-    if (V2 &&
-        !isUndefVector(V2, buildUseMask(VF, Mask, UseMask::SecondArg)).all()) {
-      // Peek through shuffles.
-      Value *Op1 = V1;
-      Value *Op2 = V2;
-      int VF =
-          cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
-      SmallVector<int> CombinedMask1(Mask.size(), UndefMaskElem);
-      SmallVector<int> CombinedMask2(Mask.size(), UndefMaskElem);
-      for (int I = 0, E = Mask.size(); I < E; ++I) {
-        if (Mask[I] < VF)
-          CombinedMask1[I] = Mask[I];
-        else
-          CombinedMask2[I] = Mask[I] - VF;
-      }
-      Value *PrevOp1;
-      Value *PrevOp2;
-      do {
-        PrevOp1 = Op1;
-        PrevOp2 = Op2;
-        PeekThroughShuffles(Op1, CombinedMask1, /*CheckForLengthChange=*/true);
-        PeekThroughShuffles(Op2, CombinedMask2, /*CheckForLengthChange=*/true);
-        // Check if we have 2 resizing shuffles - need to peek through operands
-        // again.
-        if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
-          if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
-            SmallBitVector UseMask1 = buildUseMask(
-                cast<FixedVectorType>(SV1->getOperand(1)->getType())
-                    ->getNumElements(),
-                CombinedMask1, UseMask::FirstArg);
-            SmallBitVector UseMask2 = buildUseMask(
-                cast<FixedVectorType>(SV2->getOperand(1)->getType())
-                    ->getNumElements(),
-                CombinedMask2, UseMask::FirstArg);
-            if (SV1->getOperand(0)->getType() ==
-                    SV2->getOperand(0)->getType() &&
-                SV1->getOperand(0)->getType() != SV1->getType() &&
-                isUndefVector(SV1->getOperand(1), UseMask1).all() &&
-                isUndefVector(SV2->getOperand(1), UseMask2).all()) {
-              Op1 = SV1->getOperand(0);
-              Op2 = SV2->getOperand(0);
-              SmallVector<int> ShuffleMask1(SV1->getShuffleMask().begin(),
-                                            SV1->getShuffleMask().end());
-              CombineMasks(ShuffleMask1, CombinedMask1);
-              CombinedMask1.swap(ShuffleMask1);
-              SmallVector<int> ShuffleMask2(SV2->getShuffleMask().begin(),
-                                            SV2->getShuffleMask().end());
-              CombineMasks(ShuffleMask2, CombinedMask2);
-              CombinedMask2.swap(ShuffleMask2);
-            }
-          }
-      } while (PrevOp1 != Op1 || PrevOp2 != Op2);
-      VF = cast<VectorType>(Op1->getType())
-               ->getElementCount()
-               .getKnownMinValue();
-      for (int I = 0, E = Mask.size(); I < E; ++I) {
-        if (CombinedMask2[I] != UndefMaskElem) {
-          assert(CombinedMask1[I] == UndefMaskElem &&
-                 "Expected undefined mask element");
-          CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
-        }
-      }
-      Value *Vec = Builder.CreateShuffleVector(
-          Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
-          CombinedMask1);
-      if (auto *I = dyn_cast<Instruction>(Vec)) {
-        GatherShuffleExtractSeq.insert(I);
-        CSEBlocks.insert(I->getParent());
-      }
-      return Vec;
-    }
-    if (isa<PoisonValue>(V1))
-      return PoisonValue::get(FixedVectorType::get(
-          cast<VectorType>(V1->getType())->getElementType(), Mask.size()));
-    Value *Op = V1;
-    SmallVector<int> CombinedMask(Mask);
-    PeekThroughShuffles(Op, CombinedMask);
-    if (!isa<FixedVectorType>(Op->getType()) ||
-        !IsIdentityMask(CombinedMask, cast<FixedVectorType>(Op->getType()))) {
-      Value *Vec = Builder.CreateShuffleVector(Op, CombinedMask);
-      if (auto *I = dyn_cast<Instruction>(Vec)) {
-        GatherShuffleExtractSeq.insert(I);
-        CSEBlocks.insert(I->getParent());
-      }
-      return Vec;
+  auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
+    SmallVector<int> CombinedMask1(Mask.size(), UndefMaskElem);
+    SmallVector<int> CombinedMask2(Mask.size(), UndefMaskElem);
+    int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
+    for (int I = 0, E = Mask.size(); I < E; ++I) {
+      if (Mask[I] < VF)
+        CombinedMask1[I] = Mask[I];
+      else
+        CombinedMask2[I] = Mask[I] - VF;
     }
-    return Op;
+    ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
+    ShuffleBuilder.add(V1, CombinedMask1);
+    if (V2)
+      ShuffleBuilder.add(V2, CombinedMask2);
+    return ShuffleBuilder.finalize(std::nullopt);
   };
 
   auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll
@@ -30,11 +30,11 @@
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP4]], <4 x i32> <i32 0, i32 undef, i32 1, i32 2>
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast <4 x float> [[TMP9]], [[SHUFFLE]]
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast float* [[GEP0]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP11]], <4 x float>* [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> [[TMP8]], <4 x i32> <i32 0, i32 undef, i32 2, i32 4>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast <4 x float> [[TMP9]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float* [[GEP0]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP12]], <4 x float>* [[TMP13]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %gep0 = getelementptr inbounds float, float* %a, i64 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-same-lane-insert.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-same-lane-insert.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-same-lane-insert.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-same-lane-insert.ll
@@ -43,10 +43,9 @@
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = fcmp olt float [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    store <2 x float> [[TMP3]], ptr null, align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    store <2 x float> [[TMP8]], ptr null, align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    store <2 x float> [[TMP9]], ptr null, align 4
 ; CHECK-NEXT:    ret void
 ;
   %1 = getelementptr inbounds float, ptr undef, i32 2
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll
@@ -26,26 +26,27 @@
 ; CHECK-NEXT:    [[TMP7:%.*]] = sitofp <2 x i32> [[TMP6]] to <2 x float>
 ; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x float> [[TMP7]], <float 1.000000e+01, float 1.000000e+01>
 ; CHECK-NEXT:    [[TMP9:%.*]] = fsub <2 x float> <float 1.000000e+00, float 0.000000e+00>, [[TMP8]]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 1
-; CHECK-NEXT:    store float [[TMP10]], float* @g, align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = fadd <4 x float> [[SHUFFLE]], <float -1.000000e+00, float -1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[TMP11]], i32 2
-; CHECK-NEXT:    store float [[TMP12]], float* @c, align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP11]], i32 0
-; CHECK-NEXT:    store float [[TMP13]], float* @d, align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[TMP11]], i32 3
-; CHECK-NEXT:    store float [[TMP14]], float* @e, align 4
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[TMP11]], i32 1
-; CHECK-NEXT:    store float [[TMP15]], float* @f, align 4
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> <float poison, float -1.000000e+00, float poison, float -1.000000e+00>, float [[CONV19]], i32 0
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x float> [[TMP16]], <4 x float> [[SHUFFLE]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>
-; CHECK-NEXT:    [[TMP18:%.*]] = fsub <4 x float> [[TMP11]], [[TMP17]]
-; CHECK-NEXT:    [[TMP19:%.*]] = fadd <4 x float> [[TMP11]], [[TMP17]]
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <4 x float> [[TMP18]], <4 x float> [[TMP19]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEXT:    [[TMP21:%.*]] = fptosi <4 x float> [[TMP20]] to <4 x i32>
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i32* [[ARRAYIDX1]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP21]], <4 x i32>* [[TMP22]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[TMP10]], i32 1
+; CHECK-NEXT:    store float [[TMP11]], float* @g, align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = fadd <4 x float> [[TMP10]], <float -1.000000e+00, float -1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP12]], i32 2
+; CHECK-NEXT:    store float [[TMP13]], float* @c, align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[TMP12]], i32 0
+; CHECK-NEXT:    store float [[TMP14]], float* @d, align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[TMP12]], i32 3
+; CHECK-NEXT:    store float [[TMP15]], float* @e, align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x float> [[TMP12]], i32 1
+; CHECK-NEXT:    store float [[TMP16]], float* @f, align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x float> <float poison, float -1.000000e+00, float poison, float -1.000000e+00>, float [[CONV19]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <4 x float> [[TMP17]], <4 x float> [[TMP18]], <4 x i32> <i32 0, i32 1, i32 5, i32 3>
+; CHECK-NEXT:    [[TMP20:%.*]] = fsub <4 x float> [[TMP12]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = fadd <4 x float> [[TMP12]], [[TMP19]]
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <4 x float> [[TMP20]], <4 x float> [[TMP21]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP23:%.*]] = fptosi <4 x float> [[TMP22]] to <4 x i32>
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast i32* [[ARRAYIDX1]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP23]], <4 x i32>* [[TMP24]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll
@@ -13,34 +13,35 @@
 ; CHECK-NEXT:    [[PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX12:%.*]], [[WHILE]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr null, align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A2]], align 8
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr null, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[A]], align 8
-; CHECK-NEXT:    [[SHUFFLE13:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x i64> poison, i64 [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i64> [[TMP4]], <16 x i64> [[TMP5]], <16 x i32> <i32 0, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[SHUFFLE]], <4 x i64> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i64> [[TMP6]], <16 x i64> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i64> [[TMP8]], <16 x i64> [[TMP9]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 19, i32 19, i32 19, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <16 x i64> [[TMP10]], i64 [[TMP0]], i32 9
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <16 x i64> [[TMP11]], i64 [[TMP0]], i32 10
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <16 x i64> [[TMP12]], i64 [[TMP0]], i32 11
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i64> [[SHUFFLE13]], <4 x i64> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <16 x i64> [[TMP13]], <16 x i64> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[A1]], align 16
-; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[A2]], align 8
-; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[A3]], align 16
-; CHECK-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> [[TMP15]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = xor i64 [[TMP19]], [[TMP2]]
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = xor i64 [[TMP2]], [[TMP2]]
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[SHUFFLE13]], i32 3
-; CHECK-NEXT:    [[OP_RDX2:%.*]] = xor i64 [[TMP2]], [[TMP20]]
-; CHECK-NEXT:    [[OP_RDX3:%.*]] = xor i64 [[TMP20]], [[TMP16]]
-; CHECK-NEXT:    [[OP_RDX4:%.*]] = xor i64 [[TMP16]], [[TMP16]]
-; CHECK-NEXT:    [[OP_RDX5:%.*]] = xor i64 [[TMP17]], [[TMP17]]
-; CHECK-NEXT:    [[OP_RDX6:%.*]] = xor i64 [[TMP18]], [[TMP18]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr null, align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[A]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x i64> poison, i64 [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i64> [[TMP6]], <16 x i64> [[TMP8]], <16 x i32> <i32 0, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <16 x i64> [[TMP9]], <16 x i64> [[TMP10]], <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <16 x i64> [[TMP11]], <16 x i64> [[TMP8]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 17, i32 17, i32 17, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <16 x i64> [[TMP13]], i64 [[TMP0]], i32 9
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <16 x i64> [[TMP14]], i64 [[TMP0]], i32 10
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x i64> [[TMP15]], i64 [[TMP0]], i32 11
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <16 x i64> [[TMP16]], <16 x i64> [[TMP17]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP19:%.*]] = load i64, ptr [[A1]], align 16
+; CHECK-NEXT:    [[TMP20:%.*]] = load i64, ptr [[A2]], align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = load i64, ptr [[A3]], align 16
+; CHECK-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> [[TMP18]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = xor i64 [[TMP22]], [[TMP3]]
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = xor i64 [[TMP3]], [[TMP3]]
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3
+; CHECK-NEXT:    [[OP_RDX2:%.*]] = xor i64 [[TMP3]], [[TMP23]]
+; CHECK-NEXT:    [[OP_RDX3:%.*]] = xor i64 [[TMP23]], [[TMP19]]
+; CHECK-NEXT:    [[OP_RDX4:%.*]] = xor i64 [[TMP19]], [[TMP19]]
+; CHECK-NEXT:    [[OP_RDX5:%.*]] = xor i64 [[TMP20]], [[TMP20]]
+; CHECK-NEXT:    [[OP_RDX6:%.*]] = xor i64 [[TMP21]], [[TMP21]]
 ; CHECK-NEXT:    [[OP_RDX7:%.*]] = xor i64 [[OP_RDX]], [[OP_RDX1]]
 ; CHECK-NEXT:    [[OP_RDX8:%.*]] = xor i64 [[OP_RDX2]], [[OP_RDX3]]
 ; CHECK-NEXT:    [[OP_RDX9:%.*]] = xor i64 [[OP_RDX4]], [[OP_RDX5]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll
@@ -4,16 +4,17 @@
 define void @test(float* noalias %0, float* %p) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x float*> poison, float* [[P:%.*]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP2]], <8 x float*> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> <i64 15, i64 4, i64 5, i64 0, i64 2, i64 6, i64 7, i64 8>
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 2
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP3]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> poison)
-; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <16 x i32> <i32 4, i32 3, i32 0, i32 1, i32 2, i32 0, i32 1, i32 2, i32 0, i32 2, i32 5, i32 6, i32 7, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <16 x float> <float poison, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, <16 x float> [[SHUFFLE1]], <16 x i32> <i32 18, i32 19, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd reassoc nsz arcp contract afn <16 x float> [[SHUFFLE1]], [[TMP6]]
-; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <16 x float> [[TMP7]], <16 x float> poison, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 9, i32 0, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP4]] to <16 x float>*
-; CHECK-NEXT:    store <16 x float> [[SHUFFLE2]], <16 x float>* [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float*> [[TMP2]], <8 x float*> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr float, <8 x float*> [[TMP3]], <8 x i64> <i64 15, i64 4, i64 5, i64 0, i64 2, i64 6, i64 7, i64 8>
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 2
+; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP4]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> poison)
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> poison, <16 x i32> <i32 4, i32 3, i32 0, i32 1, i32 2, i32 0, i32 1, i32 2, i32 0, i32 2, i32 5, i32 6, i32 7, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x float> <float poison, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, <16 x float> [[TMP8]], <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd reassoc nsz arcp contract afn <16 x float> [[TMP7]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <16 x float> [[TMP10]], <16 x float> poison, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 9, i32 0, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast float* [[TMP5]] to <16 x float>*
+; CHECK-NEXT:    store <16 x float> [[TMP11]], <16 x float>* [[TMP12]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %2 = getelementptr inbounds float, float* %p, i64 2