diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -661,7 +661,8 @@
   const unsigned E = Indices.size();
   Mask.resize(E, UndefMaskElem);
   for (unsigned I = 0; I < E; ++I)
-    Mask[Indices[I]] = I;
+    if (Indices[I] != E)
+      Mask[Indices[I]] = I;
 }
 
 /// \returns inserting index of InsertElement or InsertValue instruction,
@@ -1925,8 +1926,8 @@
 
     /// Do we need to gather this sequence or vectorize it
     /// (either with vector instruction or with scatter/gather
-    /// intrinsics for store/load)?
-    enum EntryState { Vectorize, ScatterVectorize, NeedToGather };
+    /// intrinsics for store/load or split entry block)?
+    enum EntryState { Vectorize, ScatterVectorize, NeedToGather, SplitShuffle };
     EntryState State;
 
     /// Does this sequence require some shuffling?
@@ -2099,6 +2100,9 @@
       case NeedToGather:
         dbgs() << "NeedToGather\n";
         break;
+      case SplitShuffle:
+        dbgs() << "SplitShuffle\n";
+        break;
       }
       dbgs() << "MainOp: ";
       if (MainOp)
@@ -2167,7 +2171,8 @@
                           const EdgeInfo &UserTreeIdx,
                           ArrayRef<int> ReuseShuffleIndices = None,
                           ArrayRef<unsigned> ReorderIndices = None) {
-    assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
+    assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
+                         EntryState == TreeEntry::SplitShuffle)) ||
             (Bundle && EntryState != TreeEntry::NeedToGather)) &&
            "Need to vectorize gather entry?");
     VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
@@ -2192,7 +2197,8 @@
       Last->setOperations(S);
       Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
     }
-    if (Last->State != TreeEntry::NeedToGather) {
+    if (Last->State == TreeEntry::Vectorize ||
+        Last->State == TreeEntry::ScatterVectorize) {
       for (Value *V : VL) {
         assert(!getTreeEntry(V) && "Scalar already in tree!");
         ScalarToTreeEntry[V] = Last;
@@ -2207,7 +2213,7 @@
       }
       assert((!Bundle.getValue() || Lane == VL.size()) &&
              "Bundle and VL out of sync");
-    } else {
+    } else if (Last->State == TreeEntry::NeedToGather) {
       MustGather.insert(VL.begin(), VL.end());
     }
 
@@ -3134,19 +3140,48 @@
                         [VF, &TE](const EdgeInfo &EI) {
                           return EI.UserTE->Scalars.size() == VF ||
                                  EI.UserTE->Scalars.size() ==
-                                     TE->Scalars.size();
+                                     TE->Scalars.size() ||
+                                 (EI.UserTE->State == TreeEntry::SplitShuffle &&
+                                  EI.UserTE->Scalars.size() == 2 * VF);
                         }) &&
                  "All users must be of VF size.");
           // Update ordering of the operands with the smaller VF than the given
           // one.
           reorderReuses(TE->ReuseShuffleIndices, Mask);
+          continue;
+        }
+        if (TE->State == TreeEntry::SplitShuffle &&
+            TE->Scalars.size() == VF * 2) {
+          // Build a full mask out of smaller mask by just duplicating it.
+          assert(!TE->ReorderIndices.empty() &&
+                 "Expected reordered indeces in split shuffle node.");
+          TE->reorderOperands(Mask);
+          unsigned Sz = 2 * VF;
+          SmallVector<int> SplitMask(Sz);
+          copy(Mask, SplitMask.begin());
+          transform(Mask, std::next(SplitMask.begin(), VF),
+                    [VF](int Idx) { return Idx + VF; });
+          reorderScalars(TE->Scalars, SplitMask);
+          SmallVector<unsigned> PrevOrder(Sz, Sz);
+          TE->ReorderIndices.swap(PrevOrder);
+          for (unsigned I = 0; I < Sz; ++I) {
+            if (SplitMask[I] == UndefMaskElem)
+              continue;
+            TE->ReorderIndices[SplitMask[I]] = PrevOrder[I];
+          }
         }
         continue;
       }
-      if (TE->State == TreeEntry::Vectorize &&
-          isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
-              InsertElementInst>(TE->getMainOp()) &&
-          !TE->isAltShuffle()) {
+      if (TE->State == TreeEntry::SplitShuffle) {
+        reorderOrder(TE->ReorderIndices, Mask);
+        if (TE->ReorderIndices.empty()) {
+          TE->ReorderIndices.assign(Mask.size(), 0);
+          std::iota(TE->ReorderIndices.begin(), TE->ReorderIndices.end(), 0);
+        }
+      } else if (TE->State == TreeEntry::Vectorize &&
+                 isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
+                     InsertElementInst>(TE->getMainOp()) &&
+                 !TE->isAltShuffle()) {
         // Build correct orders for extract{element,value}, loads and
         // stores.
         reorderOrder(TE->ReorderIndices, Mask);
@@ -3366,25 +3401,37 @@
       }
       // For gathers just need to reorder its scalars.
       for (TreeEntry *Gather : GatherOps) {
-        assert(Gather->ReorderIndices.empty() &&
+        assert((Gather->State == TreeEntry::SplitShuffle ||
+                Gather->ReorderIndices.empty()) &&
                "Unexpected reordering of gathers.");
         if (!Gather->ReuseShuffleIndices.empty()) {
           // Just reorder reuses indices.
           reorderReuses(Gather->ReuseShuffleIndices, Mask);
           continue;
         }
-        reorderScalars(Gather->Scalars, Mask);
+        if (Gather->State == TreeEntry::SplitShuffle) {
+          reorderOrder(Gather->ReorderIndices, Mask);
+          if (Gather->ReorderIndices.empty()) {
+            Gather->ReorderIndices.assign(Mask.size(), 0);
+            std::iota(Gather->ReorderIndices.begin(),
+                      Gather->ReorderIndices.end(), 0);
+          }
+        } else {
+          reorderScalars(Gather->Scalars, Mask);
+        }
         OrderedEntries.remove(Gather);
       }
       // Reorder operands of the user node and set the ordering for the user
       // node itself.
-      if (Data.first->State != TreeEntry::Vectorize ||
-          !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
-              Data.first->getMainOp()) ||
-          Data.first->isAltShuffle())
+      if (Data.first->State != TreeEntry::SplitShuffle &&
+          (Data.first->State != TreeEntry::Vectorize ||
+           !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
+               Data.first->getMainOp()) ||
+           Data.first->isAltShuffle()))
         Data.first->reorderOperands(Mask);
-      if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
-          Data.first->isAltShuffle()) {
+      if (Data.first->State != TreeEntry::SplitShuffle &&
+          (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
+           Data.first->isAltShuffle())) {
         reorderScalars(Data.first->Scalars, Mask);
         reorderOrder(Data.first->ReorderIndices, MaskOrder);
         if (Data.first->ReuseShuffleIndices.empty() &&
@@ -3394,6 +3441,27 @@
           // the graph.
           OrderedEntries.insert(Data.first);
         }
+      } else if (Data.first->State == TreeEntry::SplitShuffle) {
+        // Build a full mask out of smaller mask by just duplicating it.
+        unsigned VF = Mask.size();
+        unsigned Sz = 2 * VF;
+        assert(Data.first->Scalars.size() == Sz &&
+               "Scalars size must be twice of operands size.");
+        assert(!Data.first->ReorderIndices.empty() &&
+               "Expected reordered indeces in spit shuffle node.");
+        Data.first->reorderOperands(Mask);
+        SmallVector<int> SplitMask(Sz);
+        copy(Mask, SplitMask.begin());
+        transform(Mask, std::next(SplitMask.begin(), VF),
+                  [VF](int Idx) { return Idx + VF; });
+        reorderScalars(Data.first->Scalars, SplitMask);
+        SmallVector<unsigned> PrevOrder(Sz, Sz);
+        Data.first->ReorderIndices.swap(PrevOrder);
+        for (unsigned I = 0; I < Sz; ++I) {
+          if (SplitMask[I] == UndefMaskElem)
+            continue;
+          Data.first->ReorderIndices[SplitMask[I]] = PrevOrder[I];
+        }
       } else {
         reorderOrder(Data.first->ReorderIndices, Mask);
       }
@@ -3412,7 +3480,8 @@
     TreeEntry *Entry = TEPtr.get();
 
     // No need to handle users of gathered values.
-    if (Entry->State == TreeEntry::NeedToGather)
+    if (Entry->State == TreeEntry::NeedToGather ||
+        Entry->State == TreeEntry::SplitShuffle)
       continue;
 
     // For each lane:
@@ -3539,6 +3608,78 @@
   return LoadsState::Gather;
 }
 
+/// Generates key/subkey pair for the given value to provide effective sorting
+/// of the values and better detection of the vectorizable values sequences. The
+/// keys/subkeys can be used for better sorting of the values themselves (keys)
+/// and in values subgroups (subkeys).
+static std::pair<size_t, size_t> generateKeySubkey(
+    Value *V, const TargetLibraryInfo *TLI,
+    function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator) {
+  hash_code Key = hash_value(V->getValueID() + 1);
+  hash_code SubKey = hash_value(0);
+  // Sort the loads by the distance between the pointers.
+  if (auto *LI = dyn_cast<LoadInst>(V)) {
+    Key = hash_combine(hash_value(LI->getParent()), Key);
+    if (LI->isSimple())
+      SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
+    else
+      SubKey = hash_value(LI);
+  } else if (isVectorLikeInstWithConstOps(V)) {
+    // Sort extracts by the vector operands.
+    if (isa<ExtractElementInst, UndefValue>(V))
+      Key = hash_value(Value::UndefValueVal + 1);
+    if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
+      if (!isUndefVector(EI->getVectorOperand()) &&
+          !isa<UndefValue>(EI->getIndexOperand()))
+        SubKey = hash_value(EI->getVectorOperand());
+    }
+  } else if (auto *I = dyn_cast<Instruction>(V)) {
+    // Sort other instructions just by the opcodes except for CMPInst.
+    // For CMP also sort by the predicate kind.
+    if ((isa<BinaryOperator>(I) || isa<CastInst>(I)) &&
+        isValidForAlternation(I->getOpcode())) {
+      Key = hash_value(0);
+      SubKey = hash_combine(
+          hash_value(isa<BinaryOperator>(I) ? 1 : 0),
+          hash_value(isa<BinaryOperator>(I)
+                         ? I->getType()
+                         : cast<CastInst>(I)->getOperand(0)->getType()));
+    } else if (auto *CI = dyn_cast<CmpInst>(I)) {
+      CmpInst::Predicate Pred = CI->getPredicate();
+      CmpInst::Predicate SwapPred = CmpInst::getSwappedPredicate(Pred);
+      SubKey = hash_combine(hash_value(I->getOpcode()),
+                            hash_value(Pred > SwapPred ? Pred : SwapPred),
+                            hash_value(Pred > SwapPred ? SwapPred : Pred),
+                            hash_value(CI->getOperand(0)->getType()));
+    } else if (auto *Call = dyn_cast<CallInst>(I)) {
+      Intrinsic::ID ID = getVectorIntrinsicIDForCall(Call, TLI);
+      if (isTriviallyVectorizable(ID))
+        SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
+      else if (!VFDatabase(*Call).getMappings(*Call).empty())
+        SubKey = hash_combine(hash_value(I->getOpcode()),
+                              hash_value(Call->getCalledFunction()));
+      else
+        SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
+      for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
+        SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
+                              hash_value(Op.Tag), SubKey);
+    } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
+      if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
+        SubKey = hash_value(Gep->getPointerOperand());
+      else
+        SubKey = hash_value(Gep);
+    } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
+               !isa<ConstantInt>(I->getOperand(1))) {
+      // Do not try to vectorize instructions with potentially high cost.
+      SubKey = hash_value(I);
+    } else {
+      SubKey = hash_value(I->getOpcode());
+    }
+    Key = hash_combine(hash_value(I->getParent()), Key);
+  }
+  return std::make_pair(Key, SubKey);
+}
+
 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                             const EdgeInfo &UserTreeIdx) {
   assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
@@ -3621,9 +3762,148 @@
   // If all of the operands are identical or constant we have a simple solution.
   // If we deal with insert/extract instructions, they all must have constant
   // indices, otherwise we should gather them, not try to vectorize.
-  if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode() ||
+  bool IsSplat = isSplat(VL);
+  bool AllConstant = !IsSplat && allConstant(VL);
+  bool AllSameBlock = !AllConstant && allSameBlock(VL);
+  if (IsSplat || AllConstant || !AllSameBlock || !S.getOpcode() ||
       (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(S.MainOp) &&
        !all_of(VL, isVectorLikeInstWithConstOps))) {
+    // Try to build split shuffle if possible.
+    SmallPtrSet<Value *, 4> UniqueVals;
+    unsigned VF = VL.size() / 2;
+    if (!IsSplat && !AllConstant && (!AllSameBlock || !S.getOpcode()) &&
+        VL.size() > 2 && count_if(VL, [&UniqueVals](Value *V) {
+                           return UniqueVals.insert(V).second;
+                         }) > 2) {
+      MapVector<size_t, MapVector<size_t, SmallVector<unsigned>>>
+          SelectedValues;
+      SmallVector<unsigned> Vectorized;
+      unsigned InstCnt = 0;
+      std::pair<size_t, size_t> BestKeySubkey;
+      SmallDenseMap<unsigned, std::pair<unsigned, int>, 4> BestLoad;
+      // Group values by the most optimal kinds/subkinds.
+      for (unsigned Idx = 0, E = VL.size(); Idx < E; ++Idx) {
+        Value *V = VL[Idx];
+        if (ScalarToTreeEntry.count(V) || MustGather.contains(V)) {
+          Vectorized.push_back(Idx);
+          continue;
+        }
+        std::pair<size_t, size_t> KeySubkey = generateKeySubkey(
+            V, TLI,
+            [&SelectedValues, &BestLoad, VL, DL = DL, SE = SE,
+             VF, Idx](size_t Key, LoadInst *LI) {
+              for (const auto &LoadData : SelectedValues[Key]) {
+                auto &Data = BestLoad[LoadData.second.front()];
+                auto *RLI = cast<LoadInst>(VL[Data.first]);
+                Optional<int> Dist = getPointersDiff(
+                    RLI->getType(), RLI->getPointerOperand(), LI->getType(),
+                    LI->getPointerOperand(), *DL, *SE, /*StrictCheck=*/true);
+                if (Dist && static_cast<unsigned>(std::abs(*Dist)) < VF) {
+                  if (Data.second > *Dist) {
+                    Data.first = Idx;
+                    Data.second = *Dist;
+                  }
+                  return hash_value(cast<LoadInst>(VL[LoadData.second.front()])
+                                        ->getPointerOperand());
+                }
+              }
+              BestLoad.try_emplace(Idx, std::make_pair(Idx, 0));
+              return hash_value(LI->getPointerOperand());
+            });
+        SmallVector<unsigned> &Items =
+            SelectedValues[KeySubkey.first][KeySubkey.second];
+        Items.push_back(Idx);
+        if (Items.size() > InstCnt && any_of(Items, [VL](unsigned Idx) {
+              if (auto *EI = dyn_cast<ExtractElementInst>(VL[Idx])) {
+                if (auto *FTy =
+                        dyn_cast<FixedVectorType>(EI->getVectorOperandType()))
+                  return FTy->getNumElements() < VL.size();
+                return false;
+              }
+              return isa<Instruction>(VL[Idx]);
+            })) {
+          InstCnt = Items.size();
+          BestKeySubkey = KeySubkey;
+        }
+      }
+      // Check number of unique elements.
+      UniqueVals.clear();
+      unsigned UniqueValsCnt = count_if(
+          SelectedValues[BestKeySubkey.first][BestKeySubkey.second],
+          [&UniqueVals, VL](unsigned Idx) {
+            return isa<UndefValue>(VL[Idx]) ||
+                   UniqueVals.insert(VL[Idx]).second;
+          });
+      // Consider it only if we can split it evenly.
+      if ((((UniqueVals.empty() || !isa<PHINode>(*UniqueVals.begin())) &&
+            InstCnt >= VF && InstCnt < VL.size()) ||
+           InstCnt == VF) &&
+          (UniqueValsCnt != 2 || (InstCnt == VF && UniqueValsCnt == VF))) {
+        LLVM_DEBUG(dbgs() << "SLP: build split shuffle block. \n");
+        // How many time shall we repeat same value in the Main node.
+        unsigned NumRepeats = VL.size() / UniqueValsCnt;
+        DenseMap<Value *, unsigned> UniqueValsCounter;
+        auto SelectedValuesVector = SelectedValues.takeVector();
+        SmallVector<unsigned> ReorderIndices(VL.size(), VL.size());
+        SmallVector<ValueList, 2> Operands(2);
+        unsigned MainCnt = 0;
+        unsigned SecondCnt = 0;
+        for (unsigned I = 0, E = SelectedValuesVector.size(); I < E; ++I) {
+          auto ValuesVector = SelectedValuesVector[I].second.takeVector();
+          for (unsigned K = 0, N = ValuesVector.size(); K < N; ++K) {
+            unsigned Sz = ValuesVector[K].second.size();
+            SmallVector<unsigned> Undefs;
+            for (unsigned Idx : ValuesVector[K].second) {
+              Value *V = VL[Idx];
+              unsigned &RepeatCntRef =
+                  UniqueValsCounter.try_emplace(V, 0).first->getSecond();
+              if (Sz >= VF && MainCnt < VF &&
+                  (isa<UndefValue>(V) || RepeatCntRef < NumRepeats)) {
+                ++RepeatCntRef;
+                if (isa<UndefValue>(V)) {
+                  Undefs.push_back(Idx);
+                } else {
+                  ReorderIndices[MainCnt] = Idx;
+                  Operands.front().push_back(V);
+                  ++MainCnt;
+                }
+              } else {
+                ReorderIndices[SecondCnt + VF] = Idx;
+                ++SecondCnt;
+                Operands.back().push_back(V);
+              }
+            }
+            // Process remaining undefs.
+            if (MainCnt < VF) {
+              for (unsigned Idx : Undefs) {
+                Value *V = VL[Idx];
+                if (Sz >= VF && MainCnt < VF) {
+                  Operands.front().push_back(V);
+                  ++MainCnt;
+                } else {
+                  ++SecondCnt;
+                  Operands.back().push_back(V);
+                }
+              }
+            }
+          }
+        }
+        for (unsigned Idx : Vectorized) {
+          Value *V = VL[Idx];
+          ReorderIndices[SecondCnt + VF] = Idx;
+          ++SecondCnt;
+          Operands.back().push_back(V);
+        }
+        TreeEntry *TE = newTreeEntry(VL, TreeEntry::SplitShuffle, None, S,
+                                     UserTreeIdx, None, ReorderIndices);
+
+        for (unsigned I = 0, E = Operands.size(); I < E; ++I)
+          TE->setOperand(I, Operands[I]);
+        for (unsigned I = 0, E = Operands.size(); I < E; ++I)
+          buildTree_rec(Operands[I], Depth + 1, {TE, I});
+        return;
+      }
+    }
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
     if (TryToFindDuplicates(S))
       newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
@@ -4862,6 +5142,24 @@
     }
     return ReuseShuffleCost + getGatherCost(VL);
   }
+  if (E->State == TreeEntry::SplitShuffle) {
+    unsigned VF = VL.size();
+    SmallVector<int> Mask(VF, UndefMaskElem);
+    for (unsigned I = 0; I < VF; ++I) {
+      Value *V = VL[I];
+      unsigned OpIdx = 0;
+      const auto *It = find(E->getOperand(OpIdx), V);
+      if (It == E->getOperand(OpIdx).end()) {
+        OpIdx = 1;
+        It = find(E->getOperand(OpIdx), V);
+        assert(It != E->getOperand(OpIdx).end() &&
+               "Subvectors are not synced.");
+      }
+      int Idx = std::distance(E->getOperand(OpIdx).begin(), It);
+      Mask[I] = Idx + VF * OpIdx;
+    }
+    return TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, Mask);
+  }
   InstructionCost CommonCost = 0;
   SmallVector<int> Mask;
   if (!E->ReorderIndices.empty()) {
@@ -6246,6 +6544,12 @@
         return V;
       }
   }
+  auto *I =
+      find_if(VectorizableTree, [VL](const std::unique_ptr<TreeEntry> &TE) {
+        return TE->State == TreeEntry::SplitShuffle && TE->isSame(VL);
+      });
+  if (I != VectorizableTree.end())
+    return vectorizeTree(I->get());
 
   // Check that every instruction appears once in this bundle.
   SmallVector<int> ReuseShuffleIndicies;
@@ -6309,6 +6613,42 @@
 
   bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
   unsigned VF = E->getVectorFactor();
+  auto &&SetInsertPointAfterOps = [this](ArrayRef<Value *> VL) {
+    // The last instruction in the bundle in program order.
+    Instruction *LastInst = nullptr;
+
+    for (Value *V : VL) {
+      // If the value was vectorized, need to get the vector value for correct
+      // insert point.
+      if (const TreeEntry *TE = getTreeEntry(V))
+        if (TE->VectorizedValue)
+          V = TE->VectorizedValue;
+      auto *I = dyn_cast<Instruction>(V);
+      if (!I)
+        continue;
+      if (!DT->isReachableFromEntry(I->getParent()))
+        continue;
+      if (!LastInst) {
+        LastInst = I;
+        continue;
+      }
+      if ((LastInst->getParent() != I->getParent() &&
+           DT->dominates(LastInst->getParent(), I->getParent())) ||
+          (LastInst->getParent() == I->getParent() && LastInst->comesBefore(I)))
+        LastInst = I;
+    }
+    // Set the insertion point after the last instruction in the bundle. Set
+    // the debug location to Front.
+    if (!LastInst)
+      return;
+    if (isa<PHINode>(LastInst))
+      Builder.SetInsertPoint(LastInst->getParent(),
+                             LastInst->getParent()->getFirstInsertionPt());
+    else
+      Builder.SetInsertPoint(LastInst->getParent(),
+                             std::next(LastInst->getIterator()));
+    Builder.SetCurrentDebugLocation(LastInst->getDebugLoc());
+  };
   ShuffleInstructionBuilder ShuffleBuilder(Builder, VF, GatherShuffleSeq,
                                            CSEBlocks);
   if (E->State == TreeEntry::NeedToGather) {
@@ -6338,6 +6678,24 @@
     E->VectorizedValue = Vec;
     return Vec;
   }
+  if (E->State == TreeEntry::SplitShuffle) {
+    SetInsertPointAfterOps(E->getOperand(0));
+    Value *Op0 = vectorizeTree(E->getOperand(0));
+    SetInsertPointAfterOps(E->getOperand(1));
+    Value *Op1 = vectorizeTree(E->getOperand(1));
+    // Fix the insert point to emit shuffles exactly after the last instruction
+    // in the operands.
+    SetInsertPointAfterOps({Op0, Op1});
+    SmallVector<int> Mask;
+    inversePermutation(E->ReorderIndices, Mask);
+    Value *Vec = Builder.CreateShuffleVector(Op0, Op1, Mask);
+    if (auto *I = dyn_cast<Instruction>(Vec)) {
+      GatherShuffleSeq.insert(I);
+      CSEBlocks.insert(I->getParent());
+    }
+    E->VectorizedValue = Vec;
+    return Vec;
+  }
 
   assert((E->State == TreeEntry::Vectorize ||
           E->State == TreeEntry::ScatterVectorize) &&
@@ -7020,7 +7378,8 @@
     TreeEntry *Entry = TEPtr.get();
 
     // No need to handle users of gathered values.
-    if (Entry->State == TreeEntry::NeedToGather)
+    if (Entry->State == TreeEntry::NeedToGather ||
+        Entry->State == TreeEntry::SplitShuffle)
       continue;
 
     assert(Entry->VectorizedValue && "Can't find vectorizable value");
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll
@@ -10,19 +10,18 @@
 ; CHECK-NEXT:    [[ADD277:%.*]] = add nsw i32 undef, undef
 ; CHECK-NEXT:    store i32 [[ADD277]], i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 1), align 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 0), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[ADD277]], i32 1
 ; CHECK-NEXT:    [[ARRAYIDX372:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 0
 ; CHECK-NEXT:    [[ARRAYIDX372_1:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 1
 ; CHECK-NEXT:    [[ARRAYIDX372_2:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 2) to <2 x i32>*), align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[ADD277]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> poison, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = ashr <4 x i32> [[TMP6]], <i32 6, i32 6, i32 6, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 2) to <2 x i32>*), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP2]], <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> poison, [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = ashr <4 x i32> [[TMP5]], <i32 6, i32 6, i32 6, i32 6>
 ; CHECK-NEXT:    [[ARRAYIDX372_3:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 3
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[ARRAYIDX372]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[ARRAYIDX372]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* [[TMP7]], align 4
 ; CHECK-NEXT:    unreachable
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
@@ -165,8 +165,8 @@
 ; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP3:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float>
-; CHECK-NEXT:    [[R72:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    ret <8 x float> [[R72]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x float> [[TMP4]]
 ;
   %a0 = extractelement <4 x i32> %a, i32 0
   %a1 = extractelement <4 x i32> %a, i32 1
@@ -209,12 +209,9 @@
 ; CHECK-NEXT:    [[TMP9:%.*]] = sitofp <2 x i8> [[TMP8]] to <2 x float>
 ; CHECK-NEXT:    [[TMP10:%.*]] = uitofp <2 x i8> [[TMP8]] to <2 x float>
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[R31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[R53:%.*]] = shufflevector <8 x float> [[R31]], <8 x float> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[R72:%.*]] = shufflevector <8 x float> [[R53]], <8 x float> [[TMP13]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; CHECK-NEXT:    ret <8 x float> [[R72]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x float> [[TMP13]]
 ;
   %a0 = extractelement <4 x i32> %a, i32 0
   %a1 = extractelement <4 x i32> %a, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
@@ -165,8 +165,8 @@
 ; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP3:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float>
-; CHECK-NEXT:    [[R72:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    ret <8 x float> [[R72]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x float> [[TMP4]]
 ;
   %a0 = extractelement <4 x i32> %a, i32 0
   %a1 = extractelement <4 x i32> %a, i32 1
@@ -209,12 +209,9 @@
 ; CHECK-NEXT:    [[TMP9:%.*]] = sitofp <2 x i8> [[TMP8]] to <2 x float>
 ; CHECK-NEXT:    [[TMP10:%.*]] = uitofp <2 x i8> [[TMP8]] to <2 x float>
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[R31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[R53:%.*]] = shufflevector <8 x float> [[R31]], <8 x float> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[R72:%.*]] = shufflevector <8 x float> [[R53]], <8 x float> [[TMP13]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; CHECK-NEXT:    ret <8 x float> [[R72]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x float> [[TMP13]]
 ;
   %a0 = extractelement <4 x i32> %a, i32 0
   %a1 = extractelement <4 x i32> %a, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll
@@ -4,25 +4,13 @@
 define i32 @crash_reordering_undefs() {
 ; CHECK-LABEL: @crash_reordering_undefs(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[OR0:%.*]] = or i64 undef, undef
-; CHECK-NEXT:    [[CMP0:%.*]] = icmp eq i64 undef, [[OR0]]
-; CHECK-NEXT:    [[ADD0:%.*]] = select i1 [[CMP0]], i32 65536, i32 65537
-; CHECK-NEXT:    [[ADD1:%.*]] = add i32 undef, [[ADD0]]
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i64 undef, undef
-; CHECK-NEXT:    [[ADD2:%.*]] = select i1 [[CMP1]], i32 65536, i32 65537
-; CHECK-NEXT:    [[ADD3:%.*]] = add i32 [[ADD1]], [[ADD2]]
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i64 undef, undef
-; CHECK-NEXT:    [[ADD4:%.*]] = select i1 [[CMP2]], i32 65536, i32 65537
-; CHECK-NEXT:    [[ADD5:%.*]] = add i32 [[ADD3]], [[ADD4]]
-; CHECK-NEXT:    [[ADD6:%.*]] = add i32 [[ADD5]], undef
-; CHECK-NEXT:    [[ADD7:%.*]] = add i32 [[ADD6]], undef
-; CHECK-NEXT:    [[ADD8:%.*]] = add i32 [[ADD7]], undef
-; CHECK-NEXT:    [[OR1:%.*]] = or i64 undef, undef
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i64 undef, [[OR1]]
-; CHECK-NEXT:    [[ADD9:%.*]] = select i1 [[CMP3]], i32 65536, i32 65537
-; CHECK-NEXT:    [[ADD10:%.*]] = add i32 [[ADD8]], [[ADD9]]
-; CHECK-NEXT:    [[ADD11:%.*]] = add i32 [[ADD10]], undef
-; CHECK-NEXT:    ret i32 [[ADD11]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> poison)
+; CHECK-NEXT:    [[OP_EXTRA:%.*]] = add i32 [[TMP0]], undef
+; CHECK-NEXT:    [[OP_EXTRA1:%.*]] = add i32 [[OP_EXTRA]], undef
+; CHECK-NEXT:    [[OP_EXTRA2:%.*]] = add i32 [[OP_EXTRA1]], undef
+; CHECK-NEXT:    [[OP_EXTRA3:%.*]] = add i32 [[OP_EXTRA2]], undef
+; CHECK-NEXT:    [[OP_EXTRA4:%.*]] = add i32 [[OP_EXTRA3]], undef
+; CHECK-NEXT:    ret i32 [[OP_EXTRA4]]
 ;
 entry:
   %or0 = or i64 undef, undef
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-move-out-of-loop.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-move-out-of-loop.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/gather-move-out-of-loop.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-move-out-of-loop.ll
@@ -8,17 +8,16 @@
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i16> [[TMP1]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext <2 x i16> [[TMP1]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 1, i32 undef>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> <i32 poison, i32 0, i32 poison, i32 0>, <4 x i32> [[TMP5]], <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
 ; CHECK-NEXT:    br label [[FOR_BODY92:%.*]]
 ; CHECK:       for.body92:
 ; CHECK-NEXT:    [[SUM_MVR_I:%.*]] = getelementptr i32, i32* undef, i32 0
 ; CHECK-NEXT:    [[SUM_MVR_ABS_I:%.*]] = getelementptr i32, i32* undef, i32 2
 ; CHECK-NEXT:    [[SUM_MVC_I:%.*]] = getelementptr i32, i32* undef, i32 1
 ; CHECK-NEXT:    [[SUM_MVC_ABS_I:%.*]] = getelementptr i32, i32* undef, i32 3
-; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> zeroinitializer, [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[SUM_MVR_I]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> zeroinitializer, [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[SUM_MVR_I]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* [[TMP7]], align 8
 ; CHECK-NEXT:    br label [[FOR_BODY92]]
 ;
 for.body92.preheader:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-shuffle.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/insert-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-shuffle.ll
@@ -13,21 +13,21 @@
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 16
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 1, i32 0, i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* undef, align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 1, i32 undef>
-; CHECK-NEXT:    [[TMP6:%.*]] = fmul <4 x float> [[SHUFFLE]], [[SHUFFLE1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd <4 x float> poison, [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = fadd <4 x float> [[TMP7]], poison
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul <4 x float> [[SHUFFLE]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd <4 x float> poison, [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = fadd <4 x float> [[TMP8]], poison
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP9]], i32 0
-; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <2 x float> undef, float [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[TMP9]], i32 1
-; CHECK-NEXT:    [[VEC2:%.*]] = insertelement <2 x float> [[VEC1]], float [[TMP11]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[TMP9]], i32 2
-; CHECK-NEXT:    [[VEC3:%.*]] = insertelement <2 x float> undef, float [[TMP12]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP9]], i32 3
-; CHECK-NEXT:    [[VEC4:%.*]] = insertelement <2 x float> [[VEC3]], float [[TMP13]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd <4 x float> [[TMP9]], poison
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[TMP10]], i32 0
+; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <2 x float> undef, float [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[TMP10]], i32 1
+; CHECK-NEXT:    [[VEC2:%.*]] = insertelement <2 x float> [[VEC1]], float [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP10]], i32 2
+; CHECK-NEXT:    [[VEC3:%.*]] = insertelement <2 x float> undef, float [[TMP13]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[TMP10]], i32 3
+; CHECK-NEXT:    [[VEC4:%.*]] = insertelement <2 x float> [[VEC3]], float [[TMP14]], i32 1
 ; CHECK-NEXT:    [[INS1:%.*]] = insertvalue { <2 x float>, <2 x float> } undef, <2 x float> [[VEC2]], 0
 ; CHECK-NEXT:    [[INS2:%.*]] = insertvalue { <2 x float>, <2 x float> } [[INS1]], <2 x float> [[VEC4]], 1
 ; CHECK-NEXT:    ret { <2 x float>, <2 x float> } [[INS2]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll b/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll
@@ -195,12 +195,11 @@
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[ARG1]], i32 5
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[ARG1]], i32 3
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[ARG1]], i32 6
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> <float poison, float poison, float poison, float 0.000000e+00>, float [[ARG2:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x float> [[ARG:%.*]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 4, i32 5, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP3]], <4 x float>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[ARG2:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x float> [[ARG:%.*]], <2 x float> [[TMP0]], <4 x i32> <i32 2, i32 0, i32 1, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <4 x float> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
 ; CHECK:       bb16:
 ; CHECK-NEXT:    br label [[BB17:%.*]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll b/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll
@@ -12,22 +12,23 @@
 ; CHECK-NEXT:    [[GEP2_1:%.*]] = getelementptr i32, i32* [[ARR2]], i32 1
 ; CHECK-NEXT:    [[GEP2_2:%.*]] = getelementptr i32, i32* [[ARR2]], i32 2
 ; CHECK-NEXT:    [[GEP2_3:%.*]] = getelementptr i32, i32* [[ARR2]], i32 3
-; CHECK-NEXT:    [[V0:%.*]] = load i32, i32* [[GEP1_0]]
-; CHECK-NEXT:    [[V1:%.*]] = load i32, i32* [[GEP1_1]]
-; CHECK-NEXT:    [[V2:%.*]] = load i32, i32* [[GEP1_2]]
-; CHECK-NEXT:    [[V3:%.*]] = load i32, i32* [[GEP1_3]]
-; CHECK-NEXT:    [[Y0:%.*]] = add nsw i32 [[A0:%.*]], 1146
-; CHECK-NEXT:    [[Y1:%.*]] = add nsw i32 [[A1:%.*]], 146
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[GEP1_0]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = load i32, i32* [[GEP1_2]], align 4
+; CHECK-NEXT:    [[V3:%.*]] = load i32, i32* [[GEP1_3]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[A0:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[A1:%.*]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <2 x i32> [[TMP3]], <i32 1146, i32 146>
 ; CHECK-NEXT:    [[Y2:%.*]] = add nsw i32 [[A2:%.*]], 42
 ; CHECK-NEXT:    [[Y3:%.*]] = add nsw i32 [[A3:%.*]], 0
-; CHECK-NEXT:    [[RES0:%.*]] = add nsw i32 [[V0]], [[Y0]]
-; CHECK-NEXT:    [[RES1:%.*]] = add nsw i32 [[V1]], [[Y1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i32> [[TMP1]], [[TMP4]]
 ; CHECK-NEXT:    [[RES2:%.*]] = sdiv i32 [[V2]], [[Y2]]
 ; CHECK-NEXT:    [[RES3:%.*]] = add nsw i32 [[V3]], [[Y3]]
-; CHECK-NEXT:    store i32 [[RES0]], i32* [[GEP2_0]]
-; CHECK-NEXT:    store i32 [[RES1]], i32* [[GEP2_1]]
-; CHECK-NEXT:    store i32 [[RES2]], i32* [[GEP2_2]]
-; CHECK-NEXT:    store i32 [[RES3]], i32* [[GEP2_3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[RES3]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[RES2]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 3, i32 2>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[GEP2_0]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* [[TMP9]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -76,10 +77,10 @@
 ; CHECK-NEXT:    [[GEP2_1:%.*]] = getelementptr i32, i32* [[ARR2]], i32 1
 ; CHECK-NEXT:    [[GEP2_2:%.*]] = getelementptr i32, i32* [[ARR2]], i32 2
 ; CHECK-NEXT:    [[GEP2_3:%.*]] = getelementptr i32, i32* [[ARR2]], i32 3
-; CHECK-NEXT:    [[V0:%.*]] = load i32, i32* [[GEP1_0]]
-; CHECK-NEXT:    [[V1:%.*]] = load i32, i32* [[GEP1_1]]
-; CHECK-NEXT:    [[V2:%.*]] = load i32, i32* [[GEP1_2]]
-; CHECK-NEXT:    [[V3:%.*]] = load i32, i32* [[GEP1_3]]
+; CHECK-NEXT:    [[V0:%.*]] = load i32, i32* [[GEP1_0]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i32, i32* [[GEP1_1]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = load i32, i32* [[GEP1_2]], align 4
+; CHECK-NEXT:    [[V3:%.*]] = load i32, i32* [[GEP1_3]], align 4
 ; CHECK-NEXT:    [[Y0:%.*]] = add nsw i32 [[A0:%.*]], 1146
 ; CHECK-NEXT:    [[Y1:%.*]] = add nsw i32 [[A1:%.*]], 146
 ; CHECK-NEXT:    [[Y2:%.*]] = add nsw i32 [[A2:%.*]], 42
@@ -88,10 +89,10 @@
 ; CHECK-NEXT:    [[RES1:%.*]] = urem i32 [[V1]], [[Y1]]
 ; CHECK-NEXT:    [[RES2:%.*]] = urem i32 [[V2]], [[Y2]]
 ; CHECK-NEXT:    [[RES3:%.*]] = add nsw i32 [[V3]], [[Y3]]
-; CHECK-NEXT:    store i32 [[RES0]], i32* [[GEP2_0]]
-; CHECK-NEXT:    store i32 [[RES1]], i32* [[GEP2_1]]
-; CHECK-NEXT:    store i32 [[RES2]], i32* [[GEP2_2]]
-; CHECK-NEXT:    store i32 [[RES3]], i32* [[GEP2_3]]
+; CHECK-NEXT:    store i32 [[RES0]], i32* [[GEP2_0]], align 4
+; CHECK-NEXT:    store i32 [[RES1]], i32* [[GEP2_1]], align 4
+; CHECK-NEXT:    store i32 [[RES2]], i32* [[GEP2_2]], align 4
+; CHECK-NEXT:    store i32 [[RES3]], i32* [[GEP2_3]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
@@ -151,37 +151,36 @@
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[R_052:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = phi <4 x float> [ [[TMP2]], [[ENTRY]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = phi <2 x float> [ [[TMP5]], [[ENTRY]] ], [ [[TMP12:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = phi <4 x float> [ [[TMP2]], [[ENTRY]] ], [ [[TMP18:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = phi <2 x float> [ [[TMP5]], [[ENTRY]] ], [ [[TMP15:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x float> [[TMP7]], i32 0
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP8]], 7.000000e+00
 ; CHECK-NEXT:    [[ADD6]] = fadd float [[R_052]], [[MUL]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = add nsw i64 [[INDVARS_IV]], 2
 ; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load float, float* [[ARRAYIDX14]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x float> [[TMP7]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> poison, float [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[TMP10]], i32 1
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 3
 ; CHECK-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast float* [[ARRAYIDX19]] to <2 x float>*
-; CHECK-NEXT:    [[TMP12]] = load <2 x float>, <2 x float>* [[TMP11]], align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x float> [[TMP7]], i32 1
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> poison, float [[TMP13]], i32 0
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float [[TMP10]], i32 1
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x float> [[TMP15]], <4 x float> [[TMP16]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    [[TMP18:%.*]] = fmul <4 x float> [[TMP17]], <float 8.000000e+00, float 9.000000e+00, float 1.000000e+01, float 1.100000e+01>
-; CHECK-NEXT:    [[TMP19]] = fadd <4 x float> [[TMP6]], [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP20]], 121
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast float* [[ARRAYIDX19]] to <2 x float>*
+; CHECK-NEXT:    [[TMP15]] = load <2 x float>, <2 x float>* [[TMP14]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x float> [[TMP15]], <2 x float> [[TMP13]], <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP17:%.*]] = fmul <4 x float> [[TMP16]], <float 8.000000e+00, float 9.000000e+00, float 1.000000e+01, float 1.100000e+01>
+; CHECK-NEXT:    [[TMP18]] = fadd <4 x float> [[TMP6]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP19]], 121
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x float> [[TMP19]], i32 0
-; CHECK-NEXT:    [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP21]]
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x float> [[TMP19]], i32 1
-; CHECK-NEXT:    [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP22]]
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x float> [[TMP19]], i32 2
-; CHECK-NEXT:    [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP23]]
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x float> [[TMP19]], i32 3
-; CHECK-NEXT:    [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP24]]
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[TMP18]], i32 0
+; CHECK-NEXT:    [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP20]]
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x float> [[TMP18]], i32 1
+; CHECK-NEXT:    [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP21]]
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x float> [[TMP18]], i32 2
+; CHECK-NEXT:    [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP22]]
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x float> [[TMP18]], i32 3
+; CHECK-NEXT:    [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP23]]
 ; CHECK-NEXT:    ret float [[ADD31]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
@@ -254,65 +254,39 @@
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512F-LABEL: @gather_load_3(
-; AVX512F-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
-; AVX512F-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
-; AVX512F-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
-; AVX512F-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i64 0
-; AVX512F-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP5]], i64 1
-; AVX512F-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP7]], i64 2
-; AVX512F-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3
-; AVX512F-NEXT:    [[TMP14:%.*]] = add <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 4
-; AVX512F-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
-; AVX512F-NEXT:    store <4 x i32> [[TMP14]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
-; AVX512F-NEXT:    [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
-; AVX512F-NEXT:    [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
-; AVX512F-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
-; AVX512F-NEXT:    [[TMP24:%.*]] = load i32, i32* [[TMP23]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP25:%.*]] = insertelement <4 x i32> poison, i32 [[TMP18]], i64 0
-; AVX512F-NEXT:    [[TMP26:%.*]] = insertelement <4 x i32> [[TMP25]], i32 [[TMP20]], i64 1
-; AVX512F-NEXT:    [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP22]], i64 2
-; AVX512F-NEXT:    [[TMP28:%.*]] = insertelement <4 x i32> [[TMP27]], i32 [[TMP24]], i64 3
-; AVX512F-NEXT:    [[TMP29:%.*]] = add <4 x i32> [[TMP28]], <i32 1, i32 2, i32 3, i32 4>
-; AVX512F-NEXT:    [[TMP30:%.*]] = bitcast i32* [[TMP15]] to <4 x i32>*
-; AVX512F-NEXT:    store <4 x i32> [[TMP29]], <4 x i32>* [[TMP30]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i64 0
+; AVX512F-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> poison, <4 x i32> zeroinitializer
+; AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
+; AVX512F-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32*> poison, i32* [[TMP1]], i64 0
+; AVX512F-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32*> [[TMP5]], <2 x i32*> poison, <2 x i32> zeroinitializer
+; AVX512F-NEXT:    [[TMP7:%.*]] = getelementptr i32, <2 x i32*> [[TMP6]], <2 x i64> <i64 9, i64 6>
+; AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
+; AVX512F-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32*> poison, i32* [[TMP1]], i64 1
+; AVX512F-NEXT:    [[TMP10:%.*]] = insertelement <2 x i32*> [[TMP9]], i32* [[TMP8]], i64 0
+; AVX512F-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32*> [[TMP7]], <2 x i32*> [[TMP10]], <4 x i32> <i32 3, i32 0, i32 1, i32 2>
+; AVX512F-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32*> [[TMP4]], <4 x i32*> [[TMP11]], <8 x i32> <i32 4, i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:    [[TMP13:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP12]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef), !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP14:%.*]] = add <8 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX512F-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>*
+; AVX512F-NEXT:    store <8 x i32> [[TMP14]], <8 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    ret void
 ;
 ; AVX512VL-LABEL: @gather_load_3(
-; AVX512VL-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
-; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
-; AVX512VL-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i64 0
-; AVX512VL-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> poison, <4 x i32> zeroinitializer
-; AVX512VL-NEXT:    [[TMP7:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
-; AVX512VL-NEXT:    [[TMP8:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP7]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP8]], <i32 2, i32 3, i32 4, i32 1>
-; AVX512VL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
-; AVX512VL-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; AVX512VL-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* [[TMP11]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
-; AVX512VL-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP14:%.*]] = add i32 [[TMP13]], 2
-; AVX512VL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
-; AVX512VL-NEXT:    store i32 [[TMP14]], i32* [[TMP10]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
-; AVX512VL-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP18:%.*]] = add i32 [[TMP17]], 3
-; AVX512VL-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
-; AVX512VL-NEXT:    store i32 [[TMP18]], i32* [[TMP15]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
-; AVX512VL-NEXT:    [[TMP21:%.*]] = load i32, i32* [[TMP20]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP22:%.*]] = add i32 [[TMP21]], 4
-; AVX512VL-NEXT:    store i32 [[TMP22]], i32* [[TMP19]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i64 0
+; AVX512VL-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> poison, <4 x i32> zeroinitializer
+; AVX512VL-NEXT:    [[TMP4:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
+; AVX512VL-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32*> poison, i32* [[TMP1]], i64 0
+; AVX512VL-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32*> [[TMP5]], <2 x i32*> poison, <2 x i32> zeroinitializer
+; AVX512VL-NEXT:    [[TMP7:%.*]] = getelementptr i32, <2 x i32*> [[TMP6]], <2 x i64> <i64 9, i64 6>
+; AVX512VL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
+; AVX512VL-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32*> poison, i32* [[TMP1]], i64 1
+; AVX512VL-NEXT:    [[TMP10:%.*]] = insertelement <2 x i32*> [[TMP9]], i32* [[TMP8]], i64 0
+; AVX512VL-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32*> [[TMP7]], <2 x i32*> [[TMP10]], <4 x i32> <i32 3, i32 0, i32 1, i32 2>
+; AVX512VL-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32*> [[TMP4]], <4 x i32*> [[TMP11]], <8 x i32> <i32 4, i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 7>
+; AVX512VL-NEXT:    [[TMP13:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP12]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef), !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP14:%.*]] = add <8 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX512VL-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>*
+; AVX512VL-NEXT:    store <8 x i32> [[TMP14]], <8 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    ret void
 ;
   %3 = load i32, i32* %1, align 4, !tbaa !2
@@ -457,65 +431,39 @@
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512F-LABEL: @gather_load_4(
-; AVX512F-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11
-; AVX512F-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4
-; AVX512F-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15
-; AVX512F-NEXT:    [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 4
-; AVX512F-NEXT:    [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18
-; AVX512F-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
-; AVX512F-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
+; AVX512F-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i64 0
+; AVX512F-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer
+; AVX512F-NEXT:    [[TMP2:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
+; AVX512F-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32*> poison, i32* [[T1]], i64 0
+; AVX512F-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32*> [[TMP3]], <2 x i32*> poison, <2 x i32> zeroinitializer
+; AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i32, <2 x i32*> [[TMP4]], <2 x i64> <i64 9, i64 6>
 ; AVX512F-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
-; AVX512F-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[T19:%.*]] = load i32, i32* [[T18]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T3]], i64 0
-; AVX512F-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T7]], i64 1
-; AVX512F-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T11]], i64 2
-; AVX512F-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T15]], i64 3
-; AVX512F-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], <i32 1, i32 2, i32 3, i32 4>
-; AVX512F-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[T19]], i64 0
-; AVX512F-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[T23]], i64 1
-; AVX512F-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T27]], i64 2
-; AVX512F-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[T31]], i64 3
-; AVX512F-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP9]], <i32 1, i32 2, i32 3, i32 4>
-; AVX512F-NEXT:    [[TMP11:%.*]] = bitcast i32* [[T0]] to <4 x i32>*
-; AVX512F-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP11]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP12:%.*]] = bitcast i32* [[T17]] to <4 x i32>*
-; AVX512F-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32*> poison, i32* [[T1]], i64 1
+; AVX512F-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32*> [[TMP6]], i32* [[T30]], i64 0
+; AVX512F-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32*> [[TMP5]], <2 x i32*> [[TMP7]], <4 x i32> <i32 3, i32 0, i32 1, i32 2>
+; AVX512F-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32*> [[TMP2]], <4 x i32*> [[TMP8]], <8 x i32> <i32 4, i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:    [[TMP10:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP9]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef), !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP11:%.*]] = add <8 x i32> [[TMP10]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX512F-NEXT:    [[TMP12:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>*
+; AVX512F-NEXT:    store <8 x i32> [[TMP11]], <8 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    ret void
 ;
 ; AVX512VL-LABEL: @gather_load_4(
-; AVX512VL-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
 ; AVX512VL-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i64 0
 ; AVX512VL-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer
 ; AVX512VL-NEXT:    [[TMP2:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
-; AVX512VL-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
-; AVX512VL-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
-; AVX512VL-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
-; AVX512VL-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
-; AVX512VL-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
+; AVX512VL-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32*> poison, i32* [[T1]], i64 0
+; AVX512VL-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32*> [[TMP3]], <2 x i32*> poison, <2 x i32> zeroinitializer
+; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr i32, <2 x i32*> [[TMP4]], <2 x i64> <i64 9, i64 6>
 ; AVX512VL-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
-; AVX512VL-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[T4:%.*]] = add i32 [[T3]], 1
-; AVX512VL-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP3]], <i32 2, i32 3, i32 4, i32 1>
-; AVX512VL-NEXT:    [[T24:%.*]] = add i32 [[T23]], 2
-; AVX512VL-NEXT:    [[T28:%.*]] = add i32 [[T27]], 3
-; AVX512VL-NEXT:    [[T32:%.*]] = add i32 [[T31]], 4
-; AVX512VL-NEXT:    store i32 [[T4]], i32* [[T0]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP5:%.*]] = bitcast i32* [[T5]] to <4 x i32>*
-; AVX512VL-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    store i32 [[T28]], i32* [[T25]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    store i32 [[T32]], i32* [[T29]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32*> poison, i32* [[T1]], i64 1
+; AVX512VL-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32*> [[TMP6]], i32* [[T30]], i64 0
+; AVX512VL-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32*> [[TMP5]], <2 x i32*> [[TMP7]], <4 x i32> <i32 3, i32 0, i32 1, i32 2>
+; AVX512VL-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32*> [[TMP2]], <4 x i32*> [[TMP8]], <8 x i32> <i32 4, i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 7>
+; AVX512VL-NEXT:    [[TMP10:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP9]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef), !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP11:%.*]] = add <8 x i32> [[TMP10]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX512VL-NEXT:    [[TMP12:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>*
+; AVX512VL-NEXT:    store <8 x i32> [[TMP11]], <8 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    ret void
 ;
   %t5 = getelementptr inbounds i32, i32* %t0, i64 1
@@ -730,48 +678,46 @@
 ;
 ; AVX512F-LABEL: @gather_load_div(
 ; AVX512F-NEXT:    [[TMP3:%.*]] = insertelement <4 x float*> poison, float* [[TMP1:%.*]], i64 0
-; AVX512F-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP3]], <4 x float*> poison, <4 x i32> zeroinitializer
-; AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> <i64 10, i64 3, i64 14, i64 17>
+; AVX512F-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x float*> [[TMP3]], <4 x float*> poison, <4 x i32> zeroinitializer
+; AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE]], <4 x i64> <i64 10, i64 3, i64 14, i64 17>
 ; AVX512F-NEXT:    [[TMP5:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i64 0
 ; AVX512F-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float*> [[TMP5]], <2 x float*> poison, <2 x i32> zeroinitializer
 ; AVX512F-NEXT:    [[TMP7:%.*]] = getelementptr float, <2 x float*> [[TMP6]], <2 x i64> <i64 8, i64 5>
 ; AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
-; AVX512F-NEXT:    [[TMP9:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0
-; AVX512F-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float*> [[TMP4]], <4 x float*> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512F-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> [[TMP10]], <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef>
-; AVX512F-NEXT:    [[TMP12:%.*]] = shufflevector <2 x float*> [[TMP7]], <2 x float*> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512F-NEXT:    [[TMP13:%.*]] = shufflevector <8 x float*> [[TMP11]], <8 x float*> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 8, i32 9, i32 undef>
-; AVX512F-NEXT:    [[TMP14:%.*]] = insertelement <8 x float*> [[TMP13]], float* [[TMP8]], i64 7
-; AVX512F-NEXT:    [[TMP15:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP14]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> poison, <8 x i32> zeroinitializer
-; AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
-; AVX512F-NEXT:    [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP16]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP18:%.*]] = fdiv <8 x float> [[TMP15]], [[TMP17]]
-; AVX512F-NEXT:    [[TMP19:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
-; AVX512F-NEXT:    store <8 x float> [[TMP18]], <8 x float>* [[TMP19]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP9:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i64 1
+; AVX512F-NEXT:    [[TMP10:%.*]] = insertelement <2 x float*> [[TMP9]], float* [[TMP8]], i64 0
+; AVX512F-NEXT:    [[TMP11:%.*]] = shufflevector <2 x float*> [[TMP7]], <2 x float*> [[TMP10]], <4 x i32> <i32 3, i32 0, i32 1, i32 2>
+; AVX512F-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float*> [[TMP4]], <4 x float*> [[TMP11]], <8 x i32> <i32 4, i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:    [[TMP13:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP12]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP14:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0
+; AVX512F-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <8 x float*> [[TMP14]], <8 x float*> poison, <8 x i32> zeroinitializer
+; AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE1]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
+; AVX512F-NEXT:    [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP17:%.*]] = fdiv <8 x float> [[TMP13]], [[TMP16]]
+; AVX512F-NEXT:    [[TMP18:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
+; AVX512F-NEXT:    store <8 x float> [[TMP17]], <8 x float>* [[TMP18]], align 4, !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    ret void
 ;
 ; AVX512VL-LABEL: @gather_load_div(
 ; AVX512VL-NEXT:    [[TMP3:%.*]] = insertelement <4 x float*> poison, float* [[TMP1:%.*]], i64 0
-; AVX512VL-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP3]], <4 x float*> poison, <4 x i32> zeroinitializer
-; AVX512VL-NEXT:    [[TMP4:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> <i64 10, i64 3, i64 14, i64 17>
+; AVX512VL-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x float*> [[TMP3]], <4 x float*> poison, <4 x i32> zeroinitializer
+; AVX512VL-NEXT:    [[TMP4:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE]], <4 x i64> <i64 10, i64 3, i64 14, i64 17>
 ; AVX512VL-NEXT:    [[TMP5:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i64 0
 ; AVX512VL-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float*> [[TMP5]], <2 x float*> poison, <2 x i32> zeroinitializer
 ; AVX512VL-NEXT:    [[TMP7:%.*]] = getelementptr float, <2 x float*> [[TMP6]], <2 x i64> <i64 8, i64 5>
 ; AVX512VL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
-; AVX512VL-NEXT:    [[TMP9:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0
-; AVX512VL-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float*> [[TMP4]], <4 x float*> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512VL-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> [[TMP10]], <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef>
-; AVX512VL-NEXT:    [[TMP12:%.*]] = shufflevector <2 x float*> [[TMP7]], <2 x float*> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512VL-NEXT:    [[TMP13:%.*]] = shufflevector <8 x float*> [[TMP11]], <8 x float*> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 8, i32 9, i32 undef>
-; AVX512VL-NEXT:    [[TMP14:%.*]] = insertelement <8 x float*> [[TMP13]], float* [[TMP8]], i64 7
-; AVX512VL-NEXT:    [[TMP15:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP14]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> poison, <8 x i32> zeroinitializer
-; AVX512VL-NEXT:    [[TMP16:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
-; AVX512VL-NEXT:    [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP16]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP18:%.*]] = fdiv <8 x float> [[TMP15]], [[TMP17]]
-; AVX512VL-NEXT:    [[TMP19:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
-; AVX512VL-NEXT:    store <8 x float> [[TMP18]], <8 x float>* [[TMP19]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP9:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i64 1
+; AVX512VL-NEXT:    [[TMP10:%.*]] = insertelement <2 x float*> [[TMP9]], float* [[TMP8]], i64 0
+; AVX512VL-NEXT:    [[TMP11:%.*]] = shufflevector <2 x float*> [[TMP7]], <2 x float*> [[TMP10]], <4 x i32> <i32 3, i32 0, i32 1, i32 2>
+; AVX512VL-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float*> [[TMP4]], <4 x float*> [[TMP11]], <8 x i32> <i32 4, i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 7>
+; AVX512VL-NEXT:    [[TMP13:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP12]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP14:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0
+; AVX512VL-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <8 x float*> [[TMP14]], <8 x float*> poison, <8 x i32> zeroinitializer
+; AVX512VL-NEXT:    [[TMP15:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE1]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
+; AVX512VL-NEXT:    [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP17:%.*]] = fdiv <8 x float> [[TMP13]], [[TMP16]]
+; AVX512VL-NEXT:    [[TMP18:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
+; AVX512VL-NEXT:    store <8 x float> [[TMP17]], <8 x float>* [[TMP18]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    ret void
 ;
   %3 = load float, float* %1, align 4, !tbaa !2
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
@@ -254,65 +254,39 @@
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512F-LABEL: @gather_load_3(
-; AVX512F-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
-; AVX512F-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
-; AVX512F-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
-; AVX512F-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i64 0
-; AVX512F-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP5]], i64 1
-; AVX512F-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP7]], i64 2
-; AVX512F-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3
-; AVX512F-NEXT:    [[TMP14:%.*]] = add <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 4
-; AVX512F-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
-; AVX512F-NEXT:    store <4 x i32> [[TMP14]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
-; AVX512F-NEXT:    [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
-; AVX512F-NEXT:    [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
-; AVX512F-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
-; AVX512F-NEXT:    [[TMP24:%.*]] = load i32, i32* [[TMP23]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP25:%.*]] = insertelement <4 x i32> poison, i32 [[TMP18]], i64 0
-; AVX512F-NEXT:    [[TMP26:%.*]] = insertelement <4 x i32> [[TMP25]], i32 [[TMP20]], i64 1
-; AVX512F-NEXT:    [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP22]], i64 2
-; AVX512F-NEXT:    [[TMP28:%.*]] = insertelement <4 x i32> [[TMP27]], i32 [[TMP24]], i64 3
-; AVX512F-NEXT:    [[TMP29:%.*]] = add <4 x i32> [[TMP28]], <i32 1, i32 2, i32 3, i32 4>
-; AVX512F-NEXT:    [[TMP30:%.*]] = bitcast i32* [[TMP15]] to <4 x i32>*
-; AVX512F-NEXT:    store <4 x i32> [[TMP29]], <4 x i32>* [[TMP30]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i64 0
+; AVX512F-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> poison, <4 x i32> zeroinitializer
+; AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
+; AVX512F-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32*> poison, i32* [[TMP1]], i64 0
+; AVX512F-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32*> [[TMP5]], <2 x i32*> poison, <2 x i32> zeroinitializer
+; AVX512F-NEXT:    [[TMP7:%.*]] = getelementptr i32, <2 x i32*> [[TMP6]], <2 x i64> <i64 9, i64 6>
+; AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
+; AVX512F-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32*> poison, i32* [[TMP1]], i64 1
+; AVX512F-NEXT:    [[TMP10:%.*]] = insertelement <2 x i32*> [[TMP9]], i32* [[TMP8]], i64 0
+; AVX512F-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32*> [[TMP7]], <2 x i32*> [[TMP10]], <4 x i32> <i32 3, i32 0, i32 1, i32 2>
+; AVX512F-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32*> [[TMP4]], <4 x i32*> [[TMP11]], <8 x i32> <i32 4, i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:    [[TMP13:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP12]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef), !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP14:%.*]] = add <8 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX512F-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>*
+; AVX512F-NEXT:    store <8 x i32> [[TMP14]], <8 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    ret void
 ;
 ; AVX512VL-LABEL: @gather_load_3(
-; AVX512VL-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
-; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
-; AVX512VL-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i64 0
-; AVX512VL-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> poison, <4 x i32> zeroinitializer
-; AVX512VL-NEXT:    [[TMP7:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
-; AVX512VL-NEXT:    [[TMP8:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP7]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP8]], <i32 2, i32 3, i32 4, i32 1>
-; AVX512VL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
-; AVX512VL-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; AVX512VL-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* [[TMP11]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
-; AVX512VL-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP14:%.*]] = add i32 [[TMP13]], 2
-; AVX512VL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
-; AVX512VL-NEXT:    store i32 [[TMP14]], i32* [[TMP10]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
-; AVX512VL-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP18:%.*]] = add i32 [[TMP17]], 3
-; AVX512VL-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
-; AVX512VL-NEXT:    store i32 [[TMP18]], i32* [[TMP15]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
-; AVX512VL-NEXT:    [[TMP21:%.*]] = load i32, i32* [[TMP20]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP22:%.*]] = add i32 [[TMP21]], 4
-; AVX512VL-NEXT:    store i32 [[TMP22]], i32* [[TMP19]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i64 0
+; AVX512VL-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> poison, <4 x i32> zeroinitializer
+; AVX512VL-NEXT:    [[TMP4:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
+; AVX512VL-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32*> poison, i32* [[TMP1]], i64 0
+; AVX512VL-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32*> [[TMP5]], <2 x i32*> poison, <2 x i32> zeroinitializer
+; AVX512VL-NEXT:    [[TMP7:%.*]] = getelementptr i32, <2 x i32*> [[TMP6]], <2 x i64> <i64 9, i64 6>
+; AVX512VL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
+; AVX512VL-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32*> poison, i32* [[TMP1]], i64 1
+; AVX512VL-NEXT:    [[TMP10:%.*]] = insertelement <2 x i32*> [[TMP9]], i32* [[TMP8]], i64 0
+; AVX512VL-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32*> [[TMP7]], <2 x i32*> [[TMP10]], <4 x i32> <i32 3, i32 0, i32 1, i32 2>
+; AVX512VL-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32*> [[TMP4]], <4 x i32*> [[TMP11]], <8 x i32> <i32 4, i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 7>
+; AVX512VL-NEXT:    [[TMP13:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP12]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef), !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP14:%.*]] = add <8 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX512VL-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>*
+; AVX512VL-NEXT:    store <8 x i32> [[TMP14]], <8 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    ret void
 ;
   %3 = load i32, i32* %1, align 4, !tbaa !2
@@ -457,65 +431,39 @@
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512F-LABEL: @gather_load_4(
-; AVX512F-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11
-; AVX512F-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4
-; AVX512F-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15
-; AVX512F-NEXT:    [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 4
-; AVX512F-NEXT:    [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18
-; AVX512F-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
-; AVX512F-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
+; AVX512F-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i64 0
+; AVX512F-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer
+; AVX512F-NEXT:    [[TMP2:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
+; AVX512F-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32*> poison, i32* [[T1]], i64 0
+; AVX512F-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32*> [[TMP3]], <2 x i32*> poison, <2 x i32> zeroinitializer
+; AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i32, <2 x i32*> [[TMP4]], <2 x i64> <i64 9, i64 6>
 ; AVX512F-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
-; AVX512F-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[T19:%.*]] = load i32, i32* [[T18]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T3]], i64 0
-; AVX512F-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T7]], i64 1
-; AVX512F-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T11]], i64 2
-; AVX512F-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T15]], i64 3
-; AVX512F-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], <i32 1, i32 2, i32 3, i32 4>
-; AVX512F-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[T19]], i64 0
-; AVX512F-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[T23]], i64 1
-; AVX512F-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T27]], i64 2
-; AVX512F-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[T31]], i64 3
-; AVX512F-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP9]], <i32 1, i32 2, i32 3, i32 4>
-; AVX512F-NEXT:    [[TMP11:%.*]] = bitcast i32* [[T0]] to <4 x i32>*
-; AVX512F-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP11]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP12:%.*]] = bitcast i32* [[T17]] to <4 x i32>*
-; AVX512F-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32*> poison, i32* [[T1]], i64 1
+; AVX512F-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32*> [[TMP6]], i32* [[T30]], i64 0
+; AVX512F-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32*> [[TMP5]], <2 x i32*> [[TMP7]], <4 x i32> <i32 3, i32 0, i32 1, i32 2>
+; AVX512F-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32*> [[TMP2]], <4 x i32*> [[TMP8]], <8 x i32> <i32 4, i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:    [[TMP10:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP9]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef), !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP11:%.*]] = add <8 x i32> [[TMP10]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX512F-NEXT:    [[TMP12:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>*
+; AVX512F-NEXT:    store <8 x i32> [[TMP11]], <8 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    ret void
 ;
 ; AVX512VL-LABEL: @gather_load_4(
-; AVX512VL-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
 ; AVX512VL-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i64 0
 ; AVX512VL-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer
 ; AVX512VL-NEXT:    [[TMP2:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
-; AVX512VL-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
-; AVX512VL-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
-; AVX512VL-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
-; AVX512VL-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
-; AVX512VL-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
+; AVX512VL-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32*> poison, i32* [[T1]], i64 0
+; AVX512VL-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32*> [[TMP3]], <2 x i32*> poison, <2 x i32> zeroinitializer
+; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr i32, <2 x i32*> [[TMP4]], <2 x i64> <i64 9, i64 6>
 ; AVX512VL-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
-; AVX512VL-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[T4:%.*]] = add i32 [[T3]], 1
-; AVX512VL-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP3]], <i32 2, i32 3, i32 4, i32 1>
-; AVX512VL-NEXT:    [[T24:%.*]] = add i32 [[T23]], 2
-; AVX512VL-NEXT:    [[T28:%.*]] = add i32 [[T27]], 3
-; AVX512VL-NEXT:    [[T32:%.*]] = add i32 [[T31]], 4
-; AVX512VL-NEXT:    store i32 [[T4]], i32* [[T0]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP5:%.*]] = bitcast i32* [[T5]] to <4 x i32>*
-; AVX512VL-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    store i32 [[T28]], i32* [[T25]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    store i32 [[T32]], i32* [[T29]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32*> poison, i32* [[T1]], i64 1
+; AVX512VL-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32*> [[TMP6]], i32* [[T30]], i64 0
+; AVX512VL-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32*> [[TMP5]], <2 x i32*> [[TMP7]], <4 x i32> <i32 3, i32 0, i32 1, i32 2>
+; AVX512VL-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32*> [[TMP2]], <4 x i32*> [[TMP8]], <8 x i32> <i32 4, i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 7>
+; AVX512VL-NEXT:    [[TMP10:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP9]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef), !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP11:%.*]] = add <8 x i32> [[TMP10]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX512VL-NEXT:    [[TMP12:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>*
+; AVX512VL-NEXT:    store <8 x i32> [[TMP11]], <8 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    ret void
 ;
   %t5 = getelementptr inbounds i32, i32* %t0, i64 1
@@ -730,48 +678,46 @@
 ;
 ; AVX512F-LABEL: @gather_load_div(
 ; AVX512F-NEXT:    [[TMP3:%.*]] = insertelement <4 x float*> poison, float* [[TMP1:%.*]], i64 0
-; AVX512F-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP3]], <4 x float*> poison, <4 x i32> zeroinitializer
-; AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> <i64 10, i64 3, i64 14, i64 17>
+; AVX512F-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x float*> [[TMP3]], <4 x float*> poison, <4 x i32> zeroinitializer
+; AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE]], <4 x i64> <i64 10, i64 3, i64 14, i64 17>
 ; AVX512F-NEXT:    [[TMP5:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i64 0
 ; AVX512F-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float*> [[TMP5]], <2 x float*> poison, <2 x i32> zeroinitializer
 ; AVX512F-NEXT:    [[TMP7:%.*]] = getelementptr float, <2 x float*> [[TMP6]], <2 x i64> <i64 8, i64 5>
 ; AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
-; AVX512F-NEXT:    [[TMP9:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0
-; AVX512F-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float*> [[TMP4]], <4 x float*> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512F-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> [[TMP10]], <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef>
-; AVX512F-NEXT:    [[TMP12:%.*]] = shufflevector <2 x float*> [[TMP7]], <2 x float*> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512F-NEXT:    [[TMP13:%.*]] = shufflevector <8 x float*> [[TMP11]], <8 x float*> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 8, i32 9, i32 undef>
-; AVX512F-NEXT:    [[TMP14:%.*]] = insertelement <8 x float*> [[TMP13]], float* [[TMP8]], i64 7
-; AVX512F-NEXT:    [[TMP15:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP14]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> poison, <8 x i32> zeroinitializer
-; AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
-; AVX512F-NEXT:    [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP16]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP18:%.*]] = fdiv <8 x float> [[TMP15]], [[TMP17]]
-; AVX512F-NEXT:    [[TMP19:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
-; AVX512F-NEXT:    store <8 x float> [[TMP18]], <8 x float>* [[TMP19]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP9:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i64 1
+; AVX512F-NEXT:    [[TMP10:%.*]] = insertelement <2 x float*> [[TMP9]], float* [[TMP8]], i64 0
+; AVX512F-NEXT:    [[TMP11:%.*]] = shufflevector <2 x float*> [[TMP7]], <2 x float*> [[TMP10]], <4 x i32> <i32 3, i32 0, i32 1, i32 2>
+; AVX512F-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float*> [[TMP4]], <4 x float*> [[TMP11]], <8 x i32> <i32 4, i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:    [[TMP13:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP12]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP14:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0
+; AVX512F-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <8 x float*> [[TMP14]], <8 x float*> poison, <8 x i32> zeroinitializer
+; AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE1]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
+; AVX512F-NEXT:    [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP17:%.*]] = fdiv <8 x float> [[TMP13]], [[TMP16]]
+; AVX512F-NEXT:    [[TMP18:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
+; AVX512F-NEXT:    store <8 x float> [[TMP17]], <8 x float>* [[TMP18]], align 4, !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    ret void
 ;
 ; AVX512VL-LABEL: @gather_load_div(
 ; AVX512VL-NEXT:    [[TMP3:%.*]] = insertelement <4 x float*> poison, float* [[TMP1:%.*]], i64 0
-; AVX512VL-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP3]], <4 x float*> poison, <4 x i32> zeroinitializer
-; AVX512VL-NEXT:    [[TMP4:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> <i64 10, i64 3, i64 14, i64 17>
+; AVX512VL-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x float*> [[TMP3]], <4 x float*> poison, <4 x i32> zeroinitializer
+; AVX512VL-NEXT:    [[TMP4:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE]], <4 x i64> <i64 10, i64 3, i64 14, i64 17>
 ; AVX512VL-NEXT:    [[TMP5:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i64 0
 ; AVX512VL-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float*> [[TMP5]], <2 x float*> poison, <2 x i32> zeroinitializer
 ; AVX512VL-NEXT:    [[TMP7:%.*]] = getelementptr float, <2 x float*> [[TMP6]], <2 x i64> <i64 8, i64 5>
 ; AVX512VL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
-; AVX512VL-NEXT:    [[TMP9:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0
-; AVX512VL-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float*> [[TMP4]], <4 x float*> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512VL-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> [[TMP10]], <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef>
-; AVX512VL-NEXT:    [[TMP12:%.*]] = shufflevector <2 x float*> [[TMP7]], <2 x float*> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512VL-NEXT:    [[TMP13:%.*]] = shufflevector <8 x float*> [[TMP11]], <8 x float*> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 8, i32 9, i32 undef>
-; AVX512VL-NEXT:    [[TMP14:%.*]] = insertelement <8 x float*> [[TMP13]], float* [[TMP8]], i64 7
-; AVX512VL-NEXT:    [[TMP15:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP14]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> poison, <8 x i32> zeroinitializer
-; AVX512VL-NEXT:    [[TMP16:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
-; AVX512VL-NEXT:    [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP16]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP18:%.*]] = fdiv <8 x float> [[TMP15]], [[TMP17]]
-; AVX512VL-NEXT:    [[TMP19:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
-; AVX512VL-NEXT:    store <8 x float> [[TMP18]], <8 x float>* [[TMP19]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP9:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i64 1
+; AVX512VL-NEXT:    [[TMP10:%.*]] = insertelement <2 x float*> [[TMP9]], float* [[TMP8]], i64 0
+; AVX512VL-NEXT:    [[TMP11:%.*]] = shufflevector <2 x float*> [[TMP7]], <2 x float*> [[TMP10]], <4 x i32> <i32 3, i32 0, i32 1, i32 2>
+; AVX512VL-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float*> [[TMP4]], <4 x float*> [[TMP11]], <8 x i32> <i32 4, i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 7>
+; AVX512VL-NEXT:    [[TMP13:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP12]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP14:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0
+; AVX512VL-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <8 x float*> [[TMP14]], <8 x float*> poison, <8 x i32> zeroinitializer
+; AVX512VL-NEXT:    [[TMP15:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE1]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
+; AVX512VL-NEXT:    [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP17:%.*]] = fdiv <8 x float> [[TMP13]], [[TMP16]]
+; AVX512VL-NEXT:    [[TMP18:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
+; AVX512VL-NEXT:    store <8 x float> [[TMP17]], <8 x float>* [[TMP18]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    ret void
 ;
 ; AVX512-LABEL: @gather_load_div(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll
@@ -6,15 +6,13 @@
 
 define <4 x i32> @foo(<4 x i32> %x, i32 %f) {
 ; CHECK-LABEL: @foo(
-; CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x i32> undef, i32 [[F:%.*]], i64 0
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[F]], 1
-; CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[ADD]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[F]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[F:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; CHECK-NEXT:    [[VECINIT51:%.*]] = shufflevector <4 x i32> [[VECINIT1]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x i32> [[VECINIT51]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 1, i32 2>
+; CHECK-NEXT:    [[ADD4:%.*]] = add nsw i32 [[F]], 3
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[ADD4]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <4 x i32> <i32 2, i32 0, i32 1, i32 3>
+; CHECK-NEXT:    ret <4 x i32> [[TMP5]]
 ;
   %vecinit = insertelement <4 x i32> undef, i32 %f, i32 0
   %add = add nsw i32 %f, 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
@@ -213,20 +213,35 @@
 ;       logic...or a wide reduction?
 
 define i1 @logical_and_icmp_clamp(<4 x i32> %x) {
-; CHECK-LABEL: @logical_and_icmp_clamp(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], <i32 42, i32 42, i32 42, i32 42>
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], <i32 17, i32 17, i32 17, i32 17>
-; CHECK-NEXT:    [[TMP3:%.*]] = freeze <4 x i1> [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP3]])
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[TMP2]], i32 0
-; CHECK-NEXT:    [[S4:%.*]] = select i1 [[TMP4]], i1 [[TMP5]], i1 false
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1
-; CHECK-NEXT:    [[S5:%.*]] = select i1 [[S4]], i1 [[TMP6]], i1 false
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2
-; CHECK-NEXT:    [[S6:%.*]] = select i1 [[S5]], i1 [[TMP7]], i1 false
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3
-; CHECK-NEXT:    [[S7:%.*]] = select i1 [[S6]], i1 [[TMP8]], i1 false
-; CHECK-NEXT:    ret i1 [[S7]]
+; SSE-LABEL: @logical_and_icmp_clamp(
+; SSE-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], <i32 42, i32 42, i32 42, i32 42>
+; SSE-NEXT:    [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], <i32 17, i32 17, i32 17, i32 17>
+; SSE-NEXT:    [[TMP3:%.*]] = freeze <4 x i1> [[TMP1]]
+; SSE-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP3]])
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[TMP2]], i32 0
+; SSE-NEXT:    [[S4:%.*]] = select i1 [[TMP4]], i1 [[TMP5]], i1 false
+; SSE-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1
+; SSE-NEXT:    [[S5:%.*]] = select i1 [[S4]], i1 [[TMP6]], i1 false
+; SSE-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2
+; SSE-NEXT:    [[S6:%.*]] = select i1 [[S5]], i1 [[TMP7]], i1 false
+; SSE-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3
+; SSE-NEXT:    [[S7:%.*]] = select i1 [[S6]], i1 [[TMP8]], i1 false
+; SSE-NEXT:    ret i1 [[S7]]
+;
+; AVX-LABEL: @logical_and_icmp_clamp(
+; AVX-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 3
+; AVX-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[X]], i32 2
+; AVX-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[X]], i32 1
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[X]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> <i32 17, i32 17, i32 17, i32 17>, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 42, i32 42, i32 42, i32 42>, i32 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP3]], i32 1
+; AVX-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[TMP2]], i32 2
+; AVX-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[TMP1]], i32 3
+; AVX-NEXT:    [[TMP10:%.*]] = icmp slt <8 x i32> [[TMP5]], [[TMP9]]
+; AVX-NEXT:    [[TMP11:%.*]] = freeze <8 x i1> [[TMP10]]
+; AVX-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP11]])
+; AVX-NEXT:    ret i1 [[TMP12]]
 ;
   %x0 = extractelement <4 x i32> %x, i32 0
   %x1 = extractelement <4 x i32> %x, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll
@@ -11,11 +11,12 @@
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:    [[T3:%.*]] = load i32, i32* [[T2:%.*]], align 4
 ; CHECK-NEXT:    [[T4:%.*]] = getelementptr inbounds i32, i32* [[T2]], i64 7
-; CHECK-NEXT:    [[T5:%.*]] = load i32, i32* [[T4]], align 4
 ; CHECK-NEXT:    [[T8:%.*]] = getelementptr inbounds i32, i32* [[T2]], i64 1
 ; CHECK-NEXT:    [[T9:%.*]] = load i32, i32* [[T8]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> <i32 poison, i32 6270>, i32 [[T9]], i32 0
 ; CHECK-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, i32* [[T2]], i64 6
-; CHECK-NEXT:    [[T11:%.*]] = load i32, i32* [[T10]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[T10]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, i32* [[T2]], i64 2
 ; CHECK-NEXT:    [[T15:%.*]] = load i32, i32* [[T14]], align 4
 ; CHECK-NEXT:    [[T16:%.*]] = getelementptr inbounds i32, i32* [[T2]], i64 5
@@ -31,32 +32,33 @@
 ; CHECK-NEXT:    [[T30:%.*]] = add nsw i32 [[T27]], [[T29]]
 ; CHECK-NEXT:    [[T31:%.*]] = mul nsw i32 [[T30]], 4433
 ; CHECK-NEXT:    [[T34:%.*]] = mul nsw i32 [[T29]], -15137
-; CHECK-NEXT:    [[T37:%.*]] = add nsw i32 [[T25]], [[T11]]
-; CHECK-NEXT:    [[T38:%.*]] = add nsw i32 [[T17]], [[T5]]
-; CHECK-NEXT:    [[T39:%.*]] = add nsw i32 [[T37]], [[T38]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[T25]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[T17]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <2 x i32> [[TMP5]], [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i32> [[TMP6]], i32 1
+; CHECK-NEXT:    [[T39:%.*]] = add nsw i32 [[TMP7]], [[TMP8]]
 ; CHECK-NEXT:    [[T40:%.*]] = mul nsw i32 [[T39]], 9633
 ; CHECK-NEXT:    [[T41:%.*]] = mul nsw i32 [[T25]], 2446
 ; CHECK-NEXT:    [[T42:%.*]] = mul nsw i32 [[T17]], 16819
-; CHECK-NEXT:    [[T47:%.*]] = mul nsw i32 [[T37]], -16069
-; CHECK-NEXT:    [[T48:%.*]] = mul nsw i32 [[T38]], -3196
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T15]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T40]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T27]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T40]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> <i32 poison, i32 poison, i32 6270, i32 poison>, i32 [[T9]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[T48]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[T47]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = mul nsw <4 x i32> [[TMP4]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[TMP10]], i32 0
-; CHECK-NEXT:    [[T69:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[TMP12]], i32 4
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP10]], i32 1
-; CHECK-NEXT:    [[T70:%.*]] = insertelement <8 x i32> [[T69]], i32 [[TMP13]], i32 5
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nsw <2 x i32> [[TMP6]], <i32 -16069, i32 -3196>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 3, i32 0, i32 2>
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[T40]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[T27]], i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[T40]], i32 2
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[T15]], i32 3
+; CHECK-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], [[TMP10]]
+; CHECK-NEXT:    [[TMP16:%.*]] = mul nsw <4 x i32> [[TMP14]], [[TMP10]]
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP15]], <4 x i32> [[TMP16]], <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i32> [[TMP17]], i32 3
+; CHECK-NEXT:    [[T65:%.*]] = insertelement <8 x i32> poison, i32 [[TMP18]], i32 0
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[T691:%.*]] = shufflevector <8 x i32> [[T65]], <8 x i32> [[TMP19]], <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 11, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i32> [[TMP17]], i32 0
+; CHECK-NEXT:    [[T70:%.*]] = insertelement <8 x i32> [[T691]], i32 [[TMP20]], i32 5
 ; CHECK-NEXT:    [[T71:%.*]] = insertelement <8 x i32> [[T70]], i32 [[T34]], i32 6
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[TMP10]], i32 3
-; CHECK-NEXT:    [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[TMP14]], i32 7
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[TMP17]], i32 2
+; CHECK-NEXT:    [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[TMP21]], i32 7
 ; CHECK-NEXT:    [[T76:%.*]] = shl <8 x i32> [[T72]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; CHECK-NEXT:    [[T79:%.*]] = bitcast i32* [[T2]] to <8 x i32>*
 ; CHECK-NEXT:    store <8 x i32> [[T76]], <8 x i32>* [[T79]], align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll
@@ -11,11 +11,12 @@
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:    [[T3:%.*]] = load i32, i32* [[T2:%.*]], align 4
 ; CHECK-NEXT:    [[T4:%.*]] = getelementptr inbounds i32, i32* [[T2]], i64 7
-; CHECK-NEXT:    [[T5:%.*]] = load i32, i32* [[T4]], align 4
 ; CHECK-NEXT:    [[T8:%.*]] = getelementptr inbounds i32, i32* [[T2]], i64 1
 ; CHECK-NEXT:    [[T9:%.*]] = load i32, i32* [[T8]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> <i32 poison, i32 6270>, i32 [[T9]], i32 0
 ; CHECK-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, i32* [[T2]], i64 6
-; CHECK-NEXT:    [[T11:%.*]] = load i32, i32* [[T10]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[T10]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, i32* [[T2]], i64 2
 ; CHECK-NEXT:    [[T15:%.*]] = load i32, i32* [[T14]], align 4
 ; CHECK-NEXT:    [[T16:%.*]] = getelementptr inbounds i32, i32* [[T2]], i64 5
@@ -31,32 +32,33 @@
 ; CHECK-NEXT:    [[T30:%.*]] = add nsw i32 [[T27]], [[T29]]
 ; CHECK-NEXT:    [[T31:%.*]] = mul nsw i32 [[T30]], 4433
 ; CHECK-NEXT:    [[T34:%.*]] = mul nsw i32 [[T29]], -15137
-; CHECK-NEXT:    [[T37:%.*]] = add nsw i32 [[T25]], [[T11]]
-; CHECK-NEXT:    [[T38:%.*]] = add nsw i32 [[T17]], [[T5]]
-; CHECK-NEXT:    [[T39:%.*]] = add nsw i32 [[T37]], [[T38]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[T25]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[T17]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <2 x i32> [[TMP5]], [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i32> [[TMP6]], i32 1
+; CHECK-NEXT:    [[T39:%.*]] = add nsw i32 [[TMP7]], [[TMP8]]
 ; CHECK-NEXT:    [[T40:%.*]] = mul nsw i32 [[T39]], 9633
 ; CHECK-NEXT:    [[T41:%.*]] = mul nsw i32 [[T25]], 2446
 ; CHECK-NEXT:    [[T42:%.*]] = mul nsw i32 [[T17]], 16819
-; CHECK-NEXT:    [[T47:%.*]] = mul nsw i32 [[T37]], -16069
-; CHECK-NEXT:    [[T48:%.*]] = mul nsw i32 [[T38]], -3196
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T15]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T40]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T27]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T40]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> <i32 poison, i32 poison, i32 6270, i32 poison>, i32 [[T9]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[T48]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[T47]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = mul nsw <4 x i32> [[TMP4]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[TMP10]], i32 0
-; CHECK-NEXT:    [[T69:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[TMP12]], i32 4
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP10]], i32 1
-; CHECK-NEXT:    [[T70:%.*]] = insertelement <8 x i32> [[T69]], i32 [[TMP13]], i32 5
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nsw <2 x i32> [[TMP6]], <i32 -16069, i32 -3196>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 3, i32 0, i32 2>
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[T40]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[T27]], i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[T40]], i32 2
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[T15]], i32 3
+; CHECK-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], [[TMP10]]
+; CHECK-NEXT:    [[TMP16:%.*]] = mul nsw <4 x i32> [[TMP14]], [[TMP10]]
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP15]], <4 x i32> [[TMP16]], <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i32> [[TMP17]], i32 3
+; CHECK-NEXT:    [[T65:%.*]] = insertelement <8 x i32> undef, i32 [[TMP18]], i32 0
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[T691:%.*]] = shufflevector <8 x i32> [[T65]], <8 x i32> [[TMP19]], <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 11, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i32> [[TMP17]], i32 0
+; CHECK-NEXT:    [[T70:%.*]] = insertelement <8 x i32> [[T691]], i32 [[TMP20]], i32 5
 ; CHECK-NEXT:    [[T71:%.*]] = insertelement <8 x i32> [[T70]], i32 [[T34]], i32 6
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[TMP10]], i32 3
-; CHECK-NEXT:    [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[TMP14]], i32 7
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[TMP17]], i32 2
+; CHECK-NEXT:    [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[TMP21]], i32 7
 ; CHECK-NEXT:    [[T76:%.*]] = shl <8 x i32> [[T72]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; CHECK-NEXT:    [[T79:%.*]] = bitcast i32* [[T2]] to <8 x i32>*
 ; CHECK-NEXT:    store <8 x i32> [[T76]], <8 x i32>* [[T79]], align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
@@ -45,20 +45,20 @@
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store i32 [[TMP0]], i32* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP1]], 1
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
-; CHECK-NEXT:    store i32 [[ADD3]], i32* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[INCDEC_PTR]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 1, i32 2>
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
-; CHECK-NEXT:    store i32 [[ADD6]], i32* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP3]], 3
-; CHECK-NEXT:    store i32 [[ADD9]], i32* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP4]], 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[ADD9]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP6]], <4 x i32> <i32 2, i32 0, i32 1, i32 3>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -336,20 +336,20 @@
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store i32 [[TMP0]], i32* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[TMP1]], 1
 ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
-; CHECK-NEXT:    store i32 [[SHL]], i32* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[SHL5:%.*]] = shl i32 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[INCDEC_PTR]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = shl <2 x i32> [[TMP2]], <i32 1, i32 2>
 ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
-; CHECK-NEXT:    store i32 [[SHL5]], i32* [[INCDEC_PTR3]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[SHL8:%.*]] = shl i32 [[TMP3]], 3
-; CHECK-NEXT:    store i32 [[SHL8]], i32* [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[SHL8:%.*]] = shl i32 [[TMP4]], 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[SHL8]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP6]], <4 x i32> <i32 2, i32 0, i32 1, i32 3>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -455,20 +455,20 @@
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store float [[TMP0]], float* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[ADD3:%.*]] = fadd fast float [[TMP1]], 1.000000e+00
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
-; CHECK-NEXT:    store float [[ADD3]], float* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[ADD6:%.*]] = fadd fast float [[TMP2]], 2.000000e+00
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[INCDEC_PTR]] to <2 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float 1.000000e+00, float 2.000000e+00>
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
-; CHECK-NEXT:    store float [[ADD6]], float* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[ADD9:%.*]] = fadd fast float [[TMP3]], 3.000000e+00
-; CHECK-NEXT:    store float [[ADD9]], float* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[ADD9:%.*]] = fadd fast float [[TMP4]], 3.000000e+00
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[ADD9]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP6]], <4 x i32> <i32 2, i32 0, i32 1, i32 3>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[DST]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP7]], <4 x float>* [[TMP8]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -701,22 +701,22 @@
 ; CHECK-LABEL: @mulf(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = fmul fast float [[TMP0]], 2.570000e+02
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store float [[SUB]], float* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[SUB3:%.*]] = fmul fast float [[TMP1]], -3.000000e+00
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <2 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], <float 2.570000e+02, float -3.000000e+00>
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
-; CHECK-NEXT:    store float [[SUB3]], float* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
-; CHECK-NEXT:    store float [[TMP2]], float* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
-; CHECK-NEXT:    store float [[SUB9]], float* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP4]], -9.000000e+00
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> poison, float [[SUB9]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 3, i32 2>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[DST]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP7]], <4 x float>* [[TMP8]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -784,20 +784,20 @@
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store float [[TMP0]], float* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[ADD3:%.*]] = fadd float [[TMP1]], 1.000000e+00
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
-; CHECK-NEXT:    store float [[ADD3]], float* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[ADD6:%.*]] = fadd float [[TMP2]], 2.000000e+00
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[INCDEC_PTR]] to <2 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x float> [[TMP2]], <float 1.000000e+00, float 2.000000e+00>
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
-; CHECK-NEXT:    store float [[ADD6]], float* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[ADD9:%.*]] = fadd float [[TMP3]], 3.000000e+00
-; CHECK-NEXT:    store float [[ADD9]], float* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[ADD9:%.*]] = fadd float [[TMP4]], 3.000000e+00
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[ADD9]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP6]], <4 x i32> <i32 2, i32 0, i32 1, i32 3>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[DST]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP7]], <4 x float>* [[TMP8]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -944,22 +944,22 @@
 ; CHECK-LABEL: @mulfn(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = fmul float [[TMP0]], 2.570000e+02
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store float [[SUB]], float* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[SUB3:%.*]] = fmul float [[TMP1]], -3.000000e+00
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <2 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x float> [[TMP1]], <float 2.570000e+02, float -3.000000e+00>
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
-; CHECK-NEXT:    store float [[SUB3]], float* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
-; CHECK-NEXT:    store float [[TMP2]], float* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
-; CHECK-NEXT:    store float [[SUB9]], float* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP4]], -9.000000e+00
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> poison, float [[SUB9]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 3, i32 2>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[DST]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP7]], <4 x float>* [[TMP8]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll
@@ -12,7 +12,7 @@
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[CONV]], i32 1
 ; CHECK-NEXT:    br label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[TMP2:%.*]] = phi <4 x float> [ [[TMP1]], [[BB1]] ], [ [[TMP18:%.*]], [[BB3:%.*]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <4 x float> [ [[TMP1]], [[BB1]] ], [ [[TMP14:%.*]], [[BB3:%.*]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = load double, double* undef, align 8
 ; CHECK-NEXT:    br i1 undef, label [[BB3]], label [[BB4:%.*]]
 ; CHECK:       bb4:
@@ -23,17 +23,13 @@
 ; CHECK-NEXT:    [[TMP7:%.*]] = fsub <2 x double> [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = fadd <2 x double> [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP8]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x double> poison, double [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP9]], i32 1
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x double> [[TMP11]], double [[TMP12]], i32 1
-; CHECK-NEXT:    [[TMP14:%.*]] = fcmp ogt <4 x double> [[TMP13]], [[TMP4]]
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP16:%.*]] = fptrunc <4 x double> [[TMP15]] to <4 x float>
-; CHECK-NEXT:    [[TMP17:%.*]] = select <4 x i1> [[TMP14]], <4 x float> [[TMP2]], <4 x float> [[TMP16]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = fcmp ogt <4 x double> [[TMP10]], [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fptrunc <4 x double> [[TMP10]] to <4 x float>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x float> [[TMP2]], <4 x float> [[TMP12]]
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    [[TMP18]] = phi <4 x float> [ [[TMP17]], [[BB4]] ], [ [[TMP2]], [[BB2]] ]
+; CHECK-NEXT:    [[TMP14]] = phi <4 x float> [ [[TMP13]], [[BB4]] ], [ [[TMP2]], [[BB2]] ]
 ; CHECK-NEXT:    br label [[BB2]]
 ;
 entry: