Index: lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -516,11 +516,14 @@
 
   /// \returns the cost incurred by unwanted spills and fills, caused by
   /// holding live values over call sites.
-  int getSpillCost();
+  int getSpillCost(const SmallPtrSetImpl<Value *> &ScalarsToVec);
+
+  /// \returns the cost extracting vectorized elements.
+  int getExtractCost(const SmallPtrSetImpl<Value *> &ScalarsToVec);
 
   /// \returns the vectorization cost of the subtree that starts at \p VL.
   /// A negative number means that this is profitable.
-  int getTreeCost();
+  int getTreeCost(bool ReduceTree = false);
 
   /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
   /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
@@ -541,6 +544,7 @@
     ScalarToTreeEntry.clear();
     MustGather.clear();
     ExternalUses.clear();
+    RemovedOprations.clear();
     NumOpsWantToKeepOrder.clear();
     NumOpsWantToKeepOriginalOrder = 0;
     for (auto &Iter : BlocksSchedules) {
@@ -600,6 +604,9 @@
   /// vectorizable. We do not vectorize such trees.
   bool isTreeTinyAndNotFullyVectorizable();
 
+  /// Reduce the cost of tree to make it patrially vectorizable, if possible.
+  bool reduceTree();
+
   OptimizationRemarkEmitter *getORE() { return ORE; }
 
 private:
@@ -700,6 +707,12 @@
     /// The TreeEntry index containing the user of this entry.  We can actually
     /// have multiple users so the data structure is not truly a tree.
     SmallVector<int, 1> UserTreeIndices;
+
+    /// Cost of the tree entry.
+    int Cost = 0;
+
+    /// Full cost on expanding tree at this hight.
+    int ExpandAtCost = 0;
   };
 
   /// Create a new VectorizableTree entry.
@@ -742,6 +755,9 @@
   /// Maps a specific scalar to its tree entry.
   SmallDenseMap<Value*, int> ScalarToTreeEntry;
 
+  /// Tree entries that should not be vectorized due to throttling.
+  SmallVector<int, 2> RemovedOprations;
+
   /// A list of scalars that we found that we need to keep as scalars.
   ValueSet MustGather;
 
@@ -1170,6 +1186,9 @@
   /// Attaches the BlockScheduling structures to basic blocks.
   MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
 
+  /// Remove operations from the list of proposed to schedule.
+  void removeFromScheduling(BlockScheduling *BS, ArrayRef<int> Oprations);
+
   /// Performs the "real" scheduling. Done before vectorization is actually
   /// performed in a basic block.
   void scheduleBlock(BlockScheduling *BS);
@@ -2413,6 +2432,55 @@
   }
 }
 
+bool BoUpSLP::reduceTree() {
+  bool Reduced = false;
+
+  // Estimating where to stop vectorization.
+  unsigned StopAt = 0;
+  int MinCost = VectorizableTree.back().ExpandAtCost;
+  for (unsigned I = VectorizableTree.size(); I--;) {
+    TreeEntry *Entry = &VectorizableTree[I];
+    if (Entry->NeedToGather)
+      continue;
+    if (VectorizableTree[I].ExpandAtCost < MinCost) {
+      MinCost = VectorizableTree[I].ExpandAtCost;
+      Reduced = true;
+      StopAt = I;
+    }
+  }
+
+  // Canceling unprofitable elements.
+  if (StopAt > 0 && StopAt < (VectorizableTree.size() - 1)) {
+    int ReducedBy = MinCost - VectorizableTree.back().ExpandAtCost;
+    LLVM_DEBUG(dbgs() << "SLP: Reduced the tree cost by " << ReducedBy
+                      << " to make it partially vectorizable.\n");
+    Reduced = true;
+    for (unsigned I = 0, E = VectorizableTree.size(); I < E; I++) {
+      TreeEntry *Entry = &VectorizableTree[I];
+      if (!Entry->NeedToGather) {
+        if (I >= StopAt) {
+          Entry->NeedToGather = true;
+          for (Value *V : Entry->Scalars) {
+            LLVM_DEBUG(dbgs() << "SLP: Remove scalar " << *V
+                              << " out of proposed to vectorize.\n");
+            RemovedOprations.push_back(I);
+            ScalarToTreeEntry.erase(V);
+            MustGather.insert(V);
+            ExternalUses.erase(std::remove_if(ExternalUses.begin(),
+                                              ExternalUses.end(),
+                                              [&](ExternalUser &EU) {
+                                                return EU.Scalar == V;
+                                              }),
+                               ExternalUses.end());
+          }
+        }
+      }
+    }
+  }
+
+  return Reduced;
+}
+
 bool BoUpSLP::isFullyVectorizableTinyTree() {
   LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
                     << VectorizableTree.size() << " is fully vectorizable .\n");
@@ -2457,7 +2525,7 @@
   return true;
 }
 
-int BoUpSLP::getSpillCost() {
+int BoUpSLP::getSpillCost(const SmallPtrSetImpl<Value *> &ScalarsToVec) {
   // Walk from the bottom of the tree to the top, tracking which values are
   // live. When we see a call instruction that is not part of our tree,
   // query TTI to see if there is a cost to keeping values live over it
@@ -2481,7 +2549,7 @@
     // Update LiveValues.
     LiveValues.erase(PrevInst);
     for (auto &J : PrevInst->operands()) {
-      if (isa<Instruction>(&*J) && getTreeEntry(&*J))
+      if (isa<Instruction>(&*J) && ScalarsToVec.count(&*J) > 0)
         LiveValues.insert(cast<Instruction>(&*J));
     }
 
@@ -2512,23 +2580,51 @@
           V.push_back(VectorType::get(II->getType(), BundleWidth));
         Cost += TTI->getCostOfKeepingLiveOverCall(V);
       }
-
       ++PrevInstIt;
     }
-
     PrevInst = Inst;
   }
 
   return Cost;
 }
 
-int BoUpSLP::getTreeCost() {
-  int Cost = 0;
-  LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
-                    << VectorizableTree.size() << ".\n");
+int BoUpSLP::getExtractCost(const SmallPtrSetImpl<Value *> &ScalarsToVec) {
+  int ExtractCost = 0;
+  unsigned BundleWidth = VectorizableTree.front().Scalars.size();
+  SmallPtrSet<Value *, 16> ExtractCostCalculated;
+  for (ExternalUser &EU : ExternalUses) {
 
-  unsigned BundleWidth = VectorizableTree[0].Scalars.size();
+    // Avoid non-vectorized scalars for this tree hight.
+    if (ScalarsToVec.count(EU.Scalar) == 0)
+      continue;
 
+    // We only add extract cost once for the same scalar.
+    if (!ExtractCostCalculated.insert(EU.Scalar).second)
+      continue;
+
+    // If we plan to rewrite the tree in a smaller type, we will need to sign
+    // extend the extracted value back to the original type. Here, we account
+    // for the extract and the added cost of the sign extend if needed.
+    auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
+    auto *ScalarRoot = VectorizableTree[0].Scalars[0];
+    if (MinBWs.count(ScalarRoot)) {
+      auto *MinTy = IntegerType::get(F->getContext(),
+                                     MinBWs[ScalarRoot].first);
+      auto Extend =
+          MinBWs[ScalarRoot].second ?
+            Instruction::SExt : Instruction::ZExt;
+      VecTy = VectorType::get(MinTy, BundleWidth);
+      ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
+                                               VecTy, EU.Lane);
+    } else {
+      ExtractCost +=
+           TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
+    }
+  }
+  return ExtractCost;
+}
+
+int BoUpSLP::getTreeCost(bool ReduceTree) {
   for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
     TreeEntry &TE = VectorizableTree[I];
 
@@ -2551,60 +2647,60 @@
                     }))
       continue;
 
-    int C = getEntryCost(&TE);
-    LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
+    TE.Cost = getEntryCost(&TE);
+    LLVM_DEBUG(dbgs() << "SLP: Adding cost " << TE.Cost
                       << " for bundle that starts with " << *TE.Scalars[0]
                       << ".\n");
-    Cost += C;
   }
 
-  SmallPtrSet<Value *, 16> ExtractCostCalculated;
+  SmallPtrSet<Value*, 4> ScalarsToVec;
+  int CostSum = 0;
   int ExtractCost = 0;
-  for (ExternalUser &EU : ExternalUses) {
-    // We only add extract cost once for the same scalar.
-    if (!ExtractCostCalculated.insert(EU.Scalar).second)
-      continue;
+  int SpillCost = 0;
+  for (unsigned I = 0, E = VectorizableTree.size(); I < E; I++) {
+    TreeEntry *Entry = &VectorizableTree[I];
 
-    // Uses by ephemeral values are free (because the ephemeral value will be
-    // removed prior to code generation, and so the extraction will be
-    // removed as well).
-    if (EphValues.count(EU.User))
-      continue;
+    CostSum += Entry->Cost;
 
-    // If we plan to rewrite the tree in a smaller type, we will need to sign
-    // extend the extracted value back to the original type. Here, we account
-    // for the extract and the added cost of the sign extend if needed.
-    auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
-    auto *ScalarRoot = VectorizableTree[0].Scalars[0];
-    if (MinBWs.count(ScalarRoot)) {
-      auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
-      auto Extend =
-          MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;
-      VecTy = VectorType::get(MinTy, BundleWidth);
-      ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
-                                                   VecTy, EU.Lane);
-    } else {
-      ExtractCost +=
-          TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
+    Entry->ExpandAtCost = 0;
+    // If we have not enountered any gather node before, add gather
+    // cost for that operation.
+    if (!Entry->NeedToGather) {
+      if (I != (VectorizableTree.size() - 1)) {
+        if (!VectorizableTree[I + 1].NeedToGather)
+          Entry->ExpandAtCost = getGatherCost(Entry->Scalars);
+      }
+      for (Value *V : Entry->Scalars)
+        ScalarsToVec.insert(V);
     }
-  }
+    Entry->ExpandAtCost += CostSum;
+
+    ExtractCost = getExtractCost(ScalarsToVec);
+    Entry->ExpandAtCost += ExtractCost;
 
-  int SpillCost = getSpillCost();
-  Cost += SpillCost + ExtractCost;
+    SpillCost = getSpillCost(ScalarsToVec);
+    Entry->ExpandAtCost += SpillCost;
+  }
 
+  int MinCost = VectorizableTree.back().ExpandAtCost;
+  if (ReduceTree) {
+    for (unsigned I = VectorizableTree.size(); I--;) {
+      TreeEntry *Entry = &VectorizableTree[I];
+      if (Entry->NeedToGather)
+        continue;
+      if (VectorizableTree[I].ExpandAtCost < MinCost)
+        MinCost = VectorizableTree[I].ExpandAtCost;
+    }
+  }
   std::string Str;
   {
     raw_string_ostream OS(Str);
-    OS << "SLP: Spill Cost = " << SpillCost << ".\n"
-       << "SLP: Extract Cost = " << ExtractCost << ".\n"
-       << "SLP: Total Cost = " << Cost << ".\n";
+    OS << "SLP: Total Cost = " << MinCost << ".\n";
   }
   LLVM_DEBUG(dbgs() << Str);
-
   if (ViewSLPTree)
     ViewGraph(this, "SLP" + F->getName(), false, Str);
-
-  return Cost;
+  return MinCost;
 }
 
 int BoUpSLP::getGatherCost(Type *Ty,
@@ -3574,7 +3670,12 @@
 BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
   // All blocks must be scheduled before any instructions are inserted.
   for (auto &BSIter : BlocksSchedules) {
-    scheduleBlock(BSIter.second.get());
+    BlockScheduling *BS = BSIter.second.get();
+    // Remove all Schedule Data from all nodes that we have changed
+    // vectorization decision.
+    if (!RemovedOprations.empty())
+      removeFromScheduling(BS, RemovedOprations);
+    scheduleBlock(BS);
   }
 
   Builder.SetInsertPoint(&F->getEntryBlock().front());
@@ -3702,15 +3803,8 @@
 
       Type *Ty = Scalar->getType();
       if (!Ty->isVoidTy()) {
-#ifndef NDEBUG
-        for (User *U : Scalar->users()) {
-          LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
-
-          // It is legal to replace users in the ignorelist by undef.
-          assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) &&
-                 "Replacing out-of-tree value with undef");
-        }
-#endif
+        // The tree might not be fully vectorized, so we don't have to
+        // check every user.
         Value *Undef = UndefValue::get(Ty);
         Scalar->replaceAllUsesWith(Undef);
       }
@@ -4186,6 +4280,33 @@
   ReadyInsts.clear();
 }
 
+void BoUpSLP::removeFromScheduling(BlockScheduling *BS, ArrayRef<int> Oprations) {
+  bool Removed = false;
+  for (int I: Oprations) {
+    TreeEntry *Entry = &VectorizableTree[I];
+    ScheduleData *SD = BS->getScheduleData(Entry->Scalars[0]);
+    if (SD && SD->isPartOfBundle()) {
+      if (!Removed) {
+        Removed = true;
+        BS->resetSchedule();
+      }
+      BS->cancelScheduling(Entry->Scalars, SD->OpValue);
+    }
+  }
+  if (Removed) {
+    BS->resetSchedule();
+    BS->initialFillReadyList(BS->ReadyInsts);
+    for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
+         I = I->getNextNode()) {
+      if (BS->ScheduleDataMap.find(I) == BS->ScheduleDataMap.end())
+        continue;
+      BS->doForAllOpcodes(I, [&](ScheduleData *SD) {
+        SD->clearDependencies();
+      });
+    }
+  }
+}
+
 void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
   if (!BS->ScheduleStart)
     return;
@@ -4682,7 +4803,16 @@
   const unsigned ChainLen = Chain.size();
   LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
                     << "\n");
-  const unsigned Sz = R.getVectorElementSize(Chain[0]);
+  Value *FirstStore = nullptr;
+  for (Value *V : Chain) {
+    assert(isa<StoreInst>(V) && "Expected only StoreInst here!");
+    if (StoreInst *SI = cast<StoreInst>(V))
+      if (SI->getValueOperand())
+        FirstStore = V;
+  }
+  if (!FirstStore)
+    return false;
+  const unsigned Sz = R.getVectorElementSize(FirstStore);
   const unsigned VF = VecRegSize / Sz;
 
   if (!isPowerOf2_32(Sz) || VF < 2)
@@ -4703,13 +4833,20 @@
                       << "\n");
     ArrayRef<Value *> Operands = Chain.slice(i, VF);
 
+    // Skip if any store instruction vectorized.
+    if (std::any_of(Operands.begin(), Operands.end(),
+          [](Value *V) {
+            return (!(cast<StoreInst>(V))->getValueOperand());
+          }))
+      continue;
+
     R.buildTree(Operands);
     if (R.isTreeTinyAndNotFullyVectorizable())
       continue;
 
     R.computeMinimumValueSizes();
 
-    int Cost = R.getTreeCost();
+    int Cost = R.getTreeCost(true);
 
     LLVM_DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF
                       << "\n");
@@ -4728,6 +4865,7 @@
 
       // Move to the next bundle.
       i += VF - 1;
+      R.reduceTree();
       Changed = true;
     }
   }
Index: test/Transforms/SLPVectorizer/AArch64/transpose.ll
===================================================================
--- test/Transforms/SLPVectorizer/AArch64/transpose.ll
+++ test/Transforms/SLPVectorizer/AArch64/transpose.ll
@@ -39,19 +39,27 @@
 ; CHECK-LABEL: @store_chain_v2i64(
 ; CHECK-NEXT:    [[A_1:%.*]] = getelementptr i64, i64* [[A:%.*]], i64 1
 ; CHECK-NEXT:    [[B_1:%.*]] = getelementptr i64, i64* [[B:%.*]], i64 1
-; CHECK-NEXT:    [[C_1:%.*]] = getelementptr i64, i64* [[C:%.*]], i64 1
 ; CHECK-NEXT:    [[V0_0:%.*]] = load i64, i64* [[A]], align 8
 ; CHECK-NEXT:    [[V0_1:%.*]] = load i64, i64* [[A_1]], align 8
 ; CHECK-NEXT:    [[V1_0:%.*]] = load i64, i64* [[B]], align 8
 ; CHECK-NEXT:    [[V1_1:%.*]] = load i64, i64* [[B_1]], align 8
-; CHECK-NEXT:    [[TMP0_0:%.*]] = add i64 [[V0_0]], [[V1_0]]
-; CHECK-NEXT:    [[TMP0_1:%.*]] = add i64 [[V0_1]], [[V1_1]]
-; CHECK-NEXT:    [[TMP1_0:%.*]] = sub i64 [[V0_0]], [[V1_0]]
-; CHECK-NEXT:    [[TMP1_1:%.*]] = sub i64 [[V0_1]], [[V1_1]]
-; CHECK-NEXT:    [[TMP2_0:%.*]] = add i64 [[TMP0_0]], [[TMP0_1]]
-; CHECK-NEXT:    [[TMP2_1:%.*]] = add i64 [[TMP1_0]], [[TMP1_1]]
-; CHECK-NEXT:    store i64 [[TMP2_0]], i64* [[C]], align 8
-; CHECK-NEXT:    store i64 [[TMP2_1]], i64* [[C_1]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[V0_0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[V1_0]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> undef, i64 [[V0_1]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> undef, i64 [[V1_1]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP10]], <2 x i64> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = add <2 x i64> [[TMP9]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sub <2 x i64> [[TMP9]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP15:%.*]] = add <2 x i64> [[TMP7]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP15]], <2 x i64>* [[TMP16]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %a.0 = getelementptr i64, i64* %a, i64 0
Index: test/Transforms/SLPVectorizer/X86/addsub.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/addsub.ll
+++ test/Transforms/SLPVectorizer/X86/addsub.ll
@@ -348,22 +348,28 @@
 
 define void @no_vec_shuff_reorder() #0 {
 ; CHECK-LABEL: @no_vec_shuff_reorder(
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 0), align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd float [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    store float [[TMP3]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 0), align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 1), align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 1), align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = fsub float [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    store float [[TMP6]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 1), align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2), align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2), align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = fadd float [[TMP7]], [[TMP8]]
-; CHECK-NEXT:    store float [[TMP9]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 2), align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 3), align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 3), align 4
-; CHECK-NEXT:    [[TMP12:%.*]] = fsub float [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    store float [[TMP12]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 3), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([4 x float]* @fa to <2 x float>*), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([4 x float]* @fb to <2 x float>*), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 3), align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 3), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> undef, float [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP9]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[TMP5]], i32 3
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> undef, float [[TMP13]], i32 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> [[TMP14]], float [[TMP15]], i32 1
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x float> [[TMP17]], float [[TMP6]], i32 3
+; CHECK-NEXT:    [[TMP19:%.*]] = fadd <4 x float> [[TMP12]], [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = fsub <4 x float> [[TMP12]], [[TMP18]]
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <4 x float> [[TMP19]], <4 x float> [[TMP20]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    store <4 x float> [[TMP21]], <4 x float>* bitcast ([4 x float]* @fc to <4 x float>*), align 4
 ; CHECK-NEXT:    ret void
 ;
   %1 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4
Index: test/Transforms/SLPVectorizer/X86/bad_types.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/bad_types.ll
+++ test/Transforms/SLPVectorizer/X86/bad_types.ll
@@ -12,11 +12,12 @@
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A_CAST:%.*]] = bitcast x86_mmx [[A:%.*]] to i64
 ; CHECK-NEXT:    [[B_CAST:%.*]] = bitcast x86_mmx [[B:%.*]] to i64
-; CHECK-NEXT:    [[A_AND:%.*]] = and i64 [[A_CAST]], 42
-; CHECK-NEXT:    [[B_AND:%.*]] = and i64 [[B_CAST]], 42
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i64> undef, i64 [[A_CAST]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> [[TMP0]], i64 [[B_CAST]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i64> <i64 42, i64 42>, [[TMP1]]
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i64, i64* [[PTR:%.*]], i32 1
-; CHECK-NEXT:    store i64 [[A_AND]], i64* [[PTR]]
-; CHECK-NEXT:    store i64 [[B_AND]], i64* [[GEP]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[PTR]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* [[TMP3]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
Index: test/Transforms/SLPVectorizer/X86/crash_dequeue.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/crash_dequeue.ll
+++ test/Transforms/SLPVectorizer/X86/crash_dequeue.ll
@@ -11,23 +11,25 @@
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[_M_CUR2_I_I:%.*]] = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* [[__FIRST:%.*]], i64 0, i32 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = load double*, double** [[_M_CUR2_I_I]], align 8
-; CHECK-NEXT:    [[_M_FIRST3_I_I:%.*]] = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* [[__FIRST]], i64 0, i32 1
 ; CHECK-NEXT:    [[_M_CUR2_I_I81:%.*]] = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* [[__LAST:%.*]], i64 0, i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = load double*, double** [[_M_CUR2_I_I81]], align 8
 ; CHECK-NEXT:    [[_M_FIRST3_I_I83:%.*]] = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* [[__LAST]], i64 0, i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = load double*, double** [[_M_FIRST3_I_I83]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double*> undef, double* [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double*> [[TMP3]], double* [[TMP2]], i32 1
 ; CHECK-NEXT:    br i1 undef, label [[_ZST13ADJACENT_FINDIST15_DEQUE_ITERATORIDRDPDEET_S4_S4__EXIT:%.*]], label [[WHILE_COND_I_PREHEADER:%.*]]
 ; CHECK:       while.cond.i.preheader:
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double*> undef, double* [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double*> [[TMP5]], double* [[TMP2]], i32 1
 ; CHECK-NEXT:    br label [[WHILE_COND_I:%.*]]
 ; CHECK:       while.cond.i:
 ; CHECK-NEXT:    br i1 undef, label [[_ZST13ADJACENT_FINDIST15_DEQUE_ITERATORIDRDPDEET_S4_S4__EXIT]], label [[WHILE_BODY_I:%.*]]
 ; CHECK:       while.body.i:
 ; CHECK-NEXT:    br i1 undef, label [[_ZST13ADJACENT_FINDIST15_DEQUE_ITERATORIDRDPDEET_S4_S4__EXIT]], label [[WHILE_COND_I]]
 ; CHECK:       _ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit:
-; CHECK-NEXT:    [[TMP3:%.*]] = phi double* [ [[TMP2]], [[ENTRY:%.*]] ], [ [[TMP2]], [[WHILE_COND_I]] ], [ undef, [[WHILE_BODY_I]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi double* [ [[TMP0]], [[ENTRY]] ], [ [[TMP1]], [[WHILE_COND_I]] ], [ undef, [[WHILE_BODY_I]] ]
-; CHECK-NEXT:    store double* [[TMP4]], double** [[_M_CUR2_I_I]], align 8
-; CHECK-NEXT:    store double* [[TMP3]], double** [[_M_FIRST3_I_I]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = phi <2 x double*> [ [[TMP4]], [[ENTRY:%.*]] ], [ [[TMP6]], [[WHILE_COND_I]] ], [ undef, [[WHILE_BODY_I]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast double** [[_M_CUR2_I_I]] to <2 x double*>*
+; CHECK-NEXT:    store <2 x double*> [[TMP7]], <2 x double*>* [[TMP8]], align 8
 ; CHECK-NEXT:    br i1 undef, label [[IF_THEN_I55:%.*]], label [[WHILE_COND:%.*]]
 ; CHECK:       if.then.i55:
 ; CHECK-NEXT:    br label [[WHILE_COND]]
Index: test/Transforms/SLPVectorizer/X86/crash_gep.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/crash_gep.ll
+++ test/Transforms/SLPVectorizer/X86/crash_gep.ll
@@ -11,12 +11,13 @@
 ; CHECK-LABEL: @fn1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64*, i64** @a, align 8
-; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint i64* [[ADD_PTR]] to i64
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 2
-; CHECK-NEXT:    store i64 [[TMP1]], i64* [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint i64* [[ARRAYIDX]] to i64
-; CHECK-NEXT:    store i64 [[TMP2]], i64* [[ADD_PTR]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64*> undef, i64* [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64*> [[TMP1]], i64* [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, <2 x i64*> [[TMP2]], <2 x i64> <i64 2, i64 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint <2 x i64*> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64*> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64* [[TMP5]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP6]], align 8
 ; CHECK-NEXT:    ret i32 undef
 ;
 entry:
Index: test/Transforms/SLPVectorizer/X86/crash_lencod.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/crash_lencod.ll
+++ test/Transforms/SLPVectorizer/X86/crash_lencod.ll
@@ -126,14 +126,16 @@
 define fastcc void @dct36(double* %inbuf) {
 ; CHECK-LABEL: @dct36(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds double, double* [[INBUF:%.*]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds double, double* [[INBUF]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds double, double* [[INBUF:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = load double, double* [[ARRAYIDX44]], align 8
-; CHECK-NEXT:    [[ADD46:%.*]] = fadd double [[TMP0]], undef
-; CHECK-NEXT:    store double [[ADD46]], double* [[ARRAYIDX41]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load double, double* [[INBUF]], align 8
-; CHECK-NEXT:    [[ADD49:%.*]] = fadd double [[TMP1]], [[TMP0]]
-; CHECK-NEXT:    store double [[ADD49]], double* [[ARRAYIDX44]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double undef, i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> undef, double [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[ARRAYIDX44]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
Index: test/Transforms/SLPVectorizer/X86/cse.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/cse.ll
+++ test/Transforms/SLPVectorizer/X86/cse.ll
@@ -19,19 +19,20 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x double> <double 4.000000e+00, double 3.000000e+00>, [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> <double 1.000000e+00, double 6.000000e+00>, [[TMP2]]
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[G]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[G]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
-; CHECK-NEXT:    [[ADD8:%.*]] = fadd double [[TMP5]], 7.000000e+00
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds double, double* [[G]], i64 2
-; CHECK-NEXT:    store double [[ADD8]], double* [[ARRAYIDX9]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
-; CHECK-NEXT:    [[MUL11:%.*]] = fmul double [[TMP6]], 4.000000e+00
-; CHECK-NEXT:    [[ADD12:%.*]] = fadd double [[MUL11]], 8.000000e+00
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; CHECK-NEXT:    [[MUL11:%.*]] = fmul double [[TMP3]], 4.000000e+00
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x double> undef, double [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x double> [[TMP5]], double [[TMP6]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x double> [[TMP7]], double [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x double> [[TMP8]], double [[MUL11]], i32 3
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd <4 x double> <double 1.000000e+00, double 6.000000e+00, double 7.000000e+00, double 8.000000e+00>, [[TMP9]]
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds double, double* [[G]], i64 3
-; CHECK-NEXT:    store double [[ADD12]], double* [[ARRAYIDX13]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast double* [[G]] to <4 x double>*
+; CHECK-NEXT:    store <4 x double> [[TMP10]], <4 x double>* [[TMP11]], align 8
 ; CHECK-NEXT:    ret i32 undef
 ;
 entry:
Index: test/Transforms/SLPVectorizer/X86/horizontal.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/horizontal.ll
+++ test/Transforms/SLPVectorizer/X86/horizontal.ll
@@ -730,54 +730,54 @@
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_COND_CLEANUP15:%.*]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[INDVARS_IV]], 2
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[ARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = or i64 [[TMP0]], 1
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[ARRAYIDX4]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[TMP0]], 2
-; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[ARRAYIDX8]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP0]], 3
-; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX12]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = or i64 [[TMP0]], 1
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = or i64 [[TMP0]], 2
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = or i64 [[TMP0]], 3
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4
 ; CHECK-NEXT:    br i1 [[CMP1495]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16_LR_PH:%.*]]
 ; CHECK:       for.body16.lr.ph:
 ; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[ARG_A:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load float, float* [[ADD_PTR]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, float* [[ADD_PTR]], align 4
 ; CHECK-NEXT:    br label [[FOR_BODY16:%.*]]
 ; CHECK:       for.cond.cleanup15:
-; CHECK-NEXT:    [[W2_0_LCSSA:%.*]] = phi float [ [[TMP5]], [[FOR_BODY]] ], [ [[SUB28:%.*]], [[FOR_BODY16]] ]
-; CHECK-NEXT:    [[W3_0_LCSSA:%.*]] = phi float [ [[TMP7]], [[FOR_BODY]] ], [ [[W2_096:%.*]], [[FOR_BODY16]] ]
-; CHECK-NEXT:    [[W1_0_LCSSA:%.*]] = phi float [ [[TMP3]], [[FOR_BODY]] ], [ [[W0_0100:%.*]], [[FOR_BODY16]] ]
-; CHECK-NEXT:    [[W0_0_LCSSA:%.*]] = phi float [ [[TMP1]], [[FOR_BODY]] ], [ [[SUB19:%.*]], [[FOR_BODY16]] ]
-; CHECK-NEXT:    store float [[W0_0_LCSSA]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    store float [[W1_0_LCSSA]], float* [[ARRAYIDX4]], align 4
-; CHECK-NEXT:    store float [[W2_0_LCSSA]], float* [[ARRAYIDX8]], align 4
-; CHECK-NEXT:    store float [[W3_0_LCSSA]], float* [[ARRAYIDX12]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = phi <4 x float> [ [[TMP5]], [[FOR_BODY]] ], [ [[TMP19:%.*]], [[FOR_BODY16]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP7]], <4 x float>* [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND109:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 6
 ; CHECK-NEXT:    br i1 [[EXITCOND109]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.body16:
-; CHECK-NEXT:    [[W0_0100]] = phi float [ [[TMP1]], [[FOR_BODY16_LR_PH]] ], [ [[SUB19]], [[FOR_BODY16]] ]
-; CHECK-NEXT:    [[W1_099:%.*]] = phi float [ [[TMP3]], [[FOR_BODY16_LR_PH]] ], [ [[W0_0100]], [[FOR_BODY16]] ]
 ; CHECK-NEXT:    [[J_098:%.*]] = phi i32 [ 0, [[FOR_BODY16_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY16]] ]
-; CHECK-NEXT:    [[W3_097:%.*]] = phi float [ [[TMP7]], [[FOR_BODY16_LR_PH]] ], [ [[W2_096]], [[FOR_BODY16]] ]
-; CHECK-NEXT:    [[W2_096]] = phi float [ [[TMP5]], [[FOR_BODY16_LR_PH]] ], [ [[SUB28]], [[FOR_BODY16]] ]
-; CHECK-NEXT:    [[MUL17:%.*]] = fmul fast float [[W0_0100]], 0x3FF19999A0000000
-; CHECK-NEXT:    [[MUL18_NEG:%.*]] = fmul fast float [[W1_099]], 0xBFF3333340000000
+; CHECK-NEXT:    [[TMP9:%.*]] = phi <4 x float> [ [[TMP5]], [[FOR_BODY16_LR_PH]] ], [ [[TMP19]], [[FOR_BODY16]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP9]], i32 0
+; CHECK-NEXT:    [[MUL17:%.*]] = fmul fast float [[TMP10]], 0x3FF19999A0000000
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[TMP9]], i32 1
+; CHECK-NEXT:    [[MUL18_NEG:%.*]] = fmul fast float [[TMP11]], 0xBFF3333340000000
 ; CHECK-NEXT:    [[SUB92:%.*]] = fadd fast float [[MUL17]], [[MUL18_NEG]]
-; CHECK-NEXT:    [[SUB19]] = fadd fast float [[SUB92]], [[TMP8]]
+; CHECK-NEXT:    [[SUB19:%.*]] = fadd fast float [[SUB92]], [[TMP6]]
 ; CHECK-NEXT:    [[MUL20:%.*]] = fmul fast float [[SUB19]], 0x4000CCCCC0000000
-; CHECK-NEXT:    [[MUL21_NEG:%.*]] = fmul fast float [[W0_0100]], 0xC0019999A0000000
-; CHECK-NEXT:    [[MUL23:%.*]] = fmul fast float [[W1_099]], 0x4002666660000000
-; CHECK-NEXT:    [[MUL25:%.*]] = fmul fast float [[W2_096]], 0x4008CCCCC0000000
-; CHECK-NEXT:    [[MUL27_NEG:%.*]] = fmul fast float [[W3_097]], 0xC0099999A0000000
-; CHECK-NEXT:    [[ADD2293:%.*]] = fadd fast float [[MUL27_NEG]], [[MUL25]]
-; CHECK-NEXT:    [[ADD24:%.*]] = fadd fast float [[ADD2293]], [[MUL23]]
-; CHECK-NEXT:    [[SUB2694:%.*]] = fadd fast float [[ADD24]], [[MUL21_NEG]]
-; CHECK-NEXT:    [[SUB28]] = fadd fast float [[SUB2694]], [[MUL20]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast <4 x float> <float 0xC0019999A0000000, float 0x4002666660000000, float 0x4008CCCCC0000000, float 0xC0099999A0000000>, [[TMP9]]
+; CHECK-NEXT:    [[ADD2293:%.*]] = fadd fast float undef, undef
+; CHECK-NEXT:    [[ADD24:%.*]] = fadd fast float [[ADD2293]], undef
+; CHECK-NEXT:    [[SUB2694:%.*]] = fadd fast float [[ADD24]], undef
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP12]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP12]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = fadd fast float [[TMP13]], [[MUL20]]
+; CHECK-NEXT:    [[SUB28:%.*]] = fadd fast float [[SUB2694]], [[MUL20]]
 ; CHECK-NEXT:    [[INC]] = add nuw i32 [[J_098]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[ARG_B]]
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> undef, float [[SUB19]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[TMP10]], i32 1
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP14]], i32 2
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x float> [[TMP9]], i32 2
+; CHECK-NEXT:    [[TMP19]] = insertelement <4 x float> [[TMP17]], float [[TMP18]], i32 3
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16]]
 ;
 ; STORE-LABEL: @foo(
@@ -790,54 +790,54 @@
 ; STORE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_COND_CLEANUP15:%.*]] ]
 ; STORE-NEXT:    [[TMP0:%.*]] = shl i64 [[INDVARS_IV]], 2
 ; STORE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[ARRAY:%.*]], i64 [[TMP0]]
-; STORE-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; STORE-NEXT:    [[TMP2:%.*]] = or i64 [[TMP0]], 1
-; STORE-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP2]]
-; STORE-NEXT:    [[TMP3:%.*]] = load float, float* [[ARRAYIDX4]], align 4
-; STORE-NEXT:    [[TMP4:%.*]] = or i64 [[TMP0]], 2
-; STORE-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP4]]
-; STORE-NEXT:    [[TMP5:%.*]] = load float, float* [[ARRAYIDX8]], align 4
-; STORE-NEXT:    [[TMP6:%.*]] = or i64 [[TMP0]], 3
-; STORE-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP6]]
-; STORE-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX12]], align 4
+; STORE-NEXT:    [[TMP1:%.*]] = or i64 [[TMP0]], 1
+; STORE-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP1]]
+; STORE-NEXT:    [[TMP2:%.*]] = or i64 [[TMP0]], 2
+; STORE-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP2]]
+; STORE-NEXT:    [[TMP3:%.*]] = or i64 [[TMP0]], 3
+; STORE-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP3]]
+; STORE-NEXT:    [[TMP4:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>*
+; STORE-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4
 ; STORE-NEXT:    br i1 [[CMP1495]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16_LR_PH:%.*]]
 ; STORE:       for.body16.lr.ph:
 ; STORE-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[ARG_A:%.*]], i64 [[INDVARS_IV]]
-; STORE-NEXT:    [[TMP8:%.*]] = load float, float* [[ADD_PTR]], align 4
+; STORE-NEXT:    [[TMP6:%.*]] = load float, float* [[ADD_PTR]], align 4
 ; STORE-NEXT:    br label [[FOR_BODY16:%.*]]
 ; STORE:       for.cond.cleanup15:
-; STORE-NEXT:    [[W2_0_LCSSA:%.*]] = phi float [ [[TMP5]], [[FOR_BODY]] ], [ [[SUB28:%.*]], [[FOR_BODY16]] ]
-; STORE-NEXT:    [[W3_0_LCSSA:%.*]] = phi float [ [[TMP7]], [[FOR_BODY]] ], [ [[W2_096:%.*]], [[FOR_BODY16]] ]
-; STORE-NEXT:    [[W1_0_LCSSA:%.*]] = phi float [ [[TMP3]], [[FOR_BODY]] ], [ [[W0_0100:%.*]], [[FOR_BODY16]] ]
-; STORE-NEXT:    [[W0_0_LCSSA:%.*]] = phi float [ [[TMP1]], [[FOR_BODY]] ], [ [[SUB19:%.*]], [[FOR_BODY16]] ]
-; STORE-NEXT:    store float [[W0_0_LCSSA]], float* [[ARRAYIDX]], align 4
-; STORE-NEXT:    store float [[W1_0_LCSSA]], float* [[ARRAYIDX4]], align 4
-; STORE-NEXT:    store float [[W2_0_LCSSA]], float* [[ARRAYIDX8]], align 4
-; STORE-NEXT:    store float [[W3_0_LCSSA]], float* [[ARRAYIDX12]], align 4
+; STORE-NEXT:    [[TMP7:%.*]] = phi <4 x float> [ [[TMP5]], [[FOR_BODY]] ], [ [[TMP19:%.*]], [[FOR_BODY16]] ]
+; STORE-NEXT:    [[TMP8:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>*
+; STORE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* [[TMP8]], align 4
 ; STORE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; STORE-NEXT:    [[EXITCOND109:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 6
 ; STORE-NEXT:    br i1 [[EXITCOND109]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
 ; STORE:       for.body16:
-; STORE-NEXT:    [[W0_0100]] = phi float [ [[TMP1]], [[FOR_BODY16_LR_PH]] ], [ [[SUB19]], [[FOR_BODY16]] ]
-; STORE-NEXT:    [[W1_099:%.*]] = phi float [ [[TMP3]], [[FOR_BODY16_LR_PH]] ], [ [[W0_0100]], [[FOR_BODY16]] ]
 ; STORE-NEXT:    [[J_098:%.*]] = phi i32 [ 0, [[FOR_BODY16_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY16]] ]
-; STORE-NEXT:    [[W3_097:%.*]] = phi float [ [[TMP7]], [[FOR_BODY16_LR_PH]] ], [ [[W2_096]], [[FOR_BODY16]] ]
-; STORE-NEXT:    [[W2_096]] = phi float [ [[TMP5]], [[FOR_BODY16_LR_PH]] ], [ [[SUB28]], [[FOR_BODY16]] ]
-; STORE-NEXT:    [[MUL17:%.*]] = fmul fast float [[W0_0100]], 0x3FF19999A0000000
-; STORE-NEXT:    [[MUL18_NEG:%.*]] = fmul fast float [[W1_099]], 0xBFF3333340000000
+; STORE-NEXT:    [[TMP9:%.*]] = phi <4 x float> [ [[TMP5]], [[FOR_BODY16_LR_PH]] ], [ [[TMP19]], [[FOR_BODY16]] ]
+; STORE-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP9]], i32 0
+; STORE-NEXT:    [[MUL17:%.*]] = fmul fast float [[TMP10]], 0x3FF19999A0000000
+; STORE-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[TMP9]], i32 1
+; STORE-NEXT:    [[MUL18_NEG:%.*]] = fmul fast float [[TMP11]], 0xBFF3333340000000
 ; STORE-NEXT:    [[SUB92:%.*]] = fadd fast float [[MUL17]], [[MUL18_NEG]]
-; STORE-NEXT:    [[SUB19]] = fadd fast float [[SUB92]], [[TMP8]]
+; STORE-NEXT:    [[SUB19:%.*]] = fadd fast float [[SUB92]], [[TMP6]]
 ; STORE-NEXT:    [[MUL20:%.*]] = fmul fast float [[SUB19]], 0x4000CCCCC0000000
-; STORE-NEXT:    [[MUL21_NEG:%.*]] = fmul fast float [[W0_0100]], 0xC0019999A0000000
-; STORE-NEXT:    [[MUL23:%.*]] = fmul fast float [[W1_099]], 0x4002666660000000
-; STORE-NEXT:    [[MUL25:%.*]] = fmul fast float [[W2_096]], 0x4008CCCCC0000000
-; STORE-NEXT:    [[MUL27_NEG:%.*]] = fmul fast float [[W3_097]], 0xC0099999A0000000
-; STORE-NEXT:    [[ADD2293:%.*]] = fadd fast float [[MUL27_NEG]], [[MUL25]]
-; STORE-NEXT:    [[ADD24:%.*]] = fadd fast float [[ADD2293]], [[MUL23]]
-; STORE-NEXT:    [[SUB2694:%.*]] = fadd fast float [[ADD24]], [[MUL21_NEG]]
-; STORE-NEXT:    [[SUB28]] = fadd fast float [[SUB2694]], [[MUL20]]
+; STORE-NEXT:    [[TMP12:%.*]] = fmul fast <4 x float> <float 0xC0019999A0000000, float 0x4002666660000000, float 0x4008CCCCC0000000, float 0xC0099999A0000000>, [[TMP9]]
+; STORE-NEXT:    [[ADD2293:%.*]] = fadd fast float undef, undef
+; STORE-NEXT:    [[ADD24:%.*]] = fadd fast float [[ADD2293]], undef
+; STORE-NEXT:    [[SUB2694:%.*]] = fadd fast float [[ADD24]], undef
+; STORE-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP12]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; STORE-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP12]], [[RDX_SHUF]]
+; STORE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; STORE-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; STORE-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
+; STORE-NEXT:    [[TMP14:%.*]] = fadd fast float [[TMP13]], [[MUL20]]
+; STORE-NEXT:    [[SUB28:%.*]] = fadd fast float [[SUB2694]], [[MUL20]]
 ; STORE-NEXT:    [[INC]] = add nuw i32 [[J_098]], 1
 ; STORE-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[ARG_B]]
+; STORE-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> undef, float [[SUB19]], i32 0
+; STORE-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[TMP10]], i32 1
+; STORE-NEXT:    [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP14]], i32 2
+; STORE-NEXT:    [[TMP18:%.*]] = extractelement <4 x float> [[TMP9]], i32 2
+; STORE-NEXT:    [[TMP19]] = insertelement <4 x float> [[TMP17]], float [[TMP18]], i32 3
 ; STORE-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16]]
 ;
 entry:
Index: test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
+++ test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
@@ -15,32 +15,59 @@
 ; Vector cost is 6, Scalar cost is 7
 ; SSE: Adding cost -1 for reduction that starts with   %7 = load i32, i32* %arrayidx.7, align 4 (It is a splitting reduction)
 define i32 @test_add(i32* nocapture readonly %p) {
-; CHECK-LABEL: @test_add(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
-; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
-; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
-; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
-; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[MUL_18:%.*]] = add i32 undef, undef
-; CHECK-NEXT:    [[MUL_29:%.*]] = add i32 undef, [[MUL_18]]
-; CHECK-NEXT:    [[MUL_310:%.*]] = add i32 undef, [[MUL_29]]
-; CHECK-NEXT:    [[MUL_411:%.*]] = add i32 undef, [[MUL_310]]
-; CHECK-NEXT:    [[MUL_512:%.*]] = add i32 undef, [[MUL_411]]
-; CHECK-NEXT:    [[MUL_613:%.*]] = add i32 undef, [[MUL_512]]
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP1]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
-; CHECK-NEXT:    [[MUL_714:%.*]] = add i32 undef, [[MUL_613]]
-; CHECK-NEXT:    ret i32 [[TMP2]]
+; AVX-LABEL: @test_add(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; AVX-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; AVX-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; AVX-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; AVX-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; AVX-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; AVX-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; AVX-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
+; AVX-NEXT:    [[MUL_18:%.*]] = add i32 undef, undef
+; AVX-NEXT:    [[MUL_29:%.*]] = add i32 undef, [[MUL_18]]
+; AVX-NEXT:    [[MUL_310:%.*]] = add i32 undef, [[MUL_29]]
+; AVX-NEXT:    [[MUL_411:%.*]] = add i32 undef, [[MUL_310]]
+; AVX-NEXT:    [[MUL_512:%.*]] = add i32 undef, [[MUL_411]]
+; AVX-NEXT:    [[MUL_613:%.*]] = add i32 undef, [[MUL_512]]
+; AVX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP1]], [[RDX_SHUF]]
+; AVX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; AVX-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; AVX-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; AVX-NEXT:    [[MUL_714:%.*]] = add i32 undef, [[MUL_613]]
+; AVX-NEXT:    ret i32 [[TMP2]]
+;
+; SSE-LABEL: @test_add(
+; SSE-NEXT:  entry:
+; SSE-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; SSE-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; SSE-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; SSE-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; SSE-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; SSE-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; SSE-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; SSE-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
+; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
+; SSE-NEXT:    [[MUL_18:%.*]] = add i32 undef, undef
+; SSE-NEXT:    [[MUL_29:%.*]] = add i32 undef, [[MUL_18]]
+; SSE-NEXT:    [[MUL_310:%.*]] = add i32 undef, [[MUL_29]]
+; SSE-NEXT:    [[MUL_411:%.*]] = add i32 undef, [[MUL_310]]
+; SSE-NEXT:    [[MUL_512:%.*]] = add i32 undef, [[MUL_411]]
+; SSE-NEXT:    [[MUL_613:%.*]] = add i32 undef, [[MUL_512]]
+; SSE-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP1]], [[RDX_SHUF]]
+; SSE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; SSE-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; SSE-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; SSE-NEXT:    [[MUL_714:%.*]] = add i32 undef, [[MUL_613]]
+; SSE-NEXT:    ret i32 [[TMP2]]
 ;
 entry:
   %0 = load i32, i32* %p, align 4
@@ -136,32 +163,59 @@
 ; }
 
 define i32 @test_and(i32* nocapture readonly %p) {
-; CHECK-LABEL: @test_and(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
-; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
-; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
-; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
-; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[MUL_18:%.*]] = and i32 undef, undef
-; CHECK-NEXT:    [[MUL_29:%.*]] = and i32 undef, [[MUL_18]]
-; CHECK-NEXT:    [[MUL_310:%.*]] = and i32 undef, [[MUL_29]]
-; CHECK-NEXT:    [[MUL_411:%.*]] = and i32 undef, [[MUL_310]]
-; CHECK-NEXT:    [[MUL_512:%.*]] = and i32 undef, [[MUL_411]]
-; CHECK-NEXT:    [[MUL_613:%.*]] = and i32 undef, [[MUL_512]]
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = and <8 x i32> [[TMP1]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX2:%.*]] = and <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX4:%.*]] = and <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
-; CHECK-NEXT:    [[MUL_714:%.*]] = and i32 undef, [[MUL_613]]
-; CHECK-NEXT:    ret i32 [[TMP2]]
+; AVX-LABEL: @test_and(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; AVX-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; AVX-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; AVX-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; AVX-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; AVX-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; AVX-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; AVX-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
+; AVX-NEXT:    [[MUL_18:%.*]] = and i32 undef, undef
+; AVX-NEXT:    [[MUL_29:%.*]] = and i32 undef, [[MUL_18]]
+; AVX-NEXT:    [[MUL_310:%.*]] = and i32 undef, [[MUL_29]]
+; AVX-NEXT:    [[MUL_411:%.*]] = and i32 undef, [[MUL_310]]
+; AVX-NEXT:    [[MUL_512:%.*]] = and i32 undef, [[MUL_411]]
+; AVX-NEXT:    [[MUL_613:%.*]] = and i32 undef, [[MUL_512]]
+; AVX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[BIN_RDX:%.*]] = and <8 x i32> [[TMP1]], [[RDX_SHUF]]
+; AVX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[BIN_RDX2:%.*]] = and <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; AVX-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[BIN_RDX4:%.*]] = and <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; AVX-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; AVX-NEXT:    [[MUL_714:%.*]] = and i32 undef, [[MUL_613]]
+; AVX-NEXT:    ret i32 [[TMP2]]
+;
+; SSE-LABEL: @test_and(
+; SSE-NEXT:  entry:
+; SSE-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; SSE-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; SSE-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; SSE-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; SSE-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; SSE-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; SSE-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; SSE-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
+; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
+; SSE-NEXT:    [[MUL_18:%.*]] = and i32 undef, undef
+; SSE-NEXT:    [[MUL_29:%.*]] = and i32 undef, [[MUL_18]]
+; SSE-NEXT:    [[MUL_310:%.*]] = and i32 undef, [[MUL_29]]
+; SSE-NEXT:    [[MUL_411:%.*]] = and i32 undef, [[MUL_310]]
+; SSE-NEXT:    [[MUL_512:%.*]] = and i32 undef, [[MUL_411]]
+; SSE-NEXT:    [[MUL_613:%.*]] = and i32 undef, [[MUL_512]]
+; SSE-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[BIN_RDX:%.*]] = and <8 x i32> [[TMP1]], [[RDX_SHUF]]
+; SSE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[BIN_RDX2:%.*]] = and <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; SSE-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[BIN_RDX4:%.*]] = and <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; SSE-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; SSE-NEXT:    [[MUL_714:%.*]] = and i32 undef, [[MUL_613]]
+; SSE-NEXT:    ret i32 [[TMP2]]
 ;
 entry:
   %0 = load i32, i32* %p, align 4
@@ -197,32 +251,59 @@
 ; }
 
 define i32 @test_or(i32* nocapture readonly %p) {
-; CHECK-LABEL: @test_or(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
-; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
-; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
-; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
-; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[MUL_18:%.*]] = or i32 undef, undef
-; CHECK-NEXT:    [[MUL_29:%.*]] = or i32 undef, [[MUL_18]]
-; CHECK-NEXT:    [[MUL_310:%.*]] = or i32 undef, [[MUL_29]]
-; CHECK-NEXT:    [[MUL_411:%.*]] = or i32 undef, [[MUL_310]]
-; CHECK-NEXT:    [[MUL_512:%.*]] = or i32 undef, [[MUL_411]]
-; CHECK-NEXT:    [[MUL_613:%.*]] = or i32 undef, [[MUL_512]]
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = or <8 x i32> [[TMP1]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX2:%.*]] = or <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX4:%.*]] = or <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
-; CHECK-NEXT:    [[MUL_714:%.*]] = or i32 undef, [[MUL_613]]
-; CHECK-NEXT:    ret i32 [[TMP2]]
+; AVX-LABEL: @test_or(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; AVX-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; AVX-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; AVX-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; AVX-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; AVX-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; AVX-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; AVX-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
+; AVX-NEXT:    [[MUL_18:%.*]] = or i32 undef, undef
+; AVX-NEXT:    [[MUL_29:%.*]] = or i32 undef, [[MUL_18]]
+; AVX-NEXT:    [[MUL_310:%.*]] = or i32 undef, [[MUL_29]]
+; AVX-NEXT:    [[MUL_411:%.*]] = or i32 undef, [[MUL_310]]
+; AVX-NEXT:    [[MUL_512:%.*]] = or i32 undef, [[MUL_411]]
+; AVX-NEXT:    [[MUL_613:%.*]] = or i32 undef, [[MUL_512]]
+; AVX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[BIN_RDX:%.*]] = or <8 x i32> [[TMP1]], [[RDX_SHUF]]
+; AVX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[BIN_RDX2:%.*]] = or <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; AVX-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[BIN_RDX4:%.*]] = or <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; AVX-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; AVX-NEXT:    [[MUL_714:%.*]] = or i32 undef, [[MUL_613]]
+; AVX-NEXT:    ret i32 [[TMP2]]
+;
+; SSE-LABEL: @test_or(
+; SSE-NEXT:  entry:
+; SSE-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; SSE-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; SSE-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; SSE-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; SSE-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; SSE-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; SSE-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; SSE-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
+; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
+; SSE-NEXT:    [[MUL_18:%.*]] = or i32 undef, undef
+; SSE-NEXT:    [[MUL_29:%.*]] = or i32 undef, [[MUL_18]]
+; SSE-NEXT:    [[MUL_310:%.*]] = or i32 undef, [[MUL_29]]
+; SSE-NEXT:    [[MUL_411:%.*]] = or i32 undef, [[MUL_310]]
+; SSE-NEXT:    [[MUL_512:%.*]] = or i32 undef, [[MUL_411]]
+; SSE-NEXT:    [[MUL_613:%.*]] = or i32 undef, [[MUL_512]]
+; SSE-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[BIN_RDX:%.*]] = or <8 x i32> [[TMP1]], [[RDX_SHUF]]
+; SSE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[BIN_RDX2:%.*]] = or <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; SSE-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[BIN_RDX4:%.*]] = or <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; SSE-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; SSE-NEXT:    [[MUL_714:%.*]] = or i32 undef, [[MUL_613]]
+; SSE-NEXT:    ret i32 [[TMP2]]
 ;
 entry:
   %0 = load i32, i32* %p, align 4
@@ -258,32 +339,59 @@
 ; }
 
 define i32 @test_xor(i32* nocapture readonly %p) {
-; CHECK-LABEL: @test_xor(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
-; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
-; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
-; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
-; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[MUL_18:%.*]] = xor i32 undef, undef
-; CHECK-NEXT:    [[MUL_29:%.*]] = xor i32 undef, [[MUL_18]]
-; CHECK-NEXT:    [[MUL_310:%.*]] = xor i32 undef, [[MUL_29]]
-; CHECK-NEXT:    [[MUL_411:%.*]] = xor i32 undef, [[MUL_310]]
-; CHECK-NEXT:    [[MUL_512:%.*]] = xor i32 undef, [[MUL_411]]
-; CHECK-NEXT:    [[MUL_613:%.*]] = xor i32 undef, [[MUL_512]]
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = xor <8 x i32> [[TMP1]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX2:%.*]] = xor <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX4:%.*]] = xor <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
-; CHECK-NEXT:    [[MUL_714:%.*]] = xor i32 undef, [[MUL_613]]
-; CHECK-NEXT:    ret i32 [[TMP2]]
+; AVX-LABEL: @test_xor(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; AVX-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; AVX-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; AVX-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; AVX-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; AVX-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; AVX-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; AVX-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
+; AVX-NEXT:    [[MUL_18:%.*]] = xor i32 undef, undef
+; AVX-NEXT:    [[MUL_29:%.*]] = xor i32 undef, [[MUL_18]]
+; AVX-NEXT:    [[MUL_310:%.*]] = xor i32 undef, [[MUL_29]]
+; AVX-NEXT:    [[MUL_411:%.*]] = xor i32 undef, [[MUL_310]]
+; AVX-NEXT:    [[MUL_512:%.*]] = xor i32 undef, [[MUL_411]]
+; AVX-NEXT:    [[MUL_613:%.*]] = xor i32 undef, [[MUL_512]]
+; AVX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[BIN_RDX:%.*]] = xor <8 x i32> [[TMP1]], [[RDX_SHUF]]
+; AVX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[BIN_RDX2:%.*]] = xor <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; AVX-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[BIN_RDX4:%.*]] = xor <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; AVX-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; AVX-NEXT:    [[MUL_714:%.*]] = xor i32 undef, [[MUL_613]]
+; AVX-NEXT:    ret i32 [[TMP2]]
+;
+; SSE-LABEL: @test_xor(
+; SSE-NEXT:  entry:
+; SSE-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; SSE-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; SSE-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; SSE-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; SSE-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; SSE-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; SSE-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; SSE-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
+; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
+; SSE-NEXT:    [[MUL_18:%.*]] = xor i32 undef, undef
+; SSE-NEXT:    [[MUL_29:%.*]] = xor i32 undef, [[MUL_18]]
+; SSE-NEXT:    [[MUL_310:%.*]] = xor i32 undef, [[MUL_29]]
+; SSE-NEXT:    [[MUL_411:%.*]] = xor i32 undef, [[MUL_310]]
+; SSE-NEXT:    [[MUL_512:%.*]] = xor i32 undef, [[MUL_411]]
+; SSE-NEXT:    [[MUL_613:%.*]] = xor i32 undef, [[MUL_512]]
+; SSE-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[BIN_RDX:%.*]] = xor <8 x i32> [[TMP1]], [[RDX_SHUF]]
+; SSE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[BIN_RDX2:%.*]] = xor <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; SSE-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[BIN_RDX4:%.*]] = xor <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; SSE-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; SSE-NEXT:    [[MUL_714:%.*]] = xor i32 undef, [[MUL_613]]
+; SSE-NEXT:    ret i32 [[TMP2]]
 ;
 entry:
   %0 = load i32, i32* %p, align 4
@@ -312,25 +420,45 @@
 }
 
 define i32 @PR37731(<4 x i32>* noalias nocapture dereferenceable(16) %self) unnamed_addr #0 {
-; CHECK-LABEL: @PR37731(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[SELF:%.*]], align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <4 x i32> [[TMP0]], <i32 6, i32 2, i32 13, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = lshr <4 x i32> [[TMP2]], <i32 13, i32 27, i32 21, i32 12>
-; CHECK-NEXT:    [[TMP4:%.*]] = and <4 x i32> [[TMP0]], <i32 -2, i32 -8, i32 -16, i32 -128>
-; CHECK-NEXT:    [[TMP5:%.*]] = shl <4 x i32> [[TMP4]], <i32 18, i32 2, i32 7, i32 13>
-; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i32> [[TMP3]], [[TMP5]]
-; CHECK-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* [[SELF]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i32 undef, undef
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP7]], undef
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = xor <4 x i32> [[TMP6]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX2:%.*]] = xor <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], undef
-; CHECK-NEXT:    ret i32 [[TMP9]]
+; AVX-LABEL: @PR37731(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[SELF:%.*]], align 16
+; AVX-NEXT:    [[TMP1:%.*]] = shl <4 x i32> [[TMP0]], <i32 6, i32 2, i32 13, i32 3>
+; AVX-NEXT:    [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], [[TMP0]]
+; AVX-NEXT:    [[TMP3:%.*]] = lshr <4 x i32> [[TMP2]], <i32 13, i32 27, i32 21, i32 12>
+; AVX-NEXT:    [[TMP4:%.*]] = and <4 x i32> [[TMP0]], <i32 -2, i32 -8, i32 -16, i32 -128>
+; AVX-NEXT:    [[TMP5:%.*]] = shl <4 x i32> [[TMP4]], <i32 18, i32 2, i32 7, i32 13>
+; AVX-NEXT:    [[TMP6:%.*]] = xor <4 x i32> [[TMP3]], [[TMP5]]
+; AVX-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* [[SELF]], align 16
+; AVX-NEXT:    [[TMP7:%.*]] = xor i32 undef, undef
+; AVX-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP7]], undef
+; AVX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; AVX-NEXT:    [[BIN_RDX:%.*]] = xor <4 x i32> [[TMP6]], [[RDX_SHUF]]
+; AVX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[BIN_RDX2:%.*]] = xor <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; AVX-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
+; AVX-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], undef
+; AVX-NEXT:    ret i32 [[TMP9]]
+;
+; SSE-LABEL: @PR37731(
+; SSE-NEXT:  entry:
+; SSE-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[SELF:%.*]], align 16
+; SSE-NEXT:    [[TMP1:%.*]] = shl <4 x i32> [[TMP0]], <i32 6, i32 2, i32 13, i32 3>
+; SSE-NEXT:    [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], [[TMP0]]
+; SSE-NEXT:    [[TMP3:%.*]] = lshr <4 x i32> [[TMP2]], <i32 13, i32 27, i32 21, i32 12>
+; SSE-NEXT:    [[TMP4:%.*]] = and <4 x i32> [[TMP0]], <i32 -2, i32 -8, i32 -16, i32 -128>
+; SSE-NEXT:    [[TMP5:%.*]] = shl <4 x i32> [[TMP4]], <i32 18, i32 2, i32 7, i32 13>
+; SSE-NEXT:    [[TMP6:%.*]] = xor <4 x i32> [[TMP3]], [[TMP5]]
+; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* [[SELF]], align 16
+; SSE-NEXT:    [[TMP7:%.*]] = xor i32 undef, undef
+; SSE-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP7]], undef
+; SSE-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; SSE-NEXT:    [[BIN_RDX:%.*]] = xor <4 x i32> [[TMP6]], [[RDX_SHUF]]
+; SSE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[BIN_RDX2:%.*]] = xor <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
+; SSE-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], undef
+; SSE-NEXT:    ret i32 [[TMP9]]
 ;
 entry:
   %0 = load <4 x i32>, <4 x i32>* %self, align 16
Index: test/Transforms/SLPVectorizer/X86/simplebb.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/simplebb.ll
+++ test/Transforms/SLPVectorizer/X86/simplebb.ll
@@ -64,15 +64,17 @@
 ; CHECK-LABEL: @test_volatile_load(
 ; CHECK-NEXT:    [[I0:%.*]] = load volatile double, double* [[A:%.*]], align 8
 ; CHECK-NEXT:    [[I1:%.*]] = load volatile double, double* [[B:%.*]], align 8
-; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[I0]], [[I1]]
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A]], i64 1
 ; CHECK-NEXT:    [[I3:%.*]] = load double, double* [[ARRAYIDX3]], align 8
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
 ; CHECK-NEXT:    [[I4:%.*]] = load double, double* [[ARRAYIDX4]], align 8
-; CHECK-NEXT:    [[MUL5:%.*]] = fmul double [[I3]], [[I4]]
-; CHECK-NEXT:    store double [[MUL]], double* [[C:%.*]], align 8
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[C]], i64 1
-; CHECK-NEXT:    store double [[MUL5]], double* [[ARRAYIDX5]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[I0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[I3]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[I1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[I4]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[C:%.*]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %i0 = load volatile double, double* %a, align 8
Index: test/Transforms/SLPVectorizer/X86/unreachable.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/unreachable.ll
+++ test/Transforms/SLPVectorizer/X86/unreachable.ll
@@ -21,19 +21,18 @@
 ; CHECK-NEXT:    [[T8:%.*]] = load i32, i32* [[T7]], align 4
 ; CHECK-NEXT:    [[T9:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 7
 ; CHECK-NEXT:    [[T10:%.*]] = load i32, i32* [[T9]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 [[T4]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[T6]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T8]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T10]], i32 3
 ; CHECK-NEXT:    br label [[BB2]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[T1_0:%.*]] = phi i32 [ [[T4]], [[BB1:%.*]] ], [ 2, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[T2_0:%.*]] = phi i32 [ [[T6]], [[BB1]] ], [ 2, [[ENTRY]] ]
-; CHECK-NEXT:    [[T3_0:%.*]] = phi i32 [ [[T8]], [[BB1]] ], [ 2, [[ENTRY]] ]
-; CHECK-NEXT:    [[T4_0:%.*]] = phi i32 [ [[T10]], [[BB1]] ], [ 2, [[ENTRY]] ]
-; CHECK-NEXT:    store i32 [[T1_0]], i32* [[X]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = phi <4 x i32> [ [[TMP3]], [[BB1:%.*]] ], [ <i32 2, i32 2, i32 2, i32 2>, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[T12:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 1
-; CHECK-NEXT:    store i32 [[T2_0]], i32* [[T12]], align 4
 ; CHECK-NEXT:    [[T13:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 2
-; CHECK-NEXT:    store i32 [[T3_0]], i32* [[T13]], align 4
 ; CHECK-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 3
-; CHECK-NEXT:    store i32 [[T4_0]], i32* [[T14]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[X]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry: